X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=textstat.py;h=56e0942a91b73e5abc3117d7c2f6ec699a00b7f4;hb=refs%2Fheads%2F3.0;hp=c25b9dcea30647650b437659f22f85903056131d;hpb=9b78e6210e7fc88a7e77d178c4090aabb23580d9;p=iramuteq diff --git a/textstat.py b/textstat.py old mode 100644 new mode 100755 index c25b9dc..56e0942 --- a/textstat.py +++ b/textstat.py @@ -1,21 +1,32 @@ -#!/bin/env python # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2008-2012 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL -from chemins import ffr -from analysetxt import AnalyseText -from functions import sortedby, progressbar, exec_rcode, check_Rresult +#------------------------------------ +# import des modules python +#------------------------------------ import tempfile from time import sleep import logging -logger = logging.getLogger('iramuteq.textstat') +import langue +langue.run() + +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from chemins import ffr +from analysetxt import AnalyseText +from functions import sortedby, progressbar, exec_rcode, check_Rresult +logger = logging.getLogger('iramuteq.textstat') + class Stat(AnalyseText) : + def doanalyse(self) : self.make_stats() @@ -23,13 +34,12 @@ class Stat(AnalyseText) : return self.parametres def make_stats(self): - if self.dlg : - if not 'dlg' in dir(self) : - self.dlg = progressbar(self, 7) - +# if self.dlg : +# if not 'dlg' in dir(self) : + self.dlg = progressbar(self, 7) formes = self.corpus.lems tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1] - tot = sortedby(tot, 2,1) + tot = sortedby(tot, 2, 1) tot = [[i, val] for i, val in enumerate(tot)] hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1] hapax = sortedby(hapax, 1, 1) @@ -37,25 +47,25 @@ class Stat(AnalyseText) : act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1] act = sortedby(act, 2, 1) act = [[i, val] for i, val in enumerate(act)] - supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2] + supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2] supp = sortedby(supp, 2, 1) - supp = [[i, val] for i, val in enumerate(supp)] - - self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''} + ucesize = self.corpus.getucesize() + with open(self.pathout['stsize.csv'], 'w') as f : + f.write('\n'.join([repr(val) for val in ucesize])) + self.result = {'total' : dict(tot), 'formes_actives' : dict(act), 'formes_supplémentaires' : dict(supp), 'hapax' : dict(hapax), 'glob' : ''} occurrences = sum([val[1][1] for val in tot]) + len(hapax) phapax = (float(len(hapax)) / float(occurrences)) * 100 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100 moy_occu_mot = float(occurrences) / float(len(formes)) - txt = ''.join([_(u'Abstract').decode('utf8'), '\n']) - txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)]) - txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences]) - txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))]) - txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n']) - #print float(occurrences), float(len(self.corpus.ucis)) - txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))]) + txt = ''.join([_('Abstract'), '\n']) + txt += ''.join([_('Number of texts'),' : ', '%i\n' % len(self.corpus.ucis)]) + txt += ''.join([_("Number of occurrences"),' : %i\n' % occurrences]) + txt += ''.join([_('Number of forms'), ' : %i\n' % (len(formes))]) + txt += ''.join([_("Number of hapax"),' : %i (%.2f%% ' % (len(hapax),phapax), _('of occurrences'), ' - %.2f%% ' % phapax_forme, _('of forms'), ')\n']) + txt += ''.join([_("Mean of occurrences by text"), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))]) if self.dlg : - self.dlg.Update(7, u'Ecriture...') + self.dlg.Update(7, 'Ecriture...') self.result['glob'] = txt self.print_result() # for Zipf grap @@ -73,8 +83,14 @@ class Stat(AnalyseText) : plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16) dev.off() """ % (ffr(self.pathout['zipf.png'])) + txt += """ + stsize <- read.csv2("%s", header=F) + open_file_graph("%s", width = 400, height = 400) + barplot(table(stsize[,1])) + dev.off() + """ % (ffr(self.pathout['stsize.csv']), ffr(self.pathout['segments_size.png'])) tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR) - with open(tmpscript, 'w') as f : + with open(tmpscript, 'w', encoding='utf8') as f : f.write(txt) pid = exec_rcode(self.parent.RPath, tmpscript, wait = False) while pid.poll() == None : @@ -87,9 +103,9 @@ class Stat(AnalyseText) : for key in self.result : if key != 'glob' : dico = self.result[key] - toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico] - with open(self.pathout['%s.csv' % key], 'w') as f : - f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding)) + toprint = [[dico[val][0],repr(dico[val][1]), dico[val][2]] for val in dico] + with open(self.pathout['%s.csv' % key], 'w', encoding='utf8') as f : + f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint])) else : - with open(self.pathout['%s.txt' % 'glob'], 'w') as f : - f.write(self.result['glob'].encode(self.parent.syscoding)) + with open(self.pathout['%s.txt' % 'glob'], 'w', encoding='utf8') as f : + f.write(self.result['glob'])