X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=textstat.py;h=56e0942a91b73e5abc3117d7c2f6ec699a00b7f4;hb=refs%2Fheads%2F3.0;hp=7fdf5bd06cc2d3d051c61f3e1e8759292e53ed3c;hpb=b84be44daa86062735190970d5c6b855f227a7d7;p=iramuteq diff --git a/textstat.py b/textstat.py index 7fdf5bd..56e0942 100755 --- a/textstat.py +++ b/textstat.py @@ -1,23 +1,32 @@ -#!/bin/env python # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2008-2012 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL -from chemins import ffr -from analysetxt import AnalyseText -from functions import sortedby, progressbar, exec_rcode, check_Rresult +#------------------------------------ +# import des modules python +#------------------------------------ import tempfile from time import sleep import logging -import gettext -_ = gettext.gettext -logger = logging.getLogger('iramuteq.textstat') +import langue +langue.run() + +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from chemins import ffr +from analysetxt import AnalyseText +from functions import sortedby, progressbar, exec_rcode, check_Rresult +logger = logging.getLogger('iramuteq.textstat') + class Stat(AnalyseText) : + def doanalyse(self) : self.make_stats() @@ -25,13 +34,12 @@ class Stat(AnalyseText) : return self.parametres def make_stats(self): - if self.dlg : - if not 'dlg' in dir(self) : - self.dlg = progressbar(self, 7) - +# if self.dlg : +# if not 'dlg' in dir(self) : + self.dlg = progressbar(self, 7) formes = self.corpus.lems tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1] - tot = sortedby(tot, 2,1) + tot = sortedby(tot, 2, 1) tot = [[i, val] for i, val in enumerate(tot)] hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1] hapax = sortedby(hapax, 1, 1) @@ -41,27 +49,23 @@ class Stat(AnalyseText) : act = [[i, val] for i, val in enumerate(act)] supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2] supp = sortedby(supp, 2, 1) - supp = [[i, val] for i, val in enumerate(supp)] - ucesize = self.corpus.getucesize() with open(self.pathout['stsize.csv'], 'w') as f : - f.write('\n'.join([`val` for val in ucesize])) - - self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''} + f.write('\n'.join([repr(val) for val in ucesize])) + self.result = {'total' : dict(tot), 'formes_actives' : dict(act), 'formes_supplémentaires' : dict(supp), 'hapax' : dict(hapax), 'glob' : ''} occurrences = sum([val[1][1] for val in tot]) + len(hapax) phapax = (float(len(hapax)) / float(occurrences)) * 100 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100 moy_occu_mot = float(occurrences) / float(len(formes)) - txt = ''.join([_(u'Abstract').decode('utf8'), '\n']) - txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)]) - txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences]) - txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))]) - txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n']) - #print float(occurrences), float(len(self.corpus.ucis)) - txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))]) + txt = ''.join([_('Abstract'), '\n']) + txt += ''.join([_('Number of texts'),' : ', '%i\n' % len(self.corpus.ucis)]) + txt += ''.join([_("Number of occurrences"),' : %i\n' % occurrences]) + txt += ''.join([_('Number of forms'), ' : %i\n' % (len(formes))]) + txt += ''.join([_("Number of hapax"),' : %i (%.2f%% ' % (len(hapax),phapax), _('of occurrences'), ' - %.2f%% ' % phapax_forme, _('of forms'), ')\n']) + txt += ''.join([_("Mean of occurrences by text"), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))]) if self.dlg : - self.dlg.Update(7, u'Ecriture...') + self.dlg.Update(7, 'Ecriture...') self.result['glob'] = txt self.print_result() # for Zipf grap @@ -84,9 +88,9 @@ class Stat(AnalyseText) : open_file_graph("%s", width = 400, height = 400) barplot(table(stsize[,1])) dev.off() - """ % (self.pathout['stsize.csv'], self.pathout['segments_size.png']) + """ % (ffr(self.pathout['stsize.csv']), ffr(self.pathout['segments_size.png'])) tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR) - with open(tmpscript, 'w') as f : + with open(tmpscript, 'w', encoding='utf8') as f : f.write(txt) pid = exec_rcode(self.parent.RPath, tmpscript, wait = False) while pid.poll() == None : @@ -99,9 +103,9 @@ class Stat(AnalyseText) : for key in self.result : if key != 'glob' : dico = self.result[key] - toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico] - with open(self.pathout['%s.csv' % key], 'w') as f : - f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding)) + toprint = [[dico[val][0],repr(dico[val][1]), dico[val][2]] for val in dico] + with open(self.pathout['%s.csv' % key], 'w', encoding='utf8') as f : + f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint])) else : - with open(self.pathout['%s.txt' % 'glob'], 'w') as f : - f.write(self.result['glob'].encode(self.parent.syscoding, errors='replace')) + with open(self.pathout['%s.txt' % 'glob'], 'w', encoding='utf8') as f : + f.write(self.result['glob'])