X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=textstat.py;h=56e0942a91b73e5abc3117d7c2f6ec699a00b7f4;hb=refs%2Fheads%2F3.0;hp=c25b9dcea30647650b437659f22f85903056131d;hpb=9b78e6210e7fc88a7e77d178c4090aabb23580d9;p=iramuteq

diff --git a/textstat.py b/textstat.py
old mode 100644
new mode 100755
index c25b9dc..56e0942
--- a/textstat.py
+++ b/textstat.py
@@ -1,21 +1,32 @@
-#!/bin/env python
 # -*- coding: utf-8 -*-
 #Author: Pierre Ratinaud
-#Copyright (c) 2008-2012 Pierre Ratinaud
+#Copyright (c) 2008-2020 Pierre Ratinaud
+#modification pour python 3 : Laurent MÃ©rat, 6x7 - mai 2020
 #License: GNU/GPL
 
-from chemins import ffr
-from analysetxt import AnalyseText
-from functions import sortedby, progressbar, exec_rcode, check_Rresult 
+#------------------------------------
+# import des modules python
+#------------------------------------
 import tempfile
 from time import sleep
 import logging
 
-logger = logging.getLogger('iramuteq.textstat')
+import langue
+langue.run()
+
+#------------------------------------
+# import des fichiers du projet
+#------------------------------------
+from chemins import ffr
+from analysetxt import AnalyseText
+from functions import sortedby, progressbar, exec_rcode, check_Rresult
 
 
+logger = logging.getLogger('iramuteq.textstat')
+
 
 class Stat(AnalyseText) :
+
     def doanalyse(self) :
         self.make_stats()
 
@@ -23,13 +34,12 @@ class Stat(AnalyseText) :
         return self.parametres
 
     def make_stats(self):
-        if self.dlg :
-            if not 'dlg' in dir(self) :
-                self.dlg = progressbar(self, 7)
-
+#        if self.dlg :
+#        if not 'dlg' in dir(self) :
+        self.dlg = progressbar(self, 7)
         formes = self.corpus.lems
         tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
-        tot = sortedby(tot, 2,1)
+        tot = sortedby(tot, 2, 1)
         tot = [[i, val] for i, val in enumerate(tot)]
         hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
         hapax = sortedby(hapax, 1, 1)
@@ -37,25 +47,25 @@ class Stat(AnalyseText) :
         act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
         act = sortedby(act, 2, 1)
         act = [[i, val] for i, val in enumerate(act)]
-        supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]        
+        supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
         supp = sortedby(supp, 2, 1)
-
         supp = [[i, val] for i, val in enumerate(supp)]
-
-        self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplÃ©mentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
+        ucesize = self.corpus.getucesize()
+        with open(self.pathout['stsize.csv'], 'w') as f :
+            f.write('\n'.join([repr(val) for val in ucesize]))
+        self.result = {'total' : dict(tot), 'formes_actives' : dict(act), 'formes_supplÃ©mentaires' : dict(supp), 'hapax' : dict(hapax), 'glob' : ''}
         occurrences = sum([val[1][1] for val in tot]) + len(hapax)
         phapax = (float(len(hapax)) / float(occurrences)) * 100
         phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
         moy_occu_mot = float(occurrences) / float(len(formes))
-        txt = ''.join([_(u'Abstract').decode('utf8'), '\n'])
-        txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)])
-        txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences])
-        txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))])
-        txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n']) 
-        #print float(occurrences), float(len(self.corpus.ucis))
-        txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
+        txt = ''.join([_('Abstract'), '\n'])
+        txt += ''.join([_('Number of texts'),' : ', '%i\n' % len(self.corpus.ucis)])
+        txt += ''.join([_("Number of occurrences"),' : %i\n' % occurrences])
+        txt += ''.join([_('Number of forms'), ' : %i\n' % (len(formes))])
+        txt += ''.join([_("Number of hapax"),' : %i (%.2f%% ' % (len(hapax),phapax), _('of occurrences'), ' - %.2f%% ' % phapax_forme, _('of forms'), ')\n'])
+        txt += ''.join([_("Mean of occurrences by text"), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
         if self.dlg :
-            self.dlg.Update(7, u'Ecriture...')
+            self.dlg.Update(7, 'Ecriture...')
         self.result['glob'] = txt
         self.print_result()
         # for Zipf grap
@@ -73,8 +83,14 @@ class Stat(AnalyseText) :
         plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
         dev.off()
         """ % (ffr(self.pathout['zipf.png']))
+        txt += """
+        stsize <- read.csv2("%s", header=F)
+        open_file_graph("%s", width = 400, height = 400)
+        barplot(table(stsize[,1]))
+        dev.off()
+        """ % (ffr(self.pathout['stsize.csv']), ffr(self.pathout['segments_size.png']))
         tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
-        with open(tmpscript, 'w') as f :
+        with open(tmpscript, 'w', encoding='utf8') as f :
             f.write(txt)
         pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
         while pid.poll() == None :
@@ -87,9 +103,9 @@ class Stat(AnalyseText) :
         for key in self.result :
             if key != 'glob' :
                 dico = self.result[key]
-                toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
-                with open(self.pathout['%s.csv' % key], 'w') as f :
-                    f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding))
+                toprint = [[dico[val][0],repr(dico[val][1]), dico[val][2]] for val in dico]
+                with open(self.pathout['%s.csv' % key], 'w',  encoding='utf8') as f :
+                    f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]))
             else :
-                with open(self.pathout['%s.txt' % 'glob'], 'w') as f :
-                    f.write(self.result['glob'].encode(self.parent.syscoding))
+                with open(self.pathout['%s.txt' % 'glob'], 'w',  encoding='utf8') as f :
+                    f.write(self.result['glob'])