www.iramuteq.org Git - iramuteq/blob - textstat.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 #Copyright (c) 2008-2020 Pierre Ratinaud
   4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
   5 #License: GNU/GPL
   6
   7 #------------------------------------
   8 # import des modules python
   9 #------------------------------------
  10 import tempfile
  11 from time import sleep
  12 import logging
  13
  14 import langue
  15 langue.run()
  16
  17 #------------------------------------
  18 # import des fichiers du projet
  19 #------------------------------------
  20 from chemins import ffr
  21 from analysetxt import AnalyseText
  22 from functions import sortedby, progressbar, exec_rcode, check_Rresult
  23
  24
  25 logger = logging.getLogger('iramuteq.textstat')
  26
  27
  28 class Stat(AnalyseText) :
  29
  30     def doanalyse(self) :
  31         self.make_stats()
  32
  33     def preferences(self) :
  34         return self.parametres
  35
  36     def make_stats(self):
  37 #        if self.dlg :
  38 #        if not 'dlg' in dir(self) :
  39         self.dlg = progressbar(self, 7)
  40         formes = self.corpus.lems
  41         tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
  42         tot = sortedby(tot, 2, 1)
  43         tot = [[i, val] for i, val in enumerate(tot)]
  44         hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
  45         hapax = sortedby(hapax, 1, 1)
  46         hapax = [[i, val] for i, val in enumerate(hapax)]
  47         act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
  48         act = sortedby(act, 2, 1)
  49         act = [[i, val] for i, val in enumerate(act)]
  50         supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
  51         supp = sortedby(supp, 2, 1)
  52         supp = [[i, val] for i, val in enumerate(supp)]
  53         ucesize = self.corpus.getucesize()
  54         with open(self.pathout['stsize.csv'], 'w') as f :
  55             f.write('\n'.join([repr(val) for val in ucesize]))
  56         self.result = {'total' : dict(tot), 'formes_actives' : dict(act), 'formes_supplémentaires' : dict(supp), 'hapax' : dict(hapax), 'glob' : ''}
  57         occurrences = sum([val[1][1] for val in tot]) + len(hapax)
  58         phapax = (float(len(hapax)) / float(occurrences)) * 100
  59         phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
  60         moy_occu_mot = float(occurrences) / float(len(formes))
  61         txt = ''.join([_('Abstract'), '\n'])
  62         txt += ''.join([_('Number of texts'),' : ', '%i\n' % len(self.corpus.ucis)])
  63         txt += ''.join([_("Number of occurrences"),' : %i\n' % occurrences])
  64         txt += ''.join([_('Number of forms'), ' : %i\n' % (len(formes))])
  65         txt += ''.join([_("Number of hapax"),' : %i (%.2f%% ' % (len(hapax),phapax), _('of occurrences'), ' - %.2f%% ' % phapax_forme, _('of forms'), ')\n'])
  66         txt += ''.join([_("Mean of occurrences by text"), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
  67         if self.dlg :
  68             self.dlg.Update(7, 'Ecriture...')
  69         self.result['glob'] = txt
  70         self.print_result()
  71         # for Zipf grap
  72         txt = """
  73         source("%s")
  74         tot <- read.csv2("%s", header = FALSE, row.names = 1)
  75         """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv']))
  76         if len(hapax) :
  77             txt += """
  78             hapax <- read.csv2("%s", header = FALSE, row.names = 1)
  79             tot <- rbind(tot, hapax)
  80             """ % ffr(self.pathout['hapax.csv'])
  81         txt += """
  82         open_file_graph("%s", width = 400, height = 400)
  83         plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
  84         dev.off()
  85         """ % (ffr(self.pathout['zipf.png']))
  86         txt += """
  87         stsize <- read.csv2("%s", header=F)
  88         open_file_graph("%s", width = 400, height = 400)
  89         barplot(table(stsize[,1]))
  90         dev.off()
  91         """ % (ffr(self.pathout['stsize.csv']), ffr(self.pathout['segments_size.png']))
  92         tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
  93         with open(tmpscript, 'w', encoding='utf8') as f :
  94             f.write(txt)
  95         pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
  96         while pid.poll() == None :
  97             sleep(0.2)
  98         check_Rresult(self.parent, pid)
  99         if self.dlg :
 100             self.dlg.Destroy()
 101
 102     def print_result(self) :
 103         for key in self.result :
 104             if key != 'glob' :
 105                 dico = self.result[key]
 106                 toprint = [[dico[val][0],repr(dico[val][1]), dico[val][2]] for val in dico]
 107                 with open(self.pathout['%s.csv' % key], 'w',  encoding='utf8') as f :
 108                     f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]))
 109             else :
 110                 with open(self.pathout['%s.txt' % 'glob'], 'w',  encoding='utf8') as f :
 111                     f.write(self.result['glob'])