2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2008-2012 Pierre Ratinaud
7 from chemins import ffr
8 from analysetxt import AnalyseText
9 from functions import sortedby, progressbar, exec_rcode, check_Rresult
11 from time import sleep
14 logger = logging.getLogger('iramuteq.textstat')
18 class Stat(AnalyseText) :
22 def preferences(self) :
23 return self.parametres
27 if not 'dlg' in dir(self) :
28 self.dlg = progressbar(self, 7)
30 formes = self.corpus.lems
31 tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
32 tot = sortedby(tot, 2,1)
33 tot = [[i, val] for i, val in enumerate(tot)]
34 hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
35 hapax = sortedby(hapax, 1, 1)
36 hapax = [[i, val] for i, val in enumerate(hapax)]
37 act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
38 act = sortedby(act, 2, 1)
39 act = [[i, val] for i, val in enumerate(act)]
40 supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
41 supp = sortedby(supp, 2, 1)
43 supp = [[i, val] for i, val in enumerate(supp)]
45 self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
46 occurrences = sum([val[1][1] for val in tot]) + len(hapax)
47 phapax = (float(len(hapax)) / float(occurrences)) * 100
48 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
49 moy_occu_mot = float(occurrences) / float(len(formes))
50 txt = ''.join([_(u'Abstract').decode('utf8'), '\n'])
51 txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)])
52 txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences])
53 txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))])
54 txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n'])
55 #print float(occurrences), float(len(self.corpus.ucis))
56 txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
58 self.dlg.Update(7, u'Ecriture...')
59 self.result['glob'] = txt
64 tot <- read.csv2("%s", header = FALSE, row.names = 1)
65 """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv']))
68 hapax <- read.csv2("%s", header = FALSE, row.names = 1)
69 tot <- rbind(tot, hapax)
70 """ % ffr(self.pathout['hapax.csv'])
72 open_file_graph("%s", width = 400, height = 400)
73 plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
75 """ % (ffr(self.pathout['zipf.png']))
76 tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
77 with open(tmpscript, 'w') as f :
79 pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
80 while pid.poll() == None :
82 check_Rresult(self.parent, pid)
86 def print_result(self) :
87 for key in self.result :
89 dico = self.result[key]
90 toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
91 with open(self.pathout['%s.csv' % key], 'w') as f :
92 f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding))
94 with open(self.pathout['%s.txt' % 'glob'], 'w') as f :
95 f.write(self.result['glob'].encode(self.parent.syscoding))