1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial, MergeClusterFrame
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
45 nuci.paras = copy(uci.paras)
54 def __init__(self, parent, parametres = {}, read = False) :
56 self.parametres = parametres
58 self.connformes = None
60 self.conncorpus = None
67 self.idformesuces = {}
72 self.pathout = PathOut(dirout = parametres['pathout'])
75 def add_word(self, word) :
76 if word in self.formes :
77 self.formes[word].freq += 1
78 if self.formes[word].ident in self.idformesuces :
79 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
80 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
82 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
86 if word in self.parent.lexique :
87 gramtype = self.parent.lexique[word][1]
88 lem = self.parent.lexique[word][0]
95 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
96 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
98 def add_word_from_forme(self, word, stident):
99 if word.forme in self.formes :
100 self.formes[word.forme].freq += 1
101 if self.formes[word.forme].ident in self.idformesuces :
102 if stident in self.idformesuces[self.formes[word.forme].ident] :
103 self.idformesuces[self.formes[word.forme].ident][stident] += 1
105 self.idformesuces[self.formes[word.forme].ident][stident] = 1
107 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
109 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
110 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
113 """connect corpus to db"""
114 if self.connformes is None :
115 log.info('connexion corpus')
116 self.connuces = sqlite3.connect(self.pathout['uces.db'])
117 self.cuces = self.connuces.cursor()
118 self.connformes = sqlite3.connect(self.pathout['formes.db'])
119 self.cformes = self.connformes.cursor()
120 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
121 self.ccorpus = self.conncorpus.cursor()
122 self.cformes.execute('PRAGMA temp_store=MEMORY;')
123 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
124 self.cformes.execute('PRAGMA synchronous = OFF;')
125 self.cuces.execute('PRAGMA temp_store=MEMORY;')
126 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
127 self.cuces.execute('PRAGMA synchronous = OFF;')
128 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
129 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
130 self.ccorpus.execute('PRAGMA synchronous = OFF;')
132 def read_corpus(self) :
133 log.info('read corpus')
134 self.parametres['syscoding'] = sys.getdefaultencoding()
135 if self.conncorpus is None :
137 res = self.ccorpus.execute('SELECT * FROM etoiles;')
139 self.ucis.append(Uci(row[0], row[1], row[2]))
140 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
142 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
143 res = self.ccorpus.execute('SELECT * FROM formes;')
144 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
147 def getworduces(self, wordid) :
148 if isinstance(wordid, basestring) :
149 wordid = self.formes[wordid].ident
150 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
151 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
153 def getworducis(self, wordid) :
154 res = self.getworduces(wordid)
155 return list(set([self.getucefromid(uce).uci for uce in res]))
157 def getformeuceseff(self, formeid) :
158 if isinstance(formeid, basestring) :
159 formeid = self.formes[formeid].ident
160 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
161 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
163 res = self.cformes.execute(query)
164 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
166 for i, uce in enumerate(uces) :
167 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
170 def getlemuces(self, lem) :
171 formesid = ', '.join([`val` for val in self.lems[lem].formes])
172 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
173 res = self.cformes.execute(query)
174 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
176 def gettgenst(self, tgen):
179 if lem in self.lems :
180 formesid += self.lems[lem].formes
182 print 'abscent : %s' % lem
183 query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
184 res = self.cformes.execute(query)
185 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
187 def gettgenstprof(self, tgen, classe, i, clnb):
190 if lem in self.lems :
191 lemst = self.getlemuces(lem)
193 if not lem in self.tgenlem :
194 self.tgenlem[lem] = [0] * clnb
195 self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
197 print 'abscent: ',lem
198 return list(set(tgenst))
200 def gettgentxt(self, tgen):
201 sts = self.gettgenst(tgen)
202 return list(set([self.getucefromid(val).uci for val in sts]))
204 def getlemucis(self, lem) :
205 uces = self.getlemuces(lem)
206 return list(set([self.getucefromid(val).uci for val in uces]))
208 def getlemuceseff(self, lem, luces = None) :
209 formesid = ', '.join([`val` for val in self.lems[lem].formes])
210 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
211 res = self.cformes.execute(query)
212 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
213 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
214 res = self.cformes.execute(query)
215 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
217 for i, uce in enumerate(uces) :
218 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
221 def getlemclustereff(self, lem, cluster) :
222 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
224 def getlemeff(self, lem) :
225 return self.lems[lem].freq
230 def getforme(self, formeid) :
231 if self.idformes is None : self.make_idformes()
232 return self.idformes[formeid]
234 def gettotocc(self) :
235 return sum([self.formes[forme].freq for forme in self.formes])
237 def getucemean(self) :
238 return float(self.gettotocc())/self.getucenb()
241 return self.ucis[-1].uces[-1].ident + 1
244 return self.ucis[-1].ident + 1
246 def getucisize(self) :
247 ucesize = self.getucesize()
248 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
250 def getucesize(self) :
251 res = self.getalluces()
252 return [len(uce[1].split()) for uce in res]
254 def getconcorde(self, uces) :
255 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
257 def getuciconcorde(self, ucis) :
258 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
259 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
262 def getwordconcorde(self, word) :
263 return self.getconcorde(self.getworduces(word))
265 def getlemconcorde(self, lem) :
266 return self.getconcorde(self.getlemuces(lem))
268 def getalluces(self) :
269 return self.cuces.execute('SELECT * FROM uces')
271 def getallucis(self):
272 uces = [row[1] for row in self.getalluces()]
273 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
275 def getucesfrometoile(self, etoile) :
276 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
278 def getetoileuces(self) :
279 log.info('get uces etoiles')
282 for uci in self.ucis :
283 etoiles = uci.etoiles[1:]
285 if et in etoileuces :
286 etoileuces[et] += [uce.ident for uce in uci.uces]
288 etoileuces[et] = [uce.ident for uce in uci.uces]
290 for et in uci.paras :
291 if et in etoileuces :
292 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
294 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
300 def getetoileucis(self):
302 for uci in self.ucis :
303 etoiles = uci.etoiles[1:]
305 if et in etoileuces :
306 etoileuces[et] += [uci.ident]
308 etoileuces[et] = [uci.ident]
311 def getucefromid(self, uceid) :
312 if self.iduces is None : self.make_iduces()
313 return self.iduces[uceid]
315 def gethapaxnb(self) :
316 return len([None for forme in self.formes if self.formes[forme].freq == 1])
318 def getactivesnb(self, key) :
319 return len([lem for lem in self.lems if self.lems[lem].act == key])
320 # def make_lems(self, lem = True) :
321 # log.info('make lems')
323 # for forme in self.formes :
324 # if self.formes[forme].lem in self.lems :
325 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
326 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
328 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
330 def getetbyuceid(self, uceid) :
331 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
332 return self.ucis[self.uceuci[uceid]].etoiles
334 def make_lems(self, lem = True) :
335 log.info('make lems')
338 for forme in self.formes :
339 if self.formes[forme].lem in self.lems :
340 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
341 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
343 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
345 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
347 def make_lems_from_dict(self, dictionnaire, dolem = True) :
348 log.info('make lems from dict')
350 for forme in self.formes :
351 if self.formes[forme].forme in dictionnaire :
352 lem = dictionnaire[forme][0]
353 gram = dictionnaire[forme][1]
354 elif forme.isdigit() :
360 self.formes[forme].lem = lem
361 self.formes[forme].gram = gram
363 if self.formes[forme].lem in self.lems :
364 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
365 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
367 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
369 self.lems[forme] = Lem(self, self.formes[forme])
371 def make_idformes(self) :
372 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
374 def make_iduces(self) :
375 if self.iduces is None :
376 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
378 def make_lexitable(self, mineff, etoiles, gram = 0) :
383 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
384 etuces = [[] for et in etoiles]
385 for uci in self.ucis :
386 get = list(set(uci.etoiles).intersection(etoiles))
388 log.info('2 variables sur une ligne')
390 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
391 etuces = [set(val) for val in etuces]
394 deff = self.getlemuceseff(lem)
396 line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
397 if sum(line[1:]) >= mineff :
399 tab.insert(0, [''] + etoiles)
402 def make_tgen_table(self, tgen, etoiles, tot = None):
403 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
404 sets = [set(cl) for cl in lclasses]
405 totoccurrences = dict([[val, 0] for val in etoiles])
407 for forme in self.formes :
408 formeuceeff = self.getformeuceseff(forme)
409 for i, classe in enumerate(lclasses) :
410 concern = sets[i].intersection(formeuceeff.keys())
412 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
413 #tgenoccurrences = dict([[val, 0] for val in etoiles])
416 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
418 lemuceeff = self.getlemuceseff(lem)
419 for i, classe in enumerate(lclasses) :
420 concern = sets[i].intersection(lemuceeff.keys())
422 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
423 return tgenoccurrences, totoccurrences
425 def make_tgen_profile(self, tgen, ucecl, uci = False) :
426 log.info('tgen/classes')
430 #FIXME : NE MARCHE PLUS CHANGER CA
431 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
433 tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
434 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
438 #while nam + `i` in tgen :
441 #last = [nam] + [`len(classe)` for classe in ucecl]
443 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
445 #with open(fileout, 'w') as f :
446 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
448 def make_efftype_from_etoiles(self, etoiles) :
450 etuces = [[] for et in etoiles]
451 for uci in self.ucis :
452 get = list(set(uci.etoiles).intersection(etoiles))
454 return '2 variables sur la meme ligne'
456 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
457 etuces = [set(val) for val in etuces]
458 for lem in self.lems :
459 deff = self.getlemuceseff(lem)
461 gram = self.lems[lem].gram
463 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
465 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
466 tabout = [[gram] + dtype[gram] for gram in dtype]
467 tabout.insert(0, [''] + etoiles)
470 def make_uceactsize(self, actives) :
471 res = self.getalluces()
474 deff = self.getlemuceseff(lem)
476 ucesize[uce] = ucesize.get(uce, 0) + 1
479 def make_uc(self, actives, lim1, lim2) :
480 uceactsize = self.make_uceactsize(actives)
486 for uce in [uce for uci in self.ucis for uce in uci.uces] :
487 if uce.para == lastpara :
489 last1 += uceactsize.get(uce.ident,0)
490 uc1[-1].append(uce.ident)
492 uc1.append([uce.ident])
495 last2 += uceactsize.get(uce.ident, 0)
496 uc2[-1].append(uce.ident)
498 uc2.append([uce.ident])
501 last1 = uceactsize.get(uce.ident, 0)
502 last2 = uceactsize.get(uce.ident, 0)
504 uc1.append([uce.ident])
505 uc2.append([uce.ident])
508 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
509 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
510 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
511 self.write_ucmatrix(uc1, actives, uc1out)
512 self.write_ucmatrix(uc2, actives, uc2out)
513 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
514 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
515 with open(listuce1out, 'w') as f :
516 f.write('\n'.join([';'.join(line) for line in listuce1]))
517 with open(listuce2out, 'w') as f :
518 f.write('\n'.join([';'.join(line) for line in listuce2]))
519 return len(uc1), len(uc2)
521 def write_ucmatrix(self, uc, actives, fileout) :
522 log.info('write uc matrix %s' % fileout)
523 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
526 with open(fileout + '~', 'w+') as f :
527 for i, lem in enumerate(actives) :
528 for uce in self.getlemuces(lem):
529 if (uces_uc[uce], i) not in deja_la :
531 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
532 deja_la[(uces_uc[uce], i)] = 0
534 with open(fileout, 'w') as ffin :
535 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
538 os.remove(fileout + '~')
541 def export_corpus(self, outf) :
542 #outf = 'export_corpus.txt'
544 res = self.getalluces()
548 with open(outf,'w') as f :
550 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
551 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
552 elif self.iduces[uce[0]].uci != actuci :
553 actuci = self.iduces[uce[0]].uci
554 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
555 actpara = self.iduces[uce[0]].para
556 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
559 actpara = self.iduces[uce[0]].para
560 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
561 elif self.iduces[uce[0]].para != actpara :
562 actpara = self.iduces[uce[0]].para
564 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
566 def export_meta_table(self, outf) :
567 metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
568 longueur_max = max([len(val) for val in metas])
569 first = ['column_%i' % i for i in range(longueur_max)]
570 metas.insert(0, first)
571 with open(outf, 'w') as f :
572 f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
574 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
576 for i, lc in enumerate(self.lc) :
579 for uce in self.lc0 :
582 res = self.getalluces()
585 res = self.getallucis()
586 with open(outf, 'w') as f :
590 actuci = self.iduces[uce[0]].uci
594 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
596 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
598 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
599 f.write(etline.encode(self.parametres['syscoding']) + '\n')
600 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
602 def export_classe(self, outf, classe, lem = False, uci = False) :
603 sts = self.lc[classe - 1]
605 res = self.getconcorde(sts)
608 res = self.getuciconcorde(sts)
609 with open(outf, 'w') as f :
613 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
615 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
617 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
618 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
620 def export_owledge(self, rep, classe, lem = False, uci = False) :
621 sts = self.lc[classe - 1]
623 res = self.getconcorde(sts)
626 res = self.getuciconcorde(sts)
630 outf = '.'.join([`ident`, 'txt'])
631 outf = os.path.join(rep, outf)
633 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
634 with open(outf, 'w') as f :
635 f.write(guce.encode('cp1252', errors = 'replace'))
637 def export_tropes(self, fileout, classe, lem = False, uci = False) :
638 sts = self.lc[classe - 1]
640 res = self.getconcorde(sts)
643 res = self.getuciconcorde(sts)
644 with open(fileout, 'w') as f :
648 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
649 f.write(guce.encode('cp1252', errors = 'replace'))
652 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
653 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
655 with open(outfile + '~', 'w+') as f :
656 for i, lem in enumerate(actives) :
657 for uce in sorted(self.getlemuces(lem)) :
659 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
661 with open(outfile, 'w') as ffin :
662 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
665 os.remove(outfile + '~')
667 with open(listuce, 'w') as f :
668 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
670 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
671 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
673 with open(outfile + '~', 'w+') as f :
674 for i, lem in enumerate(actives) :
675 for uci in sorted(self.getlemucis(lem)) :
677 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
679 with open(outfile, 'w') as ffin :
680 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
683 os.remove(outfile + '~')
685 with open(listuci, 'w') as f :
686 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
688 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
689 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
691 duces = dict([[uce, i] for i, uce in enumerate(uces)])
692 with open(outfile + '~', 'w+') as f :
693 for i, lem in enumerate(actives) :
694 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
696 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
698 with open(outfile, 'w') as ffin :
699 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
702 os.remove(outfile + '~')
704 def make_table_with_classe(self, uces, list_act, uci = False) :
705 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
706 uces = dict([[uce, i] for i, uce in enumerate(uces)])
708 getlem = self.getlemucis
710 getlem = self.getlemuces
711 for i, lem in enumerate(list_act) :
712 lemuces = list(set(getlem(lem)).intersection(uces))
714 table_uce[uces[uce]][i] = 1
715 table_uce.insert(0, list_act)
718 def make_pondtable_with_classe(self, uces, list_act) :
719 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
720 uces = dict([[uce, i] for i, uce in enumerate(uces)])
721 for i, lem in enumerate(list_act) :
722 uceseff = self.getlemuceseff(lem)
723 lemuces = list(set(uceseff.keys()).intersection(uces))
725 table_uce[uces[uce]][i] = uceseff[uce]
726 table_uce.insert(0, list_act)
729 def parse_active(self, gramact, gramsup = None) :
730 log.info('parse actives')
731 for lem in self.lems :
732 if lem.startswith('_') and lem.endswith('_') :
733 self.lems[lem].act = 2
734 elif self.lems[lem].gram in gramact :
735 self.lems[lem].act = 1
736 elif gramsup is not None and self.lems[lem].gram not in gramact:
737 if self.lems[lem].gram in gramsup :
738 self.lems[lem].act = 2
740 self.lems[lem].act = 0
742 self.lems[lem].act = 2
744 def make_actives_limit(self, limit, key = 1) :
745 if self.idformes is None :
747 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
749 def make_actives_nb(self, nbmax, key) :
750 log.info('make_actives_nb : %i - %i' % (nbmax,key))
751 if self.idformes is None :
753 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
754 self.activenb = len(allactives)
755 allactives = sorted(allactives, reverse = True)
756 if self.activenb == 0 :
758 if len(allactives) <= nbmax :
759 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
760 return [val[1] for val in allactives], allactives[-1][0]
762 effs = [val[0] for val in allactives]
763 if effs.count(effs[nbmax - 1]) > 1 :
764 lim = effs[nbmax - 1] + 1
768 stop = effs.index(lim)
775 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
776 return [val[1] for val in allactives[0:stop]], lim
778 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
779 log.info('formes/classes')
781 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
783 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
784 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
785 with open(fileout, 'w') as f :
786 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
788 def make_etoiles(self) :
790 for uci in self.ucis :
791 etoiles.update(uci.etoiles[1:])
794 def make_themes(self):
796 for uci in self.ucis :
797 themes.update(uci.paras)
800 def make_etoiles_dict(self) :
801 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
803 for etoile in etoiles :
804 et = etoile.split('_')
807 endet = '_'.join(et[1:])
808 if etoile in det[et[0]] :
809 det[et[0]][etoile] += 1
811 det[et[0]][etoile] = 1
816 endet = '_'.join(et[1:])
817 det[et[0]] = {etoile :1}
822 def make_theme_dict(self):
823 themes = [val for uci in self.ucis for val in uci.paras]
825 for theme in themes :
826 th = theme.split('_')
829 endth = '_'.join(th[1:])
830 if theme in det[th[0]] :
831 det[th[0]][theme] += 1
833 det[th[0]][theme] = 1
838 endth = '_'.join(th[1:])
839 det[th[0]] = {theme:1}
844 def make_etline(self, listet) :
845 etuces = [[] for et in listet]
846 for uci in self.ucis :
847 get = list(set(uci.etoiles).intersection(listet))
849 return '2 variables sur la meme ligne'
851 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
854 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
855 log.info('etoiles/classes')
857 etoileuces = self.getetoileuces()
859 etoileuces = self.getetoileucis()
860 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
861 with open(fileout, 'w') as f :
862 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
863 #etoiles = self.make_etoiles()
864 #with open(fileout, 'w') as f :
865 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
867 def make_colored_corpus(self, uci = False) :
869 for i, lc in enumerate(self.lc) :
872 for uce in self.lc0 :
874 color = ['black'] + colors[len(self.lc) - 1]
876 <meta http-equiv="content-Type" content="text/html; charset=%s" />
878 ''' % sys.getdefaultencoding()
880 res = self.getalluces()
885 if self.iduces[uce[0]].uci != actuci :
886 actuci = self.iduces[uce[0]].uci
887 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
888 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
890 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
892 res = self.getallucis()
895 if self.ucis[uce[0]].ident != actuci :
896 actuci = self.ucis[uce[0]].ident
897 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
898 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
900 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
901 return txt + '\n</body></html>'
903 def count_from_list(self, l, d) :
911 def count_from_list_cl(self, l, d, a, clnb) :
920 def find_segments(self, taille_segment, taille_limite) :
922 for uce in self.getalluces() :
924 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
925 l = [[d[val], val] for val in d if d[val] >= 3]
928 if len(l) > taille_limite :
929 l = l[-taille_limite:]
932 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
935 concorde = self.getconcorde
937 concorde = self.getuciconcorde
938 for uce in concorde(list_uce) :
940 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
941 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
944 if len(l) > taille_limite :
945 l = l[-taille_limite:]
948 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
950 for b, classe in enumerate(self.lc) :
951 for uce in self.getconcorde(classe) :
954 uce = [self.formes[forme].lem for forme in uce]
955 for taille_segment in range(lenmin,lenmax) :
956 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
957 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
958 with open(fileout, 'w') as f :
959 f.write('\n'.join([';'.join(line) for line in result]))
961 def make_proftype(self, outf) :
963 for lem in self.lems :
964 gram = self.lems[lem].gram
966 res[gram] = [0 for val in self.lc]
967 lemuceeff = self.getlemuceseff(lem)
968 for i, classe in enumerate(self.lc) :
969 concern = set(classe).intersection(lemuceeff.keys())
970 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
971 res = [[gram] + [`val` for val in res[gram]] for gram in res]
973 with open(outf, 'w') as f :
974 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
977 def make_ucecl_from_R(self, filein) :
978 with open(filein, 'rU') as f :
983 line = line.replace('\n', '').replace('"', '').split(';')
984 self.lc.append([int(line[0]) - 1, int(line[1])])
985 classesl = [val[1] for val in self.lc]
987 self.lc = sorted(self.lc, key=itemgetter(1))
988 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
989 self.lc0 = self.lc.pop(0)
992 def get_stat_by_cluster(self, outf, lclasses = None) :
993 log.info('get_stat_by_cluster')
994 if lclasses is None :
997 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
998 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
999 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
1000 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
1001 sets = [set(cl) for cl in lclasses]
1002 for forme in self.formes :
1003 formeuceeff = self.getformeuceseff(forme)
1004 for i, classe in enumerate(lclasses) :
1005 concern = sets[i].intersection(formeuceeff.keys())
1007 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
1009 if self.formes[forme].freq == 1 :
1011 log.info('%f' % (time() - t1))
1012 if outf is not None :
1013 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
1014 with open(outf, 'w') as f :
1017 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
1019 def get_stat_by_et(self, outf, etoiles) :
1020 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
1021 stats = self.get_stat_by_cluster(None, lclasses)
1022 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1024 def gethapaxbyet(self, etoiles) :
1025 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1027 for uce in hapaxuces :
1028 if uce in hucesdict :
1032 etuces = [[] for et in etoiles]
1033 for uci in self.ucis :
1034 get = list(set(uci.etoiles).intersection(etoiles))
1036 return '2 variables sur la meme ligne'
1038 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1039 etuces = [set(val) for val in etuces]
1040 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1042 def gethapaxuces(self) :
1043 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1044 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1046 for i,uce in enumerate(hapaxuces) :
1047 if uce in hucesdict :
1048 hucesdict[uce][0] += 1
1049 hucesdict[uce][1].append(hapax[i])
1051 hucesdict[uce] = [1,[hapax[i]]]
1053 for uce in hucesdict :
1054 if hucesdict[uce][0] in huces :
1055 huces[hucesdict[uce][0]].append(uce)
1057 huces[hucesdict[uce][0]] = [uce]
1058 huces = zip(huces, huces.values())
1059 huces.sort(reverse=True)
1063 for nb in huces[0:4] :
1064 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1066 res = self.getconcorde([uce])
1068 ucetxt = ' ' + row[1] + ' '
1070 for hap in hucesdict[uce][1] :
1071 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1072 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1073 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1074 txt += '<p>'+ucetxt+'</p>\n'
1078 with open('/tmp/testhapxuce.html','w') as f :
1081 def export_dictionary(self, fileout, syscoding) :
1082 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1083 listformes.sort(reverse = True)
1084 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1085 with open(fileout, 'w') as f :
1086 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1088 def export_lems(self, fileout, syscoding) :
1089 self.make_idformes()
1090 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1092 with open(fileout, 'w') as f :
1093 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1098 def __init__(self, corpus) :
1099 ucinb = corpus.getucinb()
1100 ucisize = corpus.getucisize()
1101 ucimean = float(sum(ucisize))/float(ucinb)
1102 detoile = corpus.make_etoiles_dict()
1105 def __init__(self, iduci, line, paraset = None) :
1107 self.etoiles = line.split()
1109 if paraset is not None :
1110 self.paras = paraset.split()
1115 def __init__(self, iduce, idpara, iduci) :
1121 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1124 self.gram = gramtype
1127 if freq is not None :
1133 def __init__(self, parent, forme) :
1134 self.formes = {forme.ident : forme.freq}
1135 self.gram = forme.gram
1136 self.freq = forme.freq
1137 self.act = forme.act
1139 def add_forme(self, forme) :
1140 self.formes[forme.ident] = forme.freq
1141 self.freq += forme.freq
1143 def decouperlist(chaine, longueur, longueurOptimale) :
1145 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1146 Si on trouve un '$', c'est fini.
1147 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1149 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1150 dsep = dict([[val[0],val[1]] for val in separateurs])
1151 trouve = False # si on a trouvé un bon séparateur
1152 iDecoupe = 0 # indice du caractere ou il faut decouper
1154 longueur = min(longueur, len(chaine) - 1)
1155 chaineTravail = chaine[:longueur + 1]
1157 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1160 indice = chaineTravail.index(u'$')
1162 iDecoupe = indice - 1
1167 caractere = chaineTravail[nbCar]
1168 distance = abs(longueurOptimale - nbCar) + 1
1169 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1170 if caractere in dsep :
1171 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1172 meilleur[0] = caractere
1173 meilleur[1] = dsep[caractere]
1178 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1180 meilleur[1] = dsep[' ']
1187 #if meilleur[0] != ' ' :
1188 # fin = chaine[iDecoupe + 1:]
1189 # retour = chaineTravail[:iDecoupe]
1191 fin = chaine[iDecoupe + 1:]
1192 retour = chaineTravail[:iDecoupe + 1]
1193 return len(retour) > 0, retour, fin
1194 # si on a rien trouvé
1195 return False, chaine, ''
1197 def testetoile(line) :
1198 return line.startswith(u'****')
1201 return line[0:4].isdigit() and u'*' in line
1203 def prep_txtlist(txt) :
1204 return txt.split() + [u'$']
1206 def prep_txtcharact(txt) :
1211 Class for building a corpus
1213 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1214 log.info('begin building corpus...')
1215 self.lexique = lexique
1216 self.expressions = expressions
1218 self.corpus = Corpus(self, parametres_corpus)
1219 self.infile = infile
1221 self.lim = parametres_corpus.get('lim', 1000000)
1222 self.encoding = parametres_corpus['encoding']
1223 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1224 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1225 self.corpus.parametres['uuid'] = str(uuid4())
1226 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1227 self.corpus.parametres['type'] = 'corpus'
1228 if self.corpus.parametres['keep_ponct'] :
1229 self.ponctuation_espace = [' ', '']
1231 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1233 self.tolist = self.corpus.parametres.get('tolist', 0)
1240 def prep_makeuce(self) :
1241 method = self.corpus.parametres.get('ucemethod', 0)
1243 self.decouper = decouperlist
1244 self.prep_txt = prep_txtlist
1245 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1247 self.decouper = decoupercharact
1248 self.prep_txt = prep_txtcharact
1249 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1250 log.info('method uce : %s' % method)
1255 self.read_corpus(self.infile)
1256 except Warning, args :
1257 log.info('pas kool %s' % args)
1261 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1262 self.time = time() - t1
1264 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1265 log.info('time : %f' % (time() - t1))
1268 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1269 self.cf = self.conn_f.cursor()
1270 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1271 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1272 self.conn_f.commit()
1273 self.cf = self.conn_f.cursor()
1274 self.cf.execute('PRAGMA temp_store=MEMORY;')
1275 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1276 self.cf.execute('PRAGMA synchronous = OFF;')
1277 self.cf.execute('begin')
1278 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1279 self.c = self.conn.cursor()
1280 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1282 self.c = self.conn.cursor()
1283 self.c.execute('PRAGMA temp_store=MEMORY;')
1284 self.c.execute('PRAGMA journal_mode=MEMORY;')
1285 self.c.execute('PRAGMA synchronous = OFF;')
1286 self.c.execute('begin')
1289 #commit index and close db
1291 self.conn_f.commit()
1292 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1293 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1297 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1298 self.ccorpus = self.conn_corpus.cursor()
1299 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1300 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1301 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1302 self.conn_corpus.commit()
1303 self.ccorpus = self.conn_corpus.cursor()
1304 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1305 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1306 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1307 self.ccorpus.execute('begin')
1308 self.backup_corpus()
1309 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1310 self.conn_corpus.commit()
1311 self.conn_corpus.close()
1312 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1314 def buildcleans(self) :
1315 if self.corpus.parametres.get('lower', 1) :
1316 self.cleans.append(self.dolower)
1317 if self.corpus.parametres.get('firstclean', 1) :
1318 self.cleans.append(self.firstclean)
1319 if self.corpus.parametres['charact'] :
1320 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1321 self.cleans.append(self.docharact)
1322 if self.corpus.parametres.get('expressions', 1) :
1323 self.cleans.append(self.make_expression)
1324 if self.corpus.parametres.get('apos', 1) :
1325 self.cleans.append(self.doapos)
1326 if self.corpus.parametres.get('tiret', 1):
1327 self.cleans.append(self.dotiret)
1329 def make_expression(self,txt) :
1330 exp = self.expressions.keys()
1331 exp.sort(reverse=True)
1332 for expression in exp :
1333 if expression in txt :
1334 txt = txt.replace(expression, self.expressions[expression][0])
1337 def dolower(self, txt) :
1340 def docharact(self, txt) :
1341 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1342 list_keep = u"[" + self.rule + "]+"
1343 return re.sub(list_keep, ' ', txt)
1345 def doapos(self, txt) :
1346 return txt.replace(u'\'', u' ')
1348 def dotiret(self, txt) :
1349 return txt.replace(u'-', u' ')
1351 def firstclean(self, txt) :
1352 txt = txt.replace(u'’',"'")
1353 txt = txt.replace(u'œ', u'oe')
1354 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1356 def make_cleans(self, txt) :
1357 for clean in self.cleans :
1361 def backup_uce(self) :
1362 if self.corpus.idformesuces != {} :
1363 log.info('backup %i' % len(self.corpus.idformesuces))
1364 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1365 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1366 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1367 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1368 self.corpus.idformesuces = {}
1371 def backup_corpus(self) :
1372 log.info('start backup corpus')
1374 for uci in self.corpus.ucis :
1375 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1376 for uce in uci.uces :
1377 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1378 for forme in self.corpus.formes :
1379 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1380 log.info('%f' % (time() - t))
1382 def dofinish(self) :
1383 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1384 minutes, seconds = divmod(self.time, 60)
1385 hours, minutes = divmod(minutes, 60)
1386 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1387 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1388 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1389 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1390 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1391 hapaxnb = self.corpus.gethapaxnb()
1392 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1393 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1394 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1396 class BuildSubCorpus(BuildCorpus):
1397 def __init__(self, corpus, parametres, dlg = None) :
1398 log.info('begin subcorpus...')
1402 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1404 self.parametres = parametres
1405 self.encoding = corpus.parametres['encoding']
1406 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1407 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1408 self.corpus.pathout.createdir(parametres['pathout'])
1409 self.corpus.parametres['pathout'] = parametres['pathout']
1410 self.corpus.parametres['meta'] = parametres.get('meta', False)
1411 self.corpus.parametres['uuid'] = str(uuid4())
1412 if parametres.get('frommeta', False) :
1413 print 'make subtexts'
1414 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1415 elif parametres.get('fromtheme', False) :
1416 print 'make subtexts from theme'
1418 for uci in self.ori.ucis :
1419 if uci.paras != [] :
1422 for et in uci.paras :
1423 if et in parametres['meta'] :
1424 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1430 nuci.paras = newpara
1431 self.corpus.ucis.append(nuci)
1434 elif parametres.get('fromclusters', False) :
1435 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1437 elif parametres.get('fromuceids', False) :
1443 def fromuceids(self):
1445 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1447 for uci in self.ori.ucis :
1448 if uci.paras == [] :
1449 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1452 nuci.uces = keepuces
1453 self.corpus.ucis.append(nuci)
1458 for et in uci.paras :
1459 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1467 nuci.paras = newpara
1468 self.corpus.ucis.append(nuci)
1470 def read_corpus(self, infile = None):
1471 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1477 print 'redo text, para and st ident'
1478 for uci in self.corpus.ucis :
1479 uci.ident = ident_uci
1481 for uce in uci.uces :
1483 if uce.para != lastpara :
1486 uce.para = ident_para
1488 uce.para = ident_para
1489 newuceident[uce.ident] = ident_uce
1490 uce.ident = ident_uce
1492 print 'backup st text and forms'
1493 for row in self.ori.getconcorde(self.olduceid) :
1494 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1495 for word in row[1].split() :
1496 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1500 class BuildFromAlceste(BuildCorpus) :
1501 def read_corpus(self, infile) :
1502 if self.dlg is not None :
1503 self.dlg.Pulse('textes : 0 - segments : 0')
1506 if self.corpus.parametres['ucimark'] == 0 :
1507 self.testuci = testetoile
1508 elif self.corpus.parametres['ucimark'] == 1 :
1509 self.testuci = testint
1515 with codecs.open(infile, 'r', self.encoding) as f :
1516 for linenb, line in enumerate(f) :
1517 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1518 if self.testuci(line) :
1521 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1523 self.corpus.ucis.append(Uci(iduci, line))
1526 if self.corpus.ucis[-1].uces == [] :
1527 log.info(u'Empty text : %i' % linenb)
1529 self.corpus.ucis.pop()
1530 self.corpus.ucis.append(Uci(iduci, line))
1531 if self.dlg is not None :
1532 if not (iduci + 1) % 10 :
1533 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1534 elif line.startswith(u'-*') :
1537 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1540 self.corpus.ucis[-1].paras.append(line.split()[0])
1542 raise Exception('paragrapheOT %i' % linenb)
1543 elif line.strip() != '' and iduci != -1 :
1545 if txt != [] and iduci != -1 :
1546 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1551 self.corpus.ucis.pop()
1552 log.info(Exception("Empty text %i" % linenb))
1554 raise Exception('EmptyText %i' % linenb)
1555 if iduci != -1 and iduce != -1:
1558 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1559 raise Exception('TextBeforeTextMark %i' % linenb)
1560 except UnicodeDecodeError :
1561 raise Exception("CorpusEncoding")
1563 def treattxt(self, txt, iduce, idpara, iduci) :
1564 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1565 txt = 'laphrasepoursplitter'.join(txt)
1566 txt = self.make_cleans(txt)
1567 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1568 ucetxt = txt.split('laphrasepoursplitter')
1571 txt = self.make_cleans(txt)
1572 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1573 if self.corpus.ucis[-1].paras == [] :
1577 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1578 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1579 if not self.tolist :
1585 self.corpus.add_word(word)
1586 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1587 if self.last > self.lim :
1590 return iduce, idpara
1592 def make_uces(self, txt, douce = True, keep_ponct = False) :
1593 txt = ' '.join(txt.split())
1596 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1598 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1601 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1602 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1607 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1609 #decouper (list_sep)
1610 #make_uces (decouper)
1611 #treat_txt (make_uces)
1615 def __init__(self, parent, dlg = None) :
1616 self.parent = parent
1619 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1620 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1621 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1622 dial = CorpusPref(parent, parametres)
1623 dial.CenterOnParent()
1624 dial.txtpath.SetLabel(parent.filename)
1625 #dial.repout_choices.SetValue(parametres['pathout'])
1626 self.res = dial.ShowModal()
1627 if self.dlg is not None :
1628 self.dlg = progressbar(self.parent, self.dlg)
1629 if self.res == 5100 :
1630 parametres = dial.doparametres()
1631 parametres['originalpath'] = parent.filename
1632 PathOut().createdir(parametres['pathout'])
1633 if parametres.get('dictionary', False) :
1634 filein = parametres['dictionary']
1637 if dial.corpusname.GetValue() != '' :
1638 parametres['corpus_name'] = dial.corpusname.GetValue()
1640 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1641 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1642 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1644 self.parent.expressions = {}
1645 self.parametres = parametres
1648 if self.dlg is not None :
1651 def doanalyse(self) :
1652 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1655 def __init__(self, parent, corpus, parametres = None, dlg = None):
1656 self.parent = parent
1659 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1660 if dlg is not None :
1661 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1663 parametres['corpus_name'] = corpus_name
1664 if parametres.get('frommeta', False) :
1665 parametres['meta'] = corpus.make_etoiles()
1666 elif parametres.get('fromtheme', False) :
1667 parametres['meta'] = corpus.make_themes()
1668 elif parametres.get('fromclusters', False) :
1669 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1671 parametres['meta'] = []
1672 if 'fromclusters' not in parametres :
1673 parametres['meta'].sort()
1674 if dlg is not None :
1676 dial = SubTextFromMetaDial(parent, parametres)
1677 self.res = dial.ShowModal()
1678 if self.res == 5100 :
1679 if dial.subcorpusname.GetValue() != '' :
1680 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1681 if corpus_name != '' :
1682 parametres['corpus_name'] = corpus_name
1684 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1685 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1687 while os.path.exists(pathout + '_%i' % i) :
1689 parametres['pathout'] = pathout + '_%i' % i
1690 meta = dial.m_listBox1.GetSelections()
1691 if not 'fromclusters' in parametres :
1692 parametres['meta'] = [parametres['meta'][val] for val in meta]
1694 parametres['meta'] = meta
1695 self.parametres = parametres
1700 def doanalyse(self):
1701 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
1703 class BuildMergeFromClusters(BuildCorpus):
1704 def __init__(self, analyses, parametres, dlg = None) :
1705 log.info('begin subcorpus...')
1708 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : 'MergeFromClusters', 'encoding' : 'merge'})
1710 self.analyses = analyses
1712 self.parametres = parametres
1713 #self.encoding = corpus.parametres['encoding']
1714 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1715 self.corpus.pathout = PathOut(filename = 'MFC', dirout = parametres['pathout'])
1716 self.corpus.pathout.createdir(parametres['pathout'])
1717 self.corpus.parametres['pathout'] = parametres['pathout']
1718 self.corpus.parametres['meta'] = parametres.get('meta', False)
1719 self.corpus.parametres['uuid'] = str(uuid4())
1720 for i, analyse in enumerate(analyses) :
1723 corpus_uuid = analyse['corpus']
1724 #if corpus_uuid not in self.parent.history.openedcorpus :
1725 irapath = parametres['corpusira'][i]
1726 corpus = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1727 ucepath = os.path.join(analyse['pathout'], 'uce.csv')
1728 corpus.make_ucecl_from_R(ucepath)
1730 for j, cl in enumerate(parametres['clusters'][i]) :
1731 #print cl, self.ori.lc[cl-1]
1732 self.parametres['uceids'] = self.ori.lc[cl-1]#[st for st in self.ori['lc'][cl-1]]
1733 self.lcl[i] += self.ori.lc[cl-1]
1734 self.et = parametres['newet'][i][j]
1740 def fromuceids(self):
1742 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1744 for uci in self.ori.ucis :
1745 if uci.paras == [] :
1746 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1749 nuci.uces = keepuces
1750 nuci.etoiles.append(self.et)
1751 nuci.analyseid = self.analyseid
1752 self.corpus.ucis.append(nuci)
1757 for et in uci.paras :
1758 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1766 nuci.paras = newpara
1767 nuci.etoiles.append(self.et)
1768 nuci.analyseid = self.analyseid
1769 self.corpus.ucis.append(nuci)
1770 #print nuci.etoiles, nuci.ident, nuci.uces
1772 def read_corpus(self, infile = None):
1773 #self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1779 print 'redo text, para and st ident'
1780 for uci in self.corpus.ucis :
1781 #print uci.ident, ident_uci, [uce.ident for uce in uci.uces], uci.etoiles
1782 uci.ident = ident_uci
1784 for uce in uci.uces :
1786 if uce.para != lastpara :
1789 uce.para = ident_para
1791 uce.para = ident_para
1792 newuceident['%i-%i' %(uci.analyseid, uce.ident)] = ident_uce
1793 uce.ident = ident_uce
1796 print 'backup st text and forms'
1798 for i, analyse in enumerate(self.analyses) :
1799 #print analyse, self.parametres['corpusira']
1800 irapath = self.parametres['corpusira'][i]
1801 old = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1802 for row in old.getconcorde(self.lcl[i]) :
1803 self.c.execute('INSERT INTO uces VALUES(?,?);', (newuceident['%i-%i' % (i,row[0])], row[1]))
1804 for word in row[1].split() :
1805 self.corpus.add_word_from_forme(old.formes[word], newuceident['%i-%i' % (i,row[0])])
1811 class MergeClusters :
1812 def __init__(self, parent, parametres = None, dlg = None):
1813 self.parent = parent
1816 corpus_name = 'MergeFromClusters'
1817 if dlg is not None :
1818 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1820 parametres['corpus_name'] = corpus_name
1821 if dlg is not None :
1823 dial = MergeClusterFrame(parent)
1824 dial.m_textCtrl4.SetValue(corpus_name)
1825 self.res = dial.ShowModal()
1826 if self.res == 5100 :
1831 if dial.m_textCtrl4.GetValue() != '' :
1832 corpus_name = ''.join([l for l in dial.m_textCtrl4.GetValue() if l.isalnum() or l in ['_']])
1833 if corpus_name != '' :
1834 parametres['corpus_name'] = corpus_name
1836 parametres['corpus_name'] = 'MergeFromClusters'
1837 for cl in dial.selected :
1839 #if corpus_uuid not in self.parent.history.openedcorpus :
1840 irapath = self.parent.history.corpus[corpus_uuid]['ira']
1841 #corpus = Corpus(self.parent, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1842 #self.parent.history.openedcorpus[corpus_uuid] = corpus
1843 if cl[0] not in self.analyses :
1844 analyse = DoConf(dial.irapath[cl[0]]).getoptions()
1845 #ucepath = os.path.join(os.path.dirname(dial.irapath[cl[0]]), 'uce.csv')
1846 #corpus = copycorpus(self.parent.history.openedcorpus[corpus_uuid])
1847 #corpus.make_ucecl_from_R(ucepath)
1848 self.analyses[cl[0]] = analyse
1849 self.clusters[cl[0]] = [cl[2]]
1850 self.newet[cl[0]] = [dial.selected[cl]]
1851 self.corpusira[cl[0]] = irapath
1853 self.clusters[cl[0]].append(cl[2])
1854 self.newet[cl[0]].append(dial.selected[cl])
1857 analyses = [val for val in self.clusters]
1858 clusters = [self.clusters[val] for val in analyses]
1859 self.newet = [self.newet[val] for val in analyses]
1860 corpusira = [self.corpusira[val] for val in analyses]
1861 analyses = [self.analyses[val] for val in analyses]
1862 pathout = os.path.dirname(os.path.dirname(analyses[0]['pathout']))
1863 self.analyses = analyses
1865 pathout = os.path.join(pathout, parametres['corpus_name'])
1867 while os.path.exists(pathout + '_%i' % i) :
1869 parametres['pathout'] = pathout + '_%i' % i
1870 self.parametres = parametres
1871 self.parametres['clusters'] = clusters
1872 self.parametres['newet'] = self.newet
1873 self.parametres['corpusira'] = corpusira
1878 def doanalyse(self):
1879 return BuildMergeFromClusters(self.analyses, parametres = self.parametres, dlg = self.dlg).corpus