1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def gettgenst(self, tgen):
178 if lem in self.lems :
179 formesid += self.lems[lem].formes
181 print 'abscent : %s' % lem
182 query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
183 res = self.cformes.execute(query)
184 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
186 def gettgenstprof(self, tgen, classe, i, clnb):
189 if lem in self.lems :
190 lemst = self.getlemuces(lem)
192 if not lem in self.tgenlem :
193 self.tgenlem[lem] = [0] * clnb
194 self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
196 print 'abscent: ',lem
197 return list(set(tgenst))
199 def gettgentxt(self, tgen):
200 sts = self.gettgenst(tgen)
201 return list(set([self.getucefromid(val).uci for val in sts]))
203 def getlemucis(self, lem) :
204 uces = self.getlemuces(lem)
205 return list(set([self.getucefromid(val).uci for val in uces]))
207 def getlemuceseff(self, lem, luces = None) :
208 formesid = ', '.join([`val` for val in self.lems[lem].formes])
209 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
210 res = self.cformes.execute(query)
211 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
212 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
213 res = self.cformes.execute(query)
214 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
216 for i, uce in enumerate(uces) :
217 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
220 def getlemclustereff(self, lem, cluster) :
221 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
223 def getlemeff(self, lem) :
224 return self.lems[lem].freq
229 def getforme(self, formeid) :
230 if self.idformes is None : self.make_idformes()
231 return self.idformes[formeid]
233 def gettotocc(self) :
234 return sum([self.formes[forme].freq for forme in self.formes])
236 def getucemean(self) :
237 return float(self.gettotocc())/self.getucenb()
240 return self.ucis[-1].uces[-1].ident + 1
243 return self.ucis[-1].ident + 1
245 def getucisize(self) :
246 ucesize = self.getucesize()
247 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
249 def getucesize(self) :
250 res = self.getalluces()
251 return [len(uce[1].split()) for uce in res]
253 def getconcorde(self, uces) :
254 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
256 def getuciconcorde(self, ucis) :
257 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
258 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
261 def getwordconcorde(self, word) :
262 return self.getconcorde(self.getworduces(word))
264 def getlemconcorde(self, lem) :
265 return self.getconcorde(self.getlemuces(lem))
267 def getalluces(self) :
268 return self.cuces.execute('SELECT * FROM uces')
270 def getallucis(self):
271 uces = [row[1] for row in self.getalluces()]
272 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
274 def getucesfrometoile(self, etoile) :
275 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
277 def getetoileuces(self) :
278 log.info('get uces etoiles')
281 for uci in self.ucis :
282 etoiles = uci.etoiles[1:]
284 if et in etoileuces :
285 etoileuces[et] += [uce.ident for uce in uci.uces]
287 etoileuces[et] = [uce.ident for uce in uci.uces]
289 for et in uci.paras :
290 if et in etoileuces :
291 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
293 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
299 def getetoileucis(self):
301 for uci in self.ucis :
302 etoiles = uci.etoiles[1:]
304 if et in etoileuces :
305 etoileuces[et] += [uci.ident]
307 etoileuces[et] = [uci.ident]
310 def getucefromid(self, uceid) :
311 if self.iduces is None : self.make_iduces()
312 return self.iduces[uceid]
314 def gethapaxnb(self) :
315 return len([None for forme in self.formes if self.formes[forme].freq == 1])
317 def getactivesnb(self, key) :
318 return len([lem for lem in self.lems if self.lems[lem].act == key])
319 # def make_lems(self, lem = True) :
320 # log.info('make lems')
322 # for forme in self.formes :
323 # if self.formes[forme].lem in self.lems :
324 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
325 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
327 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
329 def getetbyuceid(self, uceid) :
330 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
331 return self.ucis[self.uceuci[uceid]].etoiles
333 def make_lems(self, lem = True) :
334 log.info('make lems')
337 for forme in self.formes :
338 if self.formes[forme].lem in self.lems :
339 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
340 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
342 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
344 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
346 def make_lems_from_dict(self, dictionnaire, dolem = True) :
347 log.info('make lems from dict')
349 for forme in self.formes :
350 if self.formes[forme].forme in dictionnaire :
351 lem = dictionnaire[forme][0]
352 gram = dictionnaire[forme][1]
353 elif forme.isdigit() :
359 self.formes[forme].lem = lem
360 self.formes[forme].gram = gram
362 if self.formes[forme].lem in self.lems :
363 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
364 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
366 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
368 self.lems[forme] = Lem(self, self.formes[forme])
370 def make_idformes(self) :
371 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
373 def make_iduces(self) :
374 if self.iduces is None :
375 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
377 def make_lexitable(self, mineff, etoiles, gram = 0) :
382 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
383 etuces = [[] for et in etoiles]
384 for uci in self.ucis :
385 get = list(set(uci.etoiles).intersection(etoiles))
387 log.info('2 variables sur une ligne')
389 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
390 etuces = [set(val) for val in etuces]
393 deff = self.getlemuceseff(lem)
395 line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
396 if sum(line[1:]) >= mineff :
398 tab.insert(0, [''] + etoiles)
401 def make_tgen_table(self, tgen, etoiles, tot = None):
402 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
403 sets = [set(cl) for cl in lclasses]
404 totoccurrences = dict([[val, 0] for val in etoiles])
406 for forme in self.formes :
407 formeuceeff = self.getformeuceseff(forme)
408 for i, classe in enumerate(lclasses) :
409 concern = sets[i].intersection(formeuceeff.keys())
411 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
412 #tgenoccurrences = dict([[val, 0] for val in etoiles])
415 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
417 lemuceeff = self.getlemuceseff(lem)
418 for i, classe in enumerate(lclasses) :
419 concern = sets[i].intersection(lemuceeff.keys())
421 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
422 return tgenoccurrences, totoccurrences
424 def make_tgen_profile(self, tgen, ucecl, uci = False) :
425 log.info('tgen/classes')
429 #FIXME : NE MARCHE PLUS CHANGER CA
430 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
432 tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
433 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
437 #while nam + `i` in tgen :
440 #last = [nam] + [`len(classe)` for classe in ucecl]
442 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
444 #with open(fileout, 'w') as f :
445 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
447 def make_efftype_from_etoiles(self, etoiles) :
449 etuces = [[] for et in etoiles]
450 for uci in self.ucis :
451 get = list(set(uci.etoiles).intersection(etoiles))
453 return '2 variables sur la meme ligne'
455 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
456 etuces = [set(val) for val in etuces]
457 for lem in self.lems :
458 deff = self.getlemuceseff(lem)
460 gram = self.lems[lem].gram
462 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
464 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
465 tabout = [[gram] + dtype[gram] for gram in dtype]
466 tabout.insert(0, [''] + etoiles)
469 def make_uceactsize(self, actives) :
470 res = self.getalluces()
473 deff = self.getlemuceseff(lem)
475 ucesize[uce] = ucesize.get(uce, 0) + 1
478 def make_uc(self, actives, lim1, lim2) :
479 uceactsize = self.make_uceactsize(actives)
485 for uce in [uce for uci in self.ucis for uce in uci.uces] :
486 if uce.para == lastpara :
488 last1 += uceactsize.get(uce.ident,0)
489 uc1[-1].append(uce.ident)
491 uc1.append([uce.ident])
494 last2 += uceactsize.get(uce.ident, 0)
495 uc2[-1].append(uce.ident)
497 uc2.append([uce.ident])
500 last1 = uceactsize.get(uce.ident, 0)
501 last2 = uceactsize.get(uce.ident, 0)
503 uc1.append([uce.ident])
504 uc2.append([uce.ident])
507 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
508 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
509 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
510 self.write_ucmatrix(uc1, actives, uc1out)
511 self.write_ucmatrix(uc2, actives, uc2out)
512 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
513 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
514 with open(listuce1out, 'w') as f :
515 f.write('\n'.join([';'.join(line) for line in listuce1]))
516 with open(listuce2out, 'w') as f :
517 f.write('\n'.join([';'.join(line) for line in listuce2]))
518 return len(uc1), len(uc2)
520 def write_ucmatrix(self, uc, actives, fileout) :
521 log.info('write uc matrix %s' % fileout)
522 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
525 with open(fileout + '~', 'w+') as f :
526 for i, lem in enumerate(actives) :
527 for uce in self.getlemuces(lem):
528 if (uces_uc[uce], i) not in deja_la :
530 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
531 deja_la[(uces_uc[uce], i)] = 0
533 with open(fileout, 'w') as ffin :
534 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
537 os.remove(fileout + '~')
540 def export_corpus(self, outf) :
541 #outf = 'export_corpus.txt'
543 res = self.getalluces()
547 with open(outf,'w') as f :
549 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
550 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
551 elif self.iduces[uce[0]].uci != actuci :
552 actuci = self.iduces[uce[0]].uci
553 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
554 actpara = self.iduces[uce[0]].para
555 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
558 actpara = self.iduces[uce[0]].para
559 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
560 elif self.iduces[uce[0]].para != actpara :
561 actpara = self.iduces[uce[0]].para
563 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
565 def export_meta_table(self, outf) :
566 metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
567 longueur_max = max([len(val) for val in metas])
568 first = ['column_%i' % i for i in range(longueur_max)]
569 metas.insert(0, first)
570 with open(outf, 'w') as f :
571 f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
573 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
575 for i, lc in enumerate(self.lc) :
578 for uce in self.lc0 :
581 res = self.getalluces()
584 res = self.getallucis()
585 with open(outf, 'w') as f :
589 actuci = self.iduces[uce[0]].uci
593 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
595 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
597 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
598 f.write(etline.encode(self.parametres['syscoding']) + '\n')
599 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
601 def export_classe(self, outf, classe, lem = False, uci = False) :
602 sts = self.lc[classe - 1]
604 res = self.getconcorde(sts)
607 res = self.getuciconcorde(sts)
608 with open(outf, 'w') as f :
612 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
614 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
616 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
617 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
619 def export_owledge(self, rep, classe, lem = False, uci = False) :
620 sts = self.lc[classe - 1]
622 res = self.getconcorde(sts)
625 res = self.getuciconcorde(sts)
629 outf = '.'.join([`ident`, 'txt'])
630 outf = os.path.join(rep, outf)
632 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
633 with open(outf, 'w') as f :
634 f.write(guce.encode('cp1252', errors = 'replace'))
636 def export_tropes(self, fileout, classe, lem = False, uci = False) :
637 sts = self.lc[classe - 1]
639 res = self.getconcorde(sts)
642 res = self.getuciconcorde(sts)
643 with open(fileout, 'w') as f :
647 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
648 f.write(guce.encode('cp1252', errors = 'replace'))
651 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
652 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
654 with open(outfile + '~', 'w+') as f :
655 for i, lem in enumerate(actives) :
656 for uce in sorted(self.getlemuces(lem)) :
658 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
660 with open(outfile, 'w') as ffin :
661 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
664 os.remove(outfile + '~')
666 with open(listuce, 'w') as f :
667 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
669 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
670 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
672 with open(outfile + '~', 'w+') as f :
673 for i, lem in enumerate(actives) :
674 for uci in sorted(self.getlemucis(lem)) :
676 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
678 with open(outfile, 'w') as ffin :
679 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
682 os.remove(outfile + '~')
684 with open(listuci, 'w') as f :
685 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
687 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
688 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
690 duces = dict([[uce, i] for i, uce in enumerate(uces)])
691 with open(outfile + '~', 'w+') as f :
692 for i, lem in enumerate(actives) :
693 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
695 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
697 with open(outfile, 'w') as ffin :
698 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
701 os.remove(outfile + '~')
703 def make_table_with_classe(self, uces, list_act, uci = False) :
704 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
705 uces = dict([[uce, i] for i, uce in enumerate(uces)])
707 getlem = self.getlemucis
709 getlem = self.getlemuces
710 for i, lem in enumerate(list_act) :
711 lemuces = list(set(getlem(lem)).intersection(uces))
713 table_uce[uces[uce]][i] = 1
714 table_uce.insert(0, list_act)
717 def make_pondtable_with_classe(self, uces, list_act) :
718 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
719 uces = dict([[uce, i] for i, uce in enumerate(uces)])
720 for i, lem in enumerate(list_act) :
721 uceseff = self.getlemuceseff(lem)
722 lemuces = list(set(uceseff.keys()).intersection(uces))
724 table_uce[uces[uce]][i] = uceseff[uce]
725 table_uce.insert(0, list_act)
728 def parse_active(self, gramact, gramsup = None) :
729 log.info('parse actives')
730 for lem in self.lems :
731 if lem.startswith('_') and lem.endswith('_') :
732 self.lems[lem].act = 2
733 elif self.lems[lem].gram in gramact :
734 self.lems[lem].act = 1
735 elif gramsup is not None and self.lems[lem].gram not in gramact:
736 if self.lems[lem].gram in gramsup :
737 self.lems[lem].act = 2
739 self.lems[lem].act = 0
741 self.lems[lem].act = 2
743 def make_actives_limit(self, limit, key = 1) :
744 if self.idformes is None :
746 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
748 def make_actives_nb(self, nbmax, key) :
749 log.info('make_actives_nb : %i - %i' % (nbmax,key))
750 if self.idformes is None :
752 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
753 self.activenb = len(allactives)
754 allactives = sorted(allactives, reverse = True)
755 if self.activenb == 0 :
757 if len(allactives) <= nbmax :
758 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
759 return [val[1] for val in allactives], allactives[-1][0]
761 effs = [val[0] for val in allactives]
762 if effs.count(effs[nbmax - 1]) > 1 :
763 lim = effs[nbmax - 1] + 1
767 stop = effs.index(lim)
774 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
775 return [val[1] for val in allactives[0:stop + 1]], lim
777 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
778 log.info('formes/classes')
780 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
782 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
783 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
784 with open(fileout, 'w') as f :
785 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
787 def make_etoiles(self) :
789 for uci in self.ucis :
790 etoiles.update(uci.etoiles[1:])
793 def make_themes(self):
795 for uci in self.ucis :
796 themes.update(uci.paras)
799 def make_etoiles_dict(self) :
800 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
802 for etoile in etoiles :
803 et = etoile.split('_')
806 endet = '_'.join(et[1:])
807 if etoile in det[et[0]] :
808 det[et[0]][etoile] += 1
810 det[et[0]][etoile] = 1
815 endet = '_'.join(et[1:])
816 det[et[0]] = {etoile :1}
821 def make_theme_dict(self):
822 themes = [val for uci in self.ucis for val in uci.paras]
824 for theme in themes :
825 th = theme.split('_')
828 endth = '_'.join(th[1:])
829 if theme in det[th[0]] :
830 det[th[0]][theme] += 1
832 det[th[0]][theme] = 1
837 endth = '_'.join(th[1:])
838 det[th[0]] = {theme:1}
843 def make_etline(self, listet) :
844 etuces = [[] for et in listet]
845 for uci in self.ucis :
846 get = list(set(uci.etoiles).intersection(listet))
848 return '2 variables sur la meme ligne'
850 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
853 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
854 log.info('etoiles/classes')
856 etoileuces = self.getetoileuces()
858 etoileuces = self.getetoileucis()
859 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
860 with open(fileout, 'w') as f :
861 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
862 #etoiles = self.make_etoiles()
863 #with open(fileout, 'w') as f :
864 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
866 def make_colored_corpus(self, uci = False) :
868 for i, lc in enumerate(self.lc) :
871 for uce in self.lc0 :
873 color = ['black'] + colors[len(self.lc) - 1]
875 <meta http-equiv="content-Type" content="text/html; charset=%s" />
877 ''' % sys.getdefaultencoding()
879 res = self.getalluces()
884 if self.iduces[uce[0]].uci != actuci :
885 actuci = self.iduces[uce[0]].uci
886 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
887 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
889 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
891 res = self.getallucis()
894 if self.ucis[uce[0]].ident != actuci :
895 actuci = self.ucis[uce[0]].ident
896 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
897 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
899 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
900 return txt + '\n</body></html>'
902 def count_from_list(self, l, d) :
910 def count_from_list_cl(self, l, d, a, clnb) :
919 def find_segments(self, taille_segment, taille_limite) :
921 for uce in self.getalluces() :
923 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
924 l = [[d[val], val] for val in d if d[val] >= 3]
927 if len(l) > taille_limite :
928 l = l[-taille_limite:]
931 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
934 concorde = self.getconcorde
936 concorde = self.getuciconcorde
937 for uce in concorde(list_uce) :
939 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
940 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
943 if len(l) > taille_limite :
944 l = l[-taille_limite:]
947 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
949 for b, classe in enumerate(self.lc) :
950 for uce in self.getconcorde(classe) :
953 uce = [self.formes[forme].lem for forme in uce]
954 for taille_segment in range(lenmin,lenmax) :
955 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
956 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
957 with open(fileout, 'w') as f :
958 f.write('\n'.join([';'.join(line) for line in result]))
960 def make_proftype(self, outf) :
962 for lem in self.lems :
963 gram = self.lems[lem].gram
965 res[gram] = [0 for val in self.lc]
966 lemuceeff = self.getlemuceseff(lem)
967 for i, classe in enumerate(self.lc) :
968 concern = set(classe).intersection(lemuceeff.keys())
969 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
970 res = [[gram] + [`val` for val in res[gram]] for gram in res]
972 with open(outf, 'w') as f :
973 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
976 def make_ucecl_from_R(self, filein) :
977 with open(filein, 'rU') as f :
982 line = line.replace('\n', '').replace('"', '').split(';')
983 self.lc.append([int(line[0]) - 1, int(line[1])])
984 classesl = [val[1] for val in self.lc]
986 self.lc = sorted(self.lc, key=itemgetter(1))
987 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
988 self.lc0 = self.lc.pop(0)
991 def get_stat_by_cluster(self, outf, lclasses = None) :
992 log.info('get_stat_by_cluster')
993 if lclasses is None :
996 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
997 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
998 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
999 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
1000 sets = [set(cl) for cl in lclasses]
1001 for forme in self.formes :
1002 formeuceeff = self.getformeuceseff(forme)
1003 for i, classe in enumerate(lclasses) :
1004 concern = sets[i].intersection(formeuceeff.keys())
1006 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
1008 if self.formes[forme].freq == 1 :
1010 log.info('%f' % (time() - t1))
1011 if outf is not None :
1012 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
1013 with open(outf, 'w') as f :
1016 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
1018 def get_stat_by_et(self, outf, etoiles) :
1019 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
1020 stats = self.get_stat_by_cluster(None, lclasses)
1021 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1023 def gethapaxbyet(self, etoiles) :
1024 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1026 for uce in hapaxuces :
1027 if uce in hucesdict :
1031 etuces = [[] for et in etoiles]
1032 for uci in self.ucis :
1033 get = list(set(uci.etoiles).intersection(etoiles))
1035 return '2 variables sur la meme ligne'
1037 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1038 etuces = [set(val) for val in etuces]
1039 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1041 def gethapaxuces(self) :
1042 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1043 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1045 for i,uce in enumerate(hapaxuces) :
1046 if uce in hucesdict :
1047 hucesdict[uce][0] += 1
1048 hucesdict[uce][1].append(hapax[i])
1050 hucesdict[uce] = [1,[hapax[i]]]
1052 for uce in hucesdict :
1053 if hucesdict[uce][0] in huces :
1054 huces[hucesdict[uce][0]].append(uce)
1056 huces[hucesdict[uce][0]] = [uce]
1057 huces = zip(huces, huces.values())
1058 huces.sort(reverse=True)
1062 for nb in huces[0:4] :
1063 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1065 res = self.getconcorde([uce])
1067 ucetxt = ' ' + row[1] + ' '
1069 for hap in hucesdict[uce][1] :
1070 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1071 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1072 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1073 txt += '<p>'+ucetxt+'</p>\n'
1077 with open('/tmp/testhapxuce.html','w') as f :
1080 def export_dictionary(self, fileout, syscoding) :
1081 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1082 listformes.sort(reverse = True)
1083 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1084 with open(fileout, 'w') as f :
1085 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1087 def export_lems(self, fileout, syscoding) :
1088 self.make_idformes()
1089 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1091 with open(fileout, 'w') as f :
1092 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1097 def __init__(self, corpus) :
1098 ucinb = corpus.getucinb()
1099 ucisize = corpus.getucisize()
1100 ucimean = float(sum(ucisize))/float(ucinb)
1101 detoile = corpus.make_etoiles_dict()
1104 def __init__(self, iduci, line, paraset = None) :
1106 self.etoiles = line.split()
1108 if paraset is not None :
1109 self.paras = paraset.split()
1114 def __init__(self, iduce, idpara, iduci) :
1120 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1123 self.gram = gramtype
1126 if freq is not None :
1132 def __init__(self, parent, forme) :
1133 self.formes = {forme.ident : forme.freq}
1134 self.gram = forme.gram
1135 self.freq = forme.freq
1136 self.act = forme.act
1138 def add_forme(self, forme) :
1139 self.formes[forme.ident] = forme.freq
1140 self.freq += forme.freq
1142 def decouperlist(chaine, longueur, longueurOptimale) :
1144 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1145 Si on trouve un '$', c'est fini.
1146 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1148 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1149 dsep = dict([[val[0],val[1]] for val in separateurs])
1150 trouve = False # si on a trouvé un bon séparateur
1151 iDecoupe = 0 # indice du caractere ou il faut decouper
1153 longueur = min(longueur, len(chaine) - 1)
1154 chaineTravail = chaine[:longueur + 1]
1156 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1159 indice = chaineTravail.index(u'$')
1161 iDecoupe = indice - 1
1166 caractere = chaineTravail[nbCar]
1167 distance = abs(longueurOptimale - nbCar) + 1
1168 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1169 if caractere in dsep :
1170 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1171 meilleur[0] = caractere
1172 meilleur[1] = dsep[caractere]
1177 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1179 meilleur[1] = dsep[' ']
1186 #if meilleur[0] != ' ' :
1187 # fin = chaine[iDecoupe + 1:]
1188 # retour = chaineTravail[:iDecoupe]
1190 fin = chaine[iDecoupe + 1:]
1191 retour = chaineTravail[:iDecoupe + 1]
1192 return len(retour) > 0, retour, fin
1193 # si on a rien trouvé
1194 return False, chaine, ''
1196 def testetoile(line) :
1197 return line.startswith(u'****')
1200 return line[0:4].isdigit() and u'*' in line
1202 def prep_txtlist(txt) :
1203 return txt.split() + [u'$']
1205 def prep_txtcharact(txt) :
1210 Class for building a corpus
1212 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1213 log.info('begin building corpus...')
1214 self.lexique = lexique
1215 self.expressions = expressions
1217 self.corpus = Corpus(self, parametres_corpus)
1218 self.infile = infile
1220 self.lim = parametres_corpus.get('lim', 1000000)
1221 self.encoding = parametres_corpus['encoding']
1222 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1223 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1224 self.corpus.parametres['uuid'] = str(uuid4())
1225 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1226 self.corpus.parametres['type'] = 'corpus'
1227 if self.corpus.parametres['keep_ponct'] :
1228 self.ponctuation_espace = [' ', '']
1230 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1232 self.tolist = self.corpus.parametres.get('tolist', 0)
1239 def prep_makeuce(self) :
1240 method = self.corpus.parametres.get('ucemethod', 0)
1242 self.decouper = decouperlist
1243 self.prep_txt = prep_txtlist
1244 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1246 self.decouper = decoupercharact
1247 self.prep_txt = prep_txtcharact
1248 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1249 log.info('method uce : %s' % method)
1254 self.read_corpus(self.infile)
1255 except Warning, args :
1256 log.info('pas kool %s' % args)
1260 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1261 self.time = time() - t1
1263 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1264 log.info('time : %f' % (time() - t1))
1267 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1268 self.cf = self.conn_f.cursor()
1269 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1270 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1271 self.conn_f.commit()
1272 self.cf = self.conn_f.cursor()
1273 self.cf.execute('PRAGMA temp_store=MEMORY;')
1274 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1275 self.cf.execute('PRAGMA synchronous = OFF;')
1276 self.cf.execute('begin')
1277 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1278 self.c = self.conn.cursor()
1279 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1281 self.c = self.conn.cursor()
1282 self.c.execute('PRAGMA temp_store=MEMORY;')
1283 self.c.execute('PRAGMA journal_mode=MEMORY;')
1284 self.c.execute('PRAGMA synchronous = OFF;')
1285 self.c.execute('begin')
1288 #commit index and close db
1290 self.conn_f.commit()
1291 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1292 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1296 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1297 self.ccorpus = self.conn_corpus.cursor()
1298 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1299 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1300 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1301 self.conn_corpus.commit()
1302 self.ccorpus = self.conn_corpus.cursor()
1303 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1304 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1305 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1306 self.ccorpus.execute('begin')
1307 self.backup_corpus()
1308 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1309 self.conn_corpus.commit()
1310 self.conn_corpus.close()
1311 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1313 def buildcleans(self) :
1314 if self.corpus.parametres.get('lower', 1) :
1315 self.cleans.append(self.dolower)
1316 if self.corpus.parametres.get('firstclean', 1) :
1317 self.cleans.append(self.firstclean)
1318 if self.corpus.parametres['charact'] :
1319 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1320 self.cleans.append(self.docharact)
1321 if self.corpus.parametres.get('expressions', 1) :
1322 self.cleans.append(self.make_expression)
1323 if self.corpus.parametres.get('apos', 1) :
1324 self.cleans.append(self.doapos)
1325 if self.corpus.parametres.get('tiret', 1):
1326 self.cleans.append(self.dotiret)
1328 def make_expression(self,txt) :
1329 exp = self.expressions.keys()
1330 exp.sort(reverse=True)
1331 for expression in exp :
1332 if expression in txt :
1333 txt = txt.replace(expression, self.expressions[expression][0])
1336 def dolower(self, txt) :
1339 def docharact(self, txt) :
1340 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1341 list_keep = u"[" + self.rule + "]+"
1342 return re.sub(list_keep, ' ', txt)
1344 def doapos(self, txt) :
1345 return txt.replace(u'\'', u' ')
1347 def dotiret(self, txt) :
1348 return txt.replace(u'-', u' ')
1350 def firstclean(self, txt) :
1351 txt = txt.replace(u'’',"'")
1352 txt = txt.replace(u'œ', u'oe')
1353 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1355 def make_cleans(self, txt) :
1356 for clean in self.cleans :
1360 def backup_uce(self) :
1361 if self.corpus.idformesuces != {} :
1362 log.info('backup %i' % len(self.corpus.idformesuces))
1363 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1364 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1365 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1366 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1367 self.corpus.idformesuces = {}
1370 def backup_corpus(self) :
1371 log.info('start backup corpus')
1373 for uci in self.corpus.ucis :
1374 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1375 for uce in uci.uces :
1376 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1377 for forme in self.corpus.formes :
1378 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1379 log.info('%f' % (time() - t))
1381 def dofinish(self) :
1382 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1383 minutes, seconds = divmod(self.time, 60)
1384 hours, minutes = divmod(minutes, 60)
1385 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1386 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1387 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1388 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1389 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1390 hapaxnb = self.corpus.gethapaxnb()
1391 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1392 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1393 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1395 class BuildSubCorpus(BuildCorpus):
1396 def __init__(self, corpus, parametres, dlg = None) :
1397 log.info('begin subcorpus...')
1401 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1403 self.parametres = parametres
1404 self.encoding = corpus.parametres['encoding']
1405 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1406 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1407 self.corpus.pathout.createdir(parametres['pathout'])
1408 self.corpus.parametres['pathout'] = parametres['pathout']
1409 self.corpus.parametres['meta'] = parametres.get('meta', False)
1410 self.corpus.parametres['uuid'] = str(uuid4())
1411 if parametres.get('frommeta', False) :
1412 print 'make subtexts'
1413 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1414 elif parametres.get('fromtheme', False) :
1415 print 'make subtexts from theme'
1417 for uci in self.ori.ucis :
1418 if uci.paras != [] :
1421 for et in uci.paras :
1422 if et in parametres['meta'] :
1423 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1429 nuci.paras = newpara
1430 self.corpus.ucis.append(nuci)
1433 elif parametres.get('fromclusters', False) :
1434 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1436 elif parametres.get('fromuceids', False) :
1442 def fromuceids(self):
1444 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1446 for uci in self.ori.ucis :
1447 if uci.paras == [] :
1448 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1451 nuci.uces = keepuces
1452 self.corpus.ucis.append(nuci)
1457 for et in uci.paras :
1458 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1466 nuci.paras = newpara
1467 self.corpus.ucis.append(nuci)
1469 def read_corpus(self, infile = None):
1470 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1476 print 'redo text, para and st ident'
1477 for uci in self.corpus.ucis :
1478 uci.ident = ident_uci
1480 for uce in uci.uces :
1482 if uce.para != lastpara :
1485 uce.para = ident_para
1487 uce.para = ident_para
1488 newuceident[uce.ident] = ident_uce
1489 uce.ident = ident_uce
1491 print 'backup st text and forms'
1492 for row in self.ori.getconcorde(self.olduceid) :
1493 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1494 for word in row[1].split() :
1495 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1499 class BuildFromAlceste(BuildCorpus) :
1500 def read_corpus(self, infile) :
1501 if self.dlg is not None :
1502 self.dlg.Pulse('textes : 0 - segments : 0')
1505 if self.corpus.parametres['ucimark'] == 0 :
1506 self.testuci = testetoile
1507 elif self.corpus.parametres['ucimark'] == 1 :
1508 self.testuci = testint
1514 with codecs.open(infile, 'r', self.encoding) as f :
1515 for linenb, line in enumerate(f) :
1516 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1517 if self.testuci(line) :
1520 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1522 self.corpus.ucis.append(Uci(iduci, line))
1525 if self.corpus.ucis[-1].uces == [] :
1526 log.info(u'Empty text : %i' % linenb)
1528 self.corpus.ucis.pop()
1529 self.corpus.ucis.append(Uci(iduci, line))
1530 if self.dlg is not None :
1531 if not (iduci + 1) % 10 :
1532 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1533 elif line.startswith(u'-*') :
1536 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1539 self.corpus.ucis[-1].paras.append(line.split()[0])
1541 raise Exception('paragrapheOT %i' % linenb)
1542 elif line.strip() != '' and iduci != -1 :
1544 if txt != [] and iduci != -1 :
1545 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1550 self.corpus.ucis.pop()
1551 log.info(Exception("Empty text %i" % linenb))
1553 raise Exception('EmptyText %i' % linenb)
1554 if iduci != -1 and iduce != -1:
1557 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1558 raise Exception('TextBeforeTextMark %i' % linenb)
1559 except UnicodeDecodeError :
1560 raise Exception("CorpusEncoding")
1562 def treattxt(self, txt, iduce, idpara, iduci) :
1563 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1564 txt = 'laphrasepoursplitter'.join(txt)
1565 txt = self.make_cleans(txt)
1566 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1567 ucetxt = txt.split('laphrasepoursplitter')
1570 txt = self.make_cleans(txt)
1571 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1572 if self.corpus.ucis[-1].paras == [] :
1576 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1577 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1578 if not self.tolist :
1584 self.corpus.add_word(word)
1585 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1586 if self.last > self.lim :
1589 return iduce, idpara
1591 def make_uces(self, txt, douce = True, keep_ponct = False) :
1592 txt = ' '.join(txt.split())
1595 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1597 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1600 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1601 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1606 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1608 #decouper (list_sep)
1609 #make_uces (decouper)
1610 #treat_txt (make_uces)
1614 def __init__(self, parent, dlg = None) :
1615 self.parent = parent
1618 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1619 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1620 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1621 dial = CorpusPref(parent, parametres)
1622 dial.CenterOnParent()
1623 dial.txtpath.SetLabel(parent.filename)
1624 #dial.repout_choices.SetValue(parametres['pathout'])
1625 self.res = dial.ShowModal()
1626 if self.dlg is not None :
1627 self.dlg = progressbar(self.parent, self.dlg)
1628 if self.res == 5100 :
1629 parametres = dial.doparametres()
1630 parametres['originalpath'] = parent.filename
1631 PathOut().createdir(parametres['pathout'])
1632 if parametres.get('dictionary', False) :
1633 filein = parametres['dictionary']
1636 if dial.corpusname.GetValue() != '' :
1637 parametres['corpus_name'] = dial.corpusname.GetValue()
1639 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1640 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1641 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1643 self.parent.expressions = {}
1644 self.parametres = parametres
1647 if self.dlg is not None :
1650 def doanalyse(self) :
1651 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1654 def __init__(self, parent, corpus, parametres = None, dlg = None):
1655 self.parent = parent
1658 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1659 if dlg is not None :
1660 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1662 parametres['corpus_name'] = corpus_name
1663 if parametres.get('frommeta', False) :
1664 parametres['meta'] = corpus.make_etoiles()
1665 elif parametres.get('fromtheme', False) :
1666 parametres['meta'] = corpus.make_themes()
1667 elif parametres.get('fromclusters', False) :
1668 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1670 parametres['meta'] = []
1671 if 'fromclusters' not in parametres :
1672 parametres['meta'].sort()
1673 if dlg is not None :
1675 dial = SubTextFromMetaDial(parent, parametres)
1676 self.res = dial.ShowModal()
1677 if self.res == 5100 :
1678 if dial.subcorpusname.GetValue() != '' :
1679 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1680 if corpus_name != '' :
1681 parametres['corpus_name'] = corpus_name
1683 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1684 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1686 while os.path.exists(pathout + '_%i' % i) :
1688 parametres['pathout'] = pathout + '_%i' % i
1689 meta = dial.m_listBox1.GetSelections()
1690 if not 'fromclusters' in parametres :
1691 parametres['meta'] = [parametres['meta'][val] for val in meta]
1693 parametres['meta'] = meta
1694 self.parametres = parametres
1699 def doanalyse(self):
1700 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus