1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def gettgenst(self, tgen):
178 if lem in self.lems :
179 formesid += ', '.join([`val` for val in self.lems[lem].formes])
181 print 'abscent: ',lem
182 #formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems])
183 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
184 res = self.cformes.execute(query)
185 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
187 def gettgentxt(self, tgen):
188 sts = self.gettgenst(tgen)
189 return list(set([self.getucefromid(val).uci for val in sts]))
191 def getlemucis(self, lem) :
192 uces = self.getlemuces(lem)
193 return list(set([self.getucefromid(val).uci for val in uces]))
195 def getlemuceseff(self, lem, luces = None) :
196 formesid = ', '.join([`val` for val in self.lems[lem].formes])
197 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
198 res = self.cformes.execute(query)
199 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
200 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
201 res = self.cformes.execute(query)
202 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
204 for i, uce in enumerate(uces) :
205 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
208 def getlemclustereff(self, lem, cluster) :
209 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
211 def getlemeff(self, lem) :
212 return self.lems[lem].freq
217 def getforme(self, formeid) :
218 if self.idformes is None : self.make_idformes()
219 return self.idformes[formeid]
221 def gettotocc(self) :
222 return sum([self.formes[forme].freq for forme in self.formes])
224 def getucemean(self) :
225 return float(self.gettotocc())/self.getucenb()
228 return self.ucis[-1].uces[-1].ident + 1
231 return self.ucis[-1].ident + 1
233 def getucisize(self) :
234 ucesize = self.getucesize()
235 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
237 def getucesize(self) :
238 res = self.getalluces()
239 return [len(uce[1].split()) for uce in res]
241 def getconcorde(self, uces) :
242 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
244 def getuciconcorde(self, ucis) :
245 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
246 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
249 def getwordconcorde(self, word) :
250 return self.getconcorde(self.getworduces(word))
252 def getlemconcorde(self, lem) :
253 return self.getconcorde(self.getlemuces(lem))
255 def getalluces(self) :
256 return self.cuces.execute('SELECT * FROM uces')
258 def getallucis(self):
259 uces = [row[1] for row in self.getalluces()]
260 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
262 def getucesfrometoile(self, etoile) :
263 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
265 def getetoileuces(self) :
266 log.info('get uces etoiles')
269 for uci in self.ucis :
270 etoiles = uci.etoiles[1:]
272 if et in etoileuces :
273 etoileuces[et] += [uce.ident for uce in uci.uces]
275 etoileuces[et] = [uce.ident for uce in uci.uces]
277 for et in uci.paras :
278 if et in etoileuces :
279 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
281 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
287 def getetoileucis(self):
289 for uci in self.ucis :
290 etoiles = uci.etoiles[1:]
292 if et in etoileuces :
293 etoileuces[et] += [uci.ident]
295 etoileuces[et] = [uci.ident]
298 def getucefromid(self, uceid) :
299 if self.iduces is None : self.make_iduces()
300 return self.iduces[uceid]
302 def gethapaxnb(self) :
303 return len([None for forme in self.formes if self.formes[forme].freq == 1])
305 def getactivesnb(self, key) :
306 return len([lem for lem in self.lems if self.lems[lem].act == key])
307 # def make_lems(self, lem = True) :
308 # log.info('make lems')
310 # for forme in self.formes :
311 # if self.formes[forme].lem in self.lems :
312 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
313 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
315 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
317 def getetbyuceid(self, uceid) :
318 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
319 return self.ucis[self.uceuci[uceid]].etoiles
321 def make_lems(self, lem = True) :
322 log.info('make lems')
325 for forme in self.formes :
326 if self.formes[forme].lem in self.lems :
327 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
328 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
330 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
332 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
334 def make_lems_from_dict(self, dictionnaire, dolem = True) :
335 log.info('make lems from dict')
337 for forme in self.formes :
338 if self.formes[forme].forme in dictionnaire :
339 lem = dictionnaire[forme][0]
340 gram = dictionnaire[forme][1]
341 elif forme.isdigit() :
347 self.formes[forme].lem = lem
348 self.formes[forme].gram = gram
350 if self.formes[forme].lem in self.lems :
351 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
352 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
354 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
356 self.lems[forme] = Lem(self, self.formes[forme])
358 def make_idformes(self) :
359 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
361 def make_iduces(self) :
362 if self.iduces is None :
363 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
365 def make_lexitable(self, mineff, etoiles, gram = 0) :
370 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
371 etuces = [[] for et in etoiles]
372 for uci in self.ucis :
373 get = list(set(uci.etoiles).intersection(etoiles))
375 log.info('2 variables sur une ligne')
377 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
378 etuces = [set(val) for val in etuces]
381 deff = self.getlemuceseff(lem)
383 line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
384 if sum(line[1:]) >= mineff :
386 tab.insert(0, [''] + etoiles)
389 def make_tgen_table(self, tgen, etoiles, tot = None):
390 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
391 sets = [set(cl) for cl in lclasses]
392 totoccurrences = dict([[val, 0] for val in etoiles])
394 for forme in self.formes :
395 formeuceeff = self.getformeuceseff(forme)
396 for i, classe in enumerate(lclasses) :
397 concern = sets[i].intersection(formeuceeff.keys())
399 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
400 #tgenoccurrences = dict([[val, 0] for val in etoiles])
403 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
405 lemuceeff = self.getlemuceseff(lem)
406 for i, classe in enumerate(lclasses) :
407 concern = sets[i].intersection(lemuceeff.keys())
409 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
410 return tgenoccurrences, totoccurrences
412 def make_tgen_profile(self, tgen, ucecl, uci = False) :
413 log.info('tgen/classes')
415 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
417 tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
418 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
422 #while nam + `i` in tgen :
425 #last = [nam] + [`len(classe)` for classe in ucecl]
427 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
429 #with open(fileout, 'w') as f :
430 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
432 def make_efftype_from_etoiles(self, etoiles) :
434 etuces = [[] for et in etoiles]
435 for uci in self.ucis :
436 get = list(set(uci.etoiles).intersection(etoiles))
438 return '2 variables sur la meme ligne'
440 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
441 etuces = [set(val) for val in etuces]
442 for lem in self.lems :
443 deff = self.getlemuceseff(lem)
445 gram = self.lems[lem].gram
447 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
449 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
450 tabout = [[gram] + dtype[gram] for gram in dtype]
451 tabout.insert(0, [''] + etoiles)
454 def make_uceactsize(self, actives) :
455 res = self.getalluces()
458 deff = self.getlemuceseff(lem)
460 ucesize[uce] = ucesize.get(uce, 0) + 1
463 def make_uc(self, actives, lim1, lim2) :
464 uceactsize = self.make_uceactsize(actives)
470 for uce in [uce for uci in self.ucis for uce in uci.uces] :
471 if uce.para == lastpara :
473 last1 += uceactsize.get(uce.ident,0)
474 uc1[-1].append(uce.ident)
476 uc1.append([uce.ident])
479 last2 += uceactsize.get(uce.ident, 0)
480 uc2[-1].append(uce.ident)
482 uc2.append([uce.ident])
485 last1 = uceactsize.get(uce.ident, 0)
486 last2 = uceactsize.get(uce.ident, 0)
488 uc1.append([uce.ident])
489 uc2.append([uce.ident])
492 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
493 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
494 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
495 self.write_ucmatrix(uc1, actives, uc1out)
496 self.write_ucmatrix(uc2, actives, uc2out)
497 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
498 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
499 with open(listuce1out, 'w') as f :
500 f.write('\n'.join([';'.join(line) for line in listuce1]))
501 with open(listuce2out, 'w') as f :
502 f.write('\n'.join([';'.join(line) for line in listuce2]))
503 return len(uc1), len(uc2)
505 def write_ucmatrix(self, uc, actives, fileout) :
506 log.info('write uc matrix %s' % fileout)
507 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
510 with open(fileout + '~', 'w+') as f :
511 for i, lem in enumerate(actives) :
512 for uce in self.getlemuces(lem):
513 if (uces_uc[uce], i) not in deja_la :
515 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
516 deja_la[(uces_uc[uce], i)] = 0
518 with open(fileout, 'w') as ffin :
519 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
522 os.remove(fileout + '~')
525 def export_corpus(self, outf) :
526 #outf = 'export_corpus.txt'
528 res = self.getalluces()
532 with open(outf,'w') as f :
534 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
535 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
536 elif self.iduces[uce[0]].uci != actuci :
537 actuci = self.iduces[uce[0]].uci
538 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
539 actpara = self.iduces[uce[0]].para
540 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
543 actpara = self.iduces[uce[0]].para
544 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
545 elif self.iduces[uce[0]].para != actpara :
546 actpara = self.iduces[uce[0]].para
548 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
550 def export_meta_table(self, outf) :
551 metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
552 longueur_max = max([len(val) for val in metas])
553 first = ['column_%i' % i for i in range(longueur_max)]
554 metas.insert(0, first)
555 with open(outf, 'w') as f :
556 f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
558 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
560 for i, lc in enumerate(self.lc) :
563 for uce in self.lc0 :
566 res = self.getalluces()
569 res = self.getallucis()
570 with open(outf, 'w') as f :
574 actuci = self.iduces[uce[0]].uci
578 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
580 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
582 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
583 f.write(etline.encode(self.parametres['syscoding']) + '\n')
584 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
586 def export_classe(self, outf, classe, lem = False, uci = False) :
587 sts = self.lc[classe - 1]
589 res = self.getconcorde(sts)
592 res = self.getuciconcorde(sts)
593 with open(outf, 'w') as f :
597 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
599 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
601 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
602 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
604 def export_owledge(self, rep, classe, lem = False, uci = False) :
605 sts = self.lc[classe - 1]
607 res = self.getconcorde(sts)
610 res = self.getuciconcorde(sts)
614 outf = '.'.join([`ident`, 'txt'])
615 outf = os.path.join(rep, outf)
617 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
618 with open(outf, 'w') as f :
619 f.write(guce.encode('cp1252', errors = 'replace'))
621 def export_tropes(self, fileout, classe, lem = False, uci = False) :
622 sts = self.lc[classe - 1]
624 res = self.getconcorde(sts)
627 res = self.getuciconcorde(sts)
628 with open(fileout, 'w') as f :
632 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
633 f.write(guce.encode('cp1252', errors = 'replace'))
636 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
637 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
639 with open(outfile + '~', 'w+') as f :
640 for i, lem in enumerate(actives) :
641 for uce in sorted(self.getlemuces(lem)) :
643 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
645 with open(outfile, 'w') as ffin :
646 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
649 os.remove(outfile + '~')
651 with open(listuce, 'w') as f :
652 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
654 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
655 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
657 with open(outfile + '~', 'w+') as f :
658 for i, lem in enumerate(actives) :
659 for uci in sorted(self.getlemucis(lem)) :
661 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
663 with open(outfile, 'w') as ffin :
664 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
667 os.remove(outfile + '~')
669 with open(listuci, 'w') as f :
670 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
672 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
673 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
675 duces = dict([[uce, i] for i, uce in enumerate(uces)])
676 with open(outfile + '~', 'w+') as f :
677 for i, lem in enumerate(actives) :
678 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
680 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
682 with open(outfile, 'w') as ffin :
683 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
686 os.remove(outfile + '~')
688 def make_table_with_classe(self, uces, list_act, uci = False) :
689 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
690 uces = dict([[uce, i] for i, uce in enumerate(uces)])
692 getlem = self.getlemucis
694 getlem = self.getlemuces
695 for i, lem in enumerate(list_act) :
696 lemuces = list(set(getlem(lem)).intersection(uces))
698 table_uce[uces[uce]][i] = 1
699 table_uce.insert(0, list_act)
702 def make_pondtable_with_classe(self, uces, list_act) :
703 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
704 uces = dict([[uce, i] for i, uce in enumerate(uces)])
705 for i, lem in enumerate(list_act) :
706 uceseff = self.getlemuceseff(lem)
707 lemuces = list(set(uceseff.keys()).intersection(uces))
709 table_uce[uces[uce]][i] = uceseff[uce]
710 table_uce.insert(0, list_act)
713 def parse_active(self, gramact, gramsup = None) :
714 log.info('parse actives')
715 for lem in self.lems :
716 if lem.startswith('_') and lem.endswith('_') :
717 self.lems[lem].act = 2
718 elif self.lems[lem].gram in gramact :
719 self.lems[lem].act = 1
720 elif gramsup is not None and self.lems[lem].gram not in gramact:
721 if self.lems[lem].gram in gramsup :
722 self.lems[lem].act = 2
724 self.lems[lem].act = 0
726 self.lems[lem].act = 2
728 def make_actives_limit(self, limit, key = 1) :
729 if self.idformes is None :
731 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
733 def make_actives_nb(self, nbmax, key) :
734 log.info('make_actives_nb : %i - %i' % (nbmax,key))
735 if self.idformes is None :
737 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
738 self.activenb = len(allactives)
739 allactives = sorted(allactives, reverse = True)
740 if self.activenb == 0 :
742 if len(allactives) <= nbmax :
743 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
744 return [val[1] for val in allactives], allactives[-1][0]
746 effs = [val[0] for val in allactives]
747 if effs.count(effs[nbmax - 1]) > 1 :
748 lim = effs[nbmax - 1] + 1
752 stop = effs.index(lim)
759 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
760 return [val[1] for val in allactives[0:stop + 1]], lim
762 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
763 log.info('formes/classes')
765 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
767 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
768 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
769 with open(fileout, 'w') as f :
770 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
772 def make_etoiles(self) :
774 for uci in self.ucis :
775 etoiles.update(uci.etoiles[1:])
778 def make_themes(self):
780 for uci in self.ucis :
781 themes.update(uci.paras)
784 def make_etoiles_dict(self) :
785 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
787 for etoile in etoiles :
788 et = etoile.split('_')
791 endet = '_'.join(et[1:])
792 if etoile in det[et[0]] :
793 det[et[0]][etoile] += 1
795 det[et[0]][etoile] = 1
800 endet = '_'.join(et[1:])
801 det[et[0]] = {etoile :1}
806 def make_theme_dict(self):
807 themes = [val for uci in self.ucis for val in uci.paras]
809 for theme in themes :
810 th = theme.split('_')
813 endth = '_'.join(th[1:])
814 if theme in det[th[0]] :
815 det[th[0]][theme] += 1
817 det[th[0]][theme] = 1
822 endth = '_'.join(th[1:])
823 det[th[0]] = {theme:1}
828 def make_etline(self, listet) :
829 etuces = [[] for et in listet]
830 for uci in self.ucis :
831 get = list(set(uci.etoiles).intersection(listet))
833 return '2 variables sur la meme ligne'
835 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
838 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
839 log.info('etoiles/classes')
841 etoileuces = self.getetoileuces()
843 etoileuces = self.getetoileucis()
844 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
845 with open(fileout, 'w') as f :
846 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
847 #etoiles = self.make_etoiles()
848 #with open(fileout, 'w') as f :
849 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
851 def make_colored_corpus(self, uci = False) :
853 for i, lc in enumerate(self.lc) :
856 for uce in self.lc0 :
858 color = ['black'] + colors[len(self.lc) - 1]
860 <meta http-equiv="content-Type" content="text/html; charset=%s" />
862 ''' % sys.getdefaultencoding()
864 res = self.getalluces()
869 if self.iduces[uce[0]].uci != actuci :
870 actuci = self.iduces[uce[0]].uci
871 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
872 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
874 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
876 res = self.getallucis()
879 if self.ucis[uce[0]].ident != actuci :
880 actuci = self.ucis[uce[0]].ident
881 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
882 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
884 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
885 return txt + '\n</body></html>'
887 def count_from_list(self, l, d) :
895 def count_from_list_cl(self, l, d, a, clnb) :
904 def find_segments(self, taille_segment, taille_limite) :
906 for uce in self.getalluces() :
908 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
909 l = [[d[val], val] for val in d if d[val] >= 3]
912 if len(l) > taille_limite :
913 l = l[-taille_limite:]
916 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
919 concorde = self.getconcorde
921 concorde = self.getuciconcorde
922 for uce in concorde(list_uce) :
924 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
925 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
928 if len(l) > taille_limite :
929 l = l[-taille_limite:]
932 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
934 for b, classe in enumerate(self.lc) :
935 for uce in self.getconcorde(classe) :
938 uce = [self.formes[forme].lem for forme in uce]
939 for taille_segment in range(lenmin,lenmax) :
940 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
941 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
942 with open(fileout, 'w') as f :
943 f.write('\n'.join([';'.join(line) for line in result]))
945 def make_proftype(self, outf) :
947 for lem in self.lems :
948 gram = self.lems[lem].gram
950 res[gram] = [0 for val in self.lc]
951 lemuceeff = self.getlemuceseff(lem)
952 for i, classe in enumerate(self.lc) :
953 concern = set(classe).intersection(lemuceeff.keys())
954 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
955 res = [[gram] + [`val` for val in res[gram]] for gram in res]
957 with open(outf, 'w') as f :
958 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
961 def make_ucecl_from_R(self, filein) :
962 with open(filein, 'rU') as f :
967 line = line.replace('\n', '').replace('"', '').split(';')
968 self.lc.append([int(line[0]) - 1, int(line[1])])
969 classesl = [val[1] for val in self.lc]
971 self.lc = sorted(self.lc, key=itemgetter(1))
972 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
973 self.lc0 = self.lc.pop(0)
976 def get_stat_by_cluster(self, outf, lclasses = None) :
977 log.info('get_stat_by_cluster')
978 if lclasses is None :
981 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
982 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
983 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
984 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
985 sets = [set(cl) for cl in lclasses]
986 for forme in self.formes :
987 formeuceeff = self.getformeuceseff(forme)
988 for i, classe in enumerate(lclasses) :
989 concern = sets[i].intersection(formeuceeff.keys())
991 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
993 if self.formes[forme].freq == 1 :
995 log.info('%f' % (time() - t1))
996 if outf is not None :
997 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
998 with open(outf, 'w') as f :
1001 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
1003 def get_stat_by_et(self, outf, etoiles) :
1004 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
1005 stats = self.get_stat_by_cluster(None, lclasses)
1006 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1008 def gethapaxbyet(self, etoiles) :
1009 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1011 for uce in hapaxuces :
1012 if uce in hucesdict :
1016 etuces = [[] for et in etoiles]
1017 for uci in self.ucis :
1018 get = list(set(uci.etoiles).intersection(etoiles))
1020 return '2 variables sur la meme ligne'
1022 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1023 etuces = [set(val) for val in etuces]
1024 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1026 def gethapaxuces(self) :
1027 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1028 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1030 for i,uce in enumerate(hapaxuces) :
1031 if uce in hucesdict :
1032 hucesdict[uce][0] += 1
1033 hucesdict[uce][1].append(hapax[i])
1035 hucesdict[uce] = [1,[hapax[i]]]
1037 for uce in hucesdict :
1038 if hucesdict[uce][0] in huces :
1039 huces[hucesdict[uce][0]].append(uce)
1041 huces[hucesdict[uce][0]] = [uce]
1042 huces = zip(huces, huces.values())
1043 huces.sort(reverse=True)
1047 for nb in huces[0:4] :
1048 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1050 res = self.getconcorde([uce])
1052 ucetxt = ' ' + row[1] + ' '
1054 for hap in hucesdict[uce][1] :
1055 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1056 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1057 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1058 txt += '<p>'+ucetxt+'</p>\n'
1062 with open('/tmp/testhapxuce.html','w') as f :
1065 def export_dictionary(self, fileout, syscoding) :
1066 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1067 listformes.sort(reverse = True)
1068 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1069 with open(fileout, 'w') as f :
1070 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1072 def export_lems(self, fileout, syscoding) :
1073 self.make_idformes()
1074 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1076 with open(fileout, 'w') as f :
1077 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1082 def __init__(self, corpus) :
1083 ucinb = corpus.getucinb()
1084 ucisize = corpus.getucisize()
1085 ucimean = float(sum(ucisize))/float(ucinb)
1086 detoile = corpus.make_etoiles_dict()
1089 def __init__(self, iduci, line, paraset = None) :
1091 self.etoiles = line.split()
1093 if paraset is not None :
1094 self.paras = paraset.split()
1099 def __init__(self, iduce, idpara, iduci) :
1105 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1108 self.gram = gramtype
1111 if freq is not None :
1117 def __init__(self, parent, forme) :
1118 self.formes = {forme.ident : forme.freq}
1119 self.gram = forme.gram
1120 self.freq = forme.freq
1121 self.act = forme.act
1123 def add_forme(self, forme) :
1124 self.formes[forme.ident] = forme.freq
1125 self.freq += forme.freq
1127 def decouperlist(chaine, longueur, longueurOptimale) :
1129 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1130 Si on trouve un '$', c'est fini.
1131 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1133 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1134 dsep = dict([[val[0],val[1]] for val in separateurs])
1135 trouve = False # si on a trouvé un bon séparateur
1136 iDecoupe = 0 # indice du caractere ou il faut decouper
1138 longueur = min(longueur, len(chaine) - 1)
1139 chaineTravail = chaine[:longueur + 1]
1141 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1144 indice = chaineTravail.index(u'$')
1146 iDecoupe = indice - 1
1151 caractere = chaineTravail[nbCar]
1152 distance = abs(longueurOptimale - nbCar) + 1
1153 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1154 if caractere in dsep :
1155 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1156 meilleur[0] = caractere
1157 meilleur[1] = dsep[caractere]
1162 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1164 meilleur[1] = dsep[' ']
1171 #if meilleur[0] != ' ' :
1172 # fin = chaine[iDecoupe + 1:]
1173 # retour = chaineTravail[:iDecoupe]
1175 fin = chaine[iDecoupe + 1:]
1176 retour = chaineTravail[:iDecoupe + 1]
1177 return len(retour) > 0, retour, fin
1178 # si on a rien trouvé
1179 return False, chaine, ''
1181 def testetoile(line) :
1182 return line.startswith(u'****')
1185 return line[0:4].isdigit() and u'*' in line
1187 def prep_txtlist(txt) :
1188 return txt.split() + [u'$']
1190 def prep_txtcharact(txt) :
1195 Class for building a corpus
1197 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1198 log.info('begin building corpus...')
1199 self.lexique = lexique
1200 self.expressions = expressions
1202 self.corpus = Corpus(self, parametres_corpus)
1203 self.infile = infile
1205 self.lim = parametres_corpus.get('lim', 1000000)
1206 self.encoding = parametres_corpus['encoding']
1207 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1208 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1209 self.corpus.parametres['uuid'] = str(uuid4())
1210 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1211 self.corpus.parametres['type'] = 'corpus'
1212 if self.corpus.parametres['keep_ponct'] :
1213 self.ponctuation_espace = [' ', '']
1215 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1217 self.tolist = self.corpus.parametres.get('tolist', 0)
1224 def prep_makeuce(self) :
1225 method = self.corpus.parametres.get('ucemethod', 0)
1227 self.decouper = decouperlist
1228 self.prep_txt = prep_txtlist
1229 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1231 self.decouper = decoupercharact
1232 self.prep_txt = prep_txtcharact
1233 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1234 log.info('method uce : %s' % method)
1239 self.read_corpus(self.infile)
1240 except Warning, args :
1241 log.info('pas kool %s' % args)
1245 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1246 self.time = time() - t1
1248 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1249 log.info('time : %f' % (time() - t1))
1252 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1253 self.cf = self.conn_f.cursor()
1254 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1255 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1256 self.conn_f.commit()
1257 self.cf = self.conn_f.cursor()
1258 self.cf.execute('PRAGMA temp_store=MEMORY;')
1259 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1260 self.cf.execute('PRAGMA synchronous = OFF;')
1261 self.cf.execute('begin')
1262 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1263 self.c = self.conn.cursor()
1264 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1266 self.c = self.conn.cursor()
1267 self.c.execute('PRAGMA temp_store=MEMORY;')
1268 self.c.execute('PRAGMA journal_mode=MEMORY;')
1269 self.c.execute('PRAGMA synchronous = OFF;')
1270 self.c.execute('begin')
1273 #commit index and close db
1275 self.conn_f.commit()
1276 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1277 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1281 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1282 self.ccorpus = self.conn_corpus.cursor()
1283 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1284 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1285 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1286 self.conn_corpus.commit()
1287 self.ccorpus = self.conn_corpus.cursor()
1288 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1289 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1290 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1291 self.ccorpus.execute('begin')
1292 self.backup_corpus()
1293 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1294 self.conn_corpus.commit()
1295 self.conn_corpus.close()
1296 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1298 def buildcleans(self) :
1299 if self.corpus.parametres.get('lower', 1) :
1300 self.cleans.append(self.dolower)
1301 if self.corpus.parametres.get('firstclean', 1) :
1302 self.cleans.append(self.firstclean)
1303 if self.corpus.parametres['charact'] :
1304 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1305 self.cleans.append(self.docharact)
1306 if self.corpus.parametres.get('expressions', 1) :
1307 self.cleans.append(self.make_expression)
1308 if self.corpus.parametres.get('apos', 1) :
1309 self.cleans.append(self.doapos)
1310 if self.corpus.parametres.get('tiret', 1):
1311 self.cleans.append(self.dotiret)
1313 def make_expression(self,txt) :
1314 exp = self.expressions.keys()
1315 exp.sort(reverse=True)
1316 for expression in exp :
1317 if expression in txt :
1318 txt = txt.replace(expression, self.expressions[expression][0])
1321 def dolower(self, txt) :
1324 def docharact(self, txt) :
1325 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1326 list_keep = u"[" + self.rule + "]+"
1327 return re.sub(list_keep, ' ', txt)
1329 def doapos(self, txt) :
1330 return txt.replace(u'\'', u' ')
1332 def dotiret(self, txt) :
1333 return txt.replace(u'-', u' ')
1335 def firstclean(self, txt) :
1336 txt = txt.replace(u'’',"'")
1337 txt = txt.replace(u'œ', u'oe')
1338 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1340 def make_cleans(self, txt) :
1341 for clean in self.cleans :
1345 def backup_uce(self) :
1346 if self.corpus.idformesuces != {} :
1347 log.info('backup %i' % len(self.corpus.idformesuces))
1348 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1349 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1350 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1351 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1352 self.corpus.idformesuces = {}
1355 def backup_corpus(self) :
1356 log.info('start backup corpus')
1358 for uci in self.corpus.ucis :
1359 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1360 for uce in uci.uces :
1361 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1362 for forme in self.corpus.formes :
1363 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1364 log.info('%f' % (time() - t))
1366 def dofinish(self) :
1367 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1368 minutes, seconds = divmod(self.time, 60)
1369 hours, minutes = divmod(minutes, 60)
1370 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1371 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1372 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1373 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1374 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1375 hapaxnb = self.corpus.gethapaxnb()
1376 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1377 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1378 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1380 class BuildSubCorpus(BuildCorpus):
1381 def __init__(self, corpus, parametres, dlg = None) :
1382 log.info('begin subcorpus...')
1386 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1388 self.parametres = parametres
1389 self.encoding = corpus.parametres['encoding']
1390 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1391 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1392 self.corpus.pathout.createdir(parametres['pathout'])
1393 self.corpus.parametres['pathout'] = parametres['pathout']
1394 self.corpus.parametres['meta'] = parametres.get('meta', False)
1395 self.corpus.parametres['uuid'] = str(uuid4())
1396 if parametres.get('frommeta', False) :
1397 print 'make subtexts'
1398 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1399 elif parametres.get('fromtheme', False) :
1400 print 'make subtexts from theme'
1402 for uci in self.ori.ucis :
1403 if uci.paras != [] :
1406 for et in uci.paras :
1407 if et in parametres['meta'] :
1408 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1414 nuci.paras = newpara
1415 self.corpus.ucis.append(nuci)
1418 elif parametres.get('fromclusters', False) :
1419 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1421 elif parametres.get('fromuceids', False) :
1427 def fromuceids(self):
1429 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1431 for uci in self.ori.ucis :
1432 if uci.paras == [] :
1433 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1436 nuci.uces = keepuces
1437 self.corpus.ucis.append(nuci)
1442 for et in uci.paras :
1443 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1451 nuci.paras = newpara
1452 self.corpus.ucis.append(nuci)
1454 def read_corpus(self, infile = None):
1455 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1461 print 'redo text, para and st ident'
1462 for uci in self.corpus.ucis :
1463 uci.ident = ident_uci
1465 for uce in uci.uces :
1467 if uce.para != lastpara :
1470 uce.para = ident_para
1472 uce.para = ident_para
1473 newuceident[uce.ident] = ident_uce
1474 uce.ident = ident_uce
1476 print 'backup st text and forms'
1477 for row in self.ori.getconcorde(self.olduceid) :
1478 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1479 for word in row[1].split() :
1480 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1484 class BuildFromAlceste(BuildCorpus) :
1485 def read_corpus(self, infile) :
1486 if self.dlg is not None :
1487 self.dlg.Pulse('textes : 0 - segments : 0')
1490 if self.corpus.parametres['ucimark'] == 0 :
1491 self.testuci = testetoile
1492 elif self.corpus.parametres['ucimark'] == 1 :
1493 self.testuci = testint
1499 with codecs.open(infile, 'r', self.encoding) as f :
1500 for linenb, line in enumerate(f) :
1501 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1502 if self.testuci(line) :
1505 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1507 self.corpus.ucis.append(Uci(iduci, line))
1510 if self.corpus.ucis[-1].uces == [] :
1511 log.info(u'Empty text : %i' % linenb)
1513 self.corpus.ucis.pop()
1514 self.corpus.ucis.append(Uci(iduci, line))
1515 if self.dlg is not None :
1516 if not (iduci + 1) % 10 :
1517 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1518 elif line.startswith(u'-*') :
1521 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1524 self.corpus.ucis[-1].paras.append(line.split()[0])
1526 raise Exception('paragrapheOT %i' % linenb)
1527 elif line.strip() != '' and iduci != -1 :
1529 if txt != [] and iduci != -1 :
1530 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1535 self.corpus.ucis.pop()
1536 log.info(Exception("Empty text %i" % linenb))
1538 raise Exception('EmptyText %i' % linenb)
1539 if iduci != -1 and iduce != -1:
1542 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1543 raise Exception('TextBeforeTextMark %i' % linenb)
1544 except UnicodeDecodeError :
1545 raise Exception("CorpusEncoding")
1547 def treattxt(self, txt, iduce, idpara, iduci) :
1548 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1549 txt = 'laphrasepoursplitter'.join(txt)
1550 txt = self.make_cleans(txt)
1551 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1552 ucetxt = txt.split('laphrasepoursplitter')
1555 txt = self.make_cleans(txt)
1556 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1557 if self.corpus.ucis[-1].paras == [] :
1561 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1562 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1563 if not self.tolist :
1569 self.corpus.add_word(word)
1570 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1571 if self.last > self.lim :
1574 return iduce, idpara
1576 def make_uces(self, txt, douce = True, keep_ponct = False) :
1577 txt = ' '.join(txt.split())
1580 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1582 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1585 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1586 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1591 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1593 #decouper (list_sep)
1594 #make_uces (decouper)
1595 #treat_txt (make_uces)
1599 def __init__(self, parent, dlg = None) :
1600 self.parent = parent
1603 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1604 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1605 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1606 dial = CorpusPref(parent, parametres)
1607 dial.CenterOnParent()
1608 dial.txtpath.SetLabel(parent.filename)
1609 #dial.repout_choices.SetValue(parametres['pathout'])
1610 self.res = dial.ShowModal()
1611 if self.dlg is not None :
1612 self.dlg = progressbar(self.parent, self.dlg)
1613 if self.res == 5100 :
1614 parametres = dial.doparametres()
1615 parametres['originalpath'] = parent.filename
1616 PathOut().createdir(parametres['pathout'])
1617 if parametres.get('dictionary', False) :
1618 filein = parametres['dictionary']
1621 if dial.corpusname.GetValue() != '' :
1622 parametres['corpus_name'] = dial.corpusname.GetValue()
1624 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1625 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1626 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1628 self.parent.expressions = {}
1629 self.parametres = parametres
1632 if self.dlg is not None :
1635 def doanalyse(self) :
1636 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1639 def __init__(self, parent, corpus, parametres = None, dlg = None):
1640 self.parent = parent
1643 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1644 if dlg is not None :
1645 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1647 parametres['corpus_name'] = corpus_name
1648 if parametres.get('frommeta', False) :
1649 parametres['meta'] = corpus.make_etoiles()
1650 elif parametres.get('fromtheme', False) :
1651 parametres['meta'] = corpus.make_themes()
1652 elif parametres.get('fromclusters', False) :
1653 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1655 parametres['meta'] = []
1656 if 'fromclusters' not in parametres :
1657 parametres['meta'].sort()
1658 if dlg is not None :
1660 dial = SubTextFromMetaDial(parent, parametres)
1661 self.res = dial.ShowModal()
1662 if self.res == 5100 :
1663 if dial.subcorpusname.GetValue() != '' :
1664 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1665 if corpus_name != '' :
1666 parametres['corpus_name'] = corpus_name
1668 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1669 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1671 while os.path.exists(pathout + '_%i' % i) :
1673 parametres['pathout'] = pathout + '_%i' % i
1674 meta = dial.m_listBox1.GetSelections()
1675 if not 'fromclusters' in parametres :
1676 parametres['meta'] = [parametres['meta'][val] for val in meta]
1678 parametres['meta'] = meta
1679 self.parametres = parametres
1684 def doanalyse(self):
1685 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus