1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def gettgenst(self, tgen):
176 formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems])
177 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
178 res = self.cformes.execute(query)
179 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
181 def gettgentxt(self, tgen):
182 sts = self.gettgenst(tgen)
183 return list(set([self.getucefromid(val).uci for val in sts]))
185 def getlemucis(self, lem) :
186 uces = self.getlemuces(lem)
187 return list(set([self.getucefromid(val).uci for val in uces]))
189 def getlemuceseff(self, lem, luces = None) :
190 formesid = ', '.join([`val` for val in self.lems[lem].formes])
191 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
192 res = self.cformes.execute(query)
193 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
194 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
195 res = self.cformes.execute(query)
196 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
198 for i, uce in enumerate(uces) :
199 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
202 def getlemclustereff(self, lem, cluster) :
203 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
205 def getlemeff(self, lem) :
206 return self.lems[lem].freq
211 def getforme(self, formeid) :
212 if self.idformes is None : self.make_idformes()
213 return self.idformes[formeid]
215 def gettotocc(self) :
216 return sum([self.formes[forme].freq for forme in self.formes])
218 def getucemean(self) :
219 return float(self.gettotocc())/self.getucenb()
222 return self.ucis[-1].uces[-1].ident + 1
225 return self.ucis[-1].ident + 1
227 def getucisize(self) :
228 ucesize = self.getucesize()
229 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
231 def getucesize(self) :
232 res = self.getalluces()
233 return [len(uce[1].split()) for uce in res]
235 def getconcorde(self, uces) :
236 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
238 def getuciconcorde(self, ucis) :
239 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
240 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
243 def getwordconcorde(self, word) :
244 return self.getconcorde(self.getworduces(word))
246 def getlemconcorde(self, lem) :
247 return self.getconcorde(self.getlemuces(lem))
249 def getalluces(self) :
250 return self.cuces.execute('SELECT * FROM uces')
252 def getallucis(self):
253 uces = [row[1] for row in self.getalluces()]
254 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
256 def getucesfrometoile(self, etoile) :
257 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
259 def getetoileuces(self) :
260 log.info('get uces etoiles')
263 for uci in self.ucis :
264 etoiles = uci.etoiles[1:]
266 if et in etoileuces :
267 etoileuces[et] += [uce.ident for uce in uci.uces]
269 etoileuces[et] = [uce.ident for uce in uci.uces]
271 for et in uci.paras :
272 if et in etoileuces :
273 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
275 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
281 def getetoileucis(self):
283 for uci in self.ucis :
284 etoiles = uci.etoiles[1:]
286 if et in etoileuces :
287 etoileuces[et] += [uci.ident]
289 etoileuces[et] = [uci.ident]
292 def getucefromid(self, uceid) :
293 if self.iduces is None : self.make_iduces()
294 return self.iduces[uceid]
296 def gethapaxnb(self) :
297 return len([None for forme in self.formes if self.formes[forme].freq == 1])
299 def getactivesnb(self, key) :
300 return len([lem for lem in self.lems if self.lems[lem].act == key])
301 # def make_lems(self, lem = True) :
302 # log.info('make lems')
304 # for forme in self.formes :
305 # if self.formes[forme].lem in self.lems :
306 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
307 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
309 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
311 def getetbyuceid(self, uceid) :
312 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
313 return self.ucis[self.uceuci[uceid]].etoiles
315 def make_lems(self, lem = True) :
316 log.info('make lems')
319 for forme in self.formes :
320 if self.formes[forme].lem in self.lems :
321 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
322 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
324 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
326 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
328 def make_lems_from_dict(self, dictionnaire, dolem = True) :
329 log.info('make lems from dict')
331 for forme in self.formes :
332 if self.formes[forme].forme in dictionnaire :
333 lem = dictionnaire[forme][0]
334 gram = dictionnaire[forme][1]
335 elif forme.isdigit() :
341 self.formes[forme].lem = lem
342 self.formes[forme].gram = gram
344 if self.formes[forme].lem in self.lems :
345 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
346 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
348 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
350 self.lems[forme] = Lem(self, self.formes[forme])
352 def make_idformes(self) :
353 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
355 def make_iduces(self) :
356 if self.iduces is None :
357 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
359 def make_lexitable(self, mineff, etoiles, gram = 0) :
364 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
365 etuces = [[] for et in etoiles]
366 for uci in self.ucis :
367 get = list(set(uci.etoiles).intersection(etoiles))
369 log.info('2 variables sur une ligne')
371 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
372 etuces = [set(val) for val in etuces]
375 deff = self.getlemuceseff(lem)
377 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
378 tab.insert(0, [''] + etoiles)
381 def make_tgen_table(self, tgen, etoiles, tot = None):
382 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
383 sets = [set(cl) for cl in lclasses]
384 totoccurrences = dict([[val, 0] for val in etoiles])
386 for forme in self.formes :
387 formeuceeff = self.getformeuceseff(forme)
388 for i, classe in enumerate(lclasses) :
389 concern = sets[i].intersection(formeuceeff.keys())
391 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
392 #tgenoccurrences = dict([[val, 0] for val in etoiles])
395 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
397 lemuceeff = self.getlemuceseff(lem)
398 for i, classe in enumerate(lclasses) :
399 concern = sets[i].intersection(lemuceeff.keys())
401 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
402 return tgenoccurrences, totoccurrences
404 def make_tgen_profile(self, tgen, ucecl, uci = False) :
405 log.info('tgen/classes')
407 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
409 tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
410 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
414 #while nam + `i` in tgen :
417 #last = [nam] + [`len(classe)` for classe in ucecl]
419 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
421 #with open(fileout, 'w') as f :
422 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
424 def make_efftype_from_etoiles(self, etoiles) :
426 etuces = [[] for et in etoiles]
427 for uci in self.ucis :
428 get = list(set(uci.etoiles).intersection(etoiles))
430 return '2 variables sur la meme ligne'
432 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
433 etuces = [set(val) for val in etuces]
434 for lem in self.lems :
435 deff = self.getlemuceseff(lem)
437 gram = self.lems[lem].gram
439 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
441 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
442 tabout = [[gram] + dtype[gram] for gram in dtype]
443 tabout.insert(0, [''] + etoiles)
446 def make_uceactsize(self, actives) :
447 res = self.getalluces()
450 deff = self.getlemuceseff(lem)
452 ucesize[uce] = ucesize.get(uce, 0) + 1
455 def make_uc(self, actives, lim1, lim2) :
456 uceactsize = self.make_uceactsize(actives)
462 for uce in [uce for uci in self.ucis for uce in uci.uces] :
463 if uce.para == lastpara :
465 last1 += uceactsize.get(uce.ident,0)
466 uc1[-1].append(uce.ident)
468 uc1.append([uce.ident])
471 last2 += uceactsize.get(uce.ident, 0)
472 uc2[-1].append(uce.ident)
474 uc2.append([uce.ident])
477 last1 = uceactsize.get(uce.ident, 0)
478 last2 = uceactsize.get(uce.ident, 0)
480 uc1.append([uce.ident])
481 uc2.append([uce.ident])
484 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
485 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
486 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
487 self.write_ucmatrix(uc1, actives, uc1out)
488 self.write_ucmatrix(uc2, actives, uc2out)
489 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
490 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
491 with open(listuce1out, 'w') as f :
492 f.write('\n'.join([';'.join(line) for line in listuce1]))
493 with open(listuce2out, 'w') as f :
494 f.write('\n'.join([';'.join(line) for line in listuce2]))
495 return len(uc1), len(uc2)
497 def write_ucmatrix(self, uc, actives, fileout) :
498 log.info('write uc matrix %s' % fileout)
499 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
502 with open(fileout + '~', 'w+') as f :
503 for i, lem in enumerate(actives) :
504 for uce in self.getlemuces(lem):
505 if (uces_uc[uce], i) not in deja_la :
507 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
508 deja_la[(uces_uc[uce], i)] = 0
510 with open(fileout, 'w') as ffin :
511 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
514 os.remove(fileout + '~')
517 def export_corpus(self, outf) :
518 #outf = 'export_corpus.txt'
520 res = self.getalluces()
524 with open(outf,'w') as f :
526 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
527 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
528 elif self.iduces[uce[0]].uci != actuci :
529 actuci = self.iduces[uce[0]].uci
530 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
531 actpara = self.iduces[uce[0]].para
532 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
535 actpara = self.iduces[uce[0]].para
536 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
537 elif self.iduces[uce[0]].para != actpara :
538 actpara = self.iduces[uce[0]].para
540 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
542 def export_meta_table(self, outf) :
543 metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
544 longueur_max = max([len(val) for val in metas])
545 first = ['column_%i' % i for i in range(longueur_max)]
546 metas.insert(0, first)
547 with open(outf, 'w') as f :
548 f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
550 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
552 for i, lc in enumerate(self.lc) :
555 for uce in self.lc0 :
558 res = self.getalluces()
561 res = self.getallucis()
562 with open(outf, 'w') as f :
566 actuci = self.iduces[uce[0]].uci
570 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
572 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
574 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
575 f.write(etline.encode(self.parametres['syscoding']) + '\n')
576 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
578 def export_classe(self, outf, classe, lem = False, uci = False) :
579 sts = self.lc[classe - 1]
581 res = self.getconcorde(sts)
584 res = self.getuciconcorde(sts)
585 with open(outf, 'w') as f :
589 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
591 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
593 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
594 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
596 def export_owledge(self, rep, classe, lem = False, uci = False) :
597 sts = self.lc[classe - 1]
599 res = self.getconcorde(sts)
602 res = self.getuciconcorde(sts)
606 outf = '.'.join([`ident`, 'txt'])
607 outf = os.path.join(rep, outf)
609 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
610 with open(outf, 'w') as f :
611 f.write(guce.encode('cp1252', errors = 'replace'))
613 def export_tropes(self, fileout, classe, lem = False, uci = False) :
614 sts = self.lc[classe - 1]
616 res = self.getconcorde(sts)
619 res = self.getuciconcorde(sts)
620 with open(fileout, 'w') as f :
624 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
625 f.write(guce.encode('cp1252', errors = 'replace'))
628 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
629 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
631 with open(outfile + '~', 'w+') as f :
632 for i, lem in enumerate(actives) :
633 for uce in sorted(self.getlemuces(lem)) :
635 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
637 with open(outfile, 'w') as ffin :
638 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
641 os.remove(outfile + '~')
643 with open(listuce, 'w') as f :
644 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
646 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
647 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
649 with open(outfile + '~', 'w+') as f :
650 for i, lem in enumerate(actives) :
651 for uci in sorted(self.getlemucis(lem)) :
653 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
655 with open(outfile, 'w') as ffin :
656 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
659 os.remove(outfile + '~')
661 with open(listuci, 'w') as f :
662 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
664 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
665 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
667 duces = dict([[uce, i] for i, uce in enumerate(uces)])
668 with open(outfile + '~', 'w+') as f :
669 for i, lem in enumerate(actives) :
670 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
672 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
674 with open(outfile, 'w') as ffin :
675 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
678 os.remove(outfile + '~')
680 def make_table_with_classe(self, uces, list_act, uci = False) :
681 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
682 uces = dict([[uce, i] for i, uce in enumerate(uces)])
684 getlem = self.getlemucis
686 getlem = self.getlemuces
687 for i, lem in enumerate(list_act) :
688 lemuces = list(set(getlem(lem)).intersection(uces))
690 table_uce[uces[uce]][i] = 1
691 table_uce.insert(0, list_act)
694 def make_pondtable_with_classe(self, uces, list_act) :
695 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
696 uces = dict([[uce, i] for i, uce in enumerate(uces)])
697 for i, lem in enumerate(list_act) :
698 uceseff = self.getlemuceseff(lem)
699 lemuces = list(set(uceseff.keys()).intersection(uces))
701 table_uce[uces[uce]][i] = uceseff[uce]
702 table_uce.insert(0, list_act)
705 def parse_active(self, gramact, gramsup = None) :
706 log.info('parse actives')
707 for lem in self.lems :
708 if lem.startswith('_') and lem.endswith('_') :
709 self.lems[lem].act = 2
710 elif self.lems[lem].gram in gramact :
711 self.lems[lem].act = 1
712 elif gramsup is not None and self.lems[lem].gram not in gramact:
713 if self.lems[lem].gram in gramsup :
714 self.lems[lem].act = 2
716 self.lems[lem].act = 0
718 self.lems[lem].act = 2
720 def make_actives_limit(self, limit, key = 1) :
721 if self.idformes is None :
723 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
725 def make_actives_nb(self, nbmax, key) :
726 log.info('make_actives_nb : %i - %i' % (nbmax,key))
727 if self.idformes is None :
729 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
730 self.activenb = len(allactives)
731 allactives = sorted(allactives, reverse = True)
732 if self.activenb == 0 :
734 if len(allactives) <= nbmax :
735 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
736 return [val[1] for val in allactives], allactives[-1][0]
738 effs = [val[0] for val in allactives]
739 if effs.count(effs[nbmax - 1]) > 1 :
740 lim = effs[nbmax - 1] + 1
744 stop = effs.index(lim)
751 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
752 return [val[1] for val in allactives[0:stop + 1]], lim
754 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
755 log.info('formes/classes')
757 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
759 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
760 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
761 with open(fileout, 'w') as f :
762 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
764 def make_etoiles(self) :
766 for uci in self.ucis :
767 etoiles.update(uci.etoiles[1:])
770 def make_themes(self):
772 for uci in self.ucis :
773 themes.update(uci.paras)
776 def make_etoiles_dict(self) :
777 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
779 for etoile in etoiles :
780 et = etoile.split('_')
783 endet = '_'.join(et[1:])
784 if etoile in det[et[0]] :
785 det[et[0]][etoile] += 1
787 det[et[0]][etoile] = 1
792 endet = '_'.join(et[1:])
793 det[et[0]] = {etoile :1}
798 def make_theme_dict(self):
799 themes = [val for uci in self.ucis for val in uci.paras]
801 for theme in themes :
802 th = theme.split('_')
805 endth = '_'.join(th[1:])
806 if theme in det[th[0]] :
807 det[th[0]][theme] += 1
809 det[th[0]][theme] = 1
814 endth = '_'.join(th[1:])
815 det[th[0]] = {theme:1}
820 def make_etline(self, listet) :
821 etuces = [[] for et in listet]
822 for uci in self.ucis :
823 get = list(set(uci.etoiles).intersection(listet))
825 return '2 variables sur la meme ligne'
827 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
830 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
831 log.info('etoiles/classes')
833 etoileuces = self.getetoileuces()
835 etoileuces = self.getetoileucis()
836 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
837 with open(fileout, 'w') as f :
838 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
839 #etoiles = self.make_etoiles()
840 #with open(fileout, 'w') as f :
841 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
843 def make_colored_corpus(self, uci = False) :
845 for i, lc in enumerate(self.lc) :
848 for uce in self.lc0 :
850 color = ['black'] + colors[len(self.lc) - 1]
852 <meta http-equiv="content-Type" content="text/html; charset=%s" />
854 ''' % sys.getdefaultencoding()
856 res = self.getalluces()
861 if self.iduces[uce[0]].uci != actuci :
862 actuci = self.iduces[uce[0]].uci
863 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
864 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
866 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
868 res = self.getallucis()
871 if self.ucis[uce[0]].ident != actuci :
872 actuci = self.ucis[uce[0]].ident
873 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
874 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
876 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
877 return txt + '\n</body></html>'
879 def count_from_list(self, l, d) :
887 def count_from_list_cl(self, l, d, a, clnb) :
896 def find_segments(self, taille_segment, taille_limite) :
898 for uce in self.getalluces() :
900 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
901 l = [[d[val], val] for val in d if d[val] >= 3]
904 if len(l) > taille_limite :
905 l = l[-taille_limite:]
908 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
911 concorde = self.getconcorde
913 concorde = self.getuciconcorde
914 for uce in concorde(list_uce) :
916 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
917 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
920 if len(l) > taille_limite :
921 l = l[-taille_limite:]
924 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
926 for b, classe in enumerate(self.lc) :
927 for uce in self.getconcorde(classe) :
930 uce = [self.formes[forme].lem for forme in uce]
931 for taille_segment in range(lenmin,lenmax) :
932 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
933 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
934 with open(fileout, 'w') as f :
935 f.write('\n'.join([';'.join(line) for line in result]))
937 def make_proftype(self, outf) :
939 for lem in self.lems :
940 gram = self.lems[lem].gram
942 res[gram] = [0 for val in self.lc]
943 lemuceeff = self.getlemuceseff(lem)
944 for i, classe in enumerate(self.lc) :
945 concern = set(classe).intersection(lemuceeff.keys())
946 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
947 res = [[gram] + [`val` for val in res[gram]] for gram in res]
949 with open(outf, 'w') as f :
950 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
953 def make_ucecl_from_R(self, filein) :
954 with open(filein, 'rU') as f :
959 line = line.replace('\n', '').replace('"', '').split(';')
960 self.lc.append([int(line[0]) - 1, int(line[1])])
961 classesl = [val[1] for val in self.lc]
963 self.lc = sorted(self.lc, key=itemgetter(1))
964 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
965 self.lc0 = self.lc.pop(0)
968 def get_stat_by_cluster(self, outf, lclasses = None) :
969 log.info('get_stat_by_cluster')
970 if lclasses is None :
973 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
974 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
975 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
976 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
977 sets = [set(cl) for cl in lclasses]
978 for forme in self.formes :
979 formeuceeff = self.getformeuceseff(forme)
980 for i, classe in enumerate(lclasses) :
981 concern = sets[i].intersection(formeuceeff.keys())
983 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
985 if self.formes[forme].freq == 1 :
987 log.info('%f' % (time() - t1))
988 if outf is not None :
989 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
990 with open(outf, 'w') as f :
993 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
995 def get_stat_by_et(self, outf, etoiles) :
996 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
997 stats = self.get_stat_by_cluster(None, lclasses)
998 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1000 def gethapaxbyet(self, etoiles) :
1001 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1003 for uce in hapaxuces :
1004 if uce in hucesdict :
1008 etuces = [[] for et in etoiles]
1009 for uci in self.ucis :
1010 get = list(set(uci.etoiles).intersection(etoiles))
1012 return '2 variables sur la meme ligne'
1014 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1015 etuces = [set(val) for val in etuces]
1016 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1018 def gethapaxuces(self) :
1019 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1020 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1022 for i,uce in enumerate(hapaxuces) :
1023 if uce in hucesdict :
1024 hucesdict[uce][0] += 1
1025 hucesdict[uce][1].append(hapax[i])
1027 hucesdict[uce] = [1,[hapax[i]]]
1029 for uce in hucesdict :
1030 if hucesdict[uce][0] in huces :
1031 huces[hucesdict[uce][0]].append(uce)
1033 huces[hucesdict[uce][0]] = [uce]
1034 huces = zip(huces, huces.values())
1035 huces.sort(reverse=True)
1039 for nb in huces[0:4] :
1040 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1042 res = self.getconcorde([uce])
1044 ucetxt = ' ' + row[1] + ' '
1046 for hap in hucesdict[uce][1] :
1047 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1048 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1049 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1050 txt += '<p>'+ucetxt+'</p>\n'
1054 with open('/tmp/testhapxuce.html','w') as f :
1057 def export_dictionary(self, fileout, syscoding) :
1058 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1059 listformes.sort(reverse = True)
1060 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1061 with open(fileout, 'w') as f :
1062 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1064 def export_lems(self, fileout, syscoding) :
1065 self.make_idformes()
1066 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1068 with open(fileout, 'w') as f :
1069 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1074 def __init__(self, corpus) :
1075 ucinb = corpus.getucinb()
1076 ucisize = corpus.getucisize()
1077 ucimean = float(sum(ucisize))/float(ucinb)
1078 detoile = corpus.make_etoiles_dict()
1081 def __init__(self, iduci, line, paraset = None) :
1083 self.etoiles = line.split()
1085 if paraset is not None :
1086 self.paras = paraset.split()
1091 def __init__(self, iduce, idpara, iduci) :
1097 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1100 self.gram = gramtype
1103 if freq is not None :
1109 def __init__(self, parent, forme) :
1110 self.formes = {forme.ident : forme.freq}
1111 self.gram = forme.gram
1112 self.freq = forme.freq
1113 self.act = forme.act
1115 def add_forme(self, forme) :
1116 self.formes[forme.ident] = forme.freq
1117 self.freq += forme.freq
1119 def decouperlist(chaine, longueur, longueurOptimale) :
1121 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1122 Si on trouve un '$', c'est fini.
1123 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1125 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1126 dsep = dict([[val[0],val[1]] for val in separateurs])
1127 trouve = False # si on a trouvé un bon séparateur
1128 iDecoupe = 0 # indice du caractere ou il faut decouper
1130 longueur = min(longueur, len(chaine) - 1)
1131 chaineTravail = chaine[:longueur + 1]
1133 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1136 indice = chaineTravail.index(u'$')
1138 iDecoupe = indice - 1
1143 caractere = chaineTravail[nbCar]
1144 distance = abs(longueurOptimale - nbCar) + 1
1145 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1146 if caractere in dsep :
1147 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1148 meilleur[0] = caractere
1149 meilleur[1] = dsep[caractere]
1154 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1156 meilleur[1] = dsep[' ']
1163 #if meilleur[0] != ' ' :
1164 # fin = chaine[iDecoupe + 1:]
1165 # retour = chaineTravail[:iDecoupe]
1167 fin = chaine[iDecoupe + 1:]
1168 retour = chaineTravail[:iDecoupe + 1]
1169 return len(retour) > 0, retour, fin
1170 # si on a rien trouvé
1171 return False, chaine, ''
1173 def testetoile(line) :
1174 return line.startswith(u'****')
1177 return line[0:4].isdigit() and u'*' in line
1179 def prep_txtlist(txt) :
1180 return txt.split() + [u'$']
1182 def prep_txtcharact(txt) :
1187 Class for building a corpus
1189 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1190 log.info('begin building corpus...')
1191 self.lexique = lexique
1192 self.expressions = expressions
1194 self.corpus = Corpus(self, parametres_corpus)
1195 self.infile = infile
1197 self.lim = parametres_corpus.get('lim', 1000000)
1198 self.encoding = parametres_corpus['encoding']
1199 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1200 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1201 self.corpus.parametres['uuid'] = str(uuid4())
1202 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1203 self.corpus.parametres['type'] = 'corpus'
1204 if self.corpus.parametres['keep_ponct'] :
1205 self.ponctuation_espace = [' ', '']
1207 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1209 self.tolist = self.corpus.parametres.get('tolist', 0)
1216 def prep_makeuce(self) :
1217 method = self.corpus.parametres.get('ucemethod', 0)
1219 self.decouper = decouperlist
1220 self.prep_txt = prep_txtlist
1221 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1223 self.decouper = decoupercharact
1224 self.prep_txt = prep_txtcharact
1225 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1226 log.info('method uce : %s' % method)
1231 self.read_corpus(self.infile)
1232 except Warning, args :
1233 log.info('pas kool %s' % args)
1237 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1238 self.time = time() - t1
1240 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1241 log.info('time : %f' % (time() - t1))
1244 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1245 self.cf = self.conn_f.cursor()
1246 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1247 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1248 self.conn_f.commit()
1249 self.cf = self.conn_f.cursor()
1250 self.cf.execute('PRAGMA temp_store=MEMORY;')
1251 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1252 self.cf.execute('PRAGMA synchronous = OFF;')
1253 self.cf.execute('begin')
1254 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1255 self.c = self.conn.cursor()
1256 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1258 self.c = self.conn.cursor()
1259 self.c.execute('PRAGMA temp_store=MEMORY;')
1260 self.c.execute('PRAGMA journal_mode=MEMORY;')
1261 self.c.execute('PRAGMA synchronous = OFF;')
1262 self.c.execute('begin')
1265 #commit index and close db
1267 self.conn_f.commit()
1268 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1269 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1273 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1274 self.ccorpus = self.conn_corpus.cursor()
1275 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1276 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1277 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1278 self.conn_corpus.commit()
1279 self.ccorpus = self.conn_corpus.cursor()
1280 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1281 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1282 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1283 self.ccorpus.execute('begin')
1284 self.backup_corpus()
1285 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1286 self.conn_corpus.commit()
1287 self.conn_corpus.close()
1288 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1290 def buildcleans(self) :
1291 if self.corpus.parametres.get('lower', 1) :
1292 self.cleans.append(self.dolower)
1293 if self.corpus.parametres.get('firstclean', 1) :
1294 self.cleans.append(self.firstclean)
1295 if self.corpus.parametres['charact'] :
1296 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1297 self.cleans.append(self.docharact)
1298 if self.corpus.parametres.get('expressions', 1) :
1299 self.cleans.append(self.make_expression)
1300 if self.corpus.parametres.get('apos', 1) :
1301 self.cleans.append(self.doapos)
1302 if self.corpus.parametres.get('tiret', 1):
1303 self.cleans.append(self.dotiret)
1305 def make_expression(self,txt) :
1306 for expression in self.expressions:
1307 if expression in txt :
1308 txt = txt.replace(expression, self.expressions[expression][0])
1311 def dolower(self, txt) :
1314 def docharact(self, txt) :
1315 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1316 list_keep = u"[" + self.rule + "]+"
1317 return re.sub(list_keep, ' ', txt)
1319 def doapos(self, txt) :
1320 return txt.replace(u'\'', u' ')
1322 def dotiret(self, txt) :
1323 return txt.replace(u'-', u' ')
1325 def firstclean(self, txt) :
1326 txt = txt.replace(u'’',"'")
1327 txt = txt.replace(u'œ', u'oe')
1328 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1330 def make_cleans(self, txt) :
1331 for clean in self.cleans :
1335 def backup_uce(self) :
1336 if self.corpus.idformesuces != {} :
1337 log.info('backup %i' % len(self.corpus.idformesuces))
1338 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1339 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1340 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1341 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1342 self.corpus.idformesuces = {}
1345 def backup_corpus(self) :
1346 log.info('start backup corpus')
1348 for uci in self.corpus.ucis :
1349 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1350 for uce in uci.uces :
1351 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1352 for forme in self.corpus.formes :
1353 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1354 log.info('%f' % (time() - t))
1356 def dofinish(self) :
1357 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1358 minutes, seconds = divmod(self.time, 60)
1359 hours, minutes = divmod(minutes, 60)
1360 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1361 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1362 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1363 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1364 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1365 hapaxnb = self.corpus.gethapaxnb()
1366 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1367 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1368 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1370 class BuildSubCorpus(BuildCorpus):
1371 def __init__(self, corpus, parametres, dlg = None) :
1372 log.info('begin subcorpus...')
1376 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1378 self.parametres = parametres
1379 self.encoding = corpus.parametres['encoding']
1380 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1381 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1382 self.corpus.pathout.createdir(parametres['pathout'])
1383 self.corpus.parametres['pathout'] = parametres['pathout']
1384 self.corpus.parametres['meta'] = parametres.get('meta', False)
1385 self.corpus.parametres['uuid'] = str(uuid4())
1386 if parametres.get('frommeta', False) :
1387 print 'make subtexts'
1388 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1389 elif parametres.get('fromtheme', False) :
1390 print 'make subtexts from theme'
1392 for uci in self.ori.ucis :
1393 if uci.paras != [] :
1396 for et in uci.paras :
1397 if et in parametres['meta'] :
1398 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1404 nuci.paras = newpara
1405 self.corpus.ucis.append(nuci)
1408 elif parametres.get('fromclusters', False) :
1409 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1411 elif parametres.get('fromuceids', False) :
1417 def fromuceids(self):
1419 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1421 for uci in self.ori.ucis :
1422 if uci.paras == [] :
1423 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1426 nuci.uces = keepuces
1427 self.corpus.ucis.append(nuci)
1432 for et in uci.paras :
1433 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1441 nuci.paras = newpara
1442 self.corpus.ucis.append(nuci)
1444 def read_corpus(self, infile = None):
1445 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1451 print 'redo text, para and st ident'
1452 for uci in self.corpus.ucis :
1453 uci.ident = ident_uci
1455 for uce in uci.uces :
1457 if uce.para != lastpara :
1460 uce.para = ident_para
1462 uce.para = ident_para
1463 newuceident[uce.ident] = ident_uce
1464 uce.ident = ident_uce
1466 print 'backup st text and forms'
1467 for row in self.ori.getconcorde(self.olduceid) :
1468 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1469 for word in row[1].split() :
1470 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1474 class BuildFromAlceste(BuildCorpus) :
1475 def read_corpus(self, infile) :
1476 if self.dlg is not None :
1477 self.dlg.Pulse('textes : 0 - segments : 0')
1480 if self.corpus.parametres['ucimark'] == 0 :
1481 self.testuci = testetoile
1482 elif self.corpus.parametres['ucimark'] == 1 :
1483 self.testuci = testint
1489 with codecs.open(infile, 'r', self.encoding) as f :
1490 for linenb, line in enumerate(f) :
1491 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1492 if self.testuci(line) :
1495 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1497 self.corpus.ucis.append(Uci(iduci, line))
1500 if self.corpus.ucis[-1].uces == [] :
1501 log.info(u'Empty text : %i' % linenb)
1503 self.corpus.ucis.pop()
1504 self.corpus.ucis.append(Uci(iduci, line))
1505 if self.dlg is not None :
1506 if not (iduci + 1) % 10 :
1507 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1508 elif line.startswith(u'-*') :
1511 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1514 self.corpus.ucis[-1].paras.append(line.split()[0])
1516 raise Exception('paragrapheOT %i' % linenb)
1517 elif line.strip() != '' and iduci != -1 :
1519 if txt != [] and iduci != -1 :
1520 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1525 self.corpus.ucis.pop()
1526 log.info(Exception("Empty text %i" % linenb))
1528 raise Exception('EmptyText %i' % linenb)
1529 if iduci != -1 and iduce != -1:
1532 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1533 raise Exception('TextBeforeTextMark %i' % linenb)
1534 except UnicodeDecodeError :
1535 raise Exception("CorpusEncoding")
1537 def treattxt(self, txt, iduce, idpara, iduci) :
1538 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1539 txt = 'laphrasepoursplitter'.join(txt)
1540 txt = self.make_cleans(txt)
1541 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1542 ucetxt = txt.split('laphrasepoursplitter')
1545 txt = self.make_cleans(txt)
1546 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1547 if self.corpus.ucis[-1].paras == [] :
1551 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1552 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1553 if not self.tolist :
1559 self.corpus.add_word(word)
1560 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1561 if self.last > self.lim :
1564 return iduce, idpara
1566 def make_uces(self, txt, douce = True, keep_ponct = False) :
1567 txt = ' '.join(txt.split())
1570 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1572 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1575 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1576 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1581 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1583 #decouper (list_sep)
1584 #make_uces (decouper)
1585 #treat_txt (make_uces)
1589 def __init__(self, parent, dlg = None) :
1590 self.parent = parent
1592 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1593 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1594 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1595 dial = CorpusPref(parent, parametres)
1596 dial.CenterOnParent()
1597 dial.txtpath.SetLabel(parent.filename)
1598 #dial.repout_choices.SetValue(parametres['pathout'])
1599 self.res = dial.ShowModal()
1600 if self.res == 5100 :
1601 parametres = dial.doparametres()
1602 parametres['originalpath'] = parent.filename
1603 PathOut().createdir(parametres['pathout'])
1604 if parametres.get('dictionary', False) :
1605 filein = parametres['dictionary']
1608 if dial.corpusname.GetValue() != '' :
1609 parametres['corpus_name'] = dial.corpusname.GetValue()
1611 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1612 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1613 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1615 self.parent.expressions = {}
1616 self.parametres = parametres
1619 if self.dlg is not None :
1622 def doanalyse(self) :
1623 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1626 def __init__(self, parent, corpus, parametres = None, dlg = None):
1627 self.parent = parent
1630 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1631 if dlg is not None :
1632 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1634 parametres['corpus_name'] = corpus_name
1635 if parametres.get('frommeta', False) :
1636 parametres['meta'] = corpus.make_etoiles()
1637 elif parametres.get('fromtheme', False) :
1638 parametres['meta'] = corpus.make_themes()
1639 elif parametres.get('fromclusters', False) :
1640 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1642 parametres['meta'] = []
1643 if 'fromclusters' not in parametres :
1644 parametres['meta'].sort()
1645 if dlg is not None :
1647 dial = SubTextFromMetaDial(parent, parametres)
1648 self.res = dial.ShowModal()
1649 if self.res == 5100 :
1650 if dial.subcorpusname.GetValue() != '' :
1651 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1652 if corpus_name != '' :
1653 parametres['corpus_name'] = corpus_name
1655 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1656 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1658 while os.path.exists(pathout + '_%i' % i) :
1660 parametres['pathout'] = pathout + '_%i' % i
1661 meta = dial.m_listBox1.GetSelections()
1662 if not 'fromclusters' in parametres :
1663 parametres['meta'] = [parametres['meta'][val] for val in meta]
1665 parametres['meta'] = meta
1666 self.parametres = parametres
1671 def doanalyse(self):
1672 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus