1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial, MergeClusterFrame
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
45 nuci.paras = copy(uci.paras)
54 def __init__(self, parent, parametres = {}, read = False) :
56 self.parametres = parametres
58 self.connformes = None
60 self.conncorpus = None
67 self.idformesuces = {}
72 self.pathout = PathOut(dirout = parametres['pathout'])
75 def add_word(self, word) :
76 if word in self.formes :
77 self.formes[word].freq += 1
78 if self.formes[word].ident in self.idformesuces :
79 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
80 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
82 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
86 if word in self.parent.lexique :
87 gramtype = self.parent.lexique[word][1]
88 lem = self.parent.lexique[word][0]
95 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
96 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
98 def add_word_from_forme(self, word, stident):
99 if word.forme in self.formes :
100 self.formes[word.forme].freq += 1
101 if self.formes[word.forme].ident in self.idformesuces :
102 if stident in self.idformesuces[self.formes[word.forme].ident] :
103 self.idformesuces[self.formes[word.forme].ident][stident] += 1
105 self.idformesuces[self.formes[word.forme].ident][stident] = 1
107 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
109 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
110 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
113 """connect corpus to db"""
114 if self.connformes is None :
115 log.info('connexion corpus')
116 self.connuces = sqlite3.connect(self.pathout['uces.db'])
117 self.cuces = self.connuces.cursor()
118 self.connformes = sqlite3.connect(self.pathout['formes.db'])
119 self.cformes = self.connformes.cursor()
120 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
121 self.ccorpus = self.conncorpus.cursor()
122 self.cformes.execute('PRAGMA temp_store=MEMORY;')
123 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
124 self.cformes.execute('PRAGMA synchronous = OFF;')
125 self.cuces.execute('PRAGMA temp_store=MEMORY;')
126 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
127 self.cuces.execute('PRAGMA synchronous = OFF;')
128 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
129 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
130 self.ccorpus.execute('PRAGMA synchronous = OFF;')
132 def read_corpus(self) :
133 log.info('read corpus')
134 self.parametres['syscoding'] = sys.getdefaultencoding()
135 if self.conncorpus is None :
137 res = self.ccorpus.execute('SELECT * FROM etoiles;')
139 self.ucis.append(Uci(row[0], row[1], row[2]))
140 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
142 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
143 res = self.ccorpus.execute('SELECT * FROM formes;')
144 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
147 def getworduces(self, wordid) :
148 if isinstance(wordid, basestring) :
149 wordid = self.formes[wordid].ident
150 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
151 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
153 def getworducis(self, wordid) :
154 res = self.getworduces(wordid)
155 return list(set([self.getucefromid(uce).uci for uce in res]))
157 def getformeuceseff(self, formeid) :
158 if isinstance(formeid, basestring) :
159 formeid = self.formes[formeid].ident
160 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
161 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
163 res = self.cformes.execute(query)
164 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
166 for i, uce in enumerate(uces) :
167 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
170 def getlemuces(self, lem) :
171 formesid = ', '.join([`val` for val in self.lems[lem].formes])
172 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
173 res = self.cformes.execute(query)
174 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
176 def gettgenst(self, tgen):
179 if lem in self.lems :
180 formesid += self.lems[lem].formes
182 print 'abscent : %s' % lem
183 query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
184 res = self.cformes.execute(query)
185 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
187 def gettgenstprof(self, tgen, classe, i, clnb):
190 if lem in self.lems :
191 lemst = self.getlemuces(lem)
193 if not lem in self.tgenlem :
194 self.tgenlem[lem] = [0] * clnb
195 self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
197 print 'abscent: ',lem
198 return list(set(tgenst))
200 def gettgentxt(self, tgen):
201 sts = self.gettgenst(tgen)
202 return list(set([self.getucefromid(val).uci for val in sts]))
204 def getlemucis(self, lem) :
205 uces = self.getlemuces(lem)
206 return list(set([self.getucefromid(val).uci for val in uces]))
208 def getlemuceseff(self, lem, luces = None) :
209 formesid = ', '.join([`val` for val in self.lems[lem].formes])
210 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
211 res = self.cformes.execute(query)
212 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
213 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
214 res = self.cformes.execute(query)
215 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
217 for i, uce in enumerate(uces) :
218 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
221 def getlemclustereff(self, lem, cluster) :
222 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
224 def getlemeff(self, lem) :
225 return self.lems[lem].freq
230 def getforme(self, formeid) :
231 if self.idformes is None : self.make_idformes()
232 return self.idformes[formeid]
234 def gettotocc(self) :
235 return sum([self.formes[forme].freq for forme in self.formes])
237 def getucemean(self) :
238 return float(self.gettotocc())/self.getucenb()
241 return self.ucis[-1].uces[-1].ident + 1
244 return self.ucis[-1].ident + 1
246 def getucisize(self) :
247 ucesize = self.getucesize()
248 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
250 def getucesize(self) :
251 res = self.getalluces()
252 return [len(uce[1].split()) for uce in res]
254 def getconcorde(self, uces) :
255 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
257 def getuciconcorde(self, ucis) :
258 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
259 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
262 def getuciconcorde_uces(self, uciid, uceid) :
263 uces = [uce.ident for uce in self.ucis[uciid].uces]
264 uces = [row for row in self.getconcorde(uces)]
267 def getwordconcorde(self, word) :
268 return self.getconcorde(self.getworduces(word))
270 def getlemconcorde(self, lem) :
271 return self.getconcorde(self.getlemuces(lem))
273 def getalluces(self) :
274 return self.cuces.execute('SELECT * FROM uces')
276 def getallucis(self):
277 uces = [row[1] for row in self.getalluces()]
278 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
280 def getucesfrometoile(self, etoile) :
281 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
283 def getetoileuces(self) :
284 log.info('get uces etoiles')
287 for uci in self.ucis :
288 etoiles = uci.etoiles[1:]
290 if et in etoileuces :
291 etoileuces[et] += [uce.ident for uce in uci.uces]
293 etoileuces[et] = [uce.ident for uce in uci.uces]
295 for et in uci.paras :
296 if et in etoileuces :
297 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
299 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
305 def getetoileucis(self):
307 for uci in self.ucis :
308 etoiles = uci.etoiles[1:]
310 if et in etoileuces :
311 etoileuces[et] += [uci.ident]
313 etoileuces[et] = [uci.ident]
316 def getucefromid(self, uceid) :
317 if self.iduces is None : self.make_iduces()
318 return self.iduces[uceid]
320 def gethapaxnb(self) :
321 return len([None for forme in self.formes if self.formes[forme].freq == 1])
323 def getactivesnb(self, key) :
324 return len([lem for lem in self.lems if self.lems[lem].act == key])
325 # def make_lems(self, lem = True) :
326 # log.info('make lems')
328 # for forme in self.formes :
329 # if self.formes[forme].lem in self.lems :
330 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
331 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
333 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
335 def getetbyuceid(self, uceid) :
336 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
337 return self.ucis[self.uceuci[uceid]].etoiles
339 def make_lems(self, lem = True) :
340 log.info('make lems')
343 for forme in self.formes :
344 if self.formes[forme].lem in self.lems :
345 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
346 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
348 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
350 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
352 def make_lems_from_dict(self, dictionnaire, dolem = True) :
353 log.info('make lems from dict')
355 for forme in self.formes :
356 if self.formes[forme].forme in dictionnaire :
357 lem = dictionnaire[forme][0]
358 gram = dictionnaire[forme][1]
359 elif forme.isdigit() :
365 self.formes[forme].lem = lem
366 self.formes[forme].gram = gram
368 if self.formes[forme].lem in self.lems :
369 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
370 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
372 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
374 self.lems[forme] = Lem(self, self.formes[forme])
376 def make_idformes(self) :
377 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
379 def make_iduces(self) :
380 if self.iduces is None :
381 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
383 def make_lexitable(self, mineff, etoiles, gram = 0) :
384 log.info('making lexical table...')
389 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
390 etuces = [[] for et in etoiles]
391 for uci in self.ucis :
392 get = list(set(uci.etoiles).intersection(etoiles))
394 log.info('2 variables sur une ligne')
396 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
397 etuces = [set(val) for val in etuces]
400 deff = self.getlemuceseff(lem)
402 line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
403 if sum(line[1:]) >= mineff :
405 tab.insert(0, [''] + etoiles)
408 def make_tgen_table(self, tgen, etoiles, tot = None):
409 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
410 sets = [set(cl) for cl in lclasses]
411 totoccurrences = dict([[val, 0] for val in etoiles])
413 for forme in self.formes :
414 formeuceeff = self.getformeuceseff(forme)
415 for i, classe in enumerate(lclasses) :
416 concern = sets[i].intersection(formeuceeff.keys())
418 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
419 #tgenoccurrences = dict([[val, 0] for val in etoiles])
422 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
424 lemuceeff = self.getlemuceseff(lem)
425 for i, classe in enumerate(lclasses) :
426 concern = sets[i].intersection(lemuceeff.keys())
428 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
429 return tgenoccurrences, totoccurrences
431 def make_tgen_profile(self, tgen, ucecl, uci = False) :
432 log.info('tgen/classes')
436 #FIXME : NE MARCHE PLUS CHANGER CA
437 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
439 tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
440 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
444 #while nam + `i` in tgen :
447 #last = [nam] + [`len(classe)` for classe in ucecl]
449 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
451 #with open(fileout, 'w') as f :
452 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
454 def make_efftype_from_etoiles(self, etoiles) :
456 etuces = [[] for et in etoiles]
457 for uci in self.ucis :
458 get = list(set(uci.etoiles).intersection(etoiles))
460 return '2 variables sur la meme ligne'
462 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
463 etuces = [set(val) for val in etuces]
464 for lem in self.lems :
465 deff = self.getlemuceseff(lem)
467 gram = self.lems[lem].gram
469 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
471 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
472 tabout = [[gram] + dtype[gram] for gram in dtype]
473 tabout.insert(0, [''] + etoiles)
476 def make_uceactsize(self, actives) :
477 res = self.getalluces()
480 deff = self.getlemuceseff(lem)
482 ucesize[uce] = ucesize.get(uce, 0) + 1
485 def make_uc(self, actives, lim1, lim2) :
486 uceactsize = self.make_uceactsize(actives)
492 for uce in [uce for uci in self.ucis for uce in uci.uces] :
493 if uce.para == lastpara :
495 last1 += uceactsize.get(uce.ident,0)
496 uc1[-1].append(uce.ident)
498 uc1.append([uce.ident])
501 last2 += uceactsize.get(uce.ident, 0)
502 uc2[-1].append(uce.ident)
504 uc2.append([uce.ident])
507 last1 = uceactsize.get(uce.ident, 0)
508 last2 = uceactsize.get(uce.ident, 0)
510 uc1.append([uce.ident])
511 uc2.append([uce.ident])
514 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
515 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
516 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
517 self.write_ucmatrix(uc1, actives, uc1out)
518 self.write_ucmatrix(uc2, actives, uc2out)
519 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
520 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
521 with open(listuce1out, 'w') as f :
522 f.write('\n'.join([';'.join(line) for line in listuce1]))
523 with open(listuce2out, 'w') as f :
524 f.write('\n'.join([';'.join(line) for line in listuce2]))
525 return len(uc1), len(uc2)
527 def write_ucmatrix(self, uc, actives, fileout) :
528 log.info('write uc matrix %s' % fileout)
529 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
532 with open(fileout + '~', 'w+') as f :
533 for i, lem in enumerate(actives) :
534 for uce in self.getlemuces(lem):
535 if (uces_uc[uce], i) not in deja_la :
537 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
538 deja_la[(uces_uc[uce], i)] = 0
540 with open(fileout, 'w') as ffin :
541 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
544 os.remove(fileout + '~')
547 def export_corpus(self, outf) :
548 #outf = 'export_corpus.txt'
550 res = self.getalluces()
554 with open(outf,'w') as f :
556 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
557 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
558 elif self.iduces[uce[0]].uci != actuci :
559 actuci = self.iduces[uce[0]].uci
560 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
561 actpara = self.iduces[uce[0]].para
562 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
565 actpara = self.iduces[uce[0]].para
566 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
567 elif self.iduces[uce[0]].para != actpara :
568 actpara = self.iduces[uce[0]].para
570 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
572 def export_meta_table(self, outf) :
573 metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
574 longueur_max = max([len(val) for val in metas])
575 first = ['column_%i' % i for i in range(longueur_max)]
576 metas.insert(0, first)
577 with open(outf, 'w') as f :
578 f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
580 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
582 for i, lc in enumerate(self.lc) :
585 for uce in self.lc0 :
588 res = self.getalluces()
591 res = self.getallucis()
592 with open(outf, 'w') as f :
596 actuci = self.iduces[uce[0]].uci
600 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
602 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
604 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
605 f.write(etline.encode(self.parametres['syscoding']) + '\n')
606 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
608 def export_classe(self, outf, classe, lem = False, uci = False) :
609 sts = self.lc[classe - 1]
611 res = self.getconcorde(sts)
614 res = self.getuciconcorde(sts)
615 with open(outf, 'w') as f :
619 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
621 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
623 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
624 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
626 def export_owledge(self, rep, classe, lem = False, uci = False) :
627 sts = self.lc[classe - 1]
629 res = self.getconcorde(sts)
632 res = self.getuciconcorde(sts)
636 outf = '.'.join([`ident`, 'txt'])
637 outf = os.path.join(rep, outf)
639 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
640 with open(outf, 'w') as f :
641 f.write(guce.encode('cp1252', errors = 'replace'))
643 def export_tropes(self, fileout, classe, lem = False, uci = False) :
644 sts = self.lc[classe - 1]
646 res = self.getconcorde(sts)
649 res = self.getuciconcorde(sts)
650 with open(fileout, 'w') as f :
654 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
655 f.write(guce.encode('cp1252', errors = 'replace'))
658 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
659 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
661 with open(outfile + '~', 'w+') as f :
662 for i, lem in enumerate(actives) :
663 for uce in sorted(self.getlemuces(lem)) :
665 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
667 with open(outfile, 'w') as ffin :
668 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
671 os.remove(outfile + '~')
673 with open(listuce, 'w') as f :
674 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
676 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
677 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
679 with open(outfile + '~', 'w+') as f :
680 for i, lem in enumerate(actives) :
681 for uci in sorted(self.getlemucis(lem)) :
683 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
685 with open(outfile, 'w') as ffin :
686 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
689 os.remove(outfile + '~')
691 with open(listuci, 'w') as f :
692 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
694 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
695 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
697 duces = dict([[uce, i] for i, uce in enumerate(uces)])
698 with open(outfile + '~', 'w+') as f :
699 for i, lem in enumerate(actives) :
700 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
702 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
704 with open(outfile, 'w') as ffin :
705 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
708 os.remove(outfile + '~')
710 def make_table_with_classe(self, uces, list_act, uci = False) :
711 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
712 uces = dict([[uce, i] for i, uce in enumerate(uces)])
714 getlem = self.getlemucis
716 getlem = self.getlemuces
717 for i, lem in enumerate(list_act) :
718 lemuces = list(set(getlem(lem)).intersection(uces))
720 table_uce[uces[uce]][i] = 1
721 table_uce.insert(0, list_act)
724 def make_pondtable_with_classe(self, uces, list_act) :
725 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
726 uces = dict([[uce, i] for i, uce in enumerate(uces)])
727 for i, lem in enumerate(list_act) :
728 uceseff = self.getlemuceseff(lem)
729 lemuces = list(set(uceseff.keys()).intersection(uces))
731 table_uce[uces[uce]][i] = uceseff[uce]
732 table_uce.insert(0, list_act)
735 def parse_active(self, gramact, gramsup = None) :
736 log.info('parse actives')
737 for lem in self.lems :
738 if lem.startswith('_') and lem.endswith('_') :
739 self.lems[lem].act = 2
740 elif self.lems[lem].gram in gramact :
741 self.lems[lem].act = 1
742 elif gramsup is not None and self.lems[lem].gram not in gramact:
743 if self.lems[lem].gram in gramsup :
744 self.lems[lem].act = 2
746 self.lems[lem].act = 0
748 self.lems[lem].act = 2
750 def make_actives_limit(self, limit, key = 1) :
751 if self.idformes is None :
753 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
755 def make_actives_nb(self, nbmax, key) :
756 log.info('make_actives_nb : %i - %i' % (nbmax,key))
757 if self.idformes is None :
759 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
760 self.activenb = len(allactives)
761 allactives = sorted(allactives, reverse = True)
762 if self.activenb == 0 :
764 if len(allactives) <= nbmax :
765 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
766 return [val[1] for val in allactives], allactives[-1][0]
768 effs = [val[0] for val in allactives]
769 if effs.count(effs[nbmax - 1]) > 1 :
770 lim = effs[nbmax - 1] + 1
774 stop = effs.index(lim)
781 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
782 return [val[1] for val in allactives[0:stop]], lim
784 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
785 log.info('formes/classes')
787 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
789 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
790 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
791 with open(fileout, 'w') as f :
792 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
795 def make_etoiles(self) :
797 for uci in self.ucis :
798 etoiles.update(uci.etoiles[1:])
801 def make_themes(self):
803 for uci in self.ucis :
804 themes.update(uci.paras)
807 def make_etoiles_dict(self) :
808 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
810 for etoile in etoiles :
811 et = etoile.split('_')
814 endet = '_'.join(et[1:])
815 if etoile in det[et[0]] :
816 det[et[0]][etoile] += 1
818 det[et[0]][etoile] = 1
823 endet = '_'.join(et[1:])
824 det[et[0]] = {etoile :1}
829 def make_theme_dict(self):
830 themes = [val for uci in self.ucis for val in uci.paras]
832 for theme in themes :
833 th = theme.split('_')
836 endth = '_'.join(th[1:])
837 if theme in det[th[0]] :
838 det[th[0]][theme] += 1
840 det[th[0]][theme] = 1
845 endth = '_'.join(th[1:])
846 det[th[0]] = {theme:1}
851 def make_etline(self, listet) :
852 etuces = [[] for et in listet]
853 for uci in self.ucis :
854 get = list(set(uci.etoiles).intersection(listet))
856 return '2 variables sur la meme ligne'
858 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
861 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
862 log.info('etoiles/classes')
864 etoileuces = self.getetoileuces()
866 etoileuces = self.getetoileucis()
867 print 'etoilesuces ok'
868 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if
869 len(etoileuces[et]) > 1 ]) #and not et.startswith(u'*reference_')
870 print len(etoileuces)
871 print 'etoilesuces ok2'
872 with open(fileout, 'w') as f :
874 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
875 #etoiles = self.make_etoiles()
876 #with open(fileout, 'w') as f :
877 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
879 def make_colored_corpus(self, uci = False) :
881 for i, lc in enumerate(self.lc) :
884 for uce in self.lc0 :
886 color = ['black'] + colors[len(self.lc) - 1]
888 <meta http-equiv="content-Type" content="text/html; charset=%s" />
890 ''' % sys.getdefaultencoding()
892 res = self.getalluces()
897 if self.iduces[uce[0]].uci != actuci :
898 actuci = self.iduces[uce[0]].uci
899 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
900 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
902 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
904 res = self.getallucis()
907 if self.ucis[uce[0]].ident != actuci :
908 actuci = self.ucis[uce[0]].ident
909 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
910 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
912 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
913 return txt + '\n</body></html>'
915 def make_cut_corpus(self, uci = False) :
918 res = self.getalluces()
923 if self.iduces[uce[0]].uci != actuci :
924 actuci = self.iduces[uce[0]].uci
925 txt += u'\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + u'\n'
926 txt += ''.join([u'\n',uce[1],u'\n'])
928 txt += ''.join([u'\n',uce[1],u'\n'])
930 res = self.getallucis()
933 if self.ucis[uce[0]].ident != actuci :
934 actuci = self.ucis[uce[0]].ident
935 txt += u'\n' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + u'\n'
936 txt += ''.join([u'\n',uce[1],u'\n'])
938 txt += ''.join([u'\n',uce[1],u'\n'])
941 def count_from_list(self, l, d) :
949 def count_from_list_cl(self, l, d, a, clnb) :
958 def find_segments(self, taille_segment, taille_limite) :
960 for uce in self.getalluces() :
962 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
963 l = [[d[val], val] for val in d if d[val] >= 3]
966 if len(l) > taille_limite :
967 l = l[-taille_limite:]
970 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
973 concorde = self.getconcorde
975 concorde = self.getuciconcorde
976 for uce in concorde(list_uce) :
978 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
979 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
982 if len(l) > taille_limite :
983 l = l[-taille_limite:]
986 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
988 for b, classe in enumerate(self.lc) :
989 for uce in self.getconcorde(classe) :
992 uce = [self.formes[forme].lem for forme in uce]
993 for taille_segment in range(lenmin,lenmax) :
994 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
995 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
996 with open(fileout, 'w') as f :
997 f.write('\n'.join([';'.join(line) for line in result]))
999 def make_proftype(self, outf) :
1001 for lem in self.lems :
1002 gram = self.lems[lem].gram
1003 if not gram in res :
1004 res[gram] = [0 for val in self.lc]
1005 lemuceeff = self.getlemuceseff(lem)
1006 for i, classe in enumerate(self.lc) :
1007 concern = set(classe).intersection(lemuceeff.keys())
1008 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
1009 res = [[gram] + [`val` for val in res[gram]] for gram in res]
1011 with open(outf, 'w') as f :
1012 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
1015 def make_ucecl_from_R(self, filein) :
1016 with open(filein, 'rU') as f :
1021 line = line.replace('\n', '').replace('"', '').split(';')
1022 self.lc.append([int(line[0]) - 1, int(line[1])])
1023 classesl = [val[1] for val in self.lc]
1024 clnb = max(classesl)
1025 self.lc = sorted(self.lc, key=itemgetter(1))
1026 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
1027 self.lc0 = self.lc.pop(0)
1030 def get_stat_by_cluster(self, outf, lclasses = None) :
1031 log.info('get_stat_by_cluster')
1032 if lclasses is None :
1035 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
1036 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
1037 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
1038 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
1039 sets = [set(cl) for cl in lclasses]
1040 for forme in self.formes :
1041 formeuceeff = self.getformeuceseff(forme)
1042 for i, classe in enumerate(lclasses) :
1043 concern = sets[i].intersection(formeuceeff.keys())
1045 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
1047 if self.formes[forme].freq == 1 :
1049 log.info('%f' % (time() - t1))
1050 if outf is not None :
1051 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
1052 with open(outf, 'w') as f :
1055 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
1057 def get_stat_by_et(self, outf, etoiles) :
1058 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
1059 stats = self.get_stat_by_cluster(None, lclasses)
1060 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1062 def gethapaxbyet(self, etoiles) :
1063 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1065 for uce in hapaxuces :
1066 if uce in hucesdict :
1070 etuces = [[] for et in etoiles]
1071 for uci in self.ucis :
1072 get = list(set(uci.etoiles).intersection(etoiles))
1074 return '2 variables sur la meme ligne'
1076 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1077 etuces = [set(val) for val in etuces]
1078 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1080 def gethapaxuces(self) :
1081 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1082 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1084 for i,uce in enumerate(hapaxuces) :
1085 if uce in hucesdict :
1086 hucesdict[uce][0] += 1
1087 hucesdict[uce][1].append(hapax[i])
1089 hucesdict[uce] = [1,[hapax[i]]]
1091 for uce in hucesdict :
1092 if hucesdict[uce][0] in huces :
1093 huces[hucesdict[uce][0]].append(uce)
1095 huces[hucesdict[uce][0]] = [uce]
1096 huces = zip(huces, huces.values())
1097 huces.sort(reverse=True)
1101 for nb in huces[0:4] :
1102 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1104 res = self.getconcorde([uce])
1106 ucetxt = ' ' + row[1] + ' '
1108 for hap in hucesdict[uce][1] :
1109 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1110 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1111 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1112 txt += '<p>'+ucetxt+'</p>\n'
1116 with open('/tmp/testhapxuce.html','w') as f :
1119 def export_dictionary(self, fileout, syscoding) :
1120 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1121 listformes.sort(reverse = True)
1122 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1123 with open(fileout, 'w') as f :
1124 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1126 def export_lems(self, fileout, syscoding) :
1127 self.make_idformes()
1128 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1130 with open(fileout, 'w') as f :
1131 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1136 def __init__(self, corpus) :
1137 ucinb = corpus.getucinb()
1138 ucisize = corpus.getucisize()
1139 ucimean = float(sum(ucisize))/float(ucinb)
1140 detoile = corpus.make_etoiles_dict()
1143 def __init__(self, iduci, line, paraset = None) :
1145 self.etoiles = line.split()
1147 if paraset is not None :
1148 self.paras = paraset.split()
1153 def __init__(self, iduce, idpara, iduci) :
1159 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1162 self.gram = gramtype
1165 if freq is not None :
1171 def __init__(self, parent, forme) :
1172 self.formes = {forme.ident : forme.freq}
1173 self.gram = forme.gram
1174 self.freq = forme.freq
1175 self.act = forme.act
1177 def add_forme(self, forme) :
1178 self.formes[forme.ident] = forme.freq
1179 self.freq += forme.freq
1181 def decouperlist(chaine, longueur, longueurOptimale) :
1183 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1184 Si on trouve un '$', c'est fini.
1185 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1187 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1188 dsep = dict([[val[0],val[1]] for val in separateurs])
1189 trouve = False # si on a trouvé un bon séparateur
1190 iDecoupe = 0 # indice du caractere ou il faut decouper
1192 longueur = min(longueur, len(chaine) - 1)
1193 chaineTravail = chaine[:longueur + 1]
1195 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1198 indice = chaineTravail.index(u'$')
1200 iDecoupe = indice - 1
1205 caractere = chaineTravail[nbCar]
1206 distance = abs(longueurOptimale - nbCar) + 1
1207 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1208 if caractere in dsep :
1209 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1210 meilleur[0] = caractere
1211 meilleur[1] = dsep[caractere]
1216 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1218 meilleur[1] = dsep[' ']
1225 #if meilleur[0] != ' ' :
1226 # fin = chaine[iDecoupe + 1:]
1227 # retour = chaineTravail[:iDecoupe]
1229 fin = chaine[iDecoupe + 1:]
1230 retour = chaineTravail[:iDecoupe + 1]
1231 return len(retour) > 0, retour, fin
1232 # si on a rien trouvé
1233 return False, chaine, ''
1235 def testetoile(line) :
1236 return line.startswith(u'****')
1239 return line[0:4].isdigit() and u'*' in line
1241 def prep_txtlist(txt) :
1242 return txt.split() + [u'$']
1244 def prep_txtcharact(txt) :
1249 Class for building a corpus
1251 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1252 log.info('begin building corpus...')
1253 self.lexique = lexique
1254 self.expressions = expressions
1256 self.corpus = Corpus(self, parametres_corpus)
1257 self.infile = infile
1259 self.lim = parametres_corpus.get('lim', 1000000)
1260 self.encoding = parametres_corpus['encoding']
1261 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1262 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1263 self.corpus.parametres['uuid'] = str(uuid4())
1264 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1265 self.corpus.parametres['type'] = 'corpus'
1266 if self.corpus.parametres['keep_ponct'] :
1267 self.ponctuation_espace = [' ', '']
1269 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1271 self.tolist = self.corpus.parametres.get('tolist', 0)
1278 def prep_makeuce(self) :
1279 method = self.corpus.parametres.get('ucemethod', 0)
1281 self.decouper = decouperlist
1282 self.prep_txt = prep_txtlist
1283 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1285 self.decouper = decoupercharact
1286 self.prep_txt = prep_txtcharact
1287 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1288 log.info('method uce : %s' % method)
1293 self.read_corpus(self.infile)
1294 except Warning, args :
1295 log.info('pas kool %s' % args)
1299 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1300 self.time = time() - t1
1302 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1303 log.info('time : %f' % (time() - t1))
1306 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1307 self.cf = self.conn_f.cursor()
1308 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1309 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1310 self.conn_f.commit()
1311 self.cf = self.conn_f.cursor()
1312 self.cf.execute('PRAGMA temp_store=MEMORY;')
1313 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1314 self.cf.execute('PRAGMA synchronous = OFF;')
1315 self.cf.execute('begin')
1316 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1317 self.c = self.conn.cursor()
1318 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1320 self.c = self.conn.cursor()
1321 self.c.execute('PRAGMA temp_store=MEMORY;')
1322 self.c.execute('PRAGMA journal_mode=MEMORY;')
1323 self.c.execute('PRAGMA synchronous = OFF;')
1324 self.c.execute('begin')
1327 #commit index and close db
1329 self.conn_f.commit()
1330 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1331 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1335 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1336 self.ccorpus = self.conn_corpus.cursor()
1337 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1338 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1339 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1340 self.conn_corpus.commit()
1341 self.ccorpus = self.conn_corpus.cursor()
1342 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1343 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1344 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1345 self.ccorpus.execute('begin')
1346 self.backup_corpus()
1347 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1348 self.conn_corpus.commit()
1349 self.conn_corpus.close()
1350 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1352 def buildcleans(self) :
1353 if self.corpus.parametres.get('lower', 1) :
1354 self.cleans.append(self.dolower)
1355 if self.corpus.parametres.get('firstclean', 1) :
1356 self.cleans.append(self.firstclean)
1357 if self.corpus.parametres['charact'] :
1358 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1359 self.cleans.append(self.docharact)
1360 if self.corpus.parametres.get('expressions', 1) :
1361 self.cleans.append(self.make_expression)
1362 if self.corpus.parametres.get('apos', 1) :
1363 self.cleans.append(self.doapos)
1364 if self.corpus.parametres.get('tiret', 1):
1365 self.cleans.append(self.dotiret)
1367 def make_expression(self,txt) :
1368 exp = self.expressions.keys()
1369 exp.sort(reverse=True)
1370 for expression in exp :
1371 if expression in txt :
1372 txt = txt.replace(expression, self.expressions[expression][0])
1375 def dolower(self, txt) :
1378 def docharact(self, txt) :
1379 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1380 list_keep = u"[" + self.rule + "]+"
1381 return re.sub(list_keep, ' ', txt)
1383 def doapos(self, txt) :
1384 return txt.replace(u'\'', u' ')
1386 def dotiret(self, txt) :
1387 return txt.replace(u'-', u' ')
1389 def firstclean(self, txt) :
1390 txt = txt.replace(u'’',"'")
1391 txt = txt.replace(u'œ', u'oe')
1392 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1394 def make_cleans(self, txt) :
1395 for clean in self.cleans :
1399 def backup_uce(self) :
1400 if self.corpus.idformesuces != {} :
1401 log.info('backup %i' % len(self.corpus.idformesuces))
1402 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1403 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1404 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1405 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1406 self.corpus.idformesuces = {}
1409 def backup_corpus(self) :
1410 log.info('start backup corpus')
1412 for uci in self.corpus.ucis :
1413 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1414 for uce in uci.uces :
1415 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1416 for forme in self.corpus.formes :
1417 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1418 log.info('%f' % (time() - t))
1420 def dofinish(self) :
1421 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1422 minutes, seconds = divmod(self.time, 60)
1423 hours, minutes = divmod(minutes, 60)
1424 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1425 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1426 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1427 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1428 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1429 hapaxnb = self.corpus.gethapaxnb()
1430 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1431 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1432 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1434 class BuildSubCorpus(BuildCorpus):
1435 def __init__(self, corpus, parametres, dlg = None) :
1436 log.info('begin subcorpus...')
1440 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1442 self.parametres = parametres
1443 self.encoding = corpus.parametres['encoding']
1444 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1445 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1446 self.corpus.pathout.createdir(parametres['pathout'])
1447 self.corpus.parametres['pathout'] = parametres['pathout']
1448 self.corpus.parametres['meta'] = parametres.get('meta', False)
1449 self.corpus.parametres['uuid'] = str(uuid4())
1450 if parametres.get('frommeta', False) :
1451 print 'make subtexts'
1452 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1453 elif parametres.get('fromtheme', False) :
1454 print 'make subtexts from theme'
1456 for uci in self.ori.ucis :
1457 if uci.paras != [] :
1460 for et in uci.paras :
1461 if et in parametres['meta'] :
1462 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1468 nuci.paras = newpara
1469 self.corpus.ucis.append(nuci)
1472 elif parametres.get('fromclusters', False) :
1473 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1475 elif parametres.get('fromuceids', False) :
1481 def fromuceids(self):
1483 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1485 for uci in self.ori.ucis :
1486 if uci.paras == [] :
1487 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1490 nuci.uces = keepuces
1491 self.corpus.ucis.append(nuci)
1496 for et in uci.paras :
1497 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep and uce.para == idpara]
1505 nuci.paras = newpara
1506 self.corpus.ucis.append(nuci)
1508 def read_corpus(self, infile = None):
1509 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1515 print 'redo text, para and st ident'
1516 for uci in self.corpus.ucis :
1517 uci.ident = ident_uci
1519 for uce in uci.uces :
1521 if uce.para != lastpara :
1524 uce.para = ident_para
1526 uce.para = ident_para
1527 newuceident[uce.ident] = ident_uce
1528 uce.ident = ident_uce
1530 print 'backup st text and forms'
1531 for row in self.ori.getconcorde(self.olduceid) :
1532 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1533 for word in row[1].split() :
1534 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1538 class BuildFromAlceste(BuildCorpus) :
1539 def read_corpus(self, infile) :
1540 if self.dlg is not None :
1541 self.dlg.Pulse('textes : 0 - segments : 0')
1544 if self.corpus.parametres['ucimark'] == 0 :
1545 self.testuci = testetoile
1546 elif self.corpus.parametres['ucimark'] == 1 :
1547 self.testuci = testint
1553 with codecs.open(infile, 'r', self.encoding) as f :
1554 for linenb, line in enumerate(f) :
1555 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1556 if self.testuci(line) :
1559 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1561 self.corpus.ucis.append(Uci(iduci, line))
1564 if self.corpus.ucis[-1].uces == [] :
1565 log.info(u'Empty text : %i' % linenb)
1567 self.corpus.ucis.pop()
1568 self.corpus.ucis.append(Uci(iduci, line))
1569 if self.dlg is not None :
1570 if not (iduci + 1) % 10 :
1571 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1572 elif line.startswith(u'-*') :
1575 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1578 self.corpus.ucis[-1].paras.append(line.split()[0])
1580 raise Exception('paragrapheOT %i' % linenb)
1581 elif line.strip() != '' and iduci != -1 :
1583 if txt != [] and iduci != -1 :
1584 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1589 self.corpus.ucis.pop()
1590 log.info(Exception("Empty text %i" % linenb))
1592 raise Exception('EmptyText %i' % linenb)
1593 if iduci != -1 and iduce != -1:
1596 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1597 raise Exception('TextBeforeTextMark %i' % linenb)
1598 except UnicodeDecodeError :
1599 raise Exception("CorpusEncoding")
1601 def treattxt(self, txt, iduce, idpara, iduci) :
1602 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1603 txt = 'laphrasepoursplitter'.join(txt)
1604 txt = self.make_cleans(txt)
1605 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1606 ucetxt = txt.split('laphrasepoursplitter')
1609 txt = self.make_cleans(txt)
1610 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1611 if self.corpus.ucis[-1].paras == [] :
1615 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1616 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1617 if not self.tolist :
1623 self.corpus.add_word(word)
1624 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1625 if self.last > self.lim :
1628 return iduce, idpara
1630 def make_uces(self, txt, douce = True, keep_ponct = False) :
1631 txt = ' '.join(txt.split())
1634 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1636 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1639 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1640 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1645 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1647 #decouper (list_sep)
1648 #make_uces (decouper)
1649 #treat_txt (make_uces)
1653 def __init__(self, parent, dlg = None) :
1654 self.parent = parent
1657 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1658 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1659 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1660 dial = CorpusPref(parent, parametres)
1661 dial.CenterOnParent()
1662 dial.txtpath.SetLabel(parent.filename)
1663 #dial.repout_choices.SetValue(parametres['pathout'])
1664 self.res = dial.ShowModal()
1665 if self.dlg is not None :
1666 self.dlg = progressbar(self.parent, self.dlg)
1667 if self.res == 5100 :
1668 parametres = dial.doparametres()
1669 parametres['originalpath'] = parent.filename
1670 PathOut().createdir(parametres['pathout'])
1671 if parametres.get('dictionary', False) :
1672 filein = parametres['dictionary']
1675 if dial.corpusname.GetValue() != '' :
1676 parametres['corpus_name'] = dial.corpusname.GetValue()
1678 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1679 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1680 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1682 self.parent.expressions = {}
1683 self.parametres = parametres
1686 if self.dlg is not None :
1689 def doanalyse(self) :
1690 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1693 def __init__(self, parent, corpus, parametres = None, dlg = None):
1694 self.parent = parent
1697 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1698 if dlg is not None :
1699 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1701 parametres['corpus_name'] = corpus_name
1702 if parametres.get('frommeta', False) :
1703 parametres['meta'] = corpus.make_etoiles()
1704 elif parametres.get('fromtheme', False) :
1705 parametres['meta'] = corpus.make_themes()
1706 elif parametres.get('fromclusters', False) :
1707 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1709 parametres['meta'] = []
1710 if 'fromclusters' not in parametres :
1711 parametres['meta'].sort()
1712 if dlg is not None :
1714 dial = SubTextFromMetaDial(parent, parametres)
1715 self.res = dial.ShowModal()
1716 if self.res == 5100 :
1717 if dial.subcorpusname.GetValue() != '' :
1718 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1719 if corpus_name != '' :
1720 parametres['corpus_name'] = corpus_name
1722 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1723 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1725 while os.path.exists(pathout + '_%i' % i) :
1727 parametres['pathout'] = pathout + '_%i' % i
1728 meta = dial.m_listBox1.GetSelections()
1729 if not 'fromclusters' in parametres :
1730 parametres['meta'] = [parametres['meta'][val] for val in meta]
1732 parametres['meta'] = meta
1733 self.parametres = parametres
1738 def doanalyse(self):
1739 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
1741 class BuildMergeFromClusters(BuildCorpus):
1742 def __init__(self, analyses, parametres, dlg = None) :
1743 log.info('begin subcorpus...')
1746 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : 'MergeFromClusters', 'encoding' : 'merge'})
1748 self.analyses = analyses
1750 self.parametres = parametres
1751 #self.encoding = corpus.parametres['encoding']
1752 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1753 self.corpus.pathout = PathOut(filename = 'MFC', dirout = parametres['pathout'])
1754 self.corpus.pathout.createdir(parametres['pathout'])
1755 self.corpus.parametres['pathout'] = parametres['pathout']
1756 self.corpus.parametres['meta'] = parametres.get('meta', False)
1757 self.corpus.parametres['uuid'] = str(uuid4())
1758 for i, analyse in enumerate(analyses) :
1761 corpus_uuid = analyse['corpus']
1762 #if corpus_uuid not in self.parent.history.openedcorpus :
1763 irapath = parametres['corpusira'][i]
1764 corpus = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1765 ucepath = os.path.join(analyse['pathout'], 'uce.csv')
1766 corpus.make_ucecl_from_R(ucepath)
1768 for j, cl in enumerate(parametres['clusters'][i]) :
1769 #print cl, self.ori.lc[cl-1]
1770 self.parametres['uceids'] = self.ori.lc[cl-1]#[st for st in self.ori['lc'][cl-1]]
1771 self.lcl[i] += self.ori.lc[cl-1]
1772 self.et = parametres['newet'][i][j]
1778 def fromuceids(self):
1780 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1782 for uci in self.ori.ucis :
1783 if uci.paras == [] :
1784 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1787 nuci.uces = keepuces
1788 nuci.etoiles.append(self.et)
1789 nuci.analyseid = self.analyseid
1790 self.corpus.ucis.append(nuci)
1795 for et in uci.paras :
1796 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1804 nuci.paras = newpara
1805 nuci.etoiles.append(self.et)
1806 nuci.analyseid = self.analyseid
1807 self.corpus.ucis.append(nuci)
1808 #print nuci.etoiles, nuci.ident, nuci.uces
1810 def read_corpus(self, infile = None):
1811 #self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1817 print 'redo text, para and st ident'
1818 for uci in self.corpus.ucis :
1819 #print uci.ident, ident_uci, [uce.ident for uce in uci.uces], uci.etoiles
1820 uci.ident = ident_uci
1822 for uce in uci.uces :
1824 if uce.para != lastpara :
1827 uce.para = ident_para
1829 uce.para = ident_para
1830 newuceident['%i-%i' %(uci.analyseid, uce.ident)] = ident_uce
1831 uce.ident = ident_uce
1834 print 'backup st text and forms'
1836 for i, analyse in enumerate(self.analyses) :
1837 #print analyse, self.parametres['corpusira']
1838 irapath = self.parametres['corpusira'][i]
1839 old = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1840 for row in old.getconcorde(self.lcl[i]) :
1841 self.c.execute('INSERT INTO uces VALUES(?,?);', (newuceident['%i-%i' % (i,row[0])], row[1]))
1842 for word in row[1].split() :
1843 self.corpus.add_word_from_forme(old.formes[word], newuceident['%i-%i' % (i,row[0])])
1849 class MergeClusters :
1850 def __init__(self, parent, parametres = None, dlg = None):
1851 self.parent = parent
1854 corpus_name = 'MergeFromClusters'
1855 if dlg is not None :
1856 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1858 parametres['corpus_name'] = corpus_name
1859 if dlg is not None :
1861 dial = MergeClusterFrame(parent)
1862 dial.m_textCtrl4.SetValue(corpus_name)
1863 self.res = dial.ShowModal()
1864 if self.res == 5100 :
1869 if dial.m_textCtrl4.GetValue() != '' :
1870 corpus_name = ''.join([l for l in dial.m_textCtrl4.GetValue() if l.isalnum() or l in ['_']])
1871 if corpus_name != '' :
1872 parametres['corpus_name'] = corpus_name
1874 parametres['corpus_name'] = 'MergeFromClusters'
1875 for cl in dial.selected :
1877 #if corpus_uuid not in self.parent.history.openedcorpus :
1878 irapath = self.parent.history.corpus[corpus_uuid]['ira']
1879 #corpus = Corpus(self.parent, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1880 #self.parent.history.openedcorpus[corpus_uuid] = corpus
1881 if cl[0] not in self.analyses :
1882 analyse = DoConf(dial.irapath[cl[0]]).getoptions()
1883 #ucepath = os.path.join(os.path.dirname(dial.irapath[cl[0]]), 'uce.csv')
1884 #corpus = copycorpus(self.parent.history.openedcorpus[corpus_uuid])
1885 #corpus.make_ucecl_from_R(ucepath)
1886 self.analyses[cl[0]] = analyse
1887 self.clusters[cl[0]] = [cl[2]]
1888 self.newet[cl[0]] = [dial.selected[cl]]
1889 self.corpusira[cl[0]] = irapath
1891 self.clusters[cl[0]].append(cl[2])
1892 self.newet[cl[0]].append(dial.selected[cl])
1895 analyses = [val for val in self.clusters]
1896 clusters = [self.clusters[val] for val in analyses]
1897 self.newet = [self.newet[val] for val in analyses]
1898 corpusira = [self.corpusira[val] for val in analyses]
1899 analyses = [self.analyses[val] for val in analyses]
1900 pathout = os.path.dirname(os.path.dirname(analyses[0]['pathout']))
1901 self.analyses = analyses
1903 pathout = os.path.join(pathout, parametres['corpus_name'])
1905 while os.path.exists(pathout + '_%i' % i) :
1907 parametres['pathout'] = pathout + '_%i' % i
1908 self.parametres = parametres
1909 self.parametres['clusters'] = clusters
1910 self.parametres['newet'] = self.newet
1911 self.parametres['corpusira'] = corpusira
1916 def doanalyse(self):
1917 return BuildMergeFromClusters(self.analyses, parametres = self.parametres, dlg = self.dlg).corpus