1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def getlemucis(self, lem) :
176 uces = self.getlemuces(lem)
177 return list(set([self.getucefromid(val).uci for val in uces]))
179 def getlemuceseff(self, lem, luces = None) :
180 formesid = ', '.join([`val` for val in self.lems[lem].formes])
181 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
182 res = self.cformes.execute(query)
183 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
184 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
185 res = self.cformes.execute(query)
186 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
188 for i, uce in enumerate(uces) :
189 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
192 def getlemclustereff(self, lem, cluster) :
193 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
195 def getlemeff(self, lem) :
196 return self.lems[lem].freq
201 def getforme(self, formeid) :
202 if self.idformes is None : self.make_idformes()
203 return self.idformes[formeid]
205 def gettotocc(self) :
206 return sum([self.formes[forme].freq for forme in self.formes])
208 def getucemean(self) :
209 return float(self.gettotocc())/self.getucenb()
212 return self.ucis[-1].uces[-1].ident + 1
215 return self.ucis[-1].ident + 1
217 def getucisize(self) :
218 ucesize = self.getucesize()
219 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
221 def getucesize(self) :
222 res = self.getalluces()
223 return [len(uce[1].split()) for uce in res]
225 def getconcorde(self, uces) :
226 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
228 def getuciconcorde(self, ucis) :
229 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
230 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
233 def getwordconcorde(self, word) :
234 return self.getconcorde(self.getworduces(word))
236 def getlemconcorde(self, lem) :
237 return self.getconcorde(self.getlemuces(lem))
239 def getalluces(self) :
240 return self.cuces.execute('SELECT * FROM uces')
242 def getallucis(self):
243 uces = [row[1] for row in self.getalluces()]
244 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
246 def getucesfrometoile(self, etoile) :
247 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
249 def getetoileuces(self) :
250 log.info('get uces etoiles')
253 for uci in self.ucis :
254 etoiles = uci.etoiles[1:]
256 if et in etoileuces :
257 etoileuces[et] += [uce.ident for uce in uci.uces]
259 etoileuces[et] = [uce.ident for uce in uci.uces]
261 for et in uci.paras :
262 if et in etoileuces :
263 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
265 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
271 def getetoileucis(self):
273 for uci in self.ucis :
274 etoiles = uci.etoiles[1:]
276 if et in etoileuces :
277 etoileuces[et] += [uci.ident]
279 etoileuces[et] = [uci.ident]
282 def getucefromid(self, uceid) :
283 if self.iduces is None : self.make_iduces()
284 return self.iduces[uceid]
286 def gethapaxnb(self) :
287 return len([None for forme in self.formes if self.formes[forme].freq == 1])
289 def getactivesnb(self, key) :
290 return len([lem for lem in self.lems if self.lems[lem].act == key])
291 # def make_lems(self, lem = True) :
292 # log.info('make lems')
294 # for forme in self.formes :
295 # if self.formes[forme].lem in self.lems :
296 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
297 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
299 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
301 def getetbyuceid(self, uceid) :
302 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
303 return self.ucis[self.uceuci[uceid]].etoiles
305 def make_lems(self, lem = True) :
306 log.info('make lems')
309 for forme in self.formes :
310 if self.formes[forme].lem in self.lems :
311 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
312 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
314 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
316 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
318 def make_lems_from_dict(self, dictionnaire, dolem = True) :
319 log.info('make lems from dict')
321 for forme in self.formes :
322 if self.formes[forme].forme in dictionnaire :
323 lem = dictionnaire[forme][0]
324 gram = dictionnaire[forme][1]
325 elif forme.isdigit() :
331 self.formes[forme].lem = lem
332 self.formes[forme].gram = gram
334 if self.formes[forme].lem in self.lems :
335 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
336 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
338 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
340 self.lems[forme] = Lem(self, self.formes[forme])
342 def make_idformes(self) :
343 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
345 def make_iduces(self) :
346 if self.iduces is None :
347 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
349 def make_lexitable(self, mineff, etoiles, gram = 0) :
354 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
355 etuces = [[] for et in etoiles]
356 for uci in self.ucis :
357 get = list(set(uci.etoiles).intersection(etoiles))
359 log.info('2 variables sur une ligne')
361 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
362 etuces = [set(val) for val in etuces]
365 deff = self.getlemuceseff(lem)
367 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
368 tab.insert(0, [''] + etoiles)
371 def make_tgen_table(self, tgen, etoiles, tot = None):
372 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
373 sets = [set(cl) for cl in lclasses]
374 totoccurrences = dict([[val, 0] for val in etoiles])
376 for forme in self.formes :
377 formeuceeff = self.getformeuceseff(forme)
378 for i, classe in enumerate(lclasses) :
379 concern = sets[i].intersection(formeuceeff.keys())
381 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
382 #tgenoccurrences = dict([[val, 0] for val in etoiles])
385 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
387 lemuceeff = self.getlemuceseff(lem)
388 for i, classe in enumerate(lclasses) :
389 concern = sets[i].intersection(lemuceeff.keys())
391 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
392 return tgenoccurrences, totoccurrences
394 def make_efftype_from_etoiles(self, etoiles) :
396 etuces = [[] for et in etoiles]
397 for uci in self.ucis :
398 get = list(set(uci.etoiles).intersection(etoiles))
400 return '2 variables sur la meme ligne'
402 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
403 etuces = [set(val) for val in etuces]
404 for lem in self.lems :
405 deff = self.getlemuceseff(lem)
407 gram = self.lems[lem].gram
409 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
411 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
412 tabout = [[gram] + dtype[gram] for gram in dtype]
413 tabout.insert(0, [''] + etoiles)
416 def make_uceactsize(self, actives) :
417 res = self.getalluces()
420 deff = self.getlemuceseff(lem)
422 ucesize[uce] = ucesize.get(uce, 0) + 1
425 def make_uc(self, actives, lim1, lim2) :
426 uceactsize = self.make_uceactsize(actives)
432 for uce in [uce for uci in self.ucis for uce in uci.uces] :
433 if uce.para == lastpara :
435 last1 += uceactsize.get(uce.ident,0)
436 uc1[-1].append(uce.ident)
438 uc1.append([uce.ident])
441 last2 += uceactsize.get(uce.ident, 0)
442 uc2[-1].append(uce.ident)
444 uc2.append([uce.ident])
447 last1 = uceactsize.get(uce.ident, 0)
448 last2 = uceactsize.get(uce.ident, 0)
450 uc1.append([uce.ident])
451 uc2.append([uce.ident])
454 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
455 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
456 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
457 self.write_ucmatrix(uc1, actives, uc1out)
458 self.write_ucmatrix(uc2, actives, uc2out)
459 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
460 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
461 with open(listuce1out, 'w') as f :
462 f.write('\n'.join([';'.join(line) for line in listuce1]))
463 with open(listuce2out, 'w') as f :
464 f.write('\n'.join([';'.join(line) for line in listuce2]))
465 return len(uc1), len(uc2)
467 def write_ucmatrix(self, uc, actives, fileout) :
468 log.info('write uc matrix %s' % fileout)
469 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
472 with open(fileout + '~', 'w+') as f :
473 for i, lem in enumerate(actives) :
474 for uce in self.getlemuces(lem):
475 if (uces_uc[uce], i) not in deja_la :
477 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
478 deja_la[(uces_uc[uce], i)] = 0
480 with open(fileout, 'w') as ffin :
481 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
484 os.remove(fileout + '~')
487 def export_corpus(self, outf) :
488 #outf = 'export_corpus.txt'
490 res = self.getalluces()
494 with open(outf,'w') as f :
496 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
497 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
498 elif self.iduces[uce[0]].uci != actuci :
499 actuci = self.iduces[uce[0]].uci
500 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
501 actpara = self.iduces[uce[0]].para
502 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
505 actpara = self.iduces[uce[0]].para
506 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
507 elif self.iduces[uce[0]].para != actpara :
508 actpara = self.iduces[uce[0]].para
510 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
512 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
514 for i, lc in enumerate(self.lc) :
517 for uce in self.lc0 :
520 res = self.getalluces()
523 res = self.getallucis()
524 with open(outf, 'w') as f :
528 actuci = self.iduces[uce[0]].uci
532 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
534 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
536 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
537 f.write(etline.encode(self.parametres['syscoding']) + '\n')
538 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
540 def export_classe(self, outf, classe, lem = False, uci = False) :
541 sts = self.lc[classe - 1]
543 res = self.getconcorde(sts)
546 res = self.getuciconcorde(sts)
547 with open(outf, 'w') as f :
551 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
553 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
555 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
556 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
558 def export_owledge(self, rep, classe, lem = False, uci = False) :
559 sts = self.lc[classe - 1]
561 res = self.getconcorde(sts)
564 res = self.getuciconcorde(sts)
568 outf = '.'.join([`ident`, 'txt'])
569 outf = os.path.join(rep, outf)
571 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
572 with open(outf, 'w') as f :
573 f.write(guce.encode('cp1252', errors = 'replace'))
575 def export_tropes(self, fileout, classe, lem = False, uci = False) :
576 sts = self.lc[classe - 1]
578 res = self.getconcorde(sts)
581 res = self.getuciconcorde(sts)
582 with open(fileout, 'w') as f :
586 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
587 f.write(guce.encode('cp1252', errors = 'replace'))
590 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
591 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
593 with open(outfile + '~', 'w+') as f :
594 for i, lem in enumerate(actives) :
595 for uce in sorted(self.getlemuces(lem)) :
597 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
599 with open(outfile, 'w') as ffin :
600 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
603 os.remove(outfile + '~')
605 with open(listuce, 'w') as f :
606 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
608 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
609 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
611 with open(outfile + '~', 'w+') as f :
612 for i, lem in enumerate(actives) :
613 for uci in sorted(self.getlemucis(lem)) :
615 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
617 with open(outfile, 'w') as ffin :
618 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
621 os.remove(outfile + '~')
623 with open(listuci, 'w') as f :
624 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
626 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
627 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
629 duces = dict([[uce, i] for i, uce in enumerate(uces)])
630 with open(outfile + '~', 'w+') as f :
631 for i, lem in enumerate(actives) :
632 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
634 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
636 with open(outfile, 'w') as ffin :
637 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
640 os.remove(outfile + '~')
642 def make_table_with_classe(self, uces, list_act, uci = False) :
643 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
644 uces = dict([[uce, i] for i, uce in enumerate(uces)])
646 getlem = self.getlemucis
648 getlem = self.getlemuces
649 for i, lem in enumerate(list_act) :
650 lemuces = list(set(getlem(lem)).intersection(uces))
652 table_uce[uces[uce]][i] = 1
653 table_uce.insert(0, list_act)
656 def make_pondtable_with_classe(self, uces, list_act) :
657 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
658 uces = dict([[uce, i] for i, uce in enumerate(uces)])
659 for i, lem in enumerate(list_act) :
660 uceseff = self.getlemuceseff(lem)
661 lemuces = list(set(uceseff.keys()).intersection(uces))
663 table_uce[uces[uce]][i] = uceseff[uce]
664 table_uce.insert(0, list_act)
667 def parse_active(self, gramact, gramsup = None) :
668 log.info('parse actives')
669 for lem in self.lems :
670 if lem.startswith('_') and lem.endswith('_') :
671 self.lems[lem].act = 2
672 elif self.lems[lem].gram in gramact :
673 self.lems[lem].act = 1
674 elif gramsup is not None and self.lems[lem].gram not in gramact:
675 if self.lems[lem].gram in gramsup :
676 self.lems[lem].act = 2
678 self.lems[lem].act = 0
680 self.lems[lem].act = 2
682 def make_actives_limit(self, limit, key = 1) :
683 if self.idformes is None :
685 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
687 def make_actives_nb(self, nbmax, key) :
688 log.info('make_actives_nb : %i - %i' % (nbmax,key))
689 if self.idformes is None :
691 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
692 self.activenb = len(allactives)
693 allactives = sorted(allactives, reverse = True)
694 if self.activenb == 0 :
696 if len(allactives) <= nbmax :
697 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
698 return [val[1] for val in allactives], allactives[-1][0]
700 effs = [val[0] for val in allactives]
701 if effs.count(effs[nbmax - 1]) > 1 :
702 lim = effs[nbmax - 1] + 1
706 stop = effs.index(lim)
713 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
714 return [val[1] for val in allactives[0:stop + 1]], lim
716 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
717 log.info('formes/classes')
719 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
721 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
722 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
723 with open(fileout, 'w') as f :
724 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
726 def make_etoiles(self) :
728 for uci in self.ucis :
729 etoiles.update(uci.etoiles[1:])
732 def make_themes(self):
734 for uci in self.ucis :
735 themes.update(uci.paras)
738 def make_etoiles_dict(self) :
739 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
741 for etoile in etoiles :
742 et = etoile.split('_')
745 endet = '_'.join(et[1:])
746 if etoile in det[et[0]] :
747 det[et[0]][etoile] += 1
749 det[et[0]][etoile] = 1
754 endet = '_'.join(et[1:])
755 det[et[0]] = {etoile :1}
760 def make_etline(self, listet) :
761 etuces = [[] for et in listet]
762 for uci in self.ucis :
763 get = list(set(uci.etoiles).intersection(listet))
765 return '2 variables sur la meme ligne'
767 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
770 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
771 log.info('etoiles/classes')
773 etoileuces = self.getetoileuces()
775 etoileuces = self.getetoileucis()
776 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
777 with open(fileout, 'w') as f :
778 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
779 #etoiles = self.make_etoiles()
780 #with open(fileout, 'w') as f :
781 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
783 def make_colored_corpus(self, uci = False) :
785 for i, lc in enumerate(self.lc) :
788 for uce in self.lc0 :
790 color = ['black'] + colors[len(self.lc) - 1]
792 <meta http-equiv="content-Type" content="text/html; charset=%s" />
794 ''' % sys.getdefaultencoding()
796 res = self.getalluces()
801 if self.iduces[uce[0]].uci != actuci :
802 actuci = self.iduces[uce[0]].uci
803 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
804 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
806 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
808 res = self.getallucis()
811 if self.ucis[uce[0]].ident != actuci :
812 actuci = self.ucis[uce[0]].ident
813 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
814 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
816 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
817 return txt + '\n</body></html>'
819 def count_from_list(self, l, d) :
827 def count_from_list_cl(self, l, d, a, clnb) :
836 def find_segments(self, taille_segment, taille_limite) :
838 for uce in self.getalluces() :
840 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
841 l = [[d[val], val] for val in d if d[val] >= 3]
844 if len(l) > taille_limite :
845 l = l[-taille_limite:]
848 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
851 concorde = self.getconcorde
853 concorde = self.getuciconcorde
854 for uce in concorde(list_uce) :
856 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
857 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
860 if len(l) > taille_limite :
861 l = l[-taille_limite:]
864 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
866 for b, classe in enumerate(self.lc) :
867 for uce in self.getconcorde(classe) :
870 uce = [self.formes[forme].lem for forme in uce]
871 for taille_segment in range(lenmin,lenmax) :
872 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
873 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
874 with open(fileout, 'w') as f :
875 f.write('\n'.join([';'.join(line) for line in result]))
877 def make_proftype(self, outf) :
879 for lem in self.lems :
880 gram = self.lems[lem].gram
882 res[gram] = [0 for val in self.lc]
883 lemuceeff = self.getlemuceseff(lem)
884 for i, classe in enumerate(self.lc) :
885 concern = set(classe).intersection(lemuceeff.keys())
886 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
887 res = [[gram] + [`val` for val in res[gram]] for gram in res]
889 with open(outf, 'w') as f :
890 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
893 def make_ucecl_from_R(self, filein) :
894 with open(filein, 'rU') as f :
899 line = line.replace('\n', '').replace('"', '').split(';')
900 self.lc.append([int(line[0]) - 1, int(line[1])])
901 classesl = [val[1] for val in self.lc]
903 self.lc = sorted(self.lc, key=itemgetter(1))
904 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
905 self.lc0 = self.lc.pop(0)
908 def get_stat_by_cluster(self, outf, lclasses = None) :
909 log.info('get_stat_by_cluster')
910 if lclasses is None :
913 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
914 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
915 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
916 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
917 sets = [set(cl) for cl in lclasses]
918 for forme in self.formes :
919 formeuceeff = self.getformeuceseff(forme)
920 for i, classe in enumerate(lclasses) :
921 concern = sets[i].intersection(formeuceeff.keys())
923 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
925 if self.formes[forme].freq == 1 :
927 log.info('%f' % (time() - t1))
928 if outf is not None :
929 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
930 with open(outf, 'w') as f :
933 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
935 def get_stat_by_et(self, outf, etoiles) :
936 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
937 stats = self.get_stat_by_cluster(None, lclasses)
938 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
940 def gethapaxbyet(self, etoiles) :
941 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
943 for uce in hapaxuces :
944 if uce in hucesdict :
948 etuces = [[] for et in etoiles]
949 for uci in self.ucis :
950 get = list(set(uci.etoiles).intersection(etoiles))
952 return '2 variables sur la meme ligne'
954 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
955 etuces = [set(val) for val in etuces]
956 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
958 def gethapaxuces(self) :
959 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
960 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
962 for i,uce in enumerate(hapaxuces) :
963 if uce in hucesdict :
964 hucesdict[uce][0] += 1
965 hucesdict[uce][1].append(hapax[i])
967 hucesdict[uce] = [1,[hapax[i]]]
969 for uce in hucesdict :
970 if hucesdict[uce][0] in huces :
971 huces[hucesdict[uce][0]].append(uce)
973 huces[hucesdict[uce][0]] = [uce]
974 huces = zip(huces, huces.values())
975 huces.sort(reverse=True)
979 for nb in huces[0:4] :
980 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
982 res = self.getconcorde([uce])
984 ucetxt = ' ' + row[1] + ' '
986 for hap in hucesdict[uce][1] :
987 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
988 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
989 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
990 txt += '<p>'+ucetxt+'</p>\n'
994 with open('/tmp/testhapxuce.html','w') as f :
997 def export_dictionary(self, fileout, syscoding) :
998 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
999 listformes.sort(reverse = True)
1000 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1001 with open(fileout, 'w') as f :
1002 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1004 def export_lems(self, fileout, syscoding) :
1005 self.make_idformes()
1006 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1008 with open(fileout, 'w') as f :
1009 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1014 def __init__(self, corpus) :
1015 ucinb = corpus.getucinb()
1016 ucisize = corpus.getucisize()
1017 ucimean = float(sum(ucisize))/float(ucinb)
1018 detoile = corpus.make_etoiles_dict()
1021 def __init__(self, iduci, line, paraset = None) :
1023 self.etoiles = line.split()
1025 if paraset is not None :
1026 self.paras = paraset.split()
1031 def __init__(self, iduce, idpara, iduci) :
1037 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1040 self.gram = gramtype
1043 if freq is not None :
1049 def __init__(self, parent, forme) :
1050 self.formes = {forme.ident : forme.freq}
1051 self.gram = forme.gram
1052 self.freq = forme.freq
1053 self.act = forme.act
1055 def add_forme(self, forme) :
1056 self.formes[forme.ident] = forme.freq
1057 self.freq += forme.freq
1059 def decouperlist(chaine, longueur, longueurOptimale) :
1061 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1062 Si on trouve un '$', c'est fini.
1063 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1065 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1066 dsep = dict([[val[0],val[1]] for val in separateurs])
1067 trouve = False # si on a trouvé un bon séparateur
1068 iDecoupe = 0 # indice du caractere ou il faut decouper
1070 longueur = min(longueur, len(chaine) - 1)
1071 chaineTravail = chaine[:longueur + 1]
1073 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1076 indice = chaineTravail.index(u'$')
1078 iDecoupe = indice - 1
1083 caractere = chaineTravail[nbCar]
1084 distance = abs(longueurOptimale - nbCar) + 1
1085 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1086 if caractere in dsep :
1087 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1088 meilleur[0] = caractere
1089 meilleur[1] = dsep[caractere]
1094 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1096 meilleur[1] = dsep[' ']
1103 #if meilleur[0] != ' ' :
1104 # fin = chaine[iDecoupe + 1:]
1105 # retour = chaineTravail[:iDecoupe]
1107 fin = chaine[iDecoupe + 1:]
1108 retour = chaineTravail[:iDecoupe + 1]
1109 return len(retour) > 0, retour, fin
1110 # si on a rien trouvé
1111 return False, chaine, ''
1113 def testetoile(line) :
1114 return line.startswith(u'****')
1117 return line[0:4].isdigit() and u'*' in line
1119 def prep_txtlist(txt) :
1120 return txt.split() + [u'$']
1122 def prep_txtcharact(txt) :
1127 Class for building a corpus
1129 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1130 log.info('begin building corpus...')
1131 self.lexique = lexique
1132 self.expressions = expressions
1134 self.corpus = Corpus(self, parametres_corpus)
1135 self.infile = infile
1137 self.lim = parametres_corpus.get('lim', 1000000)
1138 self.encoding = parametres_corpus['encoding']
1139 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1140 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1141 self.corpus.parametres['uuid'] = str(uuid4())
1142 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1143 self.corpus.parametres['type'] = 'corpus'
1144 if self.corpus.parametres['keep_ponct'] :
1145 self.ponctuation_espace = [' ', '']
1147 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1149 self.tolist = self.corpus.parametres.get('tolist', 0)
1156 def prep_makeuce(self) :
1157 method = self.corpus.parametres.get('ucemethod', 0)
1159 self.decouper = decouperlist
1160 self.prep_txt = prep_txtlist
1161 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1163 self.decouper = decoupercharact
1164 self.prep_txt = prep_txtcharact
1165 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1166 log.info('method uce : %s' % method)
1171 self.read_corpus(self.infile)
1172 except Warning, args :
1173 log.info('pas kool %s' % args)
1177 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1178 self.time = time() - t1
1180 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1181 log.info('time : %f' % (time() - t1))
1184 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1185 self.cf = self.conn_f.cursor()
1186 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1187 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1188 self.conn_f.commit()
1189 self.cf = self.conn_f.cursor()
1190 self.cf.execute('PRAGMA temp_store=MEMORY;')
1191 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1192 self.cf.execute('PRAGMA synchronous = OFF;')
1193 self.cf.execute('begin')
1194 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1195 self.c = self.conn.cursor()
1196 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1198 self.c = self.conn.cursor()
1199 self.c.execute('PRAGMA temp_store=MEMORY;')
1200 self.c.execute('PRAGMA journal_mode=MEMORY;')
1201 self.c.execute('PRAGMA synchronous = OFF;')
1202 self.c.execute('begin')
1205 #commit index and close db
1207 self.conn_f.commit()
1208 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1209 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1213 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1214 self.ccorpus = self.conn_corpus.cursor()
1215 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1216 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1217 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1218 self.conn_corpus.commit()
1219 self.ccorpus = self.conn_corpus.cursor()
1220 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1221 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1222 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1223 self.ccorpus.execute('begin')
1224 self.backup_corpus()
1225 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1226 self.conn_corpus.commit()
1227 self.conn_corpus.close()
1228 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1230 def buildcleans(self) :
1231 if self.corpus.parametres.get('lower', 1) :
1232 self.cleans.append(self.dolower)
1233 if self.corpus.parametres.get('firstclean', 1) :
1234 self.cleans.append(self.firstclean)
1235 if self.corpus.parametres['charact'] :
1236 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1237 self.cleans.append(self.docharact)
1238 if self.corpus.parametres.get('expressions', 1) :
1239 self.cleans.append(self.make_expression)
1240 if self.corpus.parametres.get('apos', 1) :
1241 self.cleans.append(self.doapos)
1242 if self.corpus.parametres.get('tiret', 1):
1243 self.cleans.append(self.dotiret)
1245 def make_expression(self,txt) :
1246 for expression in self.expressions:
1247 if expression in txt :
1248 txt = txt.replace(expression, self.expressions[expression][0])
1251 def dolower(self, txt) :
1254 def docharact(self, txt) :
1255 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1256 list_keep = u"[" + self.rule + "]+"
1257 return re.sub(list_keep, ' ', txt)
1259 def doapos(self, txt) :
1260 return txt.replace(u'\'', u' ')
1262 def dotiret(self, txt) :
1263 return txt.replace(u'-', u' ')
1265 def firstclean(self, txt) :
1266 txt = txt.replace(u'’',"'")
1267 txt = txt.replace(u'œ', u'oe')
1268 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1270 def make_cleans(self, txt) :
1271 for clean in self.cleans :
1275 def backup_uce(self) :
1276 if self.corpus.idformesuces != {} :
1277 log.info('backup %i' % len(self.corpus.idformesuces))
1278 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1279 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1280 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1281 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1282 self.corpus.idformesuces = {}
1285 def backup_corpus(self) :
1286 log.info('start backup corpus')
1288 for uci in self.corpus.ucis :
1289 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1290 for uce in uci.uces :
1291 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1292 for forme in self.corpus.formes :
1293 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1294 log.info('%f' % (time() - t))
1296 def dofinish(self) :
1297 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1298 minutes, seconds = divmod(self.time, 60)
1299 hours, minutes = divmod(minutes, 60)
1300 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1301 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1302 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1303 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1304 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1305 hapaxnb = self.corpus.gethapaxnb()
1306 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1307 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1308 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1310 class BuildSubCorpus(BuildCorpus):
1311 def __init__(self, corpus, parametres, dlg = None) :
1313 log.info('begin subcorpus...')
1317 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1319 self.encoding = corpus.parametres['encoding']
1320 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1321 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1322 self.corpus.pathout.createdir(parametres['pathout'])
1323 self.corpus.parametres['pathout'] = parametres['pathout']
1324 self.corpus.parametres['meta'] = parametres.get('meta', False)
1325 self.corpus.parametres['uuid'] = str(uuid4())
1326 if parametres.get('frommeta', False) :
1327 print 'make subtexts'
1328 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1329 elif parametres.get('fromtheme', False) :
1330 print 'make subtexts from theme'
1332 for uci in self.ori.ucis :
1333 if uci.paras != [] :
1336 for et in uci.paras :
1337 if et in parametres['meta'] :
1338 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1344 nuci.paras = newpara
1345 self.corpus.ucis.append(nuci)
1348 elif parametres.get('fromcluster', False) :
1350 elif parametres.get('fromuceids', False) :
1352 dictucekeep = dict(zip(parametres['uceids'], parametres['uceids']))
1354 for uci in self.ori.ucis :
1355 if uci.paras == [] :
1356 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1359 nuci.uces = keepuces
1360 self.corpus.ucis.append(nuci)
1365 for et in uci.paras :
1366 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1374 nuci.paras = newpara
1375 self.corpus.ucis.append(nuci)
1381 def read_corpus(self, infile = None):
1382 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1388 print 'redo text, para and st ident'
1389 for uci in self.corpus.ucis :
1390 uci.ident = ident_uci
1392 for uce in uci.uces :
1394 if uce.para != lastpara :
1397 uce.para = ident_para
1399 uce.para = ident_para
1400 newuceident[uce.ident] = ident_uce
1401 uce.ident = ident_uce
1403 print 'backup st text and forms'
1404 for row in self.ori.getconcorde(self.olduceid) :
1405 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1406 for word in row[1].split() :
1407 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1411 class BuildFromAlceste(BuildCorpus) :
1412 def read_corpus(self, infile) :
1413 if self.dlg is not None :
1414 self.dlg.Pulse('textes : 0 - segments : 0')
1417 if self.corpus.parametres['ucimark'] == 0 :
1418 self.testuci = testetoile
1419 elif self.corpus.parametres['ucimark'] == 1 :
1420 self.testuci = testint
1426 with codecs.open(infile, 'r', self.encoding) as f :
1427 for linenb, line in enumerate(f) :
1428 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1429 if self.testuci(line) :
1432 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1434 self.corpus.ucis.append(Uci(iduci, line))
1437 if self.corpus.ucis[-1].uces == [] :
1438 log.info(u'Empty text : %i' % linenb)
1440 self.corpus.ucis.pop()
1441 self.corpus.ucis.append(Uci(iduci, line))
1442 if self.dlg is not None :
1443 if not (iduci + 1) % 10 :
1444 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1445 elif line.startswith(u'-*') :
1448 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1451 self.corpus.ucis[-1].paras.append(line.split()[0])
1453 raise Exception('paragrapheOT %i' % linenb)
1454 elif line.strip() != '' and iduci != -1 :
1456 if txt != [] and iduci != -1 :
1457 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1462 self.corpus.ucis.pop()
1463 log.info(Exception("Empty text %i" % linenb))
1465 raise Exception('EmptyText %i' % linenb)
1466 if iduci != -1 and iduce != -1:
1469 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1470 raise Exception('TextBeforeTextMark %i' % linenb)
1471 except UnicodeDecodeError :
1472 raise Exception("CorpusEncoding")
1474 def treattxt(self, txt, iduce, idpara, iduci) :
1475 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1476 txt = 'laphrasepoursplitter'.join(txt)
1477 txt = self.make_cleans(txt)
1478 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1479 ucetxt = txt.split('laphrasepoursplitter')
1482 txt = self.make_cleans(txt)
1483 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1484 if self.corpus.ucis[-1].paras == [] :
1488 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1489 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1490 if not self.tolist :
1496 self.corpus.add_word(word)
1497 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1498 if self.last > self.lim :
1501 return iduce, idpara
1503 def make_uces(self, txt, douce = True, keep_ponct = False) :
1504 txt = ' '.join(txt.split())
1507 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1509 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1512 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1513 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1518 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1520 #decouper (list_sep)
1521 #make_uces (decouper)
1522 #treat_txt (make_uces)
1526 def __init__(self, parent, dlg = None) :
1527 self.parent = parent
1529 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1530 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1531 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1532 dial = CorpusPref(parent, parametres)
1533 dial.CenterOnParent()
1534 dial.txtpath.SetLabel(parent.filename)
1535 #dial.repout_choices.SetValue(parametres['pathout'])
1536 self.res = dial.ShowModal()
1537 if self.res == 5100 :
1538 parametres = dial.doparametres()
1539 parametres['originalpath'] = parent.filename
1540 PathOut().createdir(parametres['pathout'])
1541 ReadLexique(self.parent, lang = parametres['lang'])
1542 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1543 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1545 self.parent.expressions = {}
1546 self.parametres = parametres
1548 if self.dlg is not None :
1552 def doanalyse(self) :
1553 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1556 def __init__(self, parent, corpus, parametres = None, dlg = None):
1557 self.parent = parent
1560 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1561 if dlg is not None :
1562 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1564 parametres['corpus_name'] = corpus_name
1565 if parametres.get('frommeta', False) :
1566 parametres['meta'] = corpus.make_etoiles()
1567 elif parametres.get('fromtheme', False) :
1568 parametres['meta'] = corpus.make_themes()
1570 parametres['meta'] = []
1571 parametres['meta'].sort()
1572 if dlg is not None :
1574 dial = SubTextFromMetaDial(parent, parametres)
1575 self.res = dial.ShowModal()
1576 if self.res == 5100 :
1577 if dial.subcorpusname.GetValue() != '' :
1578 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1579 if corpus_name != '' :
1580 parametres['corpus_name'] = corpus_name
1582 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1583 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1585 while os.path.exists(pathout + '_%i' % i) :
1587 parametres['pathout'] = pathout + '_%i' % i
1588 meta = dial.m_listBox1.GetSelections()
1589 parametres['meta'] = [parametres['meta'][val] for val in meta]
1590 self.parametres = parametres
1595 def doanalyse(self):
1596 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
1598 if __name__ == '__main__' :
1600 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
1601 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)