1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from colors import colors
24 log = logging.getLogger('iramuteq.corpus')
27 def copycorpus(corpus) :
28 log.info('copy corpus')
29 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
30 copy_corpus.ucis = corpus.ucis
31 copy_corpus.formes = corpus.formes
32 copy_corpus.pathout = corpus.pathout
33 copy_corpus.conn_all()
42 def __init__(self, parent, parametres = {}, read = False) :
44 self.parametres = parametres
46 self.connformes = None
48 self.conncorpus = None
55 self.idformesuces = {}
60 self.pathout = PathOut(dirout = parametres['pathout'])
63 def add_word(self, word) :
64 if word in self.formes :
65 self.formes[word].freq += 1
66 if self.formes[word].ident in self.idformesuces :
67 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
68 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
72 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
74 if word in self.parent.lexique :
75 gramtype = self.parent.lexique[word][1]
76 lem = self.parent.lexique[word][0]
83 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
87 """connect corpus to db"""
88 if self.connformes is None :
89 log.info('connexion corpus')
90 self.connuces = sqlite3.connect(self.pathout['uces.db'])
91 self.cuces = self.connuces.cursor()
92 self.connformes = sqlite3.connect(self.pathout['formes.db'])
93 self.cformes = self.connformes.cursor()
94 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
95 self.ccorpus = self.conncorpus.cursor()
96 self.cformes.execute('PRAGMA temp_store=MEMORY;')
97 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
98 self.cformes.execute('PRAGMA synchronous = OFF;')
99 self.cuces.execute('PRAGMA temp_store=MEMORY;')
100 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
101 self.cuces.execute('PRAGMA synchronous = OFF;')
102 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
103 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
104 self.ccorpus.execute('PRAGMA synchronous = OFF;')
106 def read_corpus(self) :
107 log.info('read corpus')
108 self.parametres['syscoding'] = sys.getdefaultencoding()
109 if self.conncorpus is None :
111 res = self.ccorpus.execute('SELECT * FROM etoiles;')
113 self.ucis.append(Uci(row[0], row[1], row[2]))
114 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
116 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
117 res = self.ccorpus.execute('SELECT * FROM formes;')
118 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
121 def getworduces(self, wordid) :
122 if isinstance(wordid, basestring) :
123 wordid = self.formes[wordid].ident
124 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
125 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
127 def getformeuceseff(self, formeid) :
128 if isinstance(formeid, basestring) :
129 formeid = self.formes[formeid].ident
130 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
131 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
132 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
133 res = self.cformes.execute(query)
134 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
136 for i, uce in enumerate(uces) :
137 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
140 def getlemuces(self, lem) :
141 formesid = ', '.join([`val` for val in self.lems[lem].formes])
142 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
143 res = self.cformes.execute(query)
144 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
146 def getlemucis(self, lem) :
147 uces = self.getlemuces(lem)
148 return list(set([self.getucefromid(val).uci for val in uces]))
150 def getlemuceseff(self, lem, luces = None) :
151 formesid = ', '.join([`val` for val in self.lems[lem].formes])
152 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
153 res = self.cformes.execute(query)
154 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
155 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
156 res = self.cformes.execute(query)
157 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
159 for i, uce in enumerate(uces) :
160 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
163 def getlemclustereff(self, lem, cluster) :
164 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
166 def getlemeff(self, lem) :
167 return self.lems[lem].freq
172 def getforme(self, formeid) :
173 if self.idformes is None : self.make_idformes()
174 return self.idformes[formeid]
176 def gettotocc(self) :
177 return sum([self.formes[forme].freq for forme in self.formes])
179 def getucemean(self) :
180 return float(self.gettotocc())/self.getucenb()
183 return self.ucis[-1].uces[-1].ident + 1
186 return self.ucis[-1].ident + 1
188 def getucisize(self) :
189 ucesize = self.getucesize()
190 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
192 def getucesize(self) :
193 res = self.getalluces()
194 return [len(uce[1].split()) for uce in res]
196 def getconcorde(self, uces) :
197 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
199 def getwordconcorde(self, word) :
200 return self.getconcorde(self.getworduces(word))
202 def getlemconcorde(self, lem) :
203 return self.getconcorde(self.getlemuces(lem))
205 def getalluces(self) :
206 return self.cuces.execute('SELECT * FROM uces')
208 def getucesfrometoile(self, etoile) :
209 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
211 def getetoileuces(self) :
212 log.info('get uces etoiles')
215 for uci in self.ucis :
216 etoiles = uci.etoiles[1:]
218 if et in etoileuces :
219 etoileuces[et] += [uce.ident for uce in uci.uces]
221 etoileuces[et] = [uce.ident for uce in uci.uces]
223 for et in uci.paras :
224 if et in etoileuces :
225 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
227 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
233 def getucefromid(self, uceid) :
234 if self.iduces is None : self.make_iduces()
235 return self.iduces[uceid]
237 def gethapaxnb(self) :
238 return len([None for forme in self.formes if self.formes[forme].freq == 1])
240 def getactivesnb(self, key) :
241 return len([lem for lem in self.lems if self.lems[lem].act == key])
242 # def make_lems(self, lem = True) :
243 # log.info('make lems')
245 # for forme in self.formes :
246 # if self.formes[forme].lem in self.lems :
247 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
248 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
250 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
252 def getetbyuceid(self, uceid) :
253 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
254 return self.ucis[self.uceuci[uceid]].etoiles
256 def make_lems(self, lem = True) :
257 log.info('make lems')
260 for forme in self.formes :
261 if self.formes[forme].lem in self.lems :
262 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
263 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
265 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
267 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
269 def make_idformes(self) :
270 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
272 def make_iduces(self) :
273 if self.iduces is None :
274 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
276 def make_lexitable(self, mineff, etoiles, gram = 0) :
281 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
282 etuces = [[] for et in etoiles]
283 for uci in self.ucis :
284 get = list(set(uci.etoiles).intersection(etoiles))
286 log.info('2 variables sur une ligne')
288 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
289 etuces = [set(val) for val in etuces]
292 deff = self.getlemuceseff(lem)
294 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
295 tab.insert(0, [''] + etoiles)
298 def make_efftype_from_etoiles(self, etoiles) :
300 etuces = [[] for et in etoiles]
301 for uci in self.ucis :
302 get = list(set(uci.etoiles).intersection(etoiles))
304 return '2 variables sur la meme ligne'
306 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
307 etuces = [set(val) for val in etuces]
308 for lem in self.lems :
309 deff = self.getlemuceseff(lem)
311 gram = self.lems[lem].gram
313 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
315 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
316 tabout = [[gram] + dtype[gram] for gram in dtype]
317 tabout.insert(0, [''] + etoiles)
320 def make_uceactsize(self, actives) :
321 res = self.getalluces()
324 deff = self.getlemuceseff(lem)
326 ucesize[uce] = ucesize.get(uce, 0) + 1
329 def make_uc(self, actives, lim1, lim2) :
330 uceactsize = self.make_uceactsize(actives)
336 for uce in [uce for uci in self.ucis for uce in uci.uces] :
337 if uce.para == lastpara :
339 last1 += uceactsize.get(uce.ident,0)
340 uc1[-1].append(uce.ident)
342 uc1.append([uce.ident])
345 last2 += uceactsize.get(uce.ident, 0)
346 uc2[-1].append(uce.ident)
348 uc2.append([uce.ident])
351 last1 = uceactsize.get(uce.ident, 0)
352 last2 = uceactsize.get(uce.ident, 0)
354 uc1.append([uce.ident])
355 uc2.append([uce.ident])
358 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
359 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
360 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
361 self.write_ucmatrix(uc1, actives, uc1out)
362 self.write_ucmatrix(uc2, actives, uc2out)
363 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
364 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
365 with open(listuce1out, 'w') as f :
366 f.write('\n'.join([';'.join(line) for line in listuce1]))
367 with open(listuce2out, 'w') as f :
368 f.write('\n'.join([';'.join(line) for line in listuce2]))
369 return len(uc1), len(uc2)
371 def write_ucmatrix(self, uc, actives, fileout) :
372 log.info('write uc matrix %s' % fileout)
373 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
376 with open(fileout + '~', 'w+') as f :
377 for i, lem in enumerate(actives) :
378 for uce in self.getlemuces(lem):
379 if (uces_uc[uce], i) not in deja_la :
381 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
382 deja_la[(uces_uc[uce], i)] = 0
384 with open(fileout, 'w') as ffin :
385 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
388 os.remove(fileout + '~')
391 def export_corpus(self, outf) :
392 #outf = 'export_corpus.txt'
394 res = self.getalluces()
398 with open(outf,'w') as f :
400 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
401 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
402 elif self.iduces[uce[0]].uci != actuci :
403 actuci = self.iduces[uce[0]].uci
404 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
405 actpara = self.iduces[uce[0]].para
406 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
409 actpara = self.iduces[uce[0]].para
410 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
411 elif self.iduces[uce[0]].para != actpara :
412 actpara = self.iduces[uce[0]].para
414 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
416 def export_corpus_classes(self, outf, alc = True, lem = False) :
418 for i, lc in enumerate(self.lc) :
421 for uce in self.lc0 :
423 res = self.getalluces()
425 with open(outf, 'w') as f :
428 actuci = self.iduces[uce[0]].uci
430 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
432 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
434 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
435 f.write(etline.encode(self.parametres['syscoding']) + '\n')
436 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
438 def export_classe(self, outf, classe, lem = False) :
439 sts = self.lc[classe - 1]
440 res = self.getconcorde(sts)
442 with open(outf, 'w') as f :
445 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
447 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
448 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
450 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
451 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
453 with open(outfile + '~', 'w+') as f :
454 for i, lem in enumerate(actives) :
455 for uce in sorted(self.getlemuces(lem)) :
457 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
459 with open(outfile, 'w') as ffin :
460 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
463 os.remove(outfile + '~')
465 with open(listuce, 'w') as f :
466 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
468 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
469 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
471 with open(outfile + '~', 'w+') as f :
472 for i, lem in enumerate(actives) :
473 for uci in sorted(self.getlemucis(lem)) :
475 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
477 with open(outfile, 'w') as ffin :
478 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
481 os.remove(outfile + '~')
483 with open(listuci, 'w') as f :
484 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
486 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
487 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
489 duces = dict([[uce, i] for i, uce in enumerate(uces)])
490 with open(outfile + '~', 'w+') as f :
491 for i, lem in enumerate(actives) :
492 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
494 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
496 with open(outfile, 'w') as ffin :
497 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
500 os.remove(outfile + '~')
502 def make_table_with_classe(self, uces, list_act) :
503 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
504 uces = dict([[uce, i] for i, uce in enumerate(uces)])
505 for i, lem in enumerate(list_act) :
506 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
508 table_uce[uces[uce]][i] = 1
509 table_uce.insert(0, list_act)
512 def make_pondtable_with_classe(self, uces, list_act) :
513 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
514 uces = dict([[uce, i] for i, uce in enumerate(uces)])
515 for i, lem in enumerate(list_act) :
516 uceseff = self.getlemuceseff(lem)
517 lemuces = list(set(uceseff.keys()).intersection(uces))
519 table_uce[uces[uce]][i] = uceseff[uce]
520 table_uce.insert(0, list_act)
523 def parse_active(self, gramact, gramsup = None) :
524 log.info('parse actives')
525 for lem in self.lems :
526 if lem.startswith('_') and lem.endswith('_') :
527 self.lems[lem].act = 2
528 elif self.lems[lem].gram in gramact :
529 self.lems[lem].act = 1
530 elif gramsup is not None and self.lems[lem].gram not in gramact:
531 if self.lems[lem].gram in gramsup :
532 self.lems[lem].act = 2
534 self.lems[lem].act = 0
536 self.lems[lem].act = 2
538 def make_actives_limit(self, limit, key = 1) :
539 if self.idformes is None :
541 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
543 def make_actives_nb(self, nbmax, key) :
544 log.info('make_actives_nb : %i - %i' % (nbmax,key))
545 if self.idformes is None :
547 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
548 self.activenb = len(allactives)
549 allactives = sorted(allactives, reverse = True)
550 if self.activenb == 0 :
552 if len(allactives) <= nbmax :
553 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
554 return [val[1] for val in allactives], allactives[-1][0]
556 effs = [val[0] for val in allactives]
557 if effs.count(effs[nbmax - 1]) > 1 :
558 lim = effs[nbmax - 1] + 1
562 stop = effs.index(lim)
569 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
570 return [val[1] for val in allactives[0:stop + 1]], lim
572 def make_and_write_profile(self, actives, ucecl, fileout) :
573 log.info('formes/classes')
574 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
575 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
576 with open(fileout, 'w') as f :
577 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
579 def make_etoiles(self) :
581 for uci in self.ucis :
582 etoiles.update(uci.etoiles[1:])
585 def make_etoiles_dict(self) :
586 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
588 for etoile in etoiles :
589 et = etoile.split('_')
592 endet = '_'.join(et[1:])
593 if etoile in det[et[0]] :
594 det[et[0]][etoile] += 1
596 det[et[0]][etoile] = 1
601 endet = '_'.join(et[1:])
602 det[et[0]] = {etoile :1}
607 def make_etline(self, listet) :
608 etuces = [[] for et in listet]
609 for uci in self.ucis :
610 get = list(set(uci.etoiles).intersection(listet))
612 return '2 variables sur la meme ligne'
614 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
617 def make_and_write_profile_et(self, ucecl, fileout) :
618 log.info('etoiles/classes')
619 etoileuces = self.getetoileuces()
620 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
621 with open(fileout, 'w') as f :
622 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
623 #etoiles = self.make_etoiles()
624 #with open(fileout, 'w') as f :
625 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
627 def make_colored_corpus(self) :
629 for i, lc in enumerate(self.lc) :
632 for uce in self.lc0 :
634 color = ['black'] + colors[len(self.lc) - 1]
636 <meta http-equiv="content-Type" content="text/html; charset=%s" />
638 ''' % sys.getdefaultencoding()
639 res = self.getalluces()
644 if self.iduces[uce[0]].uci != actuci :
645 actuci = self.iduces[uce[0]].uci
646 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
647 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
649 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
650 return txt + '\n</body></html>'
652 def count_from_list(self, l, d) :
660 def count_from_list_cl(self, l, d, a, clnb) :
669 def find_segments(self, taille_segment, taille_limite) :
671 for uce in self.getalluces() :
673 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
674 l = [[d[val], val] for val in d if d[val] >= 3]
677 if len(l) > taille_limite :
678 l = l[-taille_limite:]
681 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
683 for uce in self.getconcorde(list_uce) :
685 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
686 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
689 if len(l) > taille_limite :
690 l = l[-taille_limite:]
693 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
695 for b, classe in enumerate(self.lc) :
696 for uce in self.getconcorde(classe) :
699 uce = [self.formes[forme].lem for forme in uce]
700 for taille_segment in range(lenmin,lenmax) :
701 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
702 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
703 with open(fileout, 'w') as f :
704 f.write('\n'.join([';'.join(line) for line in result]))
706 def make_proftype(self, outf) :
708 for lem in self.lems :
709 gram = self.lems[lem].gram
711 res[gram] = [0 for val in self.lc]
712 lemuceeff = self.getlemuceseff(lem)
713 for i, classe in enumerate(self.lc) :
714 concern = set(classe).intersection(lemuceeff.keys())
715 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
716 res = [[gram] + [`val` for val in res[gram]] for gram in res]
718 with open(outf, 'w') as f :
719 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
722 def make_ucecl_from_R(self, filein) :
723 with open(filein, 'rU') as f :
728 line = line.replace('\n', '').replace('"', '').split(';')
729 self.lc.append([int(line[0]) - 1, int(line[1])])
730 classesl = [val[1] for val in self.lc]
732 self.lc = sorted(self.lc, key=itemgetter(1))
733 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
734 self.lc0 = self.lc.pop(0)
737 def get_stat_by_cluster(self, outf, lclasses = None) :
738 log.info('get_stat_by_cluster')
739 if lclasses is None :
742 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
743 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
744 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
745 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
746 sets = [set(cl) for cl in lclasses]
747 for forme in self.formes :
748 formeuceeff = self.getformeuceseff(forme)
749 for i, classe in enumerate(lclasses) :
750 concern = sets[i].intersection(formeuceeff.keys())
752 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
754 if self.formes[forme].freq == 1 :
756 log.info('%f' % (time() - t1))
757 if outf is not None :
758 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
759 with open(outf, 'w') as f :
762 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
764 def get_stat_by_et(self, outf, etoiles) :
765 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
766 stats = self.get_stat_by_cluster(None, lclasses)
767 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
769 def gethapaxbyet(self, etoiles) :
770 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
772 for uce in hapaxuces :
773 if uce in hucesdict :
777 etuces = [[] for et in etoiles]
778 for uci in self.ucis :
779 get = list(set(uci.etoiles).intersection(etoiles))
781 return '2 variables sur la meme ligne'
783 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
784 etuces = [set(val) for val in etuces]
785 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
787 def gethapaxuces(self) :
788 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
789 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
791 for i,uce in enumerate(hapaxuces) :
792 if uce in hucesdict :
793 hucesdict[uce][0] += 1
794 hucesdict[uce][1].append(hapax[i])
796 hucesdict[uce] = [1,[hapax[i]]]
798 for uce in hucesdict :
799 if hucesdict[uce][0] in huces :
800 huces[hucesdict[uce][0]].append(uce)
802 huces[hucesdict[uce][0]] = [uce]
803 huces = zip(huces, huces.values())
804 huces.sort(reverse=True)
808 for nb in huces[0:4] :
809 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
811 res = self.getconcorde([uce])
813 ucetxt = ' ' + row[1] + ' '
815 for hap in hucesdict[uce][1] :
816 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
817 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
818 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
819 txt += '<p>'+ucetxt+'</p>\n'
823 with open('/tmp/testhapxuce.html','w') as f :
826 def export_dictionary(self, fileout, syscoding) :
827 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
828 listformes.sort(reverse = True)
829 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
830 with open(fileout, 'w') as f :
831 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
833 def export_lems(self, fileout, syscoding) :
835 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
837 with open(fileout, 'w') as f :
838 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
844 def __init__(self, corpus) :
845 ucinb = corpus.getucinb()
846 ucisize = corpus.getucisize()
847 ucimean = float(sum(ucisize))/float(ucinb)
848 detoile = corpus.make_etoiles_dict()
851 def __init__(self, iduci, line, paraset = None) :
853 self.etoiles = line.split()
855 if paraset is not None :
856 self.paras = paraset.split()
861 def __init__(self, iduce, idpara, iduci) :
867 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
873 if freq is not None :
879 def __init__(self, parent, forme) :
880 self.formes = {forme.ident : forme.freq}
881 self.gram = forme.gram
882 self.freq = forme.freq
885 def add_forme(self, forme) :
886 self.formes[forme.ident] = forme.freq
887 self.freq += forme.freq
889 def decouperlist(chaine, longueur, longueurOptimale) :
891 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
892 Si on trouve un '$', c'est fini.
893 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
895 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
896 dsep = dict([[val[0],val[1]] for val in separateurs])
897 trouve = False # si on a trouvé un bon séparateur
898 iDecoupe = 0 # indice du caractere ou il faut decouper
900 longueur = min(longueur, len(chaine) - 1)
901 chaineTravail = chaine[:longueur + 1]
903 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
906 indice = chaineTravail.index(u'$')
908 iDecoupe = indice - 1
913 caractere = chaineTravail[nbCar]
914 distance = abs(longueurOptimale - nbCar) + 1
915 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
916 if caractere in dsep :
917 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
918 meilleur[0] = caractere
919 meilleur[1] = dsep[caractere]
924 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
926 meilleur[1] = dsep[' ']
933 #if meilleur[0] != ' ' :
934 # fin = chaine[iDecoupe + 1:]
935 # retour = chaineTravail[:iDecoupe]
937 fin = chaine[iDecoupe + 1:]
938 retour = chaineTravail[:iDecoupe + 1]
939 return len(retour) > 0, retour, fin
940 # si on a rien trouvé
941 return False, chaine, ''
943 def testetoile(line) :
944 return line.startswith(u'****')
947 return line[0:4].isdigit() and u'*' in line
949 def prep_txtlist(txt) :
950 return txt.split() + [u'$']
952 def prep_txtcharact(txt) :
957 Class for building a corpus
959 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
960 log.info('begin building corpus...')
961 self.lexique = lexique
962 self.expressions = expressions
964 self.corpus = Corpus(self, parametres_corpus)
967 self.lim = parametres_corpus.get('lim', 1000000)
968 self.encoding = parametres_corpus['encoding']
969 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
970 self.corpus.pathout.createdir(parametres_corpus['pathout'])
971 self.corpus.parametres['uuid'] = str(uuid4())
972 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
973 self.corpus.parametres['type'] = 'corpus'
974 if self.corpus.parametres['keep_ponct'] :
975 self.ponctuation_espace = [' ', '']
977 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
979 self.tolist = self.corpus.parametres.get('tolist', 0)
986 def prep_makeuce(self) :
987 method = self.corpus.parametres.get('ucemethod', 0)
989 self.decouper = decouperlist
990 self.prep_txt = prep_txtlist
991 self.ucesize = self.corpus.parametres.get('ucesize', 40)
993 self.decouper = decoupercharact
994 self.prep_txt = prep_txtcharact
995 self.ucesize = self.corpus.parametres.get('ucesize', 240)
996 log.info('method uce : %s' % method)
1001 self.read_corpus(self.infile)
1002 except Warning, args :
1003 log.info('pas kool %s' % args)
1007 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1008 self.time = time() - t1
1010 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1011 log.info('time : %f' % (time() - t1))
1014 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1015 self.cf = self.conn_f.cursor()
1016 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1017 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1018 self.conn_f.commit()
1019 self.cf = self.conn_f.cursor()
1020 self.cf.execute('PRAGMA temp_store=MEMORY;')
1021 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1022 self.cf.execute('PRAGMA synchronous = OFF;')
1023 self.cf.execute('begin')
1024 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1025 self.c = self.conn.cursor()
1026 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1028 self.c = self.conn.cursor()
1029 self.c.execute('PRAGMA temp_store=MEMORY;')
1030 self.c.execute('PRAGMA journal_mode=MEMORY;')
1031 self.c.execute('PRAGMA synchronous = OFF;')
1032 self.c.execute('begin')
1035 #commit index and close db
1037 self.conn_f.commit()
1038 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1039 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1043 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1044 self.ccorpus = self.conn_corpus.cursor()
1045 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1046 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1047 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1048 self.conn_corpus.commit()
1049 self.ccorpus = self.conn_corpus.cursor()
1050 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1051 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1052 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1053 self.ccorpus.execute('begin')
1054 self.backup_corpus()
1055 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1056 self.conn_corpus.commit()
1057 self.conn_corpus.close()
1058 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1060 def buildcleans(self) :
1061 if self.corpus.parametres.get('lower', 1) :
1062 self.cleans.append(self.dolower)
1063 if self.corpus.parametres.get('firstclean', 1) :
1064 self.cleans.append(self.firstclean)
1065 if self.corpus.parametres['charact'] :
1066 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1067 self.cleans.append(self.docharact)
1068 if self.corpus.parametres.get('expressions', 1) :
1069 self.cleans.append(self.make_expression)
1070 if self.corpus.parametres.get('apos', 1) :
1071 self.cleans.append(self.doapos)
1072 if self.corpus.parametres.get('tiret', 1):
1073 self.cleans.append(self.dotiret)
1075 def make_expression(self,txt) :
1076 for expression in self.expressions:
1077 if expression in txt :
1078 txt = txt.replace(expression, self.expressions[expression][0])
1081 def dolower(self, txt) :
1084 def docharact(self, txt) :
1085 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1086 list_keep = u"[" + self.rule + "]+"
1087 return re.sub(list_keep, ' ', txt)
1089 def doapos(self, txt) :
1090 return txt.replace(u'\'', u' ')
1092 def dotiret(self, txt) :
1093 return txt.replace(u'-', u' ')
1095 def firstclean(self, txt) :
1096 txt = txt.replace(u'’',"'")
1097 txt = txt.replace(u'œ', u'oe')
1098 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1100 def make_cleans(self, txt) :
1101 for clean in self.cleans :
1105 def backup_uce(self) :
1106 if self.corpus.idformesuces != {} :
1107 log.info('backup %i' % len(self.corpus.idformesuces))
1108 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1109 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1110 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1111 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1112 self.corpus.idformesuces = {}
1115 def backup_corpus(self) :
1116 log.info('start backup corpus')
1118 for uci in self.corpus.ucis :
1119 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1120 for uce in uci.uces :
1121 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1122 for forme in self.corpus.formes :
1123 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1124 log.info('%f' % (time() - t))
1126 def dofinish(self) :
1127 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1128 minutes, seconds = divmod(self.time, 60)
1129 hours, minutes = divmod(minutes, 60)
1130 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1131 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1132 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1133 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1134 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1135 hapaxnb = self.corpus.gethapaxnb()
1136 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1137 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1138 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1141 class BuildFromAlceste(BuildCorpus) :
1142 def read_corpus(self, infile) :
1143 if self.dlg is not None :
1144 self.dlg.Pulse('textes : 0 - segments : 0')
1147 if self.corpus.parametres['ucimark'] == 0 :
1148 self.testuci = testetoile
1149 elif self.corpus.parametres['ucimark'] == 1 :
1150 self.testuci = testint
1156 with codecs.open(infile, 'r', self.encoding) as f :
1157 for linenb, line in enumerate(f) :
1158 line = line.rstrip('\n\r')
1159 if self.testuci(line) :
1162 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1164 self.corpus.ucis.append(Uci(iduci, line))
1167 if self.corpus.ucis[-1].uces == [] :
1168 log.info(u'Empty text : %i' % linenb)
1170 self.corpus.ucis.pop()
1171 self.corpus.ucis.append(Uci(iduci, line))
1172 if self.dlg is not None :
1173 if not (iduci + 1) % 10 :
1174 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1175 elif line.startswith(u'-*') :
1178 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1181 self.corpus.ucis[-1].paras.append(line.split()[0])
1183 raise Exception('paragrapheOT %i' % linenb)
1184 elif line.strip() != '' and iduci != -1 :
1186 if txt != [] and iduci != -1 :
1187 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1192 self.corpus.ucis.pop()
1193 log.info(Exception("Empty text %i" % linenb))
1195 raise Exception('EmptyText %i' % linenb)
1196 if iduci != -1 and iduce != -1:
1199 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1200 raise Exception('TextBeforeTextMark %i' % linenb)
1201 except UnicodeDecodeError :
1202 raise Exception("CorpusEncoding")
1204 def treattxt(self, txt, iduce, idpara, iduci) :
1205 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1206 txt = 'laphrasepoursplitter'.join(txt)
1207 txt = self.make_cleans(txt)
1208 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1209 ucetxt = txt.split('laphrasepoursplitter')
1212 txt = self.make_cleans(txt)
1213 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1214 if self.corpus.ucis[-1].paras == [] :
1218 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1219 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1220 if not self.tolist :
1226 self.corpus.add_word(word)
1227 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1228 if self.last > self.lim :
1231 return iduce, idpara
1233 def make_uces(self, txt, douce = True, keep_ponct = False) :
1234 txt = ' '.join(txt.split())
1237 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1239 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1242 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1243 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1248 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1250 #decouper (list_sep)
1251 #make_uces (decouper)
1252 #treat_txt (make_uces)
1256 def __init__(self, parent, dlg = None) :
1257 self.parent = parent
1259 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1260 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1261 dial = CorpusPref(parent, parametres)
1262 dial.CenterOnParent()
1263 dial.txtpath.SetLabel(parent.filename)
1264 #dial.repout_choices.SetValue(parametres['pathout'])
1265 self.res = dial.ShowModal()
1266 if self.res == 5100 :
1267 parametres = dial.doparametres()
1268 parametres['originalpath'] = parent.filename
1269 PathOut().createdir(parametres['pathout'])
1270 ReadLexique(self.parent, lang = parametres['lang'])
1271 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1272 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1274 self.parent.expressions = {}
1275 self.parametres = parametres
1277 if self.dlg is not None :
1281 def doanalyse(self) :
1282 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1285 if __name__ == '__main__' :
1287 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1288 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)