1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2010, Pierre Ratinaud
12 from colors import colors
13 from functions import decoupercharact, ReadDicoAsDico, sortedby
14 from ttparser import get_ucis_from_tt
15 #from ConfigParser import RawConfigParser
21 """ Yield successive n-sized chunks from l.
23 for i in xrange(0, len(l), n):
27 def __init__(self, parent) :
29 self.parametre = {'syscoding': sys.getdefaultencoding()}
37 self.ucis_paras_uces = None
42 #self.supplementaires = []
47 def open_corpus(self) :
48 with codecs.open(self.parametre['filename'], "r", self.parametre['encodage']) as f :
49 self.content = f.read()
53 ucifile = os.path.join(os.path.dirname(self.parametre['filename']), 'ucis.txt')
54 uci = open(ucifile, 'w')
55 #db = os.path.join(os.path.dirname(self.parametre['filename']), 'corpus.db')
56 #conn = sqlite3.connect(db)
58 #conn.text_factory = str
60 #c.execute('''CREATE TABLE corpus (id integer, varet TEXT)''')
65 with codecs.open(self.parametre['filename'], "r", self.parametre['encodage']) as open_corpus :
66 for line in open_corpus :
67 if line.startswith(u'****') :
69 uci.write(line.replace('/n', ' '))
70 #self.ucis.append([line.rstrip(), `ucinb`])
73 if word not in [' ','.', u'£', ';', '?', '!', ',', ':',''] :
75 self.feed_dict_big(word, ucinb)
78 #c.execute('INSERT INTO uci values (?,?)', (ucinb, line.rstrip()))
83 line = line.lower().replace('-', ' ').replace(u'\'',' ').replace(u'’',' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').rstrip().split()
86 print len(self.formes)
87 print sum([self.formes[forme][0] for forme in self.formes])
88 formes_out2 = os.path.join(os.path.dirname(self.parametre['filename']), 'formes_formes.csv')
89 formes_uces = os.path.join(os.path.dirname(self.parametre['filename']), 'formes_uces.csv')
90 with open(formes_out2, 'w') as f :
91 f.write('\n'.join([';'.join([forme, `self.formes[forme][0]`, self.formes[forme][2]]) for forme in self.formes]))
92 with open(formes_uces, 'w') as f:
93 f.write('\n'.join([' '.join([' '.join([`uce`, `self.formes[forme][1][uce]`]) for uce in self.formes[forme][1]]) for forme in self.formes]))
94 #uciout = os.path.join(os.path.dirname(self.parametre['filename']), 'uciout.csv')
95 #with open(uciout,'w') as f :
96 # f.write('\n'.join(['\t'.join(line) for line in self.ucis]))
101 def read_corpus_out(self, corpus_out) :
102 #print 'test encodage'
103 #self.parametre['syscoding'] = 'cp1252'
104 with codecs.open(corpus_out ,'r', self.parametre['syscoding']) as f:
106 if sys.platform == 'win32' :
110 self.ucis_paras_uces = [[[uce.split() for uce in para.splitlines()] for para in uci.split(u'$$$')] for uci in content.split(sep)]
111 #print self.ucis_paras_uces
113 def read_formes_out(self, forme_out) :
115 print 'test encodage'
117 if os.path.exists(forme_out) :
118 with codecs.open(forme_out, 'r', self.parametre['syscoding']) as f :
120 cc = [forme.split(u'$') for forme in content.splitlines()]
121 self.formes = dict([[forme[0], [int(forme[1]), dict([[eval(uce.split(':')[0]), int(uce.split(':')[1])] for uce in forme[2].split(';')]), forme[3], int(forme[4])]] for forme in cc])
123 formes_out2 = os.path.join(os.path.dirname(forme_out), 'formes_formes.csv')
124 formes_uces = os.path.join(os.path.dirname(forme_out), 'formes_uces.csv')
125 with codecs.open(formes_uces, 'r', self.parametre['syscoding']) as f:
127 uces = [list(chunks(line.split(),4)) for line in uces.splitlines()]
128 with codecs.open(formes_out2, 'r', self.parametre['syscoding']) as f :
129 self.formes = f.read()
130 self.formes = [[line.split(';'), dict([[(int(uce[0]),int(uce[1]), int(uce[2])), int(uce[3])] for uce in uces[i]])] for i, line in enumerate(self.formes.splitlines())]
131 self.formes = dict([[line[0][0], [int(line[0][1]), line[1], line[0][2], int(line[0][3])]] for line in self.formes])
133 def read_corpus_from_shelves(self, db) :
135 self.parametre = d['parametre']
136 if not 'syscoding' in self.parametre :
137 self.parametre['syscoding'] = sys.getdefaultencoding()
138 self.lems = d['lems']
139 if 'ucis_paras_uces' in d :
140 self.ucis_paras_uces = d['ucis_paras_uces']
142 corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt')
143 self.read_corpus_out(corpus_out)
145 self.formes = d['formes']
147 formes_out = os.path.join(os.path.dirname(db), 'formes.txt')
148 self.read_formes_out(formes_out)
151 # db_out = os.path.join(os.path.dirname(db), 'formes.db')
152 # conn = sqlite3.connect(db_out)
154 # c.execute('''SELECT * FROM formes''')
155 # self.formes = dict([[forme[0], [int(forme[1]), dict([[eval(uce.split(':')[0]), int(uce.split(':')[1])] for uce in forme[2].split(';')]), forme[3], int(forme[4])]] for forme in c])
157 self.etoiles = d['etoiles']
158 self.actives = d['actives']
159 self.ucis = d['ucis']
165 def save_corpus(self, db) :
167 d['parametre'] = self.parametre
168 #d['formes'] = self.formes
169 d['lems'] = self.lems
170 #d['ucis_paras_uces'] = self.ucis_paras_uces
171 d['etoiles'] = self.etoiles
172 d['actives'] = self.actives
173 d['ucis'] = self.ucis
177 corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt')
178 with open(corpus_out, 'w') as f :
179 f.write('\n\n'.join([u'$$$'.join(['\n'.join([' '.join(uce) for uce in para]) for para in uci]) for uci in self.ucis_paras_uces]))
181 formes_out2 = os.path.join(os.path.dirname(db), 'formes_formes.csv')
182 formes_uces = os.path.join(os.path.dirname(db), 'formes_uces.csv')
184 with open(formes_out2, 'w') as f :
185 f.write('\n'.join([';'.join([forme, `self.formes[forme][0]`, self.formes[forme][2], `self.formes[forme][3]`]) for forme in self.formes]))
186 with open(formes_uces, 'w') as f:
187 f.write('\n'.join([' '.join([' '.join([`uce[0]`,`uce[1]`, `uce[2]`, `self.formes[forme][1][uce]`]) for uce in self.formes[forme][1]]) for forme in self.formes]))
190 #toprint = json.dumps(self.formes)
191 #with open(os.path.join(os.path.dirname(db), 'json.db'), 'w') as f:
196 # db_out = os.path.join(os.path.dirname(db), 'formes.db')
197 # conn = sqlite3.connect(db_out)
199 # conn.text_factory = str
201 # c.execute('''CREATE TABLE formes (formes TEXT, freq integer, uces TEXT, type TEXT, identifiant integer)''')
203 # for formes in self.formes :
204 # c.execute('INSERT INTO formes values (?,?,?,?,?)', (formes, self.formes[formes][0], ';'.join([':'.join([str(uce), str(self.formes[formes][1][uce])]) for uce in self.formes[formes][1]]), self.formes[formes][2], self.formes[forme][3]))
208 def make_len_uce(self, nbtotoc):
209 if self.parametre['nbforme_uce'] == None or self.parametre['nbforme_uce'] == 0 :
211 if len(self.ucis) == 1:
212 self.parametre['eff_min_uce'] = 30
213 elif 200000 <= nbtotoc < 400000:
214 self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 20
215 elif nbtotoc < 200000:
216 self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 30
218 self.parametre['eff_min_uce'] = (float(nbtotoc) / float(len(self.ucis))) / float(15)
220 self.parametre['eff_min_uce'] = self.parametre['nbforme_uce']
221 # print 'ATTENTION ASSIGNATION DE LA TAILLE DES UCE'
225 def quick_clean1(self) :
227 self.content = self.content.lower()
228 keep_caract = u"a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇß’ñ.:,;!?\n*'_-"
229 list_keep = u"[^" + keep_caract + "]+"
230 # print 'NETTOYAGE CABLE PLUS SUB'
231 #print ('#########ATTENTION CHINOIS plus keep_caract#################')
233 self.content = re.sub(list_keep, ' ', self.content)
234 #self.content = re.sub(list_keep, ' ', self.content)
236 #self.content = self.content.replace(u'[’]+', '\'')
237 self.content = re.sub(u'[’]+', '\'', self.content)
238 self.content = re.sub(u'[\r\n]+', '\n', self.content)
239 self.content = self.content.replace(u'-*',u'#*')
241 def find_expression(self,expressions) :
242 print 'find expression'
243 for expression in expressions:
244 if expression in self.content :
245 print expression, expressions[expression][0]
246 #self.content = self.content.replace(' '+expression+' ', ' '+expressions[expression][0]+' ')
247 self.content = self.content.replace(expression, expressions[expression][0])
249 def quick_clean2(self):
250 print 'quick clean 2'
251 self.content = self.content.replace('\'',' ')
252 self.content = re.sub(u'[-]+', ' ', self.content)
253 self.content = re.sub(u'[ ]+', ' ', self.content)
254 self.content = self.content.splitlines()
256 def make_ucis(self) :
258 self.ucis = [[self.content[i].strip().split(),i] for i in range(0,len(self.content)) if self.content[i].startswith(u'****')]
259 return [a[1] for a in self.ucis]
261 def find_uci_with_digit(self, line) :
262 if line[0:4].isdigit() and u'*' in line :
267 def make_ucis_with_digit(self) :
268 self.ucis = [[self.content[i].replace('\n',' ').strip().split(),i] for i in range(0,len(self.content)) if self.find_uci_with_digit(self.content[i])]
269 return [a[1] for a in self.ucis]
271 def make_lines(self, ucinb) :
273 return [[ucinb[i]+1,ucinb[i+1]] for i in range(0,len(ucinb)-1)] + [[ucinb[len(ucinb)-1] + 1,len(self.content)]]
275 def make_ucis_words(self, lines):
276 print 'make ucis_words'
277 return [' '.join(self.content[l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'’','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').strip().split() for l in lines]
279 def make_ucis_txt(self, lines):
280 print 'make ucis_txt'
281 return [' '.join(self.content[l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'’','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':', ' : ').strip() for l in lines]
283 def make_ucis_lines(self, lines) :
284 print 'make ucis lines'
285 return [self.content[l[0]:l[1]] for l in lines]
287 def make_para_coords(self, ucis_lines):
288 print 'make para coords'
289 return [[[uci[i].split()[0], i] for i in range(0,len(uci)) if uci[i].startswith(u'#*')] for uci in ucis_lines]
291 def make_ucis_paras_txt(self, para_coords, ucis_lines, ucis_txt) :
292 print 'make_ucis_paras_txt'
293 if para_coords != [[] for val in para_coords] :
294 paranb = [[para[1] for para in uci] for uci in para_coords]
296 #print 'len paranb', len(paranb)
297 #print len(self.ucis)
298 for i, uci in enumerate(paranb) :
299 uciline = ucis_lines[i]
304 para = [[uci[i]+1, uci[i+1]] for i in range(0,len(uci)-1)]
305 para.append([uci[len(uci)-1]+1, len(uciline) ])
307 self.parametre['para'] = True
308 return [[' '.join(ucis_lines[nb][l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'’','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').strip() for l in paras[nb]] for nb in range(0,len(paras))]
310 print '############pas de para####################'
311 self.parametre['para'] = False
312 return [[val] for val in ucis_txt]
314 def make_ucis_paras_txt_phrases(self, para_coords, ucis_lines, ucis_txt) :
315 print 'make_ucis_paras_txt'
316 if para_coords != [[] for val in para_coords] :
317 paranb = [[para[1] for para in uci] for uci in para_coords]
319 for i, uci in enumerate(paranb) :
320 uciline = ucis_lines[i]
321 para = [[uci[i]+1, uci[i+1]] for i in range(0,len(uci)-1)]
322 para.append([uci[len(uci)-1]+1, len(uciline) ])
324 self.parametre['para'] = True
325 return [[' '.join(ucis_lines[nb][l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'’','\' ').strip() for l in paras[nb]] for nb in range(0,len(paras))]
327 print '############pas de para####################'
328 self.parametre['para'] = False
329 return [[val] for val in ucis_txt]
331 def make_ucis_paras_uces_sentences(self, ucis_paras_txt, make_uce = True) :
332 print 'make_ucis_paras_sentences'
333 ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
334 tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
335 self.ucis_paras_uces = []
336 for i, uci in enumerate(ucis_paras_txt) :
337 self.ucis_paras_uces.append([])
338 for j, para in enumerate(uci) :
339 sentences = tokenizer.tokenize(para)
340 sentences = [[val.strip() for val in sent.strip().replace('...',u'£').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').split() if val.strip() not in ponctuation_espace] for sent in sentences]
341 self.ucis_paras_uces[i].append(sentences)
343 def get_tot_occ_from_ucis_txt(self, ucis_txt):
345 ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
346 return sum([len([val for val in uci.split() if val.strip() not in ponctuation_espace]) for uci in ucis_txt])
348 def decouper_para(self, txt, listeSeparateurs, ls) :
350 meilleur = ['', 0, 0]
351 if len(txt) <= self.parametre['eff_min_uce'] :
352 return False, txt, []
354 while i <= self.parametre['eff_min_uce'] :
355 rapport = abs(self.parametre['eff_min_uce'] - i) + 1
357 if forme in ls and i != 0 :
358 poids = float(listeSeparateurs[ls.index(forme)][1]) / float(rapport)
360 poids = 0.1/float(rapport)
363 if poids >= meilleur[1] :
368 if meilleur[0] in ls :
369 return True, txt[:meilleur[2]],txt[meilleur[2] + 1:]
371 return True, txt[:meilleur[2]],txt[meilleur[2]:]
373 def make_ucis_paras_uces(self, ucis_paras_txt, make_uce = True) :
374 print 'make_ucis_paras_uces'
375 ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
376 listeSeparateurs = [[u'.', 60.0], [u'?', 60.0], [u'!', 60.0], [u'£', 60], [u':', 50.0], [u';', 40.0], [u',', 10.0], [u' ', 0.1]]
378 print 'decoupage uce'
379 taille_uce = self.parametre['eff_min_uce']
380 # print 'plus de recomptage UCE'
381 self.ucis_paras_uces = []
382 for i, uci in enumerate(ucis_paras_txt) :
383 self.ucis_paras_uces.append([])
384 for j, para in enumerate(uci) :
385 #print '###########ATTENTION CHINOIS para to list################'
386 #para = ' '.join(list(para))
387 self.ucis_paras_uces[i].append([])
388 reste, texte_uce, suite = decouper(para+u'$', 250, 240, listeSeparateurs)
390 uce = [val.strip() for val in texte_uce.strip().split() if val.strip() not in ponctuation_espace]
391 self.ucis_paras_uces[i][j].append(uce)
392 reste, texte_uce, suite = decouper(suite, 250, 240, listeSeparateurs)
395 for uce in self.ucis_paras_uces[i][j] :
397 if len(nuce)>=taille_uce:
409 self.ucis_paras_uces[i][j] = newpara
411 self.ucis_paras_uces = [[[[val.strip() for val in para.strip().split() if val not in ponctuation_espace]] for para in uci] for uci in ucis_paras_txt]
413 # def feed_dict(self, val, i, j, k, id) :
414 # if val in self.formes :
415 # self.formes[val][0] +=1
416 # self.formes[val][1].append([i,j,k])
418 # if val in self.parent.lexique :
419 # type_forme = self.parent.lexique[val][1]
425 # self.formes[val] = [1, [[i,j,k]], type_forme, id]
426 def feed_dict_big(self, val, ucinb) :
427 if val in self.formes :
428 self.formes[val][0] +=1
429 if ucinb in self.formes[val][1] :
430 self.formes[val][1][ucinb] += 1
432 self.formes[val][1][ucinb] = 1
433 #self.formes[val][1].append([i,j,k])
435 if val in self.parent.lexique :
436 type_forme = self.parent.lexique[val][1]
442 self.formes[val] = [1, {ucinb: 1}, type_forme]
444 def feed_dict(self, val, i, j, k, id) :
445 if val in self.formes :
446 self.formes[val][0] +=1
447 if (i,j,k) in self.formes[val][1] :
448 self.formes[val][1][(i,j,k)] += 1
450 self.formes[val][1][(i,j,k)] = 1
451 #self.formes[val][1].append([i,j,k])
453 if val in self.parent.lexique :
454 type_forme = self.parent.lexique[val][1]
460 self.formes[val] = [1, {(i,j,k): 1}, type_forme, id]
462 def check_uce_et(self) :
463 return [[forme, self.formes[forme][1]] for forme in self.formes if forme.startswith('_') and forme.endswith('_')]
465 def make_forms_and_uces(self) :
466 print 'make forms and uces'
470 for i, uci in enumerate(self.ucis_paras_uces) :
471 for j, para in enumerate(uci) :
472 for k, uce in enumerate(para) :
473 ijk = (i,j,k)#'.'.join([`i`,`j`,`k`])
474 orderuces[ijk] = compt
478 id = len(self.formes)
479 self.feed_dict(word, i, j, k, id)
480 #FIXME pas la bonne facon de compter la taille des uces
481 #passer par self.formes et self.lems
482 if ijk in uces and self.formes[word][2] in self.typeactive :
484 elif ijk not in uces and self.formes[word][2] in self.typeactive :
486 elif ijk not in uces :
490 self.etintxt = self.check_uce_et()
491 for forme in self.etintxt :
492 del(self.formes[forme[0]])
493 return uces, orderuces
495 def min_eff_formes(self) :
496 if not self.parametre['lem'] :
497 lformes = [self.formes[forme][0] for forme in self.formes if self.formes[forme][2] in self.typeactive]
498 if len(lformes) <= self.parametre['max_actives'] :
499 self.parametre['eff_min_forme'] = 3
501 lformes.sort(reverse = True)
502 self.parametre['eff_min_forme'] = lformes[self.parametre['max_actives']]
503 print self.parametre['eff_min_forme']
505 lems = self.make_lem_eff()
506 llems = [lems[lem][0] for lem in lems if lems[lem][2] in self.typeactive]
507 if len(llems) <= self.parametre['max_actives'] :
508 self.parametre['eff_min_forme'] = 3
510 llems.sort(reverse = True)
511 self.parametre['eff_min_forme'] = llems[self.parametre['max_actives']]
512 print self.parametre['eff_min_forme']
514 def make_lems(self, lexique) :
515 if self.parametre['lem'] :
517 for word in self.formes :
519 if lexique[word][0] in self.lems :
520 self.lems[lexique[word][0]].append(word)
522 self.lems[lexique[word][0]] = [word]
524 if word in self.lems :
525 self.lems[word].append(word)
527 self.lems[word] = [word]
529 print 'pas de lemmatisation : lems = formes'
530 for word in self.formes :
531 self.lems[word] = [word]
533 def make_lem_eff(self) :
536 for lem in self.lems :
537 lems[lem] = [sum([self.formes[word][0] for word in self.lems[lem]]), self.lems[lem], self.formes[self.lems[lem][0]][2]]
540 def make_lexique(self) :
543 for lem in self.lems :
544 for forme in self.lems[lem] :
545 self.lexique[forme] = lem
547 # def return_lem(self, word) :
548 # if word in self.lexique :
549 # return self.lexique[word]
553 def make_ucis_paras_uces_lems(self):
554 print 'make_ucis_paras_uces_lems'
555 if self.lexique is None :
557 return [[[[self.lexique.get(word, word) for word in uce] for uce in para] for para in uci] for uci in self.ucis_paras_uces]
559 def make_var_actives(self) :
560 print 'creation liste act'
561 self.actives = [word for word in self.lems if self.formes[self.lems[word][0]][2] in self.typeactive and sum([self.formes[mot][0] for mot in self.lems[word]]) > self.parametre['eff_min_forme']]
563 def make_var_supp(self) :
564 print 'creation var supp'
565 self.supp = [word for word in self.lems if self.formes[self.lems[word][0]][2] in self.supplementaires and sum([self.formes[mot][0] for mot in self.lems[word]]) > self.parametre['eff_min_forme']]
567 def make_and_write_sparse_matrix_from_uci(self, fileout) :
568 print 'make_and_write_sparse_martrix_from_uci'
569 with open(fileout+'~', 'w') as f :
570 for i, lem in enumerate(self.actives) :
571 ucis = list(set([uce[0] for form in self.lems[lem] for uce in self.formes[form][1]]))
574 f.write(''.join([' '.join([`uci+1`,`i+1`,`1`]),'\n']))
575 with open(fileout+'~', 'r') as f :
578 for i, line in enumerate(f) :
581 with open(fileout, 'w') as f :
582 txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(self.ucis), len(self.actives), nrow)
584 os.remove(fileout+'~')
587 def make_pondtable_with_uci(self, lformes, fileout) :
588 table_uci = [[0 for val in lformes] for line in range(0,len(self.ucis))]
589 for i, lem in enumerate(lformes) :
590 for form in self.lems[lem] :
591 ucit = [val for val in self.formes[form][1]]
593 table_uci[uci[0]][i] += self.formes[form][1][uci]
594 table_uci = [[str(val) for val in line] for line in table_uci]
595 table_uci.insert(0,lformes)
596 with open(fileout, 'w') as f :
597 f.write('\n'.join([';'.join(line) for line in table_uci]))
600 def make_tableet_with_uci(self, fileout) :
601 et = self.get_unique_etoiles()
602 table_out = [[0 for val in et] for line in range(0,len(self.ucis))]
603 for i, uci in enumerate(self.etoiles) :
604 for valet in uci[0][0] :
605 table_out[i][et.index(valet)] = 1
606 table_out = [[str(val) for val in line] for line in table_out]
607 table_out.insert(0,et)
608 with open(fileout, 'w') as f :
609 f.write('\n'.join([';'.join(line) for line in table_out]))
612 def make_table_with_uce(self, orderuces) :
613 print 'make_table_with_uce'
615 table_uce = [[0 for val in self.actives] for line in range(0, len(orderuces))]
616 for i, lem in enumerate(self.actives) :
617 for form in self.lems[lem] :
618 for uce in self.formes[form][1] :
619 #ijk = '.'.join([str(val) for val in uce])
620 table_uce[orderuces[uce]][i] = 1
623 # def make_sparse_matrix_with_uce(self, orderuces) :
624 # print 'make_sparse_matrix_with_uce'
626 # for i, lem in enumerate(self.actives) :
627 # for form in self.lems[lem] :
628 # for uce in self.formes[form][1] :
629 # #ijk = '.'.join([str(val) for val in uce])
630 # smat.append((`orderuces[uce]+1`,`i+1`,`1`))
631 # smat = list(set(smat))
635 # def write_sparse_matrix(self, fileout, smat, nrow, ncol) :
636 # print 'write_sparse_matrix'
637 # txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( nrow, ncol, len(smat))
638 # with open(fileout, 'w') as f :
639 # f.write(txt+'\n'.join([' '.join(line) for line in smat]))
641 def make_and_write_sparse_matrix_from_uce(self, orderuces, fileout) :
642 print 'make_and_write_sparse_martrix_from_uce'
643 with open(fileout+'~', 'w') as f :
644 for i, lem in enumerate(self.actives) :
645 uces = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]]))
647 f.write(''.join([' '.join([`orderuces[uce]+1`,`i+1`,`1`]),'\n']))
649 with open(fileout+'~', 'r') as f :
652 for i, line in enumerate(f) :
655 with open(fileout, 'w') as f :
656 txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(orderuces), len(self.actives), nrow)
658 os.remove(fileout+'~')
660 def make_and_write_sparse_matrix_from_uce_list(self, listin, fileout) :
661 print 'make_and_write_sparse_martrix_from_uce'
662 orderuces = [(i,j,k) for i, uci in enumerate(self.ucis_paras_uces) for j, para in enumerate(uci) for k, uce in enumerate(para)]
663 orderuces = dict([[uce,i] for i, uce in enumerate(orderuces)])
664 with open(fileout+'~', 'w') as f :
665 for i, forme in enumerate(listin) :
666 uces = [uce for uce in self.formes[forme][1]]
668 f.write(''.join([' '.join([`orderuces[uce]+1`,`i+1`,`1`]),'\n']))
670 with open(fileout+'~', 'r') as f :
673 for i, line in enumerate(f) :
676 with open(fileout, 'w') as f :
677 txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(orderuces), len(listin), nrow)
679 os.remove(fileout+'~')
682 def make_table_with_classe(self, uces, list_act) :
683 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
684 uces = dict([[uce, i] for i, uce in enumerate(uces)])
685 for i, lem in enumerate(list_act) :
686 for form in self.lems[lem] :
687 for uce in self.formes[form][1] :
689 table_uce[uces[uce]][i] = 1
690 table_uce.insert(0, list_act)
693 def make_and_write_sparse_matrix_from_classe(self, uces, list_act, fileout) :
694 print 'make_and_write_sparse_martrix_from_classe'
695 duces = dict([[uce, i] for i, uce in enumerate(uces)])
696 with open(fileout+'~', 'w') as f :
697 for i, lem in enumerate(list_act) :
698 uces_ok = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]]).intersection(uces))
700 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
702 with open(fileout+'~', 'r') as f :
705 for i, line in enumerate(f) :
708 with open(fileout, 'w') as f :
709 txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(uces), len(list_act), nrow)
711 os.remove(fileout+'~')
713 def make_uc(self, uces, orderuce, min_word_by_uc):
714 print 'start make uc'
715 ucenb= [uces[val] for val in orderuce]
718 for i, uci in enumerate(self.ucis_paras_uces) :
719 for j, para in enumerate(uci) :
721 for k, uce in enumerate(para) :
723 if uc[-1] >= min_word_by_uc :
724 uc.append(uces[uce_id])
726 uc[-1] += uces[uce_id]
727 uces_uc[uce_id] = len(uc)-1
730 return lenuc, uces_uc
732 def make_and_write_sparse_matrix_from_uc(self, uces_uc, fileout) :
733 print 'make_and_write_sparse_martrix_from_uc'
735 with open(fileout+'~', 'w') as f :
736 for i, lem in enumerate(self.actives) :
737 uces = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]]))
739 if (uces_uc[uce],i) not in deja_la :
740 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
741 deja_la[(uces_uc[uce],i)]=''
743 with open(fileout+'~', 'r') as f :
746 for i, line in enumerate(f) :
749 with open(fileout, 'w') as f :
750 txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (max(uces_uc.values()) + 1, len(self.actives), nrow)
752 os.remove(fileout+'~')
755 # def make_tab_uc(self, uces_uc, uc) :
756 # print 'make_tab_uc'
757 # tabuc = [[0 for val in self.actives] for line in uc]
758 # for i, word in enumerate(self.actives) :
759 # for forme in self.lems[word] :
760 # valforme = self.formes[forme]
761 # for j, uce in enumerate(valforme[1]):
762 # #uce = '.'.join([str(val) for val in uci])
763 # ligne = uces_uc[uce]
764 # tabuc[ligne][i] = 1
767 def write_tab(self, tab, fileout) :
768 print 'commence ecrire'
771 writer = csv.writer(open(fileout, 'wb'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
772 writer.writerows(tab)
774 def make_concord(self, words, txt, color) :
777 for forme in self.lems[word] :
778 txt = txt.replace(' '+forme+' ', ' <font color=%s>' % color +forme+'</font> ')
781 def make_colored_corpus(self) :
782 #colors = ['black', 'red', 'blue', 'green', 'orange', 'yellow', 'brown', 'pink', 'grey']
784 for i, lc in enumerate(self.lc) :
787 for uce in self.lc0 :
789 color = ['black'] + colors[len(self.lc) - 1]
791 <meta http-equiv="content-Type" content="text/html; charset=%s" />
793 ''' % sys.getdefaultencoding()
794 res = [[' '.join(self.ucis[i][0]), '<br><hr>'.join(['<font color="%s">' % color[ucecl[(i,j, k)]] + ' '.join(uce) + '</font>' for j, paras in enumerate(uci) for k, uce in enumerate(paras) ])] for i, uci in enumerate(self.ucis_paras_uces)]
795 txt += '<br>'.join(['<br>'.join(uci) for uci in res])
796 txt += '</body></html>'
798 #with open(filename,'w') as f :
801 def export_corpus_classes(self, filename, alc = False, lem = False) :
803 ucis_paras_uces = self.make_ucis_paras_uces_lems()
805 ucis_paras_uces = self.ucis_paras_uces
807 for i, lc in enumerate(self.lc) :
810 for uce in self.lc0 :
812 ucecltri = ucecl.keys()
813 #ucecltri = [[int(val) for val in uce] for uce in ucecltri]
816 #for i, uce in enumerate(ucecltri) :
818 # print self.etoiles[uce[0]][uce[1]][uce[2]]
819 # print ' '.join(ucis_paras_uces[uce[0]][uce[1]][uce[2]])
820 res = [[u'**** *classe_%i ' % ucecl[uce] + ' '.join(self.etoiles[uce[0]][uce[1]][uce[2]]), ' '.join(ucis_paras_uces[uce[0]][uce[1]][uce[2]])] for uce in ucecltri]
822 vd = [self.etoiles[uce[0]][uce[1]][uce[2]] for uce in ucecltri]
823 vd = [['<' + '='.join(et.split('_')) + '>' for et in l] for l in vd]
824 res = [['<classe=%i>' % ucecl[uce], ' '.join(ucis_paras_uces[uce[0]][uce[1]][uce[2]])] for uce in ucecltri]
825 res = [[' '.join([res[i][0],' '.join(vd[i])]), res[i][1]] for i, d in enumerate(res)]
826 with open(filename,'w') as f :
827 f.write('\n'.join(['\n'.join(uce) for uce in res]))
829 def get_concord(self, duce, word, uces, color):
831 lformes = self.lems[word]
832 for forme_ori in lformes :
833 forme = self.formes[forme_ori]
834 for ucenb in forme[1] :
837 ucinb, paranb, ucenb = ucenb
839 nuce = ' ' + duce[ijk] + ' '
840 nuce = nuce.replace(' '+forme_ori+' ', ' <font color=%s>' % color +forme_ori+'</font> ')
841 duce[ijk] = nuce.strip()
843 nuce = ' ' + ' '.join(self.ucis_paras_uces[ucinb][paranb][ucenb]) + ' '
844 nuce = nuce.replace(' '+forme_ori+' ', ' <font color = %s>' % color +forme_ori+'</font> ')
845 duce[ijk] = nuce.strip()
848 def count_from_list(self, l, d) :
856 def count_from_list_cl(self, l, d, a, clnb) :
865 def find_segments(self, taille_segment, taille_limite) :
866 print 'find_segments'
868 for para in self.ucis_paras_uces :
871 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
872 l = [[d[val], val] for val in d if d[val] >= 3]
875 if len(l) > taille_limite :
876 l = l[-taille_limite:]
879 def find_segments_doublon(self, taille_segment, taille_limite) :
880 print 'find_segments'
882 for para in self.ucis_paras_uces :
885 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
886 l = [[d[val], val] for val in d if d[val] > 1]
889 if len(l) > taille_limite :
890 l = l[-taille_limite:]
893 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
895 ucel = [self.ucis_paras_uces[uce[0]][uce[1]][uce[2]] for uce in list_uce]
897 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
898 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
901 if len(l) > taille_limite :
902 l = l[-taille_limite:]
905 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
907 ucis_paras_uces = self.make_ucis_paras_uces_lems()
909 ucis_paras_uces = self.ucis_paras_uces
911 cl_uces = [[ucis_paras_uces[uce[0]][uce[1]][uce[2]] for uce in list_uce] for list_uce in self.lc]
912 for b, classe in enumerate(cl_uces) :
914 for taille_segment in range(lenmin,lenmax) :
915 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
916 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
917 with open(fileout, 'w') as f :
918 f.write('\n'.join([';'.join(line) for line in result]))
920 def read_uce_from_R(self, filein) :
921 with open(filein, 'rU') as f :
926 line = line.replace('\n', '').replace('"', '').split(';')
927 ucecl.append([int(line[0]) - 1, int(line[1])])
930 def make_lc(self, uces, classes, clnb) :
931 self.lc = [[] for classe in range(0,clnb)]
932 for i in range(0,clnb):
933 self.lc[i] = [uce for j, uce in enumerate(uces) if i+1 == classes[j]]
934 self.lc0 = [uce for j, uce in enumerate(uces) if 0 == classes[j]]
936 def build_profile(self, clnb, classes, lformes, fileout) :
937 print 'build_profile'
938 tabout = [[[] for val in range(0,clnb)] for line in lformes]
939 for j, forme in enumerate(lformes) :
940 for word in self.lems[forme] :
941 for i in range(0,clnb) :
942 tabout[j][i] += list(set([uce for uce in self.formes[word][1]]).intersection(set(self.lc[i])))
943 tabout = [[len(set(val)) for val in line] for line in tabout]
944 tabout = [[lformes[i]] + [str(val) for val in tabout[i]] for i, line in enumerate(tabout) if sum(line) > 3]
945 with open(fileout, 'w') as f :
946 f.write('\n'.join([';'.join(line) for line in tabout]))
949 def make_etoiles(self, para_coords) :
950 if self.parametre['para'] :
951 self.etoiles = [[[uci[0][1:]+[para_coords[j][i][0]] for uce in self.ucis_paras_uces[j][i]] for i, para in enumerate(para_coords[j])] for j, uci in enumerate(self.ucis)]
953 self.etoiles = [[[uci[0][1:] for uce in self.ucis_paras_uces[j][i]] for i, para in enumerate(self.ucis_paras_uces[j])] for j, uci in enumerate(self.ucis)]
954 print '#####_etoile_######'
955 for forme in self.etintxt :
956 ucel = [tuple(val) for val in forme[1]]
957 for uce in set(ucel) :
958 self.etoiles[uce[0]][uce[1]][uce[2]].append(forme[0])
960 def build_profile_et(self, clnb, classes, uces, fileout) :
961 print 'build_profile_et'
962 unique_et = list(set([uce[i] for uci in self.etoiles for para in uci for uce in para for i in range(0,len(uce))]))
963 tabout = [[0 for val in range(0,clnb)] for line in unique_et]
964 for i, et in enumerate(unique_et) :
965 for j in range(0,clnb) :
966 for uce in self.lc[j] :
967 #coord = uce.split('.')
969 #coord = [int(val) for val in coord]
970 if et in self.etoiles[coord[0]][coord[1]][coord[2]] :
972 tabout = [[unique_et[i]] + [str(val) for val in tabout[i]] for i,line in enumerate(tabout) if sum(line) >= 1]
973 with open(fileout, 'w') as f :
974 f.write('\n'.join([';'.join(line) for line in tabout]))
977 def make_lem_type_list(self) :
978 self.lem_type_list = [[word, self.formes[self.lems[word][0]][2]] for word in self.lems]
980 def extractnr(self) :
981 with open('/home/pierre/fac/identite/nr.csv', 'w') as f :
982 f.write('\n'.join([';'.join(line) for line in self.lem_type_list if line[1] == 'nr']))
984 def get_actives_nb(self) :
985 return len([lem for lem in self.lems if self.formes[self.lems[lem][0]][2] not in self.supplementaires])
987 def get_supp_nb(self) :
988 return len([lem for lem in self.lems if self.formes[self.lems[lem][0]][2] in self.supplementaires])
990 def get_tot_occurrences(self) :
991 return sum([self.formes[forme][0] for forme in self.formes])
993 def get_unique_etoiles(self):
994 return list(set([uce[i] for uci in self.etoiles for para in uci for uce in para for i in range(0,len(uce))]))
996 def get_hapax(self) :
997 return [forme for forme in self.formes if self.formes[forme][0] == 1]
999 # def get_hapax_by_cluster(self):
1000 # print 'get_hapax_by_cluster'
1001 # hapax = self.get_hapax()
1002 # res = dict([[i+1, 0] for i in range(len(self.lc))])
1003 # sets = [dict(zip(cl,cl)) for cl in self.lc]
1004 # #classement = [self.lc0] + self.lc
1007 # uce = self.formes[hx][1].keys()[0]
1008 # for i, cl in enumerate(self.lc) :
1009 # if '.'.join([str(val) for val in uce]) in sets[i] :
1011 # toprint = '\n'.join([';'.join([`i`, `res[i]`]) for i in res])
1012 # outf = os.path.join(os.path.dirname(self.dictpathout['ira']), 'hapax_par_classe.csv')
1013 # with open(outf, 'w') as f :
1016 def get_stat_by_cluster(self, outf) :
1017 print 'get_occurrence_by_cluster'
1020 # return tuple([int(val) for val in uce.split('.')])
1021 res = dict([[i+1, 0] for i in range(len(self.lc))])
1022 res2 = dict([[i+1, 0] for i in range(len(self.lc))])
1023 res3 = dict([[i+1, 0] for i in range(len(self.lc))])
1024 res4 = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
1025 sets = [set(cl) for cl in self.lc]
1026 dicts = [dict(zip(cl,cl)) for cl in self.lc]
1027 for forme in self.formes :
1028 for i, cl in enumerate(self.lc) :
1029 concern = sets[i].intersection(self.formes[forme][1].keys())
1030 for uce in concern :
1031 res[i+1] += self.formes[forme][1][uce]
1032 if len(concern) != 0 :
1034 hapax = self.get_hapax()
1036 uce = self.formes[hx][1].keys()[0]
1037 for i, cl in enumerate(self.lc) :
1038 if uce in dicts[i] :
1040 toprint = '\n'.join([';'.join([`i`, `res[i]`, `res2[i]`, `res3[i]`, `res4[i]`, `float(res3[i])/float(res2[i])`]) for i in res])
1041 toprint = '\n'.join([';'.join([u'classe', u'occurrences', 'nb formes', u'hapax', u'uce', 'hapax/nb formes']), toprint])
1042 #outf = os.path.join(os.path.dirname(self.dictpathout['ira']), 'stat_par_classe.csv')
1043 with open(outf, 'w') as f :
1046 # def get_formenb_by_cluster(self) :
1047 # print 'get_formenb_by_cluster'
1049 # res = dict([[i+1, 0] for i in range(len(self.lc))])
1050 # sets = [set(cl) for cl in self.lc]
1051 # for forme in self.formes :
1052 # uces = ['.'.join([str(val) for val in uce]) for uce in self.formes[forme][1]]
1053 # for i, cl in enumerate(sets) :
1054 # if len(cl.intersection(uces)) != 0 :
1056 # toprint = '\n'.join([';'.join([`i`, `res[i]`]) for i in res])
1057 # outf = os.path.join(os.path.dirname(self.dictpathout['ira']), 'nbformes_par_classe.csv')
1058 # with open(outf, 'w') as f :
1061 def make_eff_from_etoiles(self, let, mineff) :
1062 forme_ok = [forme for forme in self.lems if sum([self.formes[word][0] for word in self.lems[forme]]) > mineff]
1064 #forme_ok = [forme for forme in self.formes if self.formes[forme][0] >= mineff]
1065 tabout = [[0 for et in let] for forme in forme_ok]
1066 for i, forme in enumerate(forme_ok) :
1067 for word in self.lems[forme] :
1068 for coord in self.formes[word][1] :
1069 for j, et in enumerate(let) :
1070 if et in self.etoiles[coord[0]][coord[1]][coord[2]]:
1072 tabout[i][j] += self.formes[word][1][coord]
1073 tabout = [[forme] + tabout[i] for i, forme in enumerate(forme_ok) if sum(tabout[i]) >= mineff]
1074 tabout.insert(0, [''] + let)
1077 def make_efftype_from_etoiles(self, let) :
1079 for forme in self.formes :
1080 if self.formes[forme][2] in dtypes :
1081 dtypes[self.formes[forme][2]][0] += self.formes[forme][0]
1082 #dtypes[self.formes[forme][2]][1] += self.formes[forme][1][:]
1083 dtypes[self.formes[forme][2]][1] += [uce for uce in self.formes[forme][1]]
1085 #dtypes[self.formes[forme][2]] = [self.formes[forme][0], self.formes[forme][1][:]]
1086 dtypes[self.formes[forme][2]] = [self.formes[forme][0], [uce for uce in self.formes[forme][1]]]
1087 ltypes = [typ for typ in dtypes]
1088 tabout = [[0 for et in let] for typ in dtypes]
1089 for i, typ in enumerate(ltypes) :
1090 for coord in dtypes[typ][1] :
1091 for j, et in enumerate(let) :
1092 if et in self.etoiles[coord[0]][coord[1]][coord[2]]:
1094 tabout = [[typ] + tabout[i] for i, typ in enumerate(ltypes)]
1095 tabout.insert(0, [''] + let)
1098 def make_etline(self, listet) :
1099 orderuces = [(i,j,k) for i, uci in enumerate(self.ucis_paras_uces) for j, para in enumerate(uci) for k, uce in enumerate(para)]
1100 orderuces = dict([[uce,i] for i, uce in enumerate(orderuces)])
1103 linenb.append([`orderuces[(i,j,k)] + 1` for i, uci in enumerate(self.ucis_paras_uces) for j,para in enumerate(uci) for k, uce in enumerate(para) if et in self.ucis[i][0]])
1104 linenb[-1].insert(0,et)
1107 def write_etoiles(self, fileout) :
1108 with open(fileout, 'w') as f :
1109 f.write('\n'.join([';'.join(self.ucis[i][0][1:]) for i,uci in enumerate(self.ucis) for para in self.ucis_paras_uces[i] for uce in para]))
1111 def start_analyse(self, parent, dlg = None, cmd = False, fromtt = False) :
1113 dlg.Update(1, u'Nettoyage 1')
1116 if self.parametre['expressions'] and not fromtt:
1118 dlg.Update(2, u'Expressions...')
1119 lang = self.parametre['lang']
1120 dico_path = parent.DictPath.get(lang + '_exp', 'french_exp')
1121 expressions = ReadDicoAsDico(dico_path)
1122 self.find_expression(expressions)
1125 dlg.Update(3, u'Nettoyage 2')
1129 dlg.Update(4, u'Construction des tableaux')
1131 ucisnb = self.make_ucis()
1133 if self.ucis == [] :
1134 ucisnb = self.make_ucis_with_digit()
1135 lines = self.make_lines(ucisnb)
1137 #ucis_mots = make_ucis_words(lines)
1139 ucis_txt = self.make_ucis_txt(lines)
1140 #print 'ATTENTION : CHECK DOUBLON'
1141 #self.check_double(ucis_txt)
1142 ucis_lines = self.make_ucis_lines(lines)
1143 self.para_coords = self.make_para_coords(ucis_lines)
1144 ucis_paras_txt = self.make_ucis_paras_txt(self.para_coords, ucis_lines, ucis_txt)
1147 ucis_txt = get_ucis_from_tt(self)
1149 ucis_paras_txt = [[uci] for uci in ucis_txt]
1150 self.para_coords = [[] for val in ucis_paras_txt]
1151 #print('ATTENTION PHRASE')
1152 #ucis_paras_txt = self.corpus.make_ucis_paras_txt_phrases(para_coords, ucis_lines, ucis_txt)
1153 return ucis_txt, ucis_paras_txt
1155 def check_double(self, ucis_txt):
1158 for i, uci in enumerate(ucis_txt) :
1161 ducis[uci][1].append(i)
1163 ducis[uci] = [1, [i]]
1166 list_uci_ok = [uci for uci in ducis]
1167 print 'len(list_uci_ok)', len(list_uci_ok)
1168 print 'len set list uci', len(set(list_uci_ok))
1169 toprint = [[' '.join(self.ucis[i][0]), ucis_txt[i]] for i in uci_ok]
1170 print 'len toprint', len(toprint)
1171 with open('/media/cledemoi/voile_2003_2004_ssdoublons.txt', 'w') as f:
1172 f.write('\n'.join(['\n'.join(val) for val in toprint]))
1173 lucis = [ducis[uci] for uci in ducis]
1174 #lucis = sortedby(lucis, 2, 0)
1175 lucis = [val for val in lucis if val[0] > 1]
1176 print 'len lucis', len(lucis)
1180 # if val[0] in ducis :
1181 # ducis[val[0]] += 1
1185 uci_pas_ok = [[ducis[uci][0], uci.replace(';', ' '), ';'.join([str(val) for val in ducis[uci][1]])] for uci in ducis if ducis[uci][0] > 1]
1186 #uci_pas_ok = sortedby(uci_pas_ok, 0, 2)
1187 uci_pas_ok = [[str(val[0]), val[1], val[2]] for val in uci_pas_ok]
1188 with open('/media/cledemoi/doublons.txt', 'w') as f:
1189 f.write('\n'.join([';'.join(val) for val in uci_pas_ok]))
1190 etpasok = [[' '.join(self.ucis[i][0]) for i in ducis[uci][1]] for uci in ducis if ducis[uci][0] > 1]
1191 with open('/media/cledemoi/etdoublons.txt', 'w') as f:
1192 f.write('\n'.join([';'.join(line) for line in etpasok]))
1194 def make_et_table(self) :
1195 fileout = os.path.join(os.path.dirname(self.dictpathout['ira']), 'tableau_et.csv')
1196 #fileout = '/home/pierre/tableau_et.csv'
1197 with open(fileout,'w') as f :
1198 f.write('\n'.join([';'.join(line[0]) for line in self.ucis]))
1200 def make_uci_stat(self) :
1202 for i, classe in enumerate(self.lc) :
1203 classe = [val.split('.') + [str(i)] for val in classe]
1205 fileout = os.path.join(os.path.dirname(self.dictpathout['ira']), 'uci_stat.csv')
1206 with open(fileout,'w') as f :
1207 f.write('\n'.join([';'.join(line) for line in lc]))
1209 def make_size_uci(self) :
1210 sizes = [[i, sum([len(uce) for para in uci for uce in para])] for i, uci in enumerate(self.ucis_paras_uces)]
1211 outf = os.path.join(os.path.dirname(self.dictpathout['ira']), 'taille_uci.csv')
1212 for i, size in sizes :
1214 print self.ucis_paras_uces[i]
1215 print self.etoiles[i]
1216 with open(outf, 'w') as f :
1217 f.write('\n'.join([';'.join([str(val) for val in line]) for line in sizes]))
1219 def prof_type(self) :
1222 res = dict([[i+1, {}] for i in range(len(self.lc))])
1223 sets = [set(cl) for cl in self.lc]
1224 dicts = [dict(zip(cl,cl)) for cl in self.lc]
1225 for forme in self.formes :
1226 ftype = self.formes[forme][2]
1227 #if not (forme.startswith(u'_') and forme.endswith(u'_')) :
1228 # for uce in self.formes[forme][1] :
1229 # ucet = '.'.join([str(val) for val in uce])
1230 for i, cl in enumerate(self.lc) :
1231 concern = sets[i].intersection(self.formes[forme][1].keys())
1232 for uce in concern :
1233 if ftype in res[i+1] :
1234 res[i+1][ftype] += self.formes[forme][1][uce]
1236 res[i+1][ftype] = self.formes[forme][1][uce]
1237 types = list(set([typ for typ in res[i] for i in res]))
1239 colnames = ['type'] + ['classe ' + `i+1` for i in range(len(self.lc))]
1240 toprint = [[typ] + [`res[i+1].get(typ, 0)` for i in range(len(self.lc))] for typ in types]
1241 toprint.insert(0, colnames)
1242 fileout = self.dictpathout['type_cl']
1243 with open(fileout, 'w') as f :
1244 f.write('\n'.join([';'.join(line) for line in toprint]))
1247 def make_type_tot(self):
1249 for lem in self.lems :
1250 for forme in self.lems[lem] :
1251 if self.formes[forme][2] in tt :
1252 tt[self.formes[forme][2]][0] += self.formes[forme][0]
1253 tt[self.formes[forme][2]][1].append(forme)
1255 tt[self.formes[forme][2]] = [self.formes[forme][0], [forme]]
1256 res = [';'.join([typ,str(len(tt[typ][1])),str(tt[typ][0])]) for typ in tt]
1257 res2 = ['\n'.join([';'.join([forme, str(self.formes[forme][0])]) for forme in tt[typ][1]]) for typ in tt]
1258 res = ['\n'.join([res[i], res2[i]]) for i, val in enumerate(res)]
1259 fileout = os.path.join(os.path.dirname(self.dictpathout['ira']), 'type_stat.csv')
1260 with open(fileout, 'w') as f:
1261 f.write('\n'.join(res))
1264 def count_uci_from_list(self, list_in):
1265 #liste_in = '/home/pierre/fac/lerass/bouquin_indentite/liste_mot_chercher_uci.txt'
1266 with codecs.open(list_in,'r', 'utf8') as f :
1268 content = content.splitlines()
1270 for forme in content :
1271 if forme in self.formes :
1272 ucis.append(self.formes[forme][1])
1275 #ucis = [self.formes[forme][1] for forme in content]
1276 ucis = [uc[0] for val in ucis for uc in val]
1277 print len(list(set(ucis)))