X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=corpus.py;h=489d4f18de1c71b1ce80f3699985b5cfefd62b21;hb=refs%2Fheads%2F3.0;hp=e6b0bf2345dbeed70bd26eb10ae4ec2fff90d899;hpb=1301403740fe3e9487f67a07870796f9e3dfb1f9;p=iramuteq diff --git a/corpus.py b/corpus.py index e6b0bf2..555a034 100644 --- a/corpus.py +++ b/corpus.py @@ -144,7 +144,7 @@ class Corpus : def read_corpus(self) : log.info('read corpus') - self.parametres['syscoding'] = sys.getdefaultencoding() + self.parametres['syscoding'] = 'utf8' if self.conncorpus is None : self.conn_all() res = self.ccorpus.execute('SELECT * FROM etoiles;') @@ -258,14 +258,14 @@ class Corpus : def getucisize(self) : ucesize = self.getucesize() - return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis] + return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0] def getucesize(self) : res = self.getalluces() return [len(uce[1].split()) for uce in res] def getconcorde(self, uces) : - return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) def getuciconcorde(self, ucis) : uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] @@ -293,6 +293,11 @@ class Corpus : def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + def getucisfrometoile(self, etoile): + uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + return list(set([self.getucefromid(val).uci for val in uces])) + + def getetoileuces(self) : log.info('get uces etoiles') etoileuces = {} @@ -566,7 +571,7 @@ class Corpus : self.make_iduces() actuci = '' actpara = False - with open(outf,'w') as f : + with open(outf,'w', encoding='utf8') as f : for uce in res : if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara : f.write(uce[1] + '\n') @@ -589,7 +594,7 @@ class Corpus : longueur_max = max([len(val) for val in metas]) first = ['column_%i' % i for i in range(longueur_max)] metas.insert(0, first) - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : f.write('\n'.join(['\t'.join(line) for line in metas])) def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : @@ -604,7 +609,7 @@ class Corpus : self.make_iduces() else : res = self.getallucis() - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : for uce in res : guce = uce[1] if not uci : @@ -627,7 +632,7 @@ class Corpus : self.make_iduces() else : res = self.getuciconcorde(sts) - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : for uce in res : guce = uce[1] if not uci : @@ -652,7 +657,7 @@ class Corpus : outf = os.path.join(rep, outf) if lem : guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : f.write(guce) #.encode('cp1252', errors = 'replace')) def export_tropes(self, fileout, classe, lem = False, uci = False) : @@ -662,7 +667,7 @@ class Corpus : self.make_iduces() else : res = self.getuciconcorde(sts) - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : for uce in res : guce = uce[1] if lem : @@ -803,7 +808,7 @@ class Corpus : else : tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3] - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join(line) for line in tab])) def make_etoiles(self) : @@ -878,8 +883,8 @@ class Corpus : etoileuces = self.getetoileuces() else : etoileuces = self.getetoileucis() - etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) - with open(fileout, 'w') as f : + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0]) + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding']) #etoiles = self.make_etoiles() #with open(fileout, 'w') as f : @@ -894,9 +899,9 @@ class Corpus : ucecl[uce] = 0 color = ['black'] + colors[len(self.lc) - 1] txt = ''' - + -''' % sys.getdefaultencoding() +''' if not uci : res = self.getalluces() self.make_iduces() @@ -1002,7 +1007,7 @@ class Corpus : for taille_segment in range(lenmin,lenmax) : d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc)) result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin] - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join(line) for line in result])) def make_proftype(self, outf) : @@ -1017,7 +1022,7 @@ class Corpus : res[gram][i] += sum([lemuceeff[uce] for uce in concern]) res = [[gram] + [repr(val) for val in res[gram]] for gram in res] res.sort() - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join(line) for line in res])) def make_ucecl_from_R(self, filein) : @@ -1057,7 +1062,7 @@ class Corpus : log.info('%f' % (time() - t1)) if outf is not None : toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences]) - with open(outf, 'w') as f : + with open(outf, 'w', encoding='utf8') as f : f.write(toprint) else : return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences] @@ -1121,21 +1126,21 @@ class Corpus : txt += """ """ - with open('/tmp/testhapxuce.html','w') as f : + with open('/tmp/testhapxuce.html','w', encoding='utf8') as f : f.write(txt) def export_dictionary(self, fileout, syscoding) : listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] listformes.sort(reverse = True) listformes = [forme[1:] + [repr(forme[0])] for forme in listformes] - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join(['\t'.join(forme) for forme in listformes])) def export_lems(self, fileout, syscoding) : self.make_idformes() listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems] listlem.sort() - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join(['\t'.join(lem) for lem in listlem])) @@ -1505,7 +1510,7 @@ class BuildSubCorpus(BuildCorpus): newuces = [] newpara = [] for et in uci.paras : - keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara] idpara += 1 if keepuces != [] : newuces += keepuces