X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=corpus.py;h=489d4f18de1c71b1ce80f3699985b5cfefd62b21;hb=refs%2Fheads%2F3.0;hp=e6b0bf2345dbeed70bd26eb10ae4ec2fff90d899;hpb=1301403740fe3e9487f67a07870796f9e3dfb1f9;p=iramuteq

diff --git a/corpus.py b/corpus.py
index e6b0bf2..555a034 100644
--- a/corpus.py
+++ b/corpus.py
@@ -144,7 +144,7 @@ class Corpus :
 
     def read_corpus(self) :
         log.info('read corpus')
-        self.parametres['syscoding'] = sys.getdefaultencoding()
+        self.parametres['syscoding'] = 'utf8'
         if self.conncorpus is None :
             self.conn_all()
         res = self.ccorpus.execute('SELECT * FROM etoiles;')
@@ -258,14 +258,14 @@ class Corpus :
 
     def getucisize(self) :
         ucesize = self.getucesize()
-        return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
+        return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0]
 
     def getucesize(self) :
         res = self.getalluces()
         return [len(uce[1].split()) for uce in res]
 
     def getconcorde(self, uces) :
-        return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) 
+        return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces]))
 
     def getuciconcorde(self, ucis) :
         uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
@@ -293,6 +293,11 @@ class Corpus :
     def getucesfrometoile(self, etoile) :
         return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
 
+    def getucisfrometoile(self, etoile):
+        uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
+        return list(set([self.getucefromid(val).uci for val in uces]))
+
+
     def getetoileuces(self) :
         log.info('get uces etoiles')
         etoileuces = {}
@@ -566,7 +571,7 @@ class Corpus :
         self.make_iduces()
         actuci = ''
         actpara = False
-        with open(outf,'w') as f :
+        with open(outf,'w', encoding='utf8') as f :
             for uce in res :
                 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
                     f.write(uce[1] + '\n')
@@ -589,7 +594,7 @@ class Corpus :
         longueur_max = max([len(val) for val in metas])
         first = ['column_%i' % i for i in range(longueur_max)]
         metas.insert(0, first)
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
             f.write('\n'.join(['\t'.join(line) for line in metas]))
 
     def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
@@ -604,7 +609,7 @@ class Corpus :
             self.make_iduces()
         else :
             res = self.getallucis()
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
             for uce in res :
                 guce = uce[1]
                 if not uci :
@@ -627,7 +632,7 @@ class Corpus :
             self.make_iduces()
         else :
             res = self.getuciconcorde(sts)
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
             for uce in res :
                 guce = uce[1]
                 if not uci :
@@ -652,7 +657,7 @@ class Corpus :
             outf = os.path.join(rep, outf)
             if lem :
                 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
-            with open(outf, 'w') as f :
+            with open(outf, 'w', encoding='utf8') as f :
                 f.write(guce) #.encode('cp1252', errors = 'replace'))
 
     def export_tropes(self, fileout, classe, lem = False, uci = False) :
@@ -662,7 +667,7 @@ class Corpus :
             self.make_iduces()
         else :
             res = self.getuciconcorde(sts)
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
             for uce in res :
                 guce = uce[1]
                 if lem :
@@ -803,7 +808,7 @@ class Corpus :
         else :
             tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
         tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
             f.write('\n'.join([';'.join(line) for line in tab]))
 
     def make_etoiles(self) :
@@ -878,8 +883,8 @@ class Corpus :
             etoileuces = self.getetoileuces()
         else :
             etoileuces = self.getetoileucis()
-        etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
-        with open(fileout, 'w') as f :
+        etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0])
+        with open(fileout, 'w', encoding='utf8') as f :
             f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding'])
         #etoiles = self.make_etoiles()
         #with open(fileout, 'w') as f :
@@ -894,9 +899,9 @@ class Corpus :
             ucecl[uce] = 0
         color = ['black'] + colors[len(self.lc) - 1]
         txt = '''<html>
-        <meta http-equiv="content-Type" content="text/html; charset=%s" />
+        <meta http-equiv="content-Type" content="text/html; charset=utf8" />
         <body>
-''' % sys.getdefaultencoding()
+'''
         if not uci :
             res = self.getalluces()
             self.make_iduces()
@@ -1002,7 +1007,7 @@ class Corpus :
                 for taille_segment in range(lenmin,lenmax) :
                     d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
         result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
             f.write('\n'.join([';'.join(line) for line in result]))
 
     def make_proftype(self, outf) :
@@ -1017,7 +1022,7 @@ class Corpus :
                 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
         res = [[gram] + [repr(val) for val in res[gram]] for gram in res]
         res.sort()
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
             f.write('\n'.join([';'.join(line) for line in res]))
 
     def make_ucecl_from_R(self, filein) :
@@ -1057,7 +1062,7 @@ class Corpus :
         log.info('%f' % (time() - t1))
         if outf is not None :
             toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences])
-            with open(outf, 'w') as f :
+            with open(outf, 'w', encoding='utf8') as f :
                 f.write(toprint)
         else :
             return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences]
@@ -1121,21 +1126,21 @@ class Corpus :
         txt += """
         </body></html>
         """
-        with open('/tmp/testhapxuce.html','w') as f :
+        with open('/tmp/testhapxuce.html','w', encoding='utf8') as f :
             f.write(txt)
 
     def export_dictionary(self, fileout, syscoding) :
         listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
         listformes.sort(reverse = True)
         listformes = [forme[1:] + [repr(forme[0])] for forme in listformes]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
             f.write('\n'.join(['\t'.join(forme) for forme in listformes]))
 
     def export_lems(self, fileout, syscoding) :
         self.make_idformes()
         listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems]
         listlem.sort()
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
             f.write('\n'.join(['\t'.join(lem) for lem in listlem]))
 
 
@@ -1505,7 +1510,7 @@ class BuildSubCorpus(BuildCorpus):
                 newuces = []
                 newpara = []
                 for et in uci.paras :
-                    keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+                    keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara]
                     idpara += 1
                     if keepuces != [] :
                         newuces += keepuces