corrections

[iramuteq] / corpus.py
diff --git a/corpus.py b/corpus.py

index 7ab0ebb..555a034 100644 (file)
--- a/corpus.py
+++ b/corpus.py
@@ -20,6 +20,9 @@ from operator import itemgetter
  from uuid import uuid4
  import datetime
  from copy import copy
  from uuid import uuid4
  import datetime
  from copy import copy
+#------test spacy------------
+#import spacy
+#nlp = spacy.load("fr_core_news_lg")
  
  #------------------------------------
  # import des fichiers du projet
  
  #------------------------------------
  # import des fichiers du projet
@@ -141,7 +144,7 @@ class Corpus :
  
      def read_corpus(self) :
          log.info('read corpus')
  
      def read_corpus(self) :
          log.info('read corpus')
-        self.parametres['syscoding'] = sys.getdefaultencoding()
+        self.parametres['syscoding'] = 'utf8'
          if self.conncorpus is None :
              self.conn_all()
          res = self.ccorpus.execute('SELECT * FROM etoiles;')
          if self.conncorpus is None :
              self.conn_all()
          res = self.ccorpus.execute('SELECT * FROM etoiles;')
@@ -255,14 +258,14 @@ class Corpus :
  
      def getucisize(self) :
          ucesize = self.getucesize()
  
      def getucisize(self) :
          ucesize = self.getucesize()
-        return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
+        return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0]
  
      def getucesize(self) :
          res = self.getalluces()
          return [len(uce[1].split()) for uce in res]
  
      def getconcorde(self, uces) :
  
      def getucesize(self) :
          res = self.getalluces()
          return [len(uce[1].split()) for uce in res]
  
      def getconcorde(self, uces) :
-        return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) 
+        return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces]))
  
      def getuciconcorde(self, ucis) :
          uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
  
      def getuciconcorde(self, ucis) :
          uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
@@ -290,6 +293,11 @@ class Corpus :
      def getucesfrometoile(self, etoile) :
          return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
  
      def getucesfrometoile(self, etoile) :
          return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
  
+    def getucisfrometoile(self, etoile):
+        uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
+        return list(set([self.getucefromid(val).uci for val in uces]))
+
+
      def getetoileuces(self) :
          log.info('get uces etoiles')
          etoileuces = {}
      def getetoileuces(self) :
          log.info('get uces etoiles')
          etoileuces = {}
@@ -563,7 +571,7 @@ class Corpus :
          self.make_iduces()
          actuci = ''
          actpara = False
          self.make_iduces()
          actuci = ''
          actpara = False
-        with open(outf,'w') as f :
+        with open(outf,'w', encoding='utf8') as f :
              for uce in res :
                  if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
                      f.write(uce[1] + '\n')
              for uce in res :
                  if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
                      f.write(uce[1] + '\n')
@@ -586,7 +594,7 @@ class Corpus :
          longueur_max = max([len(val) for val in metas])
          first = ['column_%i' % i for i in range(longueur_max)]
          metas.insert(0, first)
          longueur_max = max([len(val) for val in metas])
          first = ['column_%i' % i for i in range(longueur_max)]
          metas.insert(0, first)
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
              f.write('\n'.join(['\t'.join(line) for line in metas]))
  
      def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
              f.write('\n'.join(['\t'.join(line) for line in metas]))
  
      def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
@@ -601,7 +609,7 @@ class Corpus :
              self.make_iduces()
          else :
              res = self.getallucis()
              self.make_iduces()
          else :
              res = self.getallucis()
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
              for uce in res :
                  guce = uce[1]
                  if not uci :
              for uce in res :
                  guce = uce[1]
                  if not uci :
@@ -624,7 +632,7 @@ class Corpus :
              self.make_iduces()
          else :
              res = self.getuciconcorde(sts)
              self.make_iduces()
          else :
              res = self.getuciconcorde(sts)
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
              for uce in res :
                  guce = uce[1]
                  if not uci :
              for uce in res :
                  guce = uce[1]
                  if not uci :
@@ -649,7 +657,7 @@ class Corpus :
              outf = os.path.join(rep, outf)
              if lem :
                  guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
              outf = os.path.join(rep, outf)
              if lem :
                  guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
-            with open(outf, 'w') as f :
+            with open(outf, 'w', encoding='utf8') as f :
                  f.write(guce) #.encode('cp1252', errors = 'replace'))
  
      def export_tropes(self, fileout, classe, lem = False, uci = False) :
                  f.write(guce) #.encode('cp1252', errors = 'replace'))
  
      def export_tropes(self, fileout, classe, lem = False, uci = False) :
@@ -659,7 +667,7 @@ class Corpus :
              self.make_iduces()
          else :
              res = self.getuciconcorde(sts)
              self.make_iduces()
          else :
              res = self.getuciconcorde(sts)
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
              for uce in res :
                  guce = uce[1]
                  if lem :
              for uce in res :
                  guce = uce[1]
                  if lem :
@@ -800,7 +808,7 @@ class Corpus :
          else :
              tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
          tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
          else :
              tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
          tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
              f.write('\n'.join([';'.join(line) for line in tab]))
  
      def make_etoiles(self) :
              f.write('\n'.join([';'.join(line) for line in tab]))
  
      def make_etoiles(self) :
@@ -875,8 +883,8 @@ class Corpus :
              etoileuces = self.getetoileuces()
          else :
              etoileuces = self.getetoileucis()
              etoileuces = self.getetoileuces()
          else :
              etoileuces = self.getetoileucis()
-        etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
-        with open(fileout, 'w') as f :
+        etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0])
+        with open(fileout, 'w', encoding='utf8') as f :
              f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding'])
          #etoiles = self.make_etoiles()
          #with open(fileout, 'w') as f :
              f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding'])
          #etoiles = self.make_etoiles()
          #with open(fileout, 'w') as f :
@@ -891,9 +899,9 @@ class Corpus :
              ucecl[uce] = 0
          color = ['black'] + colors[len(self.lc) - 1]
          txt = '''<html>
              ucecl[uce] = 0
          color = ['black'] + colors[len(self.lc) - 1]
          txt = '''<html>
-        <meta http-equiv="content-Type" content="text/html; charset=%s" />
+        <meta http-equiv="content-Type" content="text/html; charset=utf8" />
          <body>
          <body>
-''' % sys.getdefaultencoding()
+'''
          if not uci :
              res = self.getalluces()
              self.make_iduces()
          if not uci :
              res = self.getalluces()
              self.make_iduces()
@@ -999,7 +1007,7 @@ class Corpus :
                  for taille_segment in range(lenmin,lenmax) :
                      d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
          result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
                  for taille_segment in range(lenmin,lenmax) :
                      d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
          result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
              f.write('\n'.join([';'.join(line) for line in result]))
  
      def make_proftype(self, outf) :
              f.write('\n'.join([';'.join(line) for line in result]))
  
      def make_proftype(self, outf) :
@@ -1014,7 +1022,7 @@ class Corpus :
                  res[gram][i] += sum([lemuceeff[uce] for uce in concern])
          res = [[gram] + [repr(val) for val in res[gram]] for gram in res]
          res.sort()
                  res[gram][i] += sum([lemuceeff[uce] for uce in concern])
          res = [[gram] + [repr(val) for val in res[gram]] for gram in res]
          res.sort()
-        with open(outf, 'w') as f :
+        with open(outf, 'w', encoding='utf8') as f :
              f.write('\n'.join([';'.join(line) for line in res]))
  
      def make_ucecl_from_R(self, filein) :
              f.write('\n'.join([';'.join(line) for line in res]))
  
      def make_ucecl_from_R(self, filein) :
@@ -1054,7 +1062,7 @@ class Corpus :
          log.info('%f' % (time() - t1))
          if outf is not None :
              toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences])
          log.info('%f' % (time() - t1))
          if outf is not None :
              toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences])
-            with open(outf, 'w') as f :
+            with open(outf, 'w', encoding='utf8') as f :
                  f.write(toprint)
          else :
              return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences]
                  f.write(toprint)
          else :
              return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences]
@@ -1118,21 +1126,21 @@ class Corpus :
          txt += """
          </body></html>
          """
          txt += """
          </body></html>
          """
-        with open('/tmp/testhapxuce.html','w') as f :
+        with open('/tmp/testhapxuce.html','w', encoding='utf8') as f :
              f.write(txt)
  
      def export_dictionary(self, fileout, syscoding) :
          listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
          listformes.sort(reverse = True)
          listformes = [forme[1:] + [repr(forme[0])] for forme in listformes]
              f.write(txt)
  
      def export_dictionary(self, fileout, syscoding) :
          listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
          listformes.sort(reverse = True)
          listformes = [forme[1:] + [repr(forme[0])] for forme in listformes]
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
              f.write('\n'.join(['\t'.join(forme) for forme in listformes]))
  
      def export_lems(self, fileout, syscoding) :
          self.make_idformes()
          listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems]
          listlem.sort()
              f.write('\n'.join(['\t'.join(forme) for forme in listformes]))
  
      def export_lems(self, fileout, syscoding) :
          self.make_idformes()
          listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems]
          listlem.sort()
-        with open(fileout, 'w') as f :
+        with open(fileout, 'w', encoding='utf8') as f :
              f.write('\n'.join(['\t'.join(lem) for lem in listlem]))
  
  
              f.write('\n'.join(['\t'.join(lem) for lem in listlem]))
  
  
@@ -1502,7 +1510,7 @@ class BuildSubCorpus(BuildCorpus):
                  newuces = []
                  newpara = []
                  for et in uci.paras :
                  newuces = []
                  newpara = []
                  for et in uci.paras :
-                    keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+                    keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara]
                      idpara += 1
                      if keepuces != [] :
                          newuces += keepuces
                      idpara += 1
                      if keepuces != [] :
                          newuces += keepuces
@@ -1546,6 +1554,7 @@ class BuildSubCorpus(BuildCorpus):
  class BuildFromAlceste(BuildCorpus) :
  
      def read_corpus(self, infile) :
  class BuildFromAlceste(BuildCorpus) :
  
      def read_corpus(self, infile) :
+
          if self.dlg is not None :
              self.dlg.Pulse('textes : 0 - segments : 0')
          self.limitshow = 0
          if self.dlg is not None :
              self.dlg.Pulse('textes : 0 - segments : 0')
          self.limitshow = 0
@@ -1565,6 +1574,8 @@ class BuildFromAlceste(BuildCorpus) :
                      if self.testuci(line) :
                          iduci += 1
                          if txt != [] :
                      if self.testuci(line) :
                          iduci += 1
                          if txt != [] :
+                            #doc = nlp(' '.join(txt))
+                            #print([[word, word.pos_, word.lemma_] for word in doc])
                              iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
                              txt = []
                              self.corpus.ucis.append(Uci(iduci, line))
                              iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
                              txt = []
                              self.corpus.ucis.append(Uci(iduci, line))