1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
13 #------------------------------------
14 # import des modules wx
15 #------------------------------------
18 #------------------------------------
19 # import des fichiers du projet
20 #------------------------------------
21 from dialog import ExtractDialog
22 from corpus import Corpus, copycorpus
25 parametres = {'filein' : 'corpus/lru2.txt',
28 'mods' : ['*annee_2010', '*annee_2011']}
32 if line.startswith('**** ') :
38 if line.startswith('-*') :
43 def testvar(line, variable) :
45 varmod = [val.split('_') for val in line[1:]]
46 vars = [var[0] for var in varmod]
48 return '_'.join([variable, varmod[vars.index(variable)][1]]).replace('*','')
52 def testmod(line, mods) :
56 return mod.replace('*','')
62 def __init__(self, parent, option) :
63 dial = ExtractDialog(parent, option)
65 res = dial.ShowModal()
67 parametres = dial.make_param()
68 if option == 'splitvar' :
69 SplitFromVar(parametres)
70 elif option == 'mods' :
71 ExtractMods(parametres)
72 elif option == 'them' :
73 SplitFromThem(parametres)
75 dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
84 def __init__(self, parametres) :
85 self.filein = parametres['filein']
86 self.var = parametres['var']
87 self.encodein = parametres['encodein']
88 self.encodeout = parametres['encodeout']
89 self.basepath = os.path.dirname(self.filein)
95 with codecs.open(self.filein, 'r', self.encodein) as fin :
98 varmod = testvar(line, self.var)
101 if varmod not in filedict :
102 filename = os.path.join(self.basepath, varmod + '.txt')
103 filedict[varmod] = open(filename, 'w')
104 fileout = filedict[varmod]
113 class SplitFromThem :
115 def __init__(self, parametres) :
116 self.filein = parametres['filein']
117 self.them = parametres['them']
118 self.encodein = parametres['encodein']
119 self.encodeout = parametres['encodeout']
120 self.basepath = os.path.dirname(self.filein)
121 self.pathout = os.path.join(self.basepath, '_'.join([them.replace('-*','') for them in self.them]))
122 self.fileout = open(self.pathout, 'w')
130 with codecs.open(self.filein, 'r', self.encodein) as fin :
133 self.writetext(self.fileout, lastet, text)
137 l = line.strip().rstrip('\n\r')
144 self.writetext(self.fileout, lastet, text)
146 def writetext(self, fileout, lastet, text):
148 self.fileout.write(lastet + text)
153 def __init__(self, parametres) :
154 self.onefile = parametres.get('onefile', False)
155 self.filein = parametres['filein']
156 self.mods = parametres['mods']
157 self.encodein = parametres['encodein']
158 self.encodeout = parametres['encodeout']
159 self.basepath = os.path.dirname(self.filein)
161 filename = os.path.join(self.basepath, '_'.join([mod.replace('*','') for mod in self.mods])+'.txt')
162 self.fileout = open(filename, 'w')
168 with codecs.open(self.filein, 'r', self.encodein) as fin :
171 modinline = testmod(line, self.mods)
174 if not self.onefile :
175 if modinline not in filedict :
176 filename = os.path.join(self.basepath, modinline + '.txt')
177 filedict[modinline] = open(filename, 'w')
178 fileout = filedict[modinline]
180 fileout = self.fileout
185 if not self.onefile :
192 class SubCorpus(Corpus) :
194 def __init__(self, parent, corpus, sgts) :
195 Corpus.__init__(self, parent, corpus.parametres)
197 self.corpus = copycorpus(corpus)
198 self.corpus.make_lems(self.parametres['lem'])
199 textes = list(set([corpus.getucefromid(sgt).uci for sgt in sgts]))
200 self.ucis = [corpus.ucis[i] for i in textes]
201 for texte in self.ucis :
202 texte.uces = [uce for uce in texte.uces if uce.ident in self.sgts]
203 self.make_formes(corpus)
204 self.pathout = corpus.pathout
205 self.parametres['sub'] = self.sgts
207 def make_formes(self, corpus) :
209 for forme in self.corpus.formes :
210 sgtseff = self.corpus.getformeuceseff(forme)
211 sgts = set(self.sgts).intersection(list(sgtseff.keys()))
213 self.formes[forme] = self.corpus.formes[forme]
214 self.formes[forme].freq = sum([sgtseff[sgt] for sgt in sgts])
216 def getlemuces(self, lem) :
217 return list(set(self.sgts).intersection(self.corpus.getlemuces(lem)))
220 def converttabletocorpus(table, fileout, enc='UTF8') :
222 var = var[0:len(var)-1]
224 et = [list(zip(var, line[0:len(line)-1])) for line in table]
225 et = ['**** ' + ' '.join(['*' + '_'.join(val) for val in line]) for line in et]
226 txt = ['\n'.join([et[i], line[-1]]) for i, line in enumerate(table)]
227 print('\n'.join(txt))
228 #with open(fileout, 'w') as f :
231 # execution directe ???
232 if __name__ == '__main__' :
233 #SplitFromVar(parametres)
234 ExtractMods(parametres, True)