2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2008-2013, Pierre Ratinaud
9 from dialog import ExtractDialog
10 from corpus import Corpus, copycorpus
14 parametres = {'filein' : 'corpus/lru2.txt',
17 'mods' : [u'*annee_2010', u'*annee_2011']}
20 if line.startswith(u'**** ') :
26 if line.startswith(u'-*') :
31 def testvar(line, variable) :
33 varmod = [val.split('_') for val in line[1:]]
34 vars = [var[0] for var in varmod]
36 return '_'.join([variable, varmod[vars.index(variable)][1]]).replace(u'*','')
40 def testmod(line, mods) :
44 return mod.replace(u'*','')
49 def __init__(self, parent, option) :
50 dial = ExtractDialog(parent, option)
52 res = dial.ShowModal()
54 parametres = dial.make_param()
55 if option == 'splitvar' :
56 SplitFromVar(parametres)
57 elif option == 'mods' :
58 ExtractMods(parametres)
59 elif option == 'them' :
60 SplitFromThem(parametres)
62 dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
69 def __init__(self, parametres) :
70 self.filein = parametres['filein']
71 self.var = parametres['var']
72 self.encodein = parametres['encodein']
73 self.encodeout = parametres['encodeout']
74 self.basepath = os.path.dirname(self.filein)
80 with codecs.open(self.filein, 'r', self.encodein) as fin :
83 varmod = testvar(line, self.var)
86 if varmod not in filedict :
87 filename = os.path.join(self.basepath, varmod + '.txt')
88 filedict[varmod] = open(filename, 'w')
89 fileout = filedict[varmod]
93 fileout.write(line.encode(self.encodeout))
98 def __init__(self, parametres) :
99 self.filein = parametres['filein']
100 self.them = parametres['them']
101 self.encodein = parametres['encodein']
102 self.encodeout = parametres['encodeout']
103 self.basepath = os.path.dirname(self.filein)
104 self.pathout = os.path.join(self.basepath, '_'.join([them.replace(u'-*','') for them in self.them]))
105 self.fileout = open(self.pathout, 'w')
113 with codecs.open(self.filein, 'r', self.encodein) as fin :
116 self.writetext(self.fileout, lastet, text)
120 l = line.strip().rstrip('\n\r')
127 self.writetext(self.fileout, lastet, text)
129 def writetext(self, fileout, lastet, text):
131 self.fileout.write(lastet.encode(self.encodeout) + text.encode(self.encodeout))
135 def __init__(self, parametres) :
136 self.onefile = parametres.get('onefile', False)
137 self.filein = parametres['filein']
138 self.mods = parametres['mods']
139 self.encodein = parametres['encodein']
140 self.encodeout = parametres['encodeout']
141 self.basepath = os.path.dirname(self.filein)
143 filename = os.path.join(self.basepath, '_'.join([mod.replace(u'*','') for mod in self.mods])+'.txt')
144 self.fileout = open(filename, 'w')
150 with codecs.open(self.filein, 'r', self.encodein) as fin :
153 modinline = testmod(line, self.mods)
156 if not self.onefile :
157 if modinline not in filedict :
158 filename = os.path.join(self.basepath, modinline + '.txt')
159 filedict[modinline] = open(filename, 'w')
160 fileout = filedict[modinline]
162 fileout = self.fileout
166 fileout.write(line.encode(self.encodeout))
167 if not self.onefile :
174 class SubCorpus(Corpus) :
175 def __init__(self, parent, corpus, sgts) :
176 Corpus.__init__(self, parent, corpus.parametres)
178 self.corpus = copycorpus(corpus)
179 self.corpus.make_lems(self.parametres['lem'])
180 textes = list(set([corpus.getucefromid(sgt).uci for sgt in sgts]))
181 self.ucis = [corpus.ucis[i] for i in textes]
182 for texte in self.ucis :
183 texte.uces = [uce for uce in texte.uces if uce.ident in self.sgts]
184 self.make_formes(corpus)
185 self.pathout = corpus.pathout
186 self.parametres['sub'] = self.sgts
188 def make_formes(self, corpus) :
190 for forme in self.corpus.formes :
191 sgtseff = self.corpus.getformeuceseff(forme)
192 sgts = set(self.sgts).intersection(sgtseff.keys())
194 self.formes[forme] = self.corpus.formes[forme]
195 self.formes[forme].freq = sum([sgtseff[sgt] for sgt in sgts])
197 def getlemuces(self, lem) :
198 return list(set(self.sgts).intersection(self.corpus.getlemuces(lem)))
200 def converttabletocorpus(table, fileout, enc='UTF8') :
202 var = var[0:len(var)-1]
204 et = [zip(var, line[0:len(line)-1]) for line in table]
205 et = ['**** ' + ' '.join(['*' + '_'.join(val) for val in line]) for line in et]
206 txt = ['\n'.join([et[i], line[-1]]) for i, line in enumerate(table)]
208 #with open(fileout, 'w') as f :
214 if __name__ == '__main__' :
215 #SplitFromVar(parametres)
216 ExtractMods(parametres, True)