X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=tools.py;fp=tools.py;h=3b1b0d9c7d674a0e9c203de8251c89cf28f8dfbf;hb=7fb5b2b86f6c9a0617208ee85211177c23d12f47;hp=0000000000000000000000000000000000000000;hpb=22f93a602f3584ddc6ba68114556212c90307a50;p=iramuteq diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..3b1b0d9 --- /dev/null +++ b/tools.py @@ -0,0 +1,156 @@ +#!/bin/env python +# -*- coding: utf-8 -*- +#Author: Pierre Ratinaud +#Copyright (c) 2008-2013, Pierre Ratinaud +#Lisense: GNU GPL + +import codecs +import os +from dialog import ExtractDialog +from corpus import Corpus, copycorpus +import wx + + +parametres = {'filein' : 'corpus/lru2.txt', + 'encodein' : 'utf8', + 'encodeout' : 'utf8', + 'mods' : [u'*annee_2010', u'*annee_2011']} + +def istext(line) : + if line.startswith(u'**** ') : + return True + else : + return False + +def testvar(line, variable) : + line = line.split() + varmod = [val.split('_') for val in line[1:]] + vars = [var[0] for var in varmod] + if variable in vars : + return '_'.join([variable, varmod[vars.index(variable)][1]]).replace(u'*','') + else : + return False + +def testmod(line, mods) : + line = line.split() + for mod in mods : + if mod in line[1:] : + return mod.replace(u'*','') + return False + + +class Extract : + def __init__(self, parent, option) : + dial = ExtractDialog(parent, option) + dial.CenterOnParent() + res = dial.ShowModal() + if res == wx.ID_OK : + parametres = dial.make_param() + if option == 'splitvar' : + SplitFromVar(parametres) + else : + ExtractMods(parametres) + +class SplitFromVar : + def __init__(self, parametres) : + self.filein = parametres['filein'] + self.var = parametres['var'] + self.encodein = parametres['encodein'] + self.encodeout = parametres['encodeout'] + self.basepath = os.path.dirname(self.filein) + self.doparse() + + def doparse(self) : + keepline = False + filedict = {} + with codecs.open(self.filein, 'r', self.encodein) as fin : + for line in fin : + if istext(line) : + varmod = testvar(line, self.var) + if varmod : + keepline = True + if varmod not in filedict : + filename = os.path.join(self.basepath, varmod + '.txt') + filedict[varmod] = open(filename, 'w') + fileout = filedict[varmod] + else : + keepline = False + if keepline : + fileout.write(line.encode(self.encodeout)) + for f in filedict : + filedict[f].close() + +class ExtractMods : + def __init__(self, parametres) : + self.onefile = parametres.get('onefile', False) + self.filein = parametres['filein'] + self.mods = parametres['mods'] + self.encodein = parametres['encodein'] + self.encodeout = parametres['encodeout'] + self.basepath = os.path.dirname(self.filein) + if self.onefile : + filename = os.path.join(self.basepath, '_'.join([mod.replace(u'*','') for mod in self.mods])+'.txt') + self.fileout = open(filename, 'w') + self.doparse() + + def doparse(self) : + keepline = False + filedict = {} + with codecs.open(self.filein, 'r', self.encodein) as fin : + for line in fin : + if istext(line) : + modinline = testmod(line, self.mods) + if modinline : + keepline = True + if not self.onefile : + if modinline not in filedict : + filename = os.path.join(self.basepath, modinline + '.txt') + filedict[modinline] = open(filename, 'w') + fileout = filedict[modinline] + else : + fileout = self.fileout + else : + keepline = False + if keepline : + fileout.write(line.encode(self.encodeout)) + if not self.onefile : + for f in filedict : + filedict[f].close() + else : + self.fileout.close() + + +class SubCorpus(Corpus) : + def __init__(self, parent, corpus, sgts) : + Corpus.__init__(self, parent, corpus.parametres) + self.sgts = sgts + self.corpus = copycorpus(corpus) + self.corpus.make_lems(self.parametres['lem']) + textes = list(set([corpus.getucefromid(sgt).uci for sgt in sgts])) + self.ucis = [corpus.ucis[i] for i in textes] + for texte in self.ucis : + texte.uces = [uce for uce in texte.uces if uce.ident in self.sgts] + self.make_formes(corpus) + self.pathout = corpus.pathout + self.parametres['sub'] = self.sgts + + def make_formes(self, corpus) : + self.formes = {} + for forme in self.corpus.formes : + sgtseff = self.corpus.getformeuceseff(forme) + sgts = set(self.sgts).intersection(sgtseff.keys()) + if len(sgts) : + self.formes[forme] = self.corpus.formes[forme] + self.formes[forme].freq = sum([sgtseff[sgt] for sgt in sgts]) + + def getlemuces(self, lem) : + return list(set(self.sgts).intersection(self.corpus.getlemuces(lem))) + + + + + + +if __name__ == '__main__' : + #SplitFromVar(parametres) + ExtractMods(parametres, True)