1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2010, Pierre Ratinaud
8 filein = '/home/pierre/fac/cablegate/allcables-all.txt'
11 infile = codecs.open(filein, 'r', enc)
15 def __init__(self, parent) :
17 self.parametre = {'syscoding': sys.getdefaultencoding()}
25 self.ucis_paras_uces = None
30 #self.supplementaires = []
35 def open_corpus(self) :
36 return codecs.open(self.parametre['filename'], "r", self.parametre['encodage'])
38 def buildcorpus(self) :
40 ucifile = os.path.join(os.path.basedir(self.parametre['filename']), 'ucis.txt')
41 uci = open(ucifile, 'w')
43 for line in self.open_corpus() :
44 if line.startswith(u'****') and i==0 :
47 elif line.startswith(u'****') and i=!0 :
56 addlinetouci(uci, prepare(line))
57 line = line.lower().replace(u'\'','\' ').replace(u'’','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':', ' : ').strip()
58 line = line.replace('\n', ' ').replace('\r', ' ')
60 content[-1].append(line)