1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2011, Pierre Ratinaud
12 sys.setdefaultencoding(locale.getpreferredencoding())
15 corpus_file = '/home/pierre/fac/lerass/debat/debat_ppoira.txt'
16 #encodage : cp1252 sous windows
21 with codecs.open(corpus_file, 'r', enc) as f :
23 content = content.splitlines()
26 def make_ucis(content) :
27 ucis = [[content[i].strip().split(),i] for i in range(0,len(content)) if content[i].startswith(u'****')]
28 return ucis, [a[1] for a in ucis]
30 def make_lines(content,ucinb) :
31 return [[ucinb[i]+1,ucinb[i+1]] for i in range(0,len(ucinb)-1)] + [[ucinb[len(ucinb)-1] + 1,len(content)]]
32 def make_ucis_txt(content, lines):
33 return [' '.join(content[l[0]:l[1]]) for l in lines]
35 def make_etoile(ucis) :
36 etoiles = [uci[0][1:] for uci in ucis]
39 def make_unique_etoiles(etoiles) :
40 uetoiles = list(set([etoile for uci in etoiles for etoile in uci]))
43 def treat_var_mod(variables) :
45 for variable in variables :
47 forme = variable.split(u'_')
50 if not var in var_mod :
51 var_mod[var] = [variable]
53 if not mod in var_mod[var] :
54 var_mod[var].append(variable)
58 def extract_uci(variable, var_mod, ucis_txt, etoiles) :
59 for et in var_mod[variable] :
60 #et = '_'.join([variable,mod])
61 tojoin = ['\n'.join([' '.join([u'****']+etoiles[i]), uci]) for i, uci in enumerate(ucis_txt) if et in etoiles[i]]
62 with open(et[1:]+'.txt', 'w') as f :
63 f.write('\n\n'.join(tojoin))
66 ucis,ucisnb = make_ucis(content)
67 etoiles = make_etoile(ucis)
68 uetoiles = make_unique_etoiles(etoiles)
69 var_mod = treat_var_mod(uetoiles)
70 lines = make_lines(content, ucisnb)
71 ucis_txt = make_ucis_txt(content, lines)
72 extract_uci(variable, var_mod, ucis_txt, etoiles)