2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2012-2013 Pierre Ratinaud
12 #txtdir = 'dev/factiva_txt'
13 #fileout = 'dev/factiva_txt_out.txt'
15 #encodage_out = 'utf8'
17 mois = {u'janvier' : '01',
43 def parsetxtpaste(txt):
45 parser de texte pour factiva
46 à partir d'un copier/coller de la fenêtre de visualisation
47 merci à Lucie Loubère pour l'astuce :)
49 no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin
50 txt = txt.splitlines()
54 if line.startswith(u'Article') :
57 if lp[2] == u'Article' or lp[2] == u'Next' or lp[2] == u'Previous':
58 ucis.append([[u'****'],''])
60 if line.startswith('SN ') : #source
61 jsource = re.sub(u'[\'" !\.?;,:\+\-°&]', '', line[4:])
62 source = u'_'.join([u'*source', jsource]).lower()
63 #source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
64 ucis[-1][0].append(source)
65 elif line.startswith('PD ') : #date
66 datemois = line[4:].split(' ')[1].lower()
67 datemois = mois.get(datemois, datemois)
68 dateannee = line[4:].split(' ')[2]
69 datejour = '%02d' % int(line[4:].split(' ')[0])
70 am = '_'.join([u'*am', dateannee, datemois])
71 amj = '_'.join([u'*amj', dateannee, datemois, datejour])
72 ucis[-1][0].append(am)
73 ucis[-1][0].append(amj)
74 annee = '_'.join([u'*annee', dateannee])
75 ucis[-1][0].append(annee)
76 elif line.strip() in no : #fin
78 elif line.startswith('RF ') : #fin
80 elif line.strip() in ['LP', 'TD'] : #debut texte
84 if keepline and line.strip() not in ['LP', 'TD', ''] :
85 ucis[-1][1] = '\n'.join([ucis[-1][1],line.replace(u'*', ' ')])
89 def print_ucis(ucis, ofile, encodage) :
90 #elimination des articles vides
91 ucis = [uci for uci in ucis if uci[1].strip() != '']
92 toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
93 ofile.write(toprint.encode(encodage, errors='replace') + '\n')
95 class ParseFactivaPaste :
96 def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
98 for root, subfolders, subfiles in os.walk(txtdir) :
99 nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'txt']
103 with open(fileout,'w') as outf :
106 with codecs.open(f, 'rU', encodage_in) as infile :
107 content = infile.read()
108 ucis = parsetxtpaste(content)
109 print_ucis(ucis, outf, encodage_out)
111 print 'ok', len(ucis), 'articles', ' - total : ', tot
113 #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
114 # path = os.path.join(txtdir,dat)
115 # outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt')
116 # doparse(path, outfile)
119 if __name__ == '__main__' :
120 doparse(txtdir, fileout, encodage_in, encodage_out)