2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2012-2013 Pierre Ratinaud
11 #txtdir = 'dev/factiva_txt'
12 #fileout = 'dev/factiva_txt_out.txt'
14 #encodage_out = 'utf8'
17 def parsetxtpaste(txt):
19 parser de texte pour factiva
20 à partir d'un copier/coller de la fenêtre de visualisation
21 merci à Lucie Loubère pour l'astuce :)
23 no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin
24 txt = txt.splitlines()
28 if line.startswith('Article') :
31 if lp[2] == 'Article' :
32 ucis.append([[u'****'],''])
34 if line.startswith('SN ') : #source
35 source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
36 ucis[-1][0].append(source)
37 elif line.startswith('PD ') : #date
38 mois_annee = '*ma_' + line[4:].split(' ')[1] + line[4:].split(' ')[2]
39 ucis[-1][0].append(mois_annee)
40 annee = u'*annee_' + line[4:].split(' ')[2]
41 ucis[-1][0].append(annee)
42 elif line.strip() in no : #fin
44 elif line.startswith('RF ') : #fin
46 elif line.strip() in ['LP', 'TD'] : #debut texte
50 if keepline and line.strip() not in ['LP', 'TD', ''] :
51 ucis[-1][1] = '\n'.join([ucis[-1][1],line])
55 def print_ucis(ucis, ofile, encodage) :
56 #elimination des articles vides
57 ucis = [uci for uci in ucis if uci[1].strip() != '']
58 toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
59 ofile.write(toprint.encode(encodage))
61 class ParseFactivaPaste :
62 def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
63 files = os.listdir(txtdir)
64 with open(fileout,'w') as outf :
66 f= os.path.join(txtdir, f)
67 with codecs.open(f, 'rU', encodage_in) as infile :
68 content = infile.read()
69 ucis = parsetxtpaste(content)
70 print_ucis(ucis, outf, encodage_out)
72 #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
73 # path = os.path.join(txtdir,dat)
74 # outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt')
75 # doparse(path, outfile)
78 if __name__ == '__main__' :
79 doparse(txtdir, fileout, encodage_in, encodage_out)