www.iramuteq.org Git - iramuteq/blob - parse_factiva_txt.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2012-2013 Pierre Ratinaud
   5 #License: GNU/GPL
   6
   7 import os
   8 import codecs
   9 import re
  10
  11
  12 #txtdir = 'dev/factiva_txt'
  13 #fileout = 'dev/factiva_txt_out.txt'
  14 #encodage_in = 'utf8'
  15 #encodage_out = 'utf8'
  16
  17 mois = {u'janvier' : '01',
  18         u'février' : '02',
  19         u'mars' : '03',
  20         u'avril' : '04',
  21         u'mai' : '05',
  22         u'juin' : '06',
  23         u'juillet' : '07',
  24         u'août' : '08',
  25         u'septembre' : '09',
  26         u'octobre' : '10',
  27         u'novembre' : '11',
  28         u'décembre' : '12',
  29         u'january' : '01',
  30         u'february': '02',
  31         u'march' : '03',
  32         u'april': '04',
  33         u'may': '05',
  34         u'june' : '06',
  35         u'july': '07',
  36         u'august': '08',
  37         u'september' : '09',
  38         u'october': '10',
  39         u'november': '11',
  40         u'december': '12'}
  41
  42
  43 def parsetxtpaste(txt):
  44     """
  45     parser de texte pour factiva
  46     à partir d'un copier/coller de la fenêtre de visualisation
  47     merci à Lucie Loubère pour l'astuce :)
  48     """
  49     no = ['NS','RE','IPD','CO','IN']  # les balises qui signalent une fin
  50     txt = txt.splitlines()
  51     keepline = False
  52     ucis = []
  53     for line in txt :
  54         if line.startswith(u'Article') :
  55             lp = line.split()
  56             if len(lp) > 2  :
  57                 if lp[2] == u'Article' or lp[2] == u'Next' or lp[2] == u'Previous':
  58                     ucis.append([[u'****'],''])
  59                     keepline = False
  60         if line.startswith('SN ') : #source
  61             jsource = re.sub(u'[\'" !\.?;,:\+\-°&]', '', line[4:])
  62             source = u'_'.join([u'*source', jsource]).lower()
  63             #source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
  64             ucis[-1][0].append(source)
  65         elif line.startswith('PD ') : #date
  66             datemois = line[4:].split(' ')[1].lower()
  67             datemois = mois.get(datemois, datemois)
  68             dateannee = line[4:].split(' ')[2]
  69             datejour = '%02d' % int(line[4:].split(' ')[0])
  70             am = '_'.join([u'*am', dateannee, datemois])
  71             amj = '_'.join([u'*amj', dateannee, datemois, datejour])
  72             ucis[-1][0].append(am)
  73             ucis[-1][0].append(amj)
  74             annee = '_'.join([u'*annee', dateannee])
  75             ucis[-1][0].append(annee)
  76         elif line.strip() in no : #fin
  77             keepline = False
  78         elif line.startswith('RF ') : #fin
  79             keepline = False
  80         elif line.strip() in ['LP', 'TD'] : #debut texte
  81             keepline = True
  82         else :
  83             pass
  84         if keepline and line.strip() not in ['LP', 'TD', ''] :
  85             ucis[-1][1] = '\n'.join([ucis[-1][1],line])
  86     return ucis
  87
  88
  89 def print_ucis(ucis, ofile, encodage) :
  90     #elimination des articles vides
  91     ucis = [uci for uci in ucis if uci[1].strip() != '']
  92     toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
  93     ofile.write(toprint.encode(encodage, errors='replace') + '\n')
  94
  95 class ParseFactivaPaste :
  96     def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
  97         files = os.listdir(txtdir)
  98         files = [f for f in files if f.split('.')[-1] == 'txt']
  99         tot = 0
 100         with open(fileout,'w') as outf :
 101             for f in files :
 102                 print f
 103                 f = os.path.join(txtdir, f)
 104                 print f
 105                 with codecs.open(f, 'rU', encodage_in) as infile :
 106                     content = infile.read()
 107                 ucis = parsetxtpaste(content)
 108                 print_ucis(ucis, outf, encodage_out)
 109                 tot += len(ucis)
 110                 print 'ok', len(ucis), 'articles', ' - total : ', tot
 111
 112 #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
 113 #    path = os.path.join(txtdir,dat)
 114 #    outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt')
 115 #    doparse(path, outfile)
 116
 117
 118 if __name__ == '__main__' :
 119     doparse(txtdir, fileout, encodage_in, encodage_out)
 120     print 'fini'