www.iramuteq.org Git - iramuteq/blob - parse_europress.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 #Copyright (c) 2008-2020 Pierre Ratinaud
   4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
   5 #License: GNU/GPL
   6
   7 #------------------------------------
   8 # import des modules python
   9 #------------------------------------
  10 import codecs
  11 import os
  12
  13 #from BeautifulSoup import BeautifulSoup #???
  14
  15 #------------------------------------
  16 # import des fichiers du projet
  17 #------------------------------------
  18 from html.parser import HTMLParser
  19
  20
  21 mois = {'janvier' : '01',
  22         'février' : '02',
  23         'mars' : '03',
  24         'avril' : '04',
  25         'mai' : '05',
  26         'juin' : '06',
  27         'juillet' : '07',
  28         'août' : '08',
  29         'septembre' : '09',
  30         'octobre' : '10',
  31         'novembre' : '11',
  32         'décembre' : '12',
  33         'january' : '01',
  34         'february': '02',
  35         'march' : '03',
  36         'april': '04',
  37         'may': '05',
  38         'june' : '06',
  39         'july': '07',
  40         'august': '08',
  41         'september' : '09',
  42         'october': '10',
  43         'november': '11',
  44         'december': '12'}
  45
  46
  47 def finddate(data):
  48     data = data.split()
  49     try :
  50         day = int(data[0])
  51         year = int(data[2])
  52         month = mois[data[1]]
  53     except :
  54         return None
  55     else :
  56         return [repr(year), month, '%02d' % day]
  57
  58 def makedate(date):
  59     year = date[0:4]
  60     month = date[4:6]
  61     day = date[6:]
  62     return [year, month, day]
  63
  64
  65 # create a subclass and override the handler methods
  66 class MyHTMLParser(HTMLParser):
  67
  68     def handle_starttag(self, tag, attrs):
  69         #print "Encountered a start tag:", tag
  70         if tag == 'span' :
  71             if len(attrs) > 0 :
  72                 if attrs[0][1] == 'DocPublicationName' :
  73                     #print 'DocPublicationName'
  74                     self.headercount = 0
  75                     self.currentattr = 'DocPublicationName'
  76                 elif attrs[0][1] == 'DocHeader' :
  77                     self.headercount += 1
  78                     self.currentattr = 'DocHeader'
  79                 elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] :
  80                     self.outfile.write('\n\n')
  81                     self.meta.append('\n')
  82                     self.outfile.write(' '.join(self.meta))
  83                     self.meta = ['****']
  84                     self.nb += 1
  85                     self.currentattr = 'TitreArticleVisu'
  86                 elif attrs[0][1] == 'PubliC_lblNodoc' :
  87                     self.currentattr = 'PubliC_lblNodoc'
  88         elif tag == 'table' :
  89             self.currentattr = None
  90         elif tag == 'div' :
  91             if len(attrs)>0 :
  92                 if attrs[0][1] == 'publiC-lblNodoc' :
  93                     self.currentattr = 'PubliC_lblNodoc'
  94                 elif attrs[0][1] == 'DocText' :
  95                     self.currentattr = 'TitreArticleVisu'
  96                 elif attrs[0][1] == 'titreArticle' :
  97                     self.currentattr = 'TitreArticleVisu'
  98         elif tag == 'p' :
  99             if len(attrs) > 0 :
 100                 if attrs[0][1] == 'titreArticleVisu' :
 101         #            self.outfile.write('\n\n')
 102         #            self.meta.append('\n')
 103         #            self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
 104         #            self.meta = ['****']
 105         #            self.nb += 1
 106                     self.currentattr = 'TitreArticleVisu'
 107
 108     def handle_endtag(self, tag):
 109         pass
 110         #print "Encountered an end tag :", tag
 111
 112     def handle_data(self, data):
 113         #print self.currentattr
 114         if self.currentattr == 'DocPublicationName' :
 115             #print data
 116             PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower()
 117             PublicationName = PublicationName.split(',')[0]
 118             if len([val for val in self.meta if val.startswith('*source_')]) == 0 :
 119                 self.meta.append('*source_' + PublicationName)
 120             self.currentattr = None
 121 #        elif self.currentattr == 'DocHeader' :
 122 #            date = finddate(data)
 123 #            if date is not None :
 124 #                self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
 125         elif self.currentattr == 'TitreArticleVisu' :
 126             #print data
 127             if data.startswith('©') :
 128                 self.currentattr = None
 129                 return
 130             self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ')
 131             #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
 132         elif self.currentattr == 'PubliC_lblNodoc' :
 133             date = data.split('·')[1]#data[5:13]
 134             date = makedate(date)
 135             self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
 136             self.meta.append('\n')
 137             self.outfile.write('\n\n')
 138             self.outfile.write(' '.join(self.meta))
 139             self.outfile.write(' '.join(self.content))
 140             self.content = []
 141             self.meta = ['****']
 142             self.nb += 1
 143             self.currentattr = None
 144
 145     def doinit(self, outfile):
 146         self.currentattr = None
 147         self.meta = ['****']
 148         self.content = []
 149         self.nb = 0
 150         self.outfile = outfile
 151         print('init ok')
 152
 153 def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
 154     files = []
 155     if os.path.isdir(txtdir) :
 156         for root, subfolders, subfiles in os.walk(txtdir) :
 157             nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
 158             nf.sort()
 159             files += nf
 160         if len(files) == 0 :
 161             return 'nofile'
 162     elif os.path.isfile(txtdir) :
 163         files.append(txtdir)
 164     tot = 0
 165     parser = MyHTMLParser()
 166     with open(fileout,'w') as outf :
 167         for f in files :
 168             print(f)
 169             parser.doinit(outf)
 170             with codecs.open(f, 'r', encodage_in) as infile :
 171                 content = infile.read()
 172                 content = HTMLParser().unescape(content)
 173             parser.feed(content)
 174             tot += parser.nb
 175     return tot