X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=parse_europress.py;h=b882de5f2210bef7e2de3358d3eb58cda6907cf7;hb=refs%2Fheads%2F3.0;hp=9927063624194935dde6ad72a5764379ed0ac4ac;hpb=e4ec2234f0a1224c628c7d6017211cc820913385;p=iramuteq diff --git a/parse_europress.py b/parse_europress.py old mode 100644 new mode 100755 index 9927063..b882de5 --- a/parse_europress.py +++ b/parse_europress.py @@ -1,39 +1,47 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2014 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL - -#from BeautifulSoup import BeautifulSoup +#------------------------------------ +# import des modules python +#------------------------------------ import codecs import os -from HTMLParser import HTMLParser + +#from BeautifulSoup import BeautifulSoup #??? + +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from html.parser import HTMLParser -mois = {u'janvier' : '01', - u'février' : '02', - u'mars' : '03', - u'avril' : '04', - u'mai' : '05', - u'juin' : '06', - u'juillet' : '07', - u'août' : '08', - u'septembre' : '09', - u'octobre' : '10', - u'novembre' : '11', - u'décembre' : '12', - u'january' : '01', - u'february': '02', - u'march' : '03', - u'april': '04', - u'may': '05', - u'june' : '06', - u'july': '07', - u'august': '08', - u'september' : '09', - u'october': '10', - u'november': '11', - u'december': '12'} +mois = {'janvier' : '01', + 'février' : '02', + 'mars' : '03', + 'avril' : '04', + 'mai' : '05', + 'juin' : '06', + 'juillet' : '07', + 'août' : '08', + 'septembre' : '09', + 'octobre' : '10', + 'novembre' : '11', + 'décembre' : '12', + 'january' : '01', + 'february': '02', + 'march' : '03', + 'april': '04', + 'may': '05', + 'june' : '06', + 'july': '07', + 'august': '08', + 'september' : '09', + 'october': '10', + 'november': '11', + 'december': '12'} def finddate(data): @@ -45,7 +53,7 @@ def finddate(data): except : return None else : - return [`year`, month, '%02d' % day] + return [repr(year), month, '%02d' % day] def makedate(date): year = date[0:4] @@ -56,6 +64,7 @@ def makedate(date): # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): + def handle_starttag(self, tag, attrs): #print "Encountered a start tag:", tag if tag == 'span' : @@ -67,11 +76,11 @@ class MyHTMLParser(HTMLParser): elif attrs[0][1] == 'DocHeader' : self.headercount += 1 self.currentattr = 'DocHeader' - elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu'] : + elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] : self.outfile.write('\n\n') self.meta.append('\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.meta = [u'****'] + self.outfile.write(' '.join(self.meta)) + self.meta = ['****'] self.nb += 1 self.currentattr = 'TitreArticleVisu' elif attrs[0][1] == 'PubliC_lblNodoc' : @@ -82,79 +91,85 @@ class MyHTMLParser(HTMLParser): if len(attrs)>0 : if attrs[0][1] == 'publiC-lblNodoc' : self.currentattr = 'PubliC_lblNodoc' + elif attrs[0][1] == 'DocText' : + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'titreArticle' : + self.currentattr = 'TitreArticleVisu' elif tag == 'p' : if len(attrs) > 0 : if attrs[0][1] == 'titreArticleVisu' : # self.outfile.write('\n\n') # self.meta.append('\n') # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - # self.meta = [u'****'] + # self.meta = ['****'] # self.nb += 1 self.currentattr = 'TitreArticleVisu' def handle_endtag(self, tag): pass #print "Encountered an end tag :", tag + def handle_data(self, data): + #print self.currentattr if self.currentattr == 'DocPublicationName' : #print data - PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() + PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower() PublicationName = PublicationName.split(',')[0] - self.meta.append(u'*source_' + PublicationName) + if len([val for val in self.meta if val.startswith('*source_')]) == 0 : + self.meta.append('*source_' + PublicationName) self.currentattr = None # elif self.currentattr == 'DocHeader' : # date = finddate(data) # if date is not None : -# self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] +# self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] elif self.currentattr == 'TitreArticleVisu' : #print data - if data.startswith(u'©') : + if data.startswith('©') : self.currentattr = None return self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') elif self.currentattr == 'PubliC_lblNodoc' : - date = data[5:13] + date = data.split('·')[1]#data[5:13] date = makedate(date) - self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] self.meta.append('\n') self.outfile.write('\n\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.outfile.write(' '.join(self.content).encode('utf8')) + self.outfile.write(' '.join(self.meta)) + self.outfile.write(' '.join(self.content)) self.content = [] - self.meta = [u'****'] + self.meta = ['****'] self.nb += 1 self.currentattr = None - + def doinit(self, outfile): self.currentattr = None - self.meta = [u'****'] + self.meta = ['****'] self.content = [] self.nb = 0 self.outfile = outfile - print 'init ok' - + print('init ok') def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : - files = [] + files = [] + if os.path.isdir(txtdir) : for root, subfolders, subfiles in os.walk(txtdir) : nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] nf.sort() files += nf if len(files) == 0 : return 'nofile' - tot = 0 - parser = MyHTMLParser() - with open(fileout,'w') as outf : - for f in files : - print f - parser.doinit(outf) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() - content = HTMLParser().unescape(content) - parser.feed(content) - tot += parser.nb - return tot - -#ParseEuropress('/home/pierre/fac/etudiant/DeNadai/corpus_loi_travail', -# '/home/pierre/fac/etudiant/DeNadai/corpus_loi_W.txt', 'utf8', 'utf8') + elif os.path.isfile(txtdir) : + files.append(txtdir) + tot = 0 + parser = MyHTMLParser() + with open(fileout,'w') as outf : + for f in files : + print(f) + parser.doinit(outf) + with codecs.open(f, 'r', encodage_in) as infile : + content = infile.read() + content = HTMLParser().unescape(content) + parser.feed(content) + tot += parser.nb + return tot