X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=parse_europress.py;h=b882de5f2210bef7e2de3358d3eb58cda6907cf7;hb=refs%2Fheads%2F3.0;hp=8097d308da66f5a86299a482b0e23dcd30b07e72;hpb=83e5bf88be196a94703f659fa6eda3817902f86a;p=iramuteq diff --git a/parse_europress.py b/parse_europress.py old mode 100644 new mode 100755 index 8097d30..b882de5 --- a/parse_europress.py +++ b/parse_europress.py @@ -1,39 +1,47 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2014 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL - -#from BeautifulSoup import BeautifulSoup +#------------------------------------ +# import des modules python +#------------------------------------ import codecs import os -from HTMLParser import HTMLParser + +#from BeautifulSoup import BeautifulSoup #??? + +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from html.parser import HTMLParser -mois = {u'janvier' : '01', - u'février' : '02', - u'mars' : '03', - u'avril' : '04', - u'mai' : '05', - u'juin' : '06', - u'juillet' : '07', - u'août' : '08', - u'septembre' : '09', - u'octobre' : '10', - u'novembre' : '11', - u'décembre' : '12', - u'january' : '01', - u'february': '02', - u'march' : '03', - u'april': '04', - u'may': '05', - u'june' : '06', - u'july': '07', - u'august': '08', - u'september' : '09', - u'october': '10', - u'november': '11', - u'december': '12'} +mois = {'janvier' : '01', + 'février' : '02', + 'mars' : '03', + 'avril' : '04', + 'mai' : '05', + 'juin' : '06', + 'juillet' : '07', + 'août' : '08', + 'septembre' : '09', + 'octobre' : '10', + 'novembre' : '11', + 'décembre' : '12', + 'january' : '01', + 'february': '02', + 'march' : '03', + 'april': '04', + 'may': '05', + 'june' : '06', + 'july': '07', + 'august': '08', + 'september' : '09', + 'october': '10', + 'november': '11', + 'december': '12'} def finddate(data): @@ -45,78 +53,123 @@ def finddate(data): except : return None else : - return [`year`, month, '%02d' % day] - + return [repr(year), month, '%02d' % day] + +def makedate(date): + year = date[0:4] + month = date[4:6] + day = date[6:] + return [year, month, day] + # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): + def handle_starttag(self, tag, attrs): #print "Encountered a start tag:", tag if tag == 'span' : - if attrs[0][1] == 'DocPublicationName' : - self.headercount = 0 - self.currentattr = 'DocPublicationName' - elif attrs[0][1] == 'DocHeader' : - self.headercount += 1 - self.currentattr = 'DocHeader' - elif attrs[0][1] == 'TitreArticleVisu' : - self.outfile.write('\n\n') - self.meta.append('\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.meta = [u'****'] - self.nb += 1 - self.currentattr = 'TitreArticleVisu' + if len(attrs) > 0 : + if attrs[0][1] == 'DocPublicationName' : + #print 'DocPublicationName' + self.headercount = 0 + self.currentattr = 'DocPublicationName' + elif attrs[0][1] == 'DocHeader' : + self.headercount += 1 + self.currentattr = 'DocHeader' + elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] : + self.outfile.write('\n\n') + self.meta.append('\n') + self.outfile.write(' '.join(self.meta)) + self.meta = ['****'] + self.nb += 1 + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'PubliC_lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' elif tag == 'table' : self.currentattr = None + elif tag == 'div' : + if len(attrs)>0 : + if attrs[0][1] == 'publiC-lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' + elif attrs[0][1] == 'DocText' : + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'titreArticle' : + self.currentattr = 'TitreArticleVisu' + elif tag == 'p' : + if len(attrs) > 0 : + if attrs[0][1] == 'titreArticleVisu' : + # self.outfile.write('\n\n') + # self.meta.append('\n') + # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + # self.meta = ['****'] + # self.nb += 1 + self.currentattr = 'TitreArticleVisu' + def handle_endtag(self, tag): pass #print "Encountered an end tag :", tag + def handle_data(self, data): + #print self.currentattr if self.currentattr == 'DocPublicationName' : - PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower() + #print data + PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower() PublicationName = PublicationName.split(',')[0] - self.meta.append(u'*source_' + PublicationName) - elif self.currentattr == 'DocHeader' : - date = finddate(data) - if date is not None : - self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + if len([val for val in self.meta if val.startswith('*source_')]) == 0 : + self.meta.append('*source_' + PublicationName) + self.currentattr = None +# elif self.currentattr == 'DocHeader' : +# date = finddate(data) +# if date is not None : +# self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] elif self.currentattr == 'TitreArticleVisu' : - if data.startswith(u'©') : + #print data + if data.startswith('©') : self.currentattr = None return - self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') - + self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') + #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') + elif self.currentattr == 'PubliC_lblNodoc' : + date = data.split('·')[1]#data[5:13] + date = makedate(date) + self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] + self.meta.append('\n') + self.outfile.write('\n\n') + self.outfile.write(' '.join(self.meta)) + self.outfile.write(' '.join(self.content)) + self.content = [] + self.meta = ['****'] + self.nb += 1 + self.currentattr = None + def doinit(self, outfile): self.currentattr = None - self.meta = [u'****'] + self.meta = ['****'] + self.content = [] self.nb = 0 self.outfile = outfile - + print('init ok') -class ParseEuropress : - def __init__(self, txtdir, fileout, encodage_in, encodage_out) : - files = [] +def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : + files = [] + if os.path.isdir(txtdir) : for root, subfolders, subfiles in os.walk(txtdir) : - nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html'] + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] nf.sort() files += nf - tot = 0 - parser = MyHTMLParser() - with open(fileout,'w') as outf : - for f in files : - print f - parser.doinit(outf) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() - parser.feed(content) - tot += parser.nb - print 'ok', parser.nb, 'articles', ' - total : ', tot - -#ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8') - #print "Encountered some data :", data -# instantiate the parser and fed it some HTML -#outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt' -#parser = MyHTMLParser() -#with open(outfile, 'w') as f : -# parser.doinit(f) -# parser.feed(html) \ No newline at end of file + if len(files) == 0 : + return 'nofile' + elif os.path.isfile(txtdir) : + files.append(txtdir) + tot = 0 + parser = MyHTMLParser() + with open(fileout,'w') as outf : + for f in files : + print(f) + parser.doinit(outf) + with codecs.open(f, 'r', encodage_in) as infile : + content = infile.read() + content = HTMLParser().unescape(content) + parser.feed(content) + tot += parser.nb + return tot