X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=parse_europress.py;h=b882de5f2210bef7e2de3358d3eb58cda6907cf7;hb=eaa044d1147e26b82942ce56d5965c83fdddf069;hp=7d707fa6796cc21f9df8a600426a1f8b5caff2c6;hpb=10d67a5cd48583c060b6a0e77e87c41f80671027;p=iramuteq diff --git a/parse_europress.py b/parse_europress.py index 7d707fa..b882de5 100755 --- a/parse_europress.py +++ b/parse_europress.py @@ -1,39 +1,47 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2014 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL - -#from BeautifulSoup import BeautifulSoup +#------------------------------------ +# import des modules python +#------------------------------------ import codecs import os -from HTMLParser import HTMLParser +#from BeautifulSoup import BeautifulSoup #??? + +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from html.parser import HTMLParser -mois = {u'janvier' : '01', - u'février' : '02', - u'mars' : '03', - u'avril' : '04', - u'mai' : '05', - u'juin' : '06', - u'juillet' : '07', - u'août' : '08', - u'septembre' : '09', - u'octobre' : '10', - u'novembre' : '11', - u'décembre' : '12', - u'january' : '01', - u'february': '02', - u'march' : '03', - u'april': '04', - u'may': '05', - u'june' : '06', - u'july': '07', - u'august': '08', - u'september' : '09', - u'october': '10', - u'november': '11', - u'december': '12'} + +mois = {'janvier' : '01', + 'février' : '02', + 'mars' : '03', + 'avril' : '04', + 'mai' : '05', + 'juin' : '06', + 'juillet' : '07', + 'août' : '08', + 'septembre' : '09', + 'octobre' : '10', + 'novembre' : '11', + 'décembre' : '12', + 'january' : '01', + 'february': '02', + 'march' : '03', + 'april': '04', + 'may': '05', + 'june' : '06', + 'july': '07', + 'august': '08', + 'september' : '09', + 'october': '10', + 'november': '11', + 'december': '12'} def finddate(data): @@ -45,7 +53,7 @@ def finddate(data): except : return None else : - return [`year`, month, '%02d' % day] + return [repr(year), month, '%02d' % day] def makedate(date): year = date[0:4] @@ -56,6 +64,7 @@ def makedate(date): # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): + def handle_starttag(self, tag, attrs): #print "Encountered a start tag:", tag if tag == 'span' : @@ -70,8 +79,8 @@ class MyHTMLParser(HTMLParser): elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] : self.outfile.write('\n\n') self.meta.append('\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.meta = [u'****'] + self.outfile.write(' '.join(self.meta)) + self.meta = ['****'] self.nb += 1 self.currentattr = 'TitreArticleVisu' elif attrs[0][1] == 'PubliC_lblNodoc' : @@ -92,78 +101,75 @@ class MyHTMLParser(HTMLParser): # self.outfile.write('\n\n') # self.meta.append('\n') # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - # self.meta = [u'****'] + # self.meta = ['****'] # self.nb += 1 self.currentattr = 'TitreArticleVisu' def handle_endtag(self, tag): pass #print "Encountered an end tag :", tag + def handle_data(self, data): #print self.currentattr if self.currentattr == 'DocPublicationName' : #print data - PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() + PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower() PublicationName = PublicationName.split(',')[0] - if len([val for val in self.meta if val.startswith(u'*source_')]) == 0 : - self.meta.append(u'*source_' + PublicationName) + if len([val for val in self.meta if val.startswith('*source_')]) == 0 : + self.meta.append('*source_' + PublicationName) self.currentattr = None # elif self.currentattr == 'DocHeader' : # date = finddate(data) # if date is not None : -# self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] +# self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] elif self.currentattr == 'TitreArticleVisu' : #print data - if data.startswith(u'©') : + if data.startswith('©') : self.currentattr = None return self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') elif self.currentattr == 'PubliC_lblNodoc' : - date = data.split(u'·')[1]#data[5:13] + date = data.split('·')[1]#data[5:13] date = makedate(date) - self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]] self.meta.append('\n') self.outfile.write('\n\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.outfile.write(' '.join(self.content).encode('utf8')) + self.outfile.write(' '.join(self.meta)) + self.outfile.write(' '.join(self.content)) self.content = [] - self.meta = [u'****'] + self.meta = ['****'] self.nb += 1 self.currentattr = None def doinit(self, outfile): self.currentattr = None - self.meta = [u'****'] + self.meta = ['****'] self.content = [] self.nb = 0 self.outfile = outfile - print 'init ok' - + print('init ok') def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : - files = [] - if os.path.isdir(txtdir) : - for root, subfolders, subfiles in os.walk(txtdir) : - nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] - nf.sort() - files += nf - if len(files) == 0 : - return 'nofile' - elif os.path.isfile(txtdir) : - files.append(txtdir) - tot = 0 - parser = MyHTMLParser() - with open(fileout,'w') as outf : - for f in files : - print f - parser.doinit(outf) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() - content = HTMLParser().unescape(content) - parser.feed(content) - tot += parser.nb - return tot - -#ParseEuropress('/home/pierre/fac/HDR/psychanalyse', -# '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8') + files = [] + if os.path.isdir(txtdir) : + for root, subfolders, subfiles in os.walk(txtdir) : + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] + nf.sort() + files += nf + if len(files) == 0 : + return 'nofile' + elif os.path.isfile(txtdir) : + files.append(txtdir) + tot = 0 + parser = MyHTMLParser() + with open(fileout,'w') as outf : + for f in files : + print(f) + parser.doinit(outf) + with codecs.open(f, 'r', encodage_in) as infile : + content = infile.read() + content = HTMLParser().unescape(content) + parser.feed(content) + tot += parser.nb + return tot