1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2014 Pierre Ratinaud
7 #from BeautifulSoup import BeautifulSoup
10 from HTMLParser import HTMLParser
13 mois = {u'janvier' : '01',
48 return [`year`, month, '%02d' % day]
54 return [year, month, day]
57 # create a subclass and override the handler methods
58 class MyHTMLParser(HTMLParser):
59 def handle_starttag(self, tag, attrs):
60 #print "Encountered a start tag:", tag
63 if attrs[0][1] == 'DocPublicationName' :
64 #print 'DocPublicationName'
66 self.currentattr = 'DocPublicationName'
67 elif attrs[0][1] == 'DocHeader' :
69 self.currentattr = 'DocHeader'
70 elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] :
71 self.outfile.write('\n\n')
72 self.meta.append('\n')
73 self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
76 self.currentattr = 'TitreArticleVisu'
77 elif attrs[0][1] == 'PubliC_lblNodoc' :
78 self.currentattr = 'PubliC_lblNodoc'
80 self.currentattr = None
83 if attrs[0][1] == 'publiC-lblNodoc' :
84 self.currentattr = 'PubliC_lblNodoc'
85 elif attrs[0][1] == 'DocText' :
86 self.currentattr = 'TitreArticleVisu'
87 elif attrs[0][1] == 'titreArticle' :
88 self.currentattr = 'TitreArticleVisu'
91 if attrs[0][1] == 'titreArticleVisu' :
92 # self.outfile.write('\n\n')
93 # self.meta.append('\n')
94 # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
95 # self.meta = [u'****']
97 self.currentattr = 'TitreArticleVisu'
99 def handle_endtag(self, tag):
101 #print "Encountered an end tag :", tag
102 def handle_data(self, data):
103 #print self.currentattr
104 if self.currentattr == 'DocPublicationName' :
106 PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower()
107 PublicationName = PublicationName.split(',')[0]
108 if len([val for val in self.meta if val.startswith(u'*source_')]) == 0 :
109 self.meta.append(u'*source_' + PublicationName)
110 self.currentattr = None
111 # elif self.currentattr == 'DocHeader' :
112 # date = finddate(data)
113 # if date is not None :
114 # self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
115 elif self.currentattr == 'TitreArticleVisu' :
117 if data.startswith(u'©') :
118 self.currentattr = None
120 self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ')
121 #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
122 elif self.currentattr == 'PubliC_lblNodoc' :
123 date = data.split(u'·')[1]#data[5:13]
124 date = makedate(date)
125 self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
126 self.meta.append('\n')
127 self.outfile.write('\n\n')
128 self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
129 self.outfile.write(' '.join(self.content).encode('utf8'))
131 self.meta = [u'****']
133 self.currentattr = None
135 def doinit(self, outfile):
136 self.currentattr = None
137 self.meta = [u'****']
140 self.outfile = outfile
144 def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
146 if os.path.isdir(txtdir) :
147 for root, subfolders, subfiles in os.walk(txtdir) :
148 nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
153 elif os.path.isfile(txtdir) :
156 parser = MyHTMLParser()
157 with open(fileout,'w') as outf :
161 with codecs.open(f, 'rU', encodage_in) as infile :
162 content = infile.read()
163 content = HTMLParser().unescape(content)
168 #ParseEuropress('/home/pierre/fac/HDR/psychanalyse',
169 # '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8')