2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2012 Pierre Ratinaud
7 from HTMLParser import HTMLParser
8 import wx.lib.sized_controls as sc
9 import wx.lib.filebrowsebutton as filebrowse
17 htmldir = 'dev/factiva_html'
20 class MyHTMLParser(HTMLParser):
22 HTMLParser.__init__(self)
31 def handle_starttag(self, tag, attrs):
33 if tag not in ['div', 'p', 'b'] :
43 tagname = attrs[0][1].split()
44 if tagtype == 'class' and tagname[0] == 'article' :
49 elif tagtype == 'class' and tagname[0] == 'author' :
54 tagname = attrs[0][1].split()
55 if tagtype == 'class' and tagname[0] == 'articleParagraph' :
61 def handle_data(self, data) :
62 #print data.encode('utf-8')
64 #print data.encode('utf-8')
67 #print 'data', data.encode('utf8')
70 self.data[-1].append(data)
75 if self.count == 2 and not self.author :
76 self.data[-1].append('PAS DAUTEUR')
78 self.data[-1].append(data)
81 self.data[-1].append(data)
84 # print "Encountered a start tag:", tag
85 #def handle_endtag(self, tag):
86 # print "Encountered an end tag :", tag
87 #def handle_data(self, data):
88 # print "Encountered some data :", data
91 files = os.listdir(htmldir)
93 parser = MyHTMLParser()
95 f= os.path.join(htmldir, f)
96 with codecs.open(f, 'r', 'utf8') as infile :
97 content = infile.read()
100 out = [[' '.join(['****','*date_'+art[4].replace(' ','_'),'*s_'+art[5].replace(' ','_')]), ' '.join(art[10:len(art)-1])] for art in parser.data]
102 for i in range(0,8) :
105 out = [' '.join(art) for art in out]
106 print '\n\n\n'.join(out).encode('utf8')