www.iramuteq.org Git - iramuteq/blob - parse_factiva_html.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2012 Pierre Ratinaud
   5 #License: GNU/GPL
   6
   7 from HTMLParser import HTMLParser
   8 import wx.lib.sized_controls as sc
   9 import wx.lib.filebrowsebutton as filebrowse
  10 import os
  11 import codecs
  12 import sys
  13 import re
  14 import wx
  15
  16
  17 htmldir = 'dev/factiva_html'
  18
  19
  20 class MyHTMLParser(HTMLParser):
  21     def __init__(self) :
  22         HTMLParser.__init__(self)
  23         self.recording = 0
  24         self.data = []
  25         self.need = True
  26         self.author = False
  27         self.start = False
  28         self.text = False
  29         self.count = 0
  30
  31     def handle_starttag(self, tag, attrs):
  32         self.need=True
  33         if tag not in ['div', 'p', 'b'] :
  34             self.need=False
  35             self.text = False
  36             return
  37         else :
  38             print attrs
  39             self.need = True
  40             if tag == 'div' :
  41                 if attrs != [] :
  42                     tagtype = attrs[0][0]
  43                     tagname = attrs[0][1].split()
  44                     if tagtype == 'class' and tagname[0] == 'article' :
  45                         self.author = False
  46                         self.start = True
  47                         self.count = 0
  48                         self.data.append([])
  49                     elif tagtype == 'class' and tagname[0] == 'author' :
  50                         self.author = True
  51             if tag == 'p' :
  52                 if attrs != [] :
  53                     tagtype = attrs[0][0]
  54                     tagname = attrs[0][1].split()
  55                     if tagtype == 'class' and tagname[0] == 'articleParagraph' :
  56                         self.text = True
  57             if tag == 'b' :
  58                 self.text = True
  59             return
  60
  61     def handle_data(self, data) :
  62         #print data.encode('utf-8')
  63         if self.need :
  64             #print data.encode('utf-8')
  65             if self.start :
  66                 pass
  67                 #print 'data', data.encode('utf8')
  68             if self.author :
  69                 if self.count < 7 :
  70                     self.data[-1].append(data)
  71                     self.count += 1
  72                 else :
  73                     self.author = False
  74             elif self.text :
  75                 if self.count == 2 and not self.author :
  76                     self.data[-1].append('PAS DAUTEUR')
  77                     self.count += 1
  78                     self.data[-1].append(data)
  79                 else :
  80                     self.count += 1
  81                     self.data[-1].append(data)
  82
  83
  84     #    print "Encountered a start tag:", tag
  85     #def handle_endtag(self, tag):
  86     #    print "Encountered an end tag :", tag
  87     #def handle_data(self, data):
  88     #    print "Encountered some data  :", data
  89
  90
  91 files = os.listdir(htmldir)
  92
  93 parser = MyHTMLParser()
  94 for f in files :
  95     f= os.path.join(htmldir, f)
  96     with codecs.open(f, 'r', 'utf8') as infile :
  97         content = infile.read()
  98 parser.feed(content)
  99
 100 out = [[' '.join(['****','*date_'+art[4].replace(' ','_'),'*s_'+art[5].replace(' ','_')]), ' '.join(art[10:len(art)-1])] for art in parser.data]
 101
 102 for i in range(0,8) :
 103     print parser.data[i]
 104
 105 out = [' '.join(art) for art in out]
 106 print '\n\n\n'.join(out).encode('utf8')