www.iramuteq.org Git - iramuteq/blob - autres/parse_factiva_html.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 #Copyright (c) 2008-2020 Pierre Ratinaud
   4 #License: GNU/GPL
   5
   6 #------------------------------------
   7 # import des modules python
   8 #------------------------------------
   9 import os
  10 import codecs
  11 import sys
  12 import re
  13
  14 #------------------------------------
  15 # import des modules wx
  16 #------------------------------------
  17 import wx
  18 import wx.lib.sized_controls as sc
  19 import wx.lib.filebrowsebutton as filebrowse
  20
  21 #------------------------------------
  22 # import des fichiers du projet
  23 #------------------------------------
  24 from html.parser import HTMLParser
  25
  26
  27 htmldir = 'dev/factiva_html'
  28
  29
  30 class MyHTMLParser(HTMLParser):
  31
  32     def __init__(self) :
  33         HTMLParser.__init__(self)
  34         self.recording = 0
  35         self.data = []
  36         self.need = True
  37         self.author = False
  38         self.start = False
  39         self.text = False
  40         self.count = 0
  41
  42     def handle_starttag(self, tag, attrs):
  43         self.need=True
  44         if tag not in ['div', 'p', 'b'] :
  45             self.need=False
  46             self.text = False
  47             return
  48         else :
  49             print(attrs)
  50             self.need = True
  51             if tag == 'div' :
  52                 if attrs != [] :
  53                     tagtype = attrs[0][0]
  54                     tagname = attrs[0][1].split()
  55                     if tagtype == 'class' and tagname[0] == 'article' :
  56                         self.author = False
  57                         self.start = True
  58                         self.count = 0
  59                         self.data.append([])
  60                     elif tagtype == 'class' and tagname[0] == 'author' :
  61                         self.author = True
  62             if tag == 'p' :
  63                 if attrs != [] :
  64                     tagtype = attrs[0][0]
  65                     tagname = attrs[0][1].split()
  66                     if tagtype == 'class' and tagname[0] == 'articleParagraph' :
  67                         self.text = True
  68             if tag == 'b' :
  69                 self.text = True
  70             return
  71
  72     def handle_data(self, data) :
  73         #print data.encode('utf-8')
  74         if self.need :
  75             #print data.encode('utf-8')
  76             if self.start :
  77                 pass
  78                 #print 'data', data.encode('utf8')
  79             if self.author :
  80                 if self.count < 7 :
  81                     self.data[-1].append(data)
  82                     self.count += 1
  83                 else :
  84                     self.author = False
  85             elif self.text :
  86                 if self.count == 2 and not self.author :
  87                     self.data[-1].append('PAS DAUTEUR')
  88                     self.count += 1
  89                     self.data[-1].append(data)
  90                 else :
  91                     self.count += 1
  92                     self.data[-1].append(data)
  93     #    print "Encountered a start tag:", tag
  94     #def handle_endtag(self, tag):
  95     #    print "Encountered an end tag :", tag
  96     #def handle_data(self, data):
  97     #    print "Encountered some data  :", data
  98
  99 # execution en direct ???
 100 files = os.listdir(htmldir)
 101 parser = MyHTMLParser()
 102 for f in files :
 103     f= os.path.join(htmldir, f)
 104     with codecs.open(f, 'r', 'utf8') as infile :
 105         content = infile.read()
 106 parser.feed(content)
 107 out = [[' '.join(['****','*date_'+art[4].replace(' ','_'),'*s_'+art[5].replace(' ','_')]), ' '.join(art[10:len(art)-1])] for art in parser.data]
 108 for i in range(0,8):
 109     print(parser.data[i])
 110 out = [' '.join(art) for art in out]
 111 print('\n\n\n'.join(out))