www.iramuteq.org Git - iramuteq/blob - parse_dmi.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 #Copyright (c) 2008-2020 Pierre Ratinaud
   4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
   5 #License: GNU/GPL
   6
   7 #appel seulement par iramuteq.py : from parse_dmi import ImportDMI
   8
   9 #------------------------------------
  10 # import des modules python
  11 #------------------------------------
  12 import csv, codecs, io
  13 import itertools
  14 import os
  15
  16 import langue
  17 langue.run()
  18
  19 #------------------------------------
  20 # import des modules wx
  21 #------------------------------------
  22 import wx
  23
  24 #------------------------------------
  25 # import des fichiers du projet
  26 #------------------------------------
  27 from parse_factiva_xml import PrefImport
  28 from functions import BugReport
  29
  30
  31 class UTF8Recoder:
  32     """
  33     Iterator that reads an encoded stream and reencodes the input to UTF-8
  34     """
  35
  36     def __init__(self, f, encoding):
  37         self.reader = codecs.getreader(encoding)(f)
  38
  39     def __iter__(self):
  40         return self
  41
  42     def __next__(self):
  43         return self.reader.next() #.encode("utf-8")
  44
  45
  46 class UnicodeReader:
  47     """
  48     A CSV reader which will iterate over lines in the CSV file "f",
  49     which is encoded in the given encoding.
  50     """
  51
  52     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  53         f = UTF8Recoder(f, encoding)
  54         self.reader = csv.reader(f, dialect=dialect, **kwds)
  55
  56     def __next__(self):
  57         row = next(self.reader)
  58         return [str(s, "utf-8") for s in row]
  59
  60     def __iter__(self):
  61         return self
  62
  63 class UnicodeWriter:
  64     """
  65     A CSV writer which will write rows to CSV file "f",
  66     which is encoded in the given encoding.
  67     """
  68
  69     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  70         # Redirect output to a queue
  71         self.queue = io.StringIO()
  72         self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  73         self.stream = f
  74         self.encoder = codecs.getincrementalencoder(encoding)()
  75
  76     def writerow(self, row):
  77         self.writer.writerow([s for s in row])
  78         # Fetch UTF-8 output from the queue ...
  79         data = self.queue.getvalue()
  80         data = data.decode("utf-8")
  81         # ... and reencode it into the target encoding
  82         data = self.encoder.encode(data)
  83         # write to the target stream
  84         self.stream.write(data)
  85         # empty queue
  86         self.queue.truncate(0)
  87
  88     def writerows(self, rows):
  89         for row in rows:
  90             self.writerow(row)
  91
  92
  93 class ParseDMI :
  94
  95     def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True, lang= 'es'):
  96         self.outf = open(fileout, 'w')
  97         self.encodeout = encodeout
  98         with open(filein, 'r') as f:
  99             reader = UnicodeReader(f)
 100             linenb = 0
 101             for row in reader:
 102                 if linenb == 0 :
 103                     first = row
 104                     self.create_dateid = first.index('created_at')
 105                     textid = first.index('text')
 106                     langid = first.index('lang')
 107                 else :
 108                     text = row[textid]
 109                     text = self.washtweet(text)
 110                     isrt = self.isRT(text)
 111                     if cleanurl :
 112                         text = self.cleanurl(text)
 113                     if cleanRT :
 114                         text = self.cleanRT(text)
 115                     if cleanAt :
 116                         text = self.cleanAt(text)
 117                     if onlyrt and not isrt :
 118                         if lang == 'all' :
 119                             self.write_tweet(row, text)
 120                         elif row[langid] == lang :
 121                             self.write_tweet(row, text)
 122                     if not onlyrt :
 123                         if lang == 'all' :
 124                             self.write_tweet(row, text)
 125                         elif row[langid] == lang :
 126                             self.write_tweet(row, text)
 127                 linenb += 1
 128         self.outf.close()
 129
 130     def write_tweet(self, row, text):
 131         meta = self.makemetadata(row, {'date' : self.create_dateid})
 132         self.outf.write('\n'.join([meta, text, '']))
 133
 134     def makemetadata(self, row, parametres = {}):
 135         line = ['****']
 136         for val in parametres :
 137             if val == 'date' :
 138                 line.append('_'.join(['*date', row[parametres[val]].split()[0]]))
 139             else :
 140                 line.append('_'.join([val,row[parametres[val]]]))
 141         return ' '.join(line)
 142
 143     def washtweet(self, text) :
 144         text = text.replace('RT“', 'RT ')
 145         text = text.replace('*', ' ')
 146         for val in '”«»“"' :
 147             text = text.replace(val, ' " ')
 148         text.strip()
 149         return text
 150
 151     def isRT(self, tweet):
 152         if tweet.startswith('RT ') :
 153             return True
 154         else :
 155             return False
 156
 157     def cleanurl(self, tweet) :
 158         return ' '.join([word for word in tweet.split() if not word.startswith('http')])
 159
 160     def cleanAt(self, tweet) :
 161         return ' '.join([word for word in tweet.split() if not word.startswith('@')])
 162
 163     def cleanRT(self, text) :
 164         text = ''.join([' ',text, ' '])
 165         text.replace('rt','_rt_')
 166         text = text.replace('RT', '_rt_')
 167         text.strip()
 168         # ???
 169         #tweet = text.split()
 170         #tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1]
 171         #tovire = itertools.chain(*tovire)
 172         #text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire])
 173         return text
 174
 175
 176 class ImportDMI :
 177
 178     def __init__(self, parent, parametres):
 179         self.ira = parent
 180         self.parametres = parametres
 181         self.parse()
 182
 183     def parse(self):
 184         self.dial =  PrefImport(self.ira, methode='dmi')
 185         val = self.dial.ShowModal()
 186         if val == wx.ID_OK :
 187             csvfile = self.dial.dbb.GetValue()
 188             corp_out = self.dial.fbb.GetValue()
 189             nort = self.dial.paneldmi.check_removeR_rt.GetValue()
 190             remove_url = self.dial.paneldmi.check_remove_url.GetValue()
 191             remove_mention = self.dial.paneldmi.check_remove_mention.GetValue()
 192             remove_rt_in_tweets = self.dial.paneldmi.check_remove_rt_in_tweets.GetValue()
 193             self.dial.Destroy()
 194             busy = wx.BusyInfo(_("Please wait..."))
 195             wx.SafeYield()
 196             try :
 197                 ParseDMI(csvfile, corp_out, 'utf8', onlyrt=nort, cleanurl=remove_url, cleanAt=remove_mention, cleanRT=remove_rt_in_tweets)
 198                 del busy
 199                 msg = '\n'.join([_("Corpus created :"), corp_out, _("Do you want to open it in IRaMuTeQ ?")])
 200                 dlg = wx.MessageDialog(self.ira, msg, _('Information'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP)
 201                 dlg.CenterOnParent()
 202                 val = dlg.ShowModal()
 203                 if val == wx.ID_YES :
 204                     dlg.Destroy()
 205                     self.ira.filename = os.path.abspath(corp_out)
 206                     self.ira.OpenText()
 207                 else :
 208                     dlg.Destroy()
 209             except :
 210                 del busy
 211                 BugReport(self.ira)
 212         else :
 213             self.dial.Destroy()
 214