1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #appel seulement par iramuteq.py : from parse_dmi import ImportDMI
9 #------------------------------------
10 # import des modules python
11 #------------------------------------
12 import csv, codecs, io
19 #------------------------------------
20 # import des modules wx
21 #------------------------------------
24 #------------------------------------
25 # import des fichiers du projet
26 #------------------------------------
27 from parse_factiva_xml import PrefImport
28 from functions import BugReport
33 Iterator that reads an encoded stream and reencodes the input to UTF-8
36 def __init__(self, f, encoding):
37 self.reader = codecs.getreader(encoding)(f)
43 return self.reader.next() #.encode("utf-8")
48 A CSV reader which will iterate over lines in the CSV file "f",
49 which is encoded in the given encoding.
52 def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
53 f = UTF8Recoder(f, encoding)
54 self.reader = csv.reader(f, dialect=dialect, **kwds)
57 row = next(self.reader)
58 return [str(s, "utf-8") for s in row]
65 A CSV writer which will write rows to CSV file "f",
66 which is encoded in the given encoding.
69 def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
70 # Redirect output to a queue
71 self.queue = io.StringIO()
72 self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
74 self.encoder = codecs.getincrementalencoder(encoding)()
76 def writerow(self, row):
77 self.writer.writerow([s for s in row])
78 # Fetch UTF-8 output from the queue ...
79 data = self.queue.getvalue()
80 data = data.decode("utf-8")
81 # ... and reencode it into the target encoding
82 data = self.encoder.encode(data)
83 # write to the target stream
84 self.stream.write(data)
86 self.queue.truncate(0)
88 def writerows(self, rows):
95 def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True, lang= 'es'):
96 self.outf = open(fileout, 'w')
97 self.encodeout = encodeout
98 with open(filein, 'r') as f:
99 reader = UnicodeReader(f)
104 self.create_dateid = first.index('created_at')
105 textid = first.index('text')
106 langid = first.index('lang')
109 text = self.washtweet(text)
110 isrt = self.isRT(text)
112 text = self.cleanurl(text)
114 text = self.cleanRT(text)
116 text = self.cleanAt(text)
117 if onlyrt and not isrt :
119 self.write_tweet(row, text)
120 elif row[langid] == lang :
121 self.write_tweet(row, text)
124 self.write_tweet(row, text)
125 elif row[langid] == lang :
126 self.write_tweet(row, text)
130 def write_tweet(self, row, text):
131 meta = self.makemetadata(row, {'date' : self.create_dateid})
132 self.outf.write('\n'.join([meta, text, '']))
134 def makemetadata(self, row, parametres = {}):
136 for val in parametres :
138 line.append('_'.join(['*date', row[parametres[val]].split()[0]]))
140 line.append('_'.join([val,row[parametres[val]]]))
141 return ' '.join(line)
143 def washtweet(self, text) :
144 text = text.replace('RT“', 'RT ')
145 text = text.replace('*', ' ')
147 text = text.replace(val, ' " ')
151 def isRT(self, tweet):
152 if tweet.startswith('RT ') :
157 def cleanurl(self, tweet) :
158 return ' '.join([word for word in tweet.split() if not word.startswith('http')])
160 def cleanAt(self, tweet) :
161 return ' '.join([word for word in tweet.split() if not word.startswith('@')])
163 def cleanRT(self, text) :
164 text = ''.join([' ',text, ' '])
165 text.replace('rt','_rt_')
166 text = text.replace('RT', '_rt_')
169 #tweet = text.split()
170 #tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1]
171 #tovire = itertools.chain(*tovire)
172 #text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire])
178 def __init__(self, parent, parametres):
180 self.parametres = parametres
184 self.dial = PrefImport(self.ira, methode='dmi')
185 val = self.dial.ShowModal()
187 csvfile = self.dial.dbb.GetValue()
188 corp_out = self.dial.fbb.GetValue()
189 nort = self.dial.paneldmi.check_removeR_rt.GetValue()
190 remove_url = self.dial.paneldmi.check_remove_url.GetValue()
191 remove_mention = self.dial.paneldmi.check_remove_mention.GetValue()
192 remove_rt_in_tweets = self.dial.paneldmi.check_remove_rt_in_tweets.GetValue()
194 busy = wx.BusyInfo(_("Please wait..."))
197 ParseDMI(csvfile, corp_out, 'utf8', onlyrt=nort, cleanurl=remove_url, cleanAt=remove_mention, cleanRT=remove_rt_in_tweets)
199 msg = '\n'.join([_("Corpus created :"), corp_out, _("Do you want to open it in IRaMuTeQ ?")])
200 dlg = wx.MessageDialog(self.ira, msg, _('Information'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP)
202 val = dlg.ShowModal()
203 if val == wx.ID_YES :
205 self.ira.filename = os.path.abspath(corp_out)