2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2014, Pierre Ratinaud
7 import csv, codecs, cStringIO
9 from parse_factiva_xml import PrefImport
12 from functions import BugReport
14 #filein = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/travail_dmi.csv'
15 #fileout = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/corpus.txt'
19 Iterator that reads an encoded stream and reencodes the input to UTF-8
21 def __init__(self, f, encoding):
22 self.reader = codecs.getreader(encoding)(f)
28 return self.reader.next().encode("utf-8")
32 A CSV reader which will iterate over lines in the CSV file "f",
33 which is encoded in the given encoding.
36 def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
37 f = UTF8Recoder(f, encoding)
38 self.reader = csv.reader(f, dialect=dialect, **kwds)
41 row = self.reader.next()
42 return [unicode(s, "utf-8") for s in row]
49 A CSV writer which will write rows to CSV file "f",
50 which is encoded in the given encoding.
53 def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
54 # Redirect output to a queue
55 self.queue = cStringIO.StringIO()
56 self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
58 self.encoder = codecs.getincrementalencoder(encoding)()
60 def writerow(self, row):
61 self.writer.writerow([s.encode("utf-8") for s in row])
62 # Fetch UTF-8 output from the queue ...
63 data = self.queue.getvalue()
64 data = data.decode("utf-8")
65 # ... and reencode it into the target encoding
66 data = self.encoder.encode(data)
67 # write to the target stream
68 self.stream.write(data)
70 self.queue.truncate(0)
72 def writerows(self, rows):
77 def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True, lang= 'es'):
78 self.outf = open(fileout, 'w')
79 self.encodeout = encodeout
80 with open(filein, 'rb') as f:
81 reader = UnicodeReader(f)
86 self.create_dateid = first.index('created_at')
87 textid = first.index('text')
88 langid = first.index('lang')
91 text = self.washtweet(text)
92 isrt = self.isRT(text)
94 text = self.cleanurl(text)
96 text = self.cleanRT(text)
98 text = self.cleanAt(text)
99 if onlyrt and not isrt :
101 self.write_tweet(row, text)
102 elif row[langid] == lang :
103 self.write_tweet(row, text)
106 self.write_tweet(row, text)
107 elif row[langid] == lang :
108 self.write_tweet(row, text)
112 def write_tweet(self, row, text):
113 meta = self.makemetadata(row, {'date' : self.create_dateid})
114 self.outf.write('\n'.join([meta, text, '']).encode(self.encodeout))
116 def makemetadata(self, row, parametres = {}):
118 for val in parametres :
120 line.append('_'.join([u'*date', row[parametres[val]].split()[0]]))
122 line.append('_'.join([val,row[parametres[val]]]))
123 return ' '.join(line)
125 def washtweet(self, text) :
126 text = text.replace(u'RT“', u'RT ')
127 text = text.replace(u'*', ' ')
128 for val in u'”«»“"' :
129 text = text.replace(val, ' " ')
133 def isRT(self, tweet):
134 if tweet.startswith('RT ') :
139 def cleanurl(self, tweet) :
140 return ' '.join([word for word in tweet.split() if not word.startswith('http')])
142 def cleanAt(self, tweet) :
143 return ' '.join([word for word in tweet.split() if not word.startswith('@')])
145 def cleanRT(self, text) :
146 text = ''.join([' ',text, ' '])
147 text.replace('rt','_rt_')
148 text = text.replace('RT', '_rt_')
150 #tweet = text.split()
151 #tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1]
152 #tovire = itertools.chain(*tovire)
153 #text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire])
157 def __init__(self, parent, parametres):
159 self.parametres = parametres
163 self.dial = PrefImport(self.ira, methode='dmi')
164 val = self.dial.ShowModal()
166 csvfile = self.dial.dbb.GetValue()
167 corp_out = self.dial.fbb.GetValue()
168 nort = self.dial.paneldmi.check_removeR_rt.GetValue()
169 remove_url = self.dial.paneldmi.check_remove_url.GetValue()
170 remove_mention = self.dial.paneldmi.check_remove_mention.GetValue()
171 remove_rt_in_tweets = self.dial.paneldmi.check_remove_rt_in_tweets.GetValue()
173 busy = wx.BusyInfo(_("Please wait...").decode('utf8'))
176 ParseDMI(csvfile, corp_out, 'utf8', onlyrt=nort, cleanurl=remove_url, cleanAt=remove_mention, cleanRT=remove_rt_in_tweets)
178 msg = '\n'.join([_(u"Corpus created :").decode('utf8'), corp_out, _(u"Do you want to open it in IRaMuTeQ ?").decode('utf8')])
179 dlg = wx.MessageDialog(self.ira, msg, _(u'Information').decode('utf8'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP)
181 val = dlg.ShowModal()
182 if val == wx.ID_YES :
184 self.ira.filename = os.path.abspath(corp_out)
194 #ParseDMI(filein, fileout, 'utf8')