1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
11 from subprocess import Popen, call, PIPE
23 from shutil import copyfile
26 #from dialog import BugDialog
28 from operator import itemgetter
30 #------------------------------------
31 # import des modules wx
32 #------------------------------------
36 #------------------------------------
37 # import des fichiers du projet
38 #------------------------------------
39 from configparser import ConfigParser
42 log = logging.getLogger('iramuteq')
45 indices_simi = ['cooccurrence' ,'pourcentage de cooccurrence','Russel','Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
47 def open_folder(folder):
48 if sys.platform == "win32":
51 opener ="open" if sys.platform == "darwin" else "xdg-open"
52 #call([opener, folder])
53 call(["%s %s &" % (opener, folder)], shell=True)
55 def normpath_win32(path) :
56 if not sys.platform == 'win32' :
58 while '\\\\' in path :
59 path = path.replace('\\\\', '\\')
60 if path.startswith('\\') and not path.startswith('\\\\') :
65 def __init__(self, path = None, encoding = 'utf8'):
68 self.encoding = encoding
70 def __getitem__(self, key):
73 def read(self, path = None):
76 with codecs.open(path, 'r', self.encoding) as f :
78 tgen = [line.split('\t') for line in tgen.splitlines()]
79 tgen = dict([[line[0], line[1:]] for line in tgen])
83 def write(self, path = None):
86 with open(path, 'w') as f :
87 f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]))
89 def writetable(self, pathout, tgens, totocc):
90 etoiles = list(totocc.keys())
92 with open(pathout, 'w') as f :
93 line = '\t'.join(['tgens'] + etoiles) + '\n'
96 line = '\t'.join([t] + [repr(tgens[t][et]) for et in etoiles]) + '\n'
100 while totname + repr(i) in tgens :
102 totname = totname + repr(i)
103 line = '\t'.join([totname] + [repr(totocc[et]) for et in etoiles]) + '\n'
107 def __init__(self, filein, syscoding = 'utf8') :
109 self.syscoding = syscoding
111 self.openedcorpus = {}
112 self.openedmatrix = {}
120 with open(self.filein, 'r') as fjson :
122 # d = shelve.open(self.filein, protocol=1)
123 self.history = d.get('history', [])
124 self.matrix = d.get('matrix', [])
125 self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
126 self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
127 self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
128 self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
129 self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
134 d['history'] = self.history
135 d['matrix'] = self.matrix
136 with open(self.filein, 'w') as f :
137 f.write(json.dumps(d, indent=4, default=str))
138 #d = shelve.open(self.filein, protocol=1)
141 def add(self, analyse) :
142 log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
143 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
144 if tosave['uuid'] in self.corpus :
145 log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
147 if analyse.get('corpus', False) :
148 if analyse['uuid'] in self.analyses :
150 tosave['corpus'] = analyse['corpus']
151 tosave['name'] = analyse['name']
152 acorpus_uuid = analyse['corpus']
153 if acorpus_uuid in self.corpus :
154 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
155 self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
157 self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
159 self.orph.append(tosave)
161 tosave['corpus_name'] = analyse['corpus_name']
162 #self.ordercorpus[tosave['uuid']] = len(history)
163 #self.corpus[tosave['uuid']] = analyse
164 self.history.append(tosave)
168 def addMatrix(self, analyse) :
170 #tosave['matrix_name'] = analyse['matrix_name']
171 tosave['analyses'] = []
172 self.matrix.append(tosave)
176 def addMatrixAnalyse(self, analyse) :
177 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
178 tosave['name'] = analyse['name']
179 if tosave['matrix'] in self.ordermatrix :
180 self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
184 def addmultiple(self, analyses) :
185 log.info('add multiple')
186 for analyse in analyses :
187 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
188 corpus = analyse['corpus']
189 tosave['corpus'] = corpus
190 tosave['name'] = analyse['name']
191 if corpus in self.corpus :
192 if 'analyses' in self.history[self.ordercorpus[corpus]] :
193 self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
195 self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
199 def delete(self, analyse, corpus = False) :
200 log.info('delete %s' % analyse.get('name', 'noname'))
202 self.history.pop(self.ordercorpus[analyse['uuid']])
203 if analyse['uuid'] in self.openedcorpus :
204 del self.openedcorpus[analyse['uuid']]
205 log.info('delete corpus : %s' % analyse['uuid'])
206 elif analyse['uuid'] in self.analyses :
207 todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
208 self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
209 elif analyse['uuid'] in self.matrixanalyse :
210 self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
211 elif analyse.get('matrix', False) in self.matrixanalyse :
212 analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
213 topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
215 self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
219 def addtab(self, analyse) :
220 self.opened[analyse['uuid']] = analyse
222 def rmtab(self, analyse) :
223 del self.opened[analyse['uuid']]
225 def update(self, analyse) :
226 if 'matrix_name' in analyse :
227 self.matrixanalyse[analyse['uuid']].update(analyse)
228 elif 'corpus_name' in analyse :
229 self.corpus[analyse['uuid']].update(analyse)
230 elif 'corpus' in analyse :
231 self.analyses[analyse['uuid']].update(analyse)
233 toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
234 toupdate[0].update(analyse)
239 corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
241 for corpus in corpustodel :
242 print('cleaning :', corpus['corpus_name'])
243 self.delete(corpus, corpus = True)
244 anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
245 for analyse in anatodel :
246 print('cleaning :', analyse['name'])
261 for corpus in self.history :
262 analysenb += len(corpus.get('analyses', []))
263 analyses = corpus.get('analyses', [])
264 for analyse in analyses :
265 if os.path.exists(analyse['ira']) :
266 ana = DoConf(analyse['ira']).getoptions()
268 time = ana['time'].split()
269 ha += int(time[0].replace('h','')) * 3600
270 ma += int(time[1].replace('m','')) * 60
271 sa += int(time[2].replace('s',''))
272 if os.path.exists(corpus['ira']) :
273 param = DoConf(corpus['ira']).getoptions()
274 time = param.get('time','0h 0m 0s')
276 hours += int(time[0].replace('h','')) * 3600
277 minutes += int(time[1].replace('m','')) * 60
278 secondes += int(time[2].replace('s',''))
279 if param.get('originalpath', False) :
280 if param['originalpath'] in corpusnb :
281 corpusnb[param['originalpath']] += 1
282 tokens += int(param['occurrences'])
284 corpusnb[param['originalpath']] = 1
289 if corpus['ira'] in todel :
293 print('Nbr total de corpus : %s' % len(self.history))
294 corpus_nb = len(corpusnb) + len(todel)
295 print('Nbr de corpus différents : %s' % corpus_nb)
296 lentodel = len(todel)
297 print('Nbr de corpus à supprimer : %s' % lentodel)
298 print('Nbr de sous corpus : %s' % subnb)
299 print("Nbr total d'occurrences : %s" % tokens)
300 print('Moyenne occurrences par corpus : %f' % (tokens/corpus_nb))
301 print('---------------------')
302 print("Nbr total d'analyses : %s" % analysenb)
303 print('Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600))
304 print('Temps total analyses : %f h' % ((ha+ma+sa) / 3600))
307 return str(self.history)
310 def __init__(self, configfile=None, diff = None, parametres = None) :
311 self.configfile = configfile
312 self.conf = ConfigParser(interpolation=None) # pourquoi ce paramètre ???
314 if configfile is not None :
315 configfile = normpath_win32(configfile)
316 self.conf.read_file(codecs.open(configfile, 'r', 'utf8'))
318 if parametres is not None :
319 self.doparametres(parametres)
321 def doparametres(self, parametres) :
324 def getsections(self) :
325 return self.conf.sections()
327 def getoptions(self, section = None, diff = None):
330 section = self.conf.sections()[0]
331 for option in self.conf.options(section) :
332 if self.conf.get(section, option).isdigit() :
333 parametres[option] = int(self.conf.get(section, option))
334 elif self.conf.get(section, option) == 'False' :
335 parametres[option] = False
336 elif self.conf.get(section, option) == 'True' :
337 parametres[option] = True
338 elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
339 parametres[option] = ast.literal_eval(self.conf.get(section, option))
340 elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
341 parametres[option] = ast.literal_eval(self.conf.get(section, option))
343 parametres[option] = self.conf.get(section, option)
344 if 'type' not in parametres :
345 parametres['type'] = section
348 def makeoptions(self, sections, parametres, outfile = None) :
350 for i, section in enumerate(sections) :
351 txt += '[%s]\n' % section
352 if not self.conf.has_section(section) :
353 self.conf.add_section(section)
354 for option in parametres[i] :
355 if isinstance(parametres[i][option], int) :
356 self.conf.set(section, option, repr(parametres[i][option]))
357 txt += '%s = %i\n' % (option, parametres[i][option])
358 elif isinstance(parametres[i][option], str) :
359 self.conf.set(section, option, parametres[i][option])
360 txt += '%s = %s\n' % (option, parametres[i][option])
361 elif isinstance(parametres[i][option], wx.Colour) :
362 self.conf.set(section, option, str(parametres[i][option]))
363 txt += '%s = %s\n' % (option, str(parametres[i][option]))
364 elif option == 'analyses' :
367 self.conf.set(section, option, repr(parametres[i][option]))
368 txt += '%s = %s\n' % (option, repr(parametres[i][option]))
370 outfile = self.configfile
371 outfile = normpath_win32(outfile)
372 with open(outfile, 'w') as f :
376 def totext(self, parametres) :
379 for val in parametres :
380 if isinstance(parametres[val], int) :
381 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
382 elif isinstance(parametres[val], str) :
383 txt.append(' \t\t: '.join([val, parametres[val]]))
384 elif val in ['listet', 'stars'] :
387 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
388 return '\n'.join(txt)
391 def write_tab(tab, fileout) :
392 csvWriter = csv.writer(open(fileout, 'w'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
393 csvWriter.writerows(tab)
395 class BugDialog(wx.Dialog):
396 def __init__(self, *args, **kwds):
397 # begin wxGlade: MyDialog.__init__
398 kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
399 kwds["size"] = wx.Size(500, 200)
400 wx.Dialog.__init__(self, *args, **kwds)
401 self.SetTitle(kwds['title'])
402 self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
403 self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
404 self.button_1 = wx.Button(self, wx.ID_OK, "")
406 self.__set_properties()
410 def __set_properties(self):
411 # begin wxGlade: MyDialog.__set_properties
412 self.SetMinSize(wx.Size(500, 200))
413 self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
417 def __do_layout(self):
418 # begin wxGlade: MyDialog.__do_layout
419 sizer_1 = wx.BoxSizer(wx.VERTICAL)
420 sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
421 sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
422 self.SetSizer(sizer_1)
427 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
428 AnalyseConf = ConfigParser()
429 AnalyseConf.read(DictPathOut['ira'])
430 AnalyseConf.add_section(section)
431 date = datetime.datetime.now().ctime()
432 AnalyseConf.set(section, 'date', str(date))
433 AnalyseConf.set(section, 'clusternb', clusternb)
434 AnalyseConf.set(section, 'corpus_name', corpname)
436 fileout = open(DictPathOut['ira'], 'w')
437 AnalyseConf.write(fileout)
440 def multisort(liste2d, ordre, indices_tri):
443 methode destinée à remplacer 'comp' qui a disparu en Python 3
444 tri de tuples sur l'un des éléments du tuple
445 en principe, elle doit renvoyer les éléments triés selon le principe d'avant
446 tel que décrit dans la docstring de 'sortedby'
448 probablement à améliorer pour la rendre d'usage plus général
449 en acceptant un nombre variable de parametres ???
452 indices_triTuple = indices_tri.Tuple(int, ...)
453 for key in reversed(indices_tri):
454 liste2d.sort(key=attrgetter(key), reverse=ordre)
457 def sortedby(liste2d, direct, *indices):
460 sortedby: sort a list of lists (e.g. a table) by one or more indices
461 (columns of the table) and return the sorted list
464 for list = [[2,3],[1,2],[3,1]]:
465 sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
466 sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
468 elle n'est pas remplacée par la méthode 'multisort' ???
473 # nlist = map(lambda x, indices=indices:
474 # map(lambda i, x=x: x[i], indices) + [x],
477 # iramuteq passé à 2to3
478 # nlist = list(map(lambda x, indices=indices:
479 # list(map(lambda i, x=x: x[i], indices)) + [x],
482 for key in reversed(indices):
483 liste2d.sort(key=itemgetter(key), reverse=(direct==2))
489 # sorted_list = multisort(liste2d, direct, *indices)
492 # nlist.sort(reverse=True)
493 # sorted_list = multisort(liste2d, direct, *indices)
495 # return [l[-1] for l in nlist]
498 def add_type(line, dictlem):
499 if line[4] in dictlem:
500 line.append(dictlem[line[4]])
505 def treat_line_alceste(i, line) :
506 if line[0] == '*' or line[0] == '*****' :
511 elif float(line[5].replace(',', '.')) < 0.0001:
513 elif float(line[5].replace(',', '.')) > 0.05:
514 line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
516 line[5] = str(float(line[5].replace(',', '.')))[0:7]
517 return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
519 def ReadProfileAsDico(File, Alceste=False, encoding = sys.getdefaultencoding()):
521 print('lecture des profiles')
522 FileReader = codecs.open(File, 'r', encoding)
523 Filecontent = FileReader.readlines()
527 #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
528 rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
530 ClusterNb = rows[0][2]
532 clusters = [row[2] for row in rows if row[0] == '**']
533 valclusters = [row[1:4] for row in rows if row[0] == '****']
534 lp = [i for i, line in enumerate(rows) if line[0] == '****']
535 prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
537 prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
538 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
540 prof = [[line + [''] for line in pr] for pr in prof]
541 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
542 for i, cluster in enumerate(clusters):
543 DictProfile[cluster] = [valclusters[i]] + prof[i]
546 def GetTxtProfile(dictprofile, cluster_size) :
548 for classe in range(0, len(dictprofile)) :
549 prof = dictprofile[str(classe + 1)]
550 clinfo = cluster_size[classe]
551 proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
552 return '\n\n'.join(proflist)
554 def formatExceptionInfo(maxTBlevel=5):
555 cla, exc, trbk = sys.exc_info()
557 excName = cla.__name__
561 excArgs = exc.args[0]
563 excArgs = "<no args>"
564 excTb = traceback.format_tb(trbk, maxTBlevel)
565 return (excName, excArgs, excTb)
568 #fonction des etudiants de l'iut
569 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
571 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
572 Si on trouve un '$', c'est fini.
573 Sinon, on cherche le meilleur candidat. C'est-Ã -dire le rapport poids/distance le plus important.
575 separateurs = [['.', 60.0], ['?', 60.0], ['!', 60.0], ['£$£', 60], [':', 50.0], [';', 40.0], [',', 10.0], [' ', 0.1]]
576 trouve = False # si on a trouvé un bon séparateur
577 iDecoupe = 0 # indice du caractere ou il faut decouper
579 # on découpe la chaine pour avoir au maximum 240 caractères
580 longueur = min(longueur, len(chaine) - 1)
581 chaineTravail = chaine[:longueur + 1]
583 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
585 # on vérifie si on ne trouve pas un '$'
586 indice = chaineTravail.find('$')
591 # si on ne trouve rien, on cherche le meilleur séparateur
594 caractere = chaineTravail[nbCar]
595 distance = abs(longueurOptimale - nbCar) + 1
596 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
598 # on vérifie si le caractére courant est une marque de ponctuation
599 for s in separateurs:
600 if caractere == s[0]:
601 # si c'est une ponctuation
603 if s[1] / distance > float(meilleur[1]) / meilleureDistance:
611 # et on termine la recherche
614 # on passe au caractère précédant
619 fin = chaine[iDecoupe + 1:]
620 retour = chaineTravail[:iDecoupe]
621 return len(retour) > 0, retour.split(), fin
622 # si on a rien trouvé
623 return False, chaine.split(), ''
626 exceptions = {'paragrapheOT' : "Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
627 'EmptyText' : "Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
628 'CorpusEncoding' : "Problème d'encodage.",
629 'TextBeforeTextMark' : "Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
630 'MissingAnalyse' : 'Aucun fichier à cet emplacement :\n',
633 def BugReport(parent, error = None):
634 for ch in parent.GetChildren():
635 if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
637 excName, exc, excTb = formatExceptionInfo()
638 if excName == 'Exception' :
640 if len(exc.split()) == 2 :
641 mss, linenb = exc.split()
642 if mss in exceptions :
643 txt = exceptions[mss] + linenb
647 if exc in exceptions :
648 txt = exceptions[exc]
651 title = "Information"
653 txt = '\n !== BUG ==! \n'
654 txt += '*************************************\n'
655 txt += '\n'.join(excTb).replace(' ', ' ')
656 txt += excName + '\n'
660 dial = BugDialog(parent, **{'title' : title})
661 if 'Rerror' in dir(parent) :
665 dial.text_ctrl_1.write(txt)
666 dial.CenterOnParent()
670 def PlaySound(parent):
671 if parent.pref.getboolean('iramuteq', 'sound') :
673 if "gtk2" in wx.PlatformInfo:
674 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
676 sound = wx.adv.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
677 sound.Play(wx.adv.SOUND_SYNC)
681 def ReadDicoAsDico(dicopath):
682 with codecs.open(dicopath, 'r', 'UTF8') as f:
683 content = f.readlines()
684 lines = [line.rstrip('\n\r').replace('\n', '').replace('"', '').split('\t') for line in content if line != '']
685 return dict([[line[0], line[1:]] for line in lines])
687 def ReadLexique(parent, lang = 'french', filein = None):
690 parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
692 parent.lexique = ReadDicoAsDico(filein)
697 parent.lexique = ReadDicoAsDico(filein)
699 def ReadList(filein, encoding = sys.getdefaultencoding(), sep = ';'):
701 with codecs.open(filein, 'r', encoding) as f :
703 content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
704 #file = codecs.open(filein, 'r', encoding)
705 #content = file.readlines()
707 first = content.pop(0)
708 #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
712 #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
713 #line = line.split(';')
722 don = float('%.5f' % float(val))
728 def exec_RCMD(rpath, command) :
729 log.info('R CMD INSTALL %s' % command)
730 rpath = rpath.replace('\\','\\\\')
731 error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
734 def exec_rcode(rpath, rcode, wait = True, graph = False):
735 log.info("R Script : %s" % rcode)
737 if sys.platform == 'darwin' :
739 macversion = platform.mac_ver()[0].split('.')
740 if int(macversion[1]) < 5 :
746 rpath = rpath.replace('\\','\\\\')
747 env = os.environ.copy()
748 if sys.platform == 'darwin' and 'LC_ALL' not in env:
749 env['LC_ALL'] = 'en_US.UTF-8'
752 if sys.platform == 'win32':
753 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
755 error = call([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], env = env)
758 if sys.platform == 'win32':
759 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
761 pid = Popen([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8') #PIPE ou STDOUT ?
765 if sys.platform == 'win32':
766 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
767 elif sys.platform == 'darwin' and needX11:
768 os.environ['DISPLAY'] = ':0.0'
769 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
771 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
774 if sys.platform == 'win32':
775 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
776 elif sys.platform == 'darwin' and needX11:
777 os.environ['DISPLAY'] = ':0.0'
778 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
780 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
783 def check_Rresult(parent, pid) :
784 if isinstance(pid, Popen) :
785 if pid.returncode != 0 :
786 error = pid.communicate()
787 error = [str(error[0]), error[1]]
788 if error[1] is None :
790 parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
792 raise Exception('\n'.join(['Erreur R', '\n'.join(error[1:])]))
801 raise Exception('Erreur R')
809 def launchcommand(mycommand):
812 def print_liste(filename,liste):
813 with open(filename,'w') as f :
815 f.write(';'.join(graph) +'\n')
816 def read_list_file(filename, encoding = sys.getdefaultencoding()):
817 with codecs.open(filename,'r', encoding) as f:
818 content=f.readlines()
819 ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
822 def progressbar(self, maxi):
823 ira = wx.GetApp().GetTopWindow()
829 prog = wx.ProgressDialog("Traitements",
830 "Veuillez patienter...",
833 style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
836 # le ABORT n'est pas géré à tous les coups ???
837 prog.SetSize((400,150))
838 #prog.SetIcon(ira._icon)
841 def treat_var_mod(variables) :
843 variables = list(set(variables))
844 varmod = [variable.split('_') for variable in variables]
845 vars = list(set([var[0] for var in varmod if len(var) >=2]))
847 mods = ['_'.join(v) for v in varmod if v[0] == var]
850 # for variable in variables :
851 # if '_' in variable :
852 # forme = variable.split('_')
855 # if not var in var_mod :
856 # var_mod[var] = [variable]
858 # if not mod in var_mod[var] :
859 # var_mod[var].append(variable)
862 def doconcorde(corpus, uces, mots, uci = False) :
864 ucestxt1 = [row for row in corpus.getconcorde(uces)]
866 ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
867 ucestxt1 = dict(ucestxt1)
870 listmot = [corpus.getlems()[lem].formes for lem in mots]
871 listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
872 mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
873 dmots = dict(list(zip(listmot, mothtml)))
875 ucetxt = ucestxt1[uce].split()
876 ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
878 uciid = corpus.getucefromid(uce).uci
879 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></b></p>' % (uciid, uce, uciid, uce))
881 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[uce].etoiles) + '</b></p>')
882 ucestxt.append(ucetxt)
883 return ucis_txt, ucestxt
886 def getallstcarac(corpus, analyse) :
887 pathout = PathOut(analyse['ira'])
888 profils = ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, self.encoding)
891 def read_chd(filein, fileout):
892 with open(filein, 'r') as f :
894 #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
895 content = [line.split('\t') for line in content.splitlines()]
896 chd = {'name':1, 'children':[]}
898 for i, line in enumerate(content) :
900 chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
901 mere[line[1]] = chd['children'][0]
902 mere[line[2]] = chd['children'][1]
904 if 'children' in mere[line[0]]:
905 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
906 mere[line[1]] = mere[line[0]]['children'][-1]
907 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
908 mere[line[2]] = mere[line[0]]['children'][-1]
910 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
911 mere[line[1]] = mere[line[0]]['children'][-2]
912 mere[line[2]] = mere[line[0]]['children'][-1]
913 with open(fileout, 'w') as f :
914 f.write(json.dumps(chd))
917 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
920 def gettranslation(words, lf, lt) :
921 import urllib.request, urllib.error, urllib.parse
923 agent = {'User-Agent':
931 .NET CLR 3.0.04506.30\
933 base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
935 totrans = urllib.parse.quote('\n'.join(words))
936 link = base_link % (lf, lt, totrans)
937 request = urllib.request.Request(link, headers=agent)
938 raw_data = urllib.request.urlopen(request).read()
939 data = json.loads(raw_data)
940 return [line[0].replace("'", '_').replace(' | ', '|').replace(' ', '_').replace('-','_').replace('\n','') for line in data[0]]
942 def makenprof(prof, trans, deb=0) :
945 nprof.append(prof[0])
946 for i, val in enumerate(trans) :
947 line = prof[deb+i+1][:]
952 def treatempty(val) :
953 if val.strip() == '' :
958 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
961 for i in range(len(dictprofile)) :
962 prof = dictprofile[repr(i+1)]
964 lenact = prof.index(['*****', '*', '*', '*', '*', '*', '', ''])
968 lenact = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
974 lensup += prof.index(['*', '*', '*', '*', '*', '*', '', ''])
975 lensup = lensup - lenact
977 lensup += len(prof) - lenact
979 if lenact > maxword :
983 actori = [line[6] for line in prof[1:nlenact]]
984 act = [val.replace('_', ' ') for val in actori]
985 act = gettranslation(act, lf, lt)
986 for j, val in enumerate(actori) :
987 if act[j] not in lems :
990 while act[j] in lems :
991 act[j] = act[j] + "+"
993 nprof[repr(i+1)] = makenprof(prof, act)
996 if lensup > maxword :
1000 supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
1001 sup = [val.replace('_', ' ') for val in supori]
1002 sup = [treatempty(val) for val in sup]
1003 sup = gettranslation(sup, lf, lt)
1004 for j, val in enumerate(supori) :
1005 if sup[j] not in lems :
1008 while sup[j] in lems :
1009 sup[j] = sup[j] + "+"
1011 nprof[repr(i+1)].append(['*****', '*', '*', '*', '*', '*', '', ''])
1012 nprof[repr(i+1)] += makenprof(prof, sup, deb=lenact)
1015 lenet = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
1016 nprof[repr(i+1)].append(['*', '*', '*', '*', '*', '*', '', ''])
1017 nprof[repr(i+1)] += prof[(lenet+1):]
1022 def write_translation_profile(prof, lems, language, dictpathout) :
1023 if os.path.exists(dictpathout['translations.txt']) :
1024 with codecs.open(dictpathout['translations.txt'], 'r', 'utf8') as f :
1025 translist = f.read()
1026 translist = [line.split('\t') for line in translist.splitlines()]
1030 toprint.append(['','','','','',''])
1031 toprint.append(['***', 'nb classes', repr(len(prof)), '***', '', ''])
1032 for i in range(len(prof)) :
1033 toprint.append(['**', 'classe', repr(i+1), '**', '', ''])
1034 toprint.append(['****'] + prof[repr(i+1)][0] + ['****'])
1035 rest = [[repr(line[1]), repr(line[2]), repr(line[3]), repr(line[4]), line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[repr(i+1)][1:]]
1036 for i, line in enumerate(prof[repr(i+1)][1:]) :
1038 rest[i] = ['*', '*', '*', '*', '*', '*']
1039 elif line[0] == '*****' :
1040 rest[i] = ['*****','*','*', '*', '*', '*']
1042 with open(dictpathout['translation_profile_%s.csv' % language], 'w') as f :
1043 f.write('\n'.join([';'.join(line) for line in toprint]))
1044 with open(dictpathout['translation_words_%s.csv' % language], 'w') as f :
1045 f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]))
1046 if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
1047 translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
1048 with open(dictpathout['translations.txt'], 'w') as f :
1049 f.write('\n'.join(['\t'.join(line) for line in translist]))
1051 def makesentidict(infile, language) :
1052 with codecs.open(infile,'r', 'utf8') as f :
1054 content = [line.split('\t') for line in content.splitlines()]
1055 titles = content.pop(0)
1056 senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1059 sentid[sent] = titles.index(sent)
1060 frtitle = [val for val in titles if '(fr)' in val]
1061 frid = titles.index(frtitle[0])
1062 sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1063 pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1064 neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1065 anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1066 anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1067 disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1068 fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1069 joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1070 sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1071 surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1072 trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1073 with open('/tmp/tgenemo.csv', 'w') as f :
1074 for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1075 f.write('\t'.join(val) + '\n')
1077 def countsentfromprof(prof, encoding, sentidict) :
1078 with codecs.open(prof, 'r', encoding) as f :
1080 content = [line.split(';') for line in content.splitlines()]
1082 content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1084 content = dict(content)
1087 def iratolexico(infile, outfile, encoding) :
1088 with codecs.open(infile, 'r', encoding) as f :
1090 if line.startswith('**** ') :