1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
8 Removes HTML or XML character references and entities from a text string.
10 @param text The HTML (or XML) source text.
11 @return The plain text, as a Unicode string, if necessary.
14 #------------------------------------
15 # import des modules python
16 #------------------------------------
26 from uuid import uuid4
29 #------------------------------------
30 # import des fichiers du projet
31 #------------------------------------
32 from functions import DoConf
33 from chemins import PathOut
36 log = logging.getLogger('iramuteq.tableau')
41 #apos is not in the dictionnary
42 html.entities.name2codepoint['apos'] = ord("'")
48 return chr(int(text[3:-1], 16))
50 return chr(int(text[2:-1]))
55 text = chr(html.entities.name2codepoint[text[1:-1]])
58 return text # leave as is
59 return re.sub("&#?\w+;", fixup, text)
61 def UpdateDico(Dico, word, line):
64 Dico[word][1].append(line)
66 Dico[word] = [1, [line]]
68 def copymatrix(tableau):
69 log.info('copy matrix')
70 copymat = Tableau(tableau.parent, parametres = tableau.parametres)
71 copymat.linecontent = copy(tableau.linecontent)
72 copymat.csvtable = copy(tableau.csvtable)
73 copymat.pathout = copy(tableau.pathout)
74 copymat.colnames = copy(tableau.colnames)
75 copymat.rownb = copy(tableau.rownb)
76 copymat.colnb = copy(tableau.colnb)
77 if copymat.csvtable is None :
84 def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8', parametres = None) :
86 if parametres is None :
87 self.parametres = DoConf(self.parent.ConfigPath['matrix']).getoptions('matrix')
88 self.parametres['pathout'] = PathOut(filename, 'matrix').mkdirout()
89 self.parametres['originalpath'] = filename
90 self.parametres['filetype'] = filetype
91 self.parametres['encodage'] = encodage
92 #self.parametre['pathout'] = os.path.dirname(os.path.abspath(filename))
93 self.parametres['mineff'] = 3
94 self.parametres['syscoding'] = sys.getdefaultencoding()
95 self.parametres['type'] = 'matrix'
96 self.parametres['matrix_name'] = os.path.basename(filename)
97 self.parametres['uuid'] = str(uuid4())
98 self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve')
99 self.parametres['ira'] = os.path.join(self.parametres['pathout'], 'Matrix.ira')
101 self.parametres = parametres
102 self.pathout = PathOut(filename = filename, dirout = self.parametres['pathout'])
106 self.listactives = None
108 self.linecontent = []
109 self.isbinary = False
111 self.firstrowiscolnames = True
113 self.firstcolisrownames = True
118 #self.parametres = self.parametre
120 def read_tableau(self, fileout) :
121 d=shelve.open(fileout)
122 #self.parametres = d['parametres']
123 #if 'syscoding' not in self.parametres :
124 # self.parametres['syscoding'] = sys.getdefaultencoding()
125 self.actives = d['actives']
126 self.sups = d['sups']
127 self.classes = d['classes']
128 self.listactives = d['listactives']
130 self.listet = d['listet']
131 if 'selected_col' in d :
132 self.selected_col = d['selected_col']
134 self.datas = d['datas']
136 self.lchi = d['lchi']
138 self.content = d['content']
144 self.colnames = self.csvtable[0][1:]
145 self.rownb = len(self.linecontent)
146 self.colnb = len(self.linecontent[0])
148 def save_tableau(self, fileout) :
149 d=shelve.open(fileout)
150 d['parametres'] = self.parametres
151 d['actives'] = self.actives
152 d['sups'] = self.sups
153 d['classes'] = self.classes
154 d['listactives'] = self.listactives
155 if 'listet' in dir(self) :
156 d['listet'] = self.listet
157 if 'selected_col' in dir(self) :
158 d['selected_col'] = self.selected_col
159 if 'datas' in dir(self) :
160 d['datas'] = self.datas
161 if 'lchi' in dir(self) :
162 d['lchi'] = self.lchi
163 d['content'] = self.content
166 def make_content(self) :
167 self.pathout.createdir(self.parametres['pathout'])
168 if self.parametres['filetype'] == 'csv' :
170 elif self.parametres['filetype'] == 'xls' :
172 elif self.parametres['filetype'] == 'ods' :
174 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
176 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
177 self.parent.history.addMatrix(self.parametres)
179 def make_content_simple(self):
180 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
182 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
183 self.parent.history.addMatrix(self.parametres)
187 #print '############## ENCODING IN EXCEL #######################'
188 #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer")
189 datafile = xlrd.open_workbook(self.parametres['originalpath'])
190 datatable = datafile.sheet_by_index(self.parametres['sheetnb']-1)
191 self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace('"','').replace(';',' ').replace('\n',' ').replace('\r', ' ').replace('\t', ' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)]
194 doc = ooolib.Calc(opendoc=self.parametres['originalpath'])
195 doc.set_sheet_index(0)
196 (cols, rows) = doc.get_sheet_dimensions()
197 for row in range(1, rows + 1):
199 for col in range(1, cols + 1):
200 data = doc.get_cell_value(col, row)
201 if data is not None :
202 ligne.append(unescape(data[1].replace('"','').replace(';',' ').replace('\n', ' ').replace('\t', ' ').strip()))
205 self.linecontent.append(ligne)
208 with codecs.open(self.parametres['originalpath'], 'r', self.parametres['encodage']) as f :
210 self.linecontent = [line.split(self.parametres['colsep']) for line in content.splitlines()]
211 self.linecontent = [[val.replace('"','').replace(';',' ').replace('\t', ' ').strip() for val in line] for line in self.linecontent]
213 def write_csvfile(self) :
214 with open(self.parametres['csvfile'], 'w', encoding='utf8') as f :
215 f.write('\n'.join(['\t'.join(line) for line in self.csvtable]))
217 def make_tmpfile(self) :
218 self.rownb = len(self.linecontent)
219 self.colnb = len(self.linecontent[0])
220 if self.firstrowiscolnames :
221 self.colnames = self.linecontent[0]
222 self.linecontent.pop(0)
225 self.colnames = ['_'.join(['colonne', repr(i)]) for i in range(self.colnb)]
226 if self.firstcolisrownames :
227 self.rownames = [row[0] for row in self.linecontent]
228 self.linecontent = [row[1:] for row in self.linecontent]
230 self.idname = self.colnames[0]
232 self.check_rownames()
234 self.rownames = [repr(i) for i in range(self.rownb)]
235 self.idname = 'identifiant'
236 self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))]
239 def read_csvfile(self):
240 with open(self.parametres['csvfile'], 'r', encoding='utf8') as f:
241 self.csvtable = [line.split('\t') for line in f.read().splitlines()]
242 self.linecontent = [line[1:] for line in self.csvtable]
243 self.linecontent.pop(0)
245 def extractfrommod(self, col, val):
246 return ([''] + self.colnames) + [line for line in self.csvtable[1:] if line[col + 1] == val]
248 def splitfromvar(self, col):
250 for line in self.csvtable[1:] :
253 newtabs[mod].append(line)
255 newtabs[mod] = [line]
257 newtabs[mod].insert(0, [''] + self.colnames)
260 def check_rownames(self) :
261 if len(self.rownames) == len(list(set(self.rownames))) :
262 print('row names ok')
264 print('les noms de lignes ne sont pas uniques, ils sont remplaces')
265 self.rownames = [repr(i) for i in range(self.rownb)]
267 def make_unique_list(self) :
268 return list(set([val for line in self.linecontent for val in line if val.strip() != '']))
270 def make_dico(self, selcol) :
272 for i, line in enumerate(selcol) :
274 if forme.strip() != '' :
275 UpdateDico(dico, forme, i)
278 def select_col(self, listcol) :
279 dc = dict(list(zip(listcol, listcol)))
280 selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent]
283 def countmultiple(self, liscol):
284 return self.make_dico(self.select_col(liscol))
286 def getactlistfromselection(self, listact) :
287 selcol = self.select_col(listact)
288 self.actives = self.make_dico(selcol)
289 return [[val, self.actives[val][0]] for val in self.actives]
291 def make_listactives(self) :
292 self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val][0] >= self.parametres['mineff']]
294 def write01(self, fileout, dico, linecontent) :
295 if self.listactives is None :
296 self.listactives = [val for val in dico if val != 'NA' and dico[val][0] >= self.parametres['mineff']]
297 out = [['0' for forme in self.listactives] for line in linecontent]
298 for i, forme in enumerate(self.listactives) :
299 for line in dico[forme][1] :
301 #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))]
302 #out.insert(0,[self.idname] + self.listactives)
303 out.insert(0, self.listactives)
304 with open(fileout, 'w', encoding='utf8') as f :
305 f.write('\n'.join([';'.join(line) for line in out]))
307 def make_01_from_selection(self, listact, listsup = None, dowrite = True) :
308 selcol = self.select_col(listact)
309 self.actives = self.make_dico(selcol)
310 self.write01(self.pathout['mat01.csv'], self.actives, selcol)
311 if listsup is not None :
312 selcol = self.select_col(listsup)
313 self.sups = self.make_dico(selcol)
315 def make_01_alc_format(self, fileout) :
316 for i, ligne in enumerate(self.linecontent) :
320 UpdateDico(self.sups, forme, i)
322 UpdateDico(self.actives, forme, i)
323 self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametres['mineff']]
324 table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)]
325 for i, val in enumerate(self.listactives) :
326 for j, line in enumerate(self.linecontent) :
329 #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))]
330 #table.insert(0, [self.idname] + self.listactives)
331 table.insert(0, self.listactives)
332 with open(fileout, 'w', encoding='utf8') as f:
333 f.write('\n'.join([';'.join(line) for line in table]))
335 def printtable(self, filename, Table, sep = ';'):
336 with open(filename, 'w', encoding='utf8') as f :
337 f.write('\n'.join([sep.join(line) for line in Table]))
339 def buildprofil(self) :
340 with open(self.pathout['uce'], 'r', encoding='utf8') as filein :
341 content = filein.readlines()
345 for i, line in enumerate(content) :
346 line = line.replace('\n', '').replace('"', '').split(';')
347 UpdateDico(dicocl, line[1], i)
348 lsucecl.append([int(line[0]) - 1, int(line[1])])
349 self.classes = lsucecl
350 nlist = [[nbuce, cl] for nbuce, cl in lsucecl if cl != 0]
351 self.ucecla = len(nlist)
353 self.clnb = len(dicocl) - 1
355 self.clnb = len(dicocl)
357 for active in self.listactives :
359 line0 = [0] * self.clnb
361 for i in range(0, self.clnb) :
362 for uce, cl in nlist:
364 if active in self.linecontent[uce]:
366 if sum(line[1:]) > self.parametres['mineff']:
367 tablecont.append([line[0]] + [repr(don) for don in line if type(don) == type(1)])
369 for sup in self.sups :
371 line0 = [0] * self.clnb
373 for i in range(0, self.clnb) :
374 for uce, cl in nlist:
376 if sup in self.linecontent[uce]:
378 tablecontet.append([line[0]] + [repr(don) for don in line if type(don) == type(1)])
380 self.printtable(self.pathout['ContEtOut'], tablecontet)
381 self.printtable(self.pathout['Contout'], tablecont)
383 def get_colnames(self) :
384 return self.colnames[:]
386 def make_table_from_classe(self, cl, la) :
387 ln = [line[0] for line in self.classes if line[1] == cl]
388 out = [['0' for col in la] for line in ln]
389 for i, act in enumerate(la) :
390 for j, line in enumerate(ln) :
391 if line in self.actives[act][1] :
393 out.insert(0,[act for act in la])