1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2010 Pierre Ratinaud
15 from functions import DoConf
16 from uuid import uuid4
17 from chemins import PathOut
20 log = logging.getLogger('iramuteq.tableau')
23 # Removes HTML or XML character references and entities from a text string.
25 # @param text The HTML (or XML) source text.
26 # @return The plain text, as a Unicode string, if necessary.
30 #apos is not in the dictionnary
31 htmlentitydefs.name2codepoint['apos'] = ord("'")
37 return unichr(int(text[3:-1], 16))
39 return unichr(int(text[2:-1]))
44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
47 return text # leave as is
48 return re.sub("&#?\w+;", fixup, text)
50 def UpdateDico(Dico, word, line):
53 Dico[word][1].append(line)
55 Dico[word] = [1, [line]]
57 def copymatrix(tableau):
58 log.info('copy matrix')
59 copymat = Tableau(tableau.parent, parametres = tableau.parametres)
60 copymat.linecontent = copy(tableau.linecontent)
61 copymat.csvtable = copy(tableau.csvtable)
62 copymat.pathout = copy(tableau.pathout)
63 copymat.colnames = copy(tableau.colnames)
64 copymat.rownb = copy(tableau.rownb)
65 copymat.colnb = copy(tableau.colnb)
66 if copymat.csvtable is None :
71 def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8', parametres = None) :
73 if parametres is None :
74 self.parametres = DoConf(self.parent.ConfigPath['matrix']).getoptions('matrix')
75 self.parametres['pathout'] = PathOut(filename, 'matrix').mkdirout()
76 self.parametres['originalpath'] = filename
77 self.parametres['filetype'] = filetype
78 self.parametres['encodage'] = encodage
79 #self.parametre['pathout'] = os.path.dirname(os.path.abspath(filename))
80 self.parametres['mineff'] = 3
81 self.parametres['syscoding'] = sys.getdefaultencoding()
82 self.parametres['type'] = 'matrix'
83 self.parametres['matrix_name'] = os.path.basename(filename)
84 self.parametres['uuid'] = str(uuid4())
85 self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve.db')
86 self.parametres['ira'] = os.path.join(self.parametres['pathout'], 'Matrix.ira')
88 self.parametres = parametres
89 self.pathout = PathOut(filename = filename, dirout = self.parametres['pathout'])
93 self.listactives = None
98 self.firstrowiscolnames = True
100 self.firstcolisrownames = True
105 #self.parametres = self.parametre
107 def read_tableau(self, fileout) :
108 d=shelve.open(fileout)
109 #self.parametres = d['parametres']
110 #if 'syscoding' not in self.parametres :
111 # self.parametres['syscoding'] = sys.getdefaultencoding()
112 self.actives = d['actives']
113 self.sups = d['sups']
114 self.classes = d['classes']
115 self.listactives = d['listactives']
117 self.listet = d['listet']
118 if 'selected_col' in d :
119 self.selected_col = d['selected_col']
121 self.datas = d['datas']
123 self.lchi = d['lchi']
125 self.content = d['content']
131 self.colnames = self.csvtable[0][1:]
132 self.rownb = len(self.linecontent)
133 self.colnb = len(self.linecontent[0])
135 def save_tableau(self, fileout) :
136 d=shelve.open(fileout)
137 d['parametres'] = self.parametres
138 d['actives'] = self.actives
139 d['sups'] = self.sups
140 d['classes'] = self.classes
141 d['listactives'] = self.listactives
142 if 'listet' in dir(self) :
143 d['listet'] = self.listet
144 if 'selected_col' in dir(self) :
145 d['selected_col'] = self.selected_col
146 if 'datas' in dir(self) :
147 d['datas'] = self.datas
148 if 'lchi' in dir(self) :
149 d['lchi'] = self.lchi
150 d['content'] = self.content
153 def make_content(self) :
154 self.pathout.createdir(self.parametres['pathout'])
155 if self.parametres['filetype'] == 'csv' :
157 elif self.parametres['filetype'] == 'xls' :
159 elif self.parametres['filetype'] == 'ods' :
161 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
163 print self.parametres
164 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
165 self.parent.history.addMatrix(self.parametres)
167 def make_content_simple(self):
168 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
170 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
171 self.parent.history.addMatrix(self.parametres)
175 #print '############## ENCODING IN EXCEL #######################'
176 #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer")
177 datafile = xlrd.open_workbook(self.parametres['originalpath'])
178 datatable = datafile.sheet_by_index(self.parametres['sheetnb']-1)
179 self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace(u'"','').replace(u';',' ').replace(u'\n',' ').replace('\r', ' ').replace('\t', ' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)]
182 doc = ooolib.Calc(opendoc=self.parametres['originalpath'])
183 doc.set_sheet_index(0)
184 (cols, rows) = doc.get_sheet_dimensions()
185 for row in range(1, rows + 1):
187 for col in range(1, cols + 1):
188 data = doc.get_cell_value(col, row)
189 if data is not None :
190 ligne.append(unescape(data[1].replace(u'"','').replace(u';',' ').replace(u'\n', ' ').replace('\t', ' ').strip()))
193 self.linecontent.append(ligne)
196 with codecs.open(self.parametres['originalpath'], 'r', self.parametres['encodage']) as f :
198 self.linecontent = [line.split(self.parametres['colsep']) for line in content.splitlines()]
199 self.linecontent = [[val.replace(u'"','').replace(u';',' ').replace('\t', ' ').strip() for val in line] for line in self.linecontent]
201 def write_csvfile(self) :
202 with open(self.parametres['csvfile'], 'w') as f :
203 f.write('\n'.join(['\t'.join(line) for line in self.csvtable]))
205 def make_tmpfile(self) :
206 self.rownb = len(self.linecontent)
207 self.colnb = len(self.linecontent[0])
208 if self.firstrowiscolnames :
209 self.colnames = self.linecontent[0]
210 self.linecontent.pop(0)
213 self.colnames = ['_'.join([u'colonne', `i`]) for i in range(self.colnb)]
214 if self.firstcolisrownames :
215 self.rownames = [row[0] for row in self.linecontent]
216 self.linecontent = [row[1:] for row in self.linecontent]
218 self.idname = self.colnames[0]
220 self.check_rownames()
222 self.rownames = [`i` for i in range(self.rownb)]
223 self.idname = u'identifiant'
224 self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))]
227 def read_csvfile(self):
228 with codecs.open(self.parametres['csvfile'], 'r', self.parametres['syscoding']) as f:
229 self.csvtable = [line.split('\t') for line in f.read().splitlines()]
230 self.linecontent = [line[1:] for line in self.csvtable]
231 self.linecontent.pop(0)
233 def extractfrommod(self, col, val):
234 return ([''] + self.colnames) + [line for line in self.csvtable[1:] if line[col + 1] == val]
236 def splitfromvar(self, col):
238 for line in self.csvtable[1:] :
241 newtabs[mod].append(line)
243 newtabs[mod] = [line]
245 newtabs[mod].insert(0, [''] + self.colnames)
248 def check_rownames(self) :
249 if len(self.rownames) == len(list(set(self.rownames))) :
250 print u'row names ok'
252 print u'les noms de lignes ne sont pas uniques, ils sont remplaces'
253 self.rownames = [`i` for i in range(self.rownb)]
255 def make_unique_list(self) :
256 return list(set([val for line in self.linecontent for val in line if val.strip() != '']))
258 def make_dico(self, selcol) :
260 for i, line in enumerate(selcol) :
262 if forme.strip() != '' :
263 UpdateDico(dico, forme, i)
266 def select_col(self, listcol) :
267 dc = dict(zip(listcol, listcol))
268 selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent]
271 def countmultiple(self, liscol):
272 return self.make_dico(self.select_col(liscol))
274 def getactlistfromselection(self, listact) :
275 selcol = self.select_col(listact)
276 self.actives = self.make_dico(selcol)
277 return [[val, self.actives[val][0]] for val in self.actives]
279 def make_listactives(self) :
280 self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val] >= self.parametres['mineff']]
282 def write01(self, fileout, dico, linecontent) :
283 if self.listactives is None :
284 self.listactives = [val for val in dico if val != 'NA' and dico[val] >= self.parametres['mineff']]
285 out = [['0' for forme in self.listactives] for line in linecontent]
286 for i, forme in enumerate(self.listactives) :
287 for line in dico[forme][1] :
289 #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))]
290 #out.insert(0,[self.idname] + self.listactives)
291 out.insert(0, self.listactives)
292 with open(fileout, 'w') as f :
293 f.write('\n'.join([';'.join(line) for line in out]))
295 def make_01_from_selection(self, listact, listsup = None, dowrite = True) :
296 selcol = self.select_col(listact)
297 self.actives = self.make_dico(selcol)
298 self.write01(self.pathout['mat01.csv'], self.actives, selcol)
299 if listsup is not None :
300 selcol = self.select_col(listsup)
301 self.sups = self.make_dico(selcol)
303 def make_01_alc_format(self, fileout) :
304 for i, ligne in enumerate(self.linecontent) :
308 UpdateDico(self.sups, forme, i)
310 UpdateDico(self.actives, forme, i)
311 self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametres['mineff']]
312 table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)]
313 for i, val in enumerate(self.listactives) :
314 for j, line in enumerate(self.linecontent) :
317 #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))]
318 #table.insert(0, [self.idname] + self.listactives)
319 table.insert(0, self.listactives)
320 with open(fileout, 'w') as f:
321 f.write('\n'.join([';'.join(line) for line in table]))
323 def printtable(self, filename, Table, sep = ';'):
324 with open(filename, 'w') as f :
325 f.write('\n'.join([sep.join(line) for line in Table]))
327 def buildprofil(self) :
328 with open(self.pathout['uce'], 'rU') as filein :
329 content = filein.readlines()
333 for i, line in enumerate(content) :
334 line = line.replace('\n', '').replace('"', '').split(';')
335 UpdateDico(dicocl, line[1], i)
336 lsucecl.append([int(line[0]) - 1, int(line[1])])
337 self.classes = lsucecl
338 nlist = [[nbuce, cl] for nbuce, cl in lsucecl if cl != 0]
339 self.ucecla = len(nlist)
341 self.clnb = len(dicocl) - 1
343 self.clnb = len(dicocl)
346 for active in self.listactives :
348 line0 = [0] * self.clnb
350 for i in range(0, self.clnb) :
351 for uce, cl in nlist:
353 if active in self.linecontent[uce]:
355 if sum(line[1:]) > self.parametres['mineff']:
356 tablecont.append([line[0]] + [`don` for don in line if type(don) == type(1)])
359 for sup in self.sups :
361 line0 = [0] * self.clnb
363 for i in range(0, self.clnb) :
364 for uce, cl in nlist:
366 if sup in self.linecontent[uce]:
368 tablecontet.append([line[0]] + [`don` for don in line if type(don) == type(1)])
370 self.printtable(self.pathout['ContEtOut'], tablecontet)
371 self.printtable(self.pathout['Contout'], tablecont)
373 def get_colnames(self) :
374 return self.colnames[:]
376 def make_table_from_classe(self, cl, la) :
377 ln = [line[0] for line in self.classes if line[1] == cl]
378 out = [['0' for col in la] for line in ln]
379 for i, act in enumerate(la) :
380 for j, line in enumerate(ln) :
381 if line in self.actives[act][1] :
383 out.insert(0,[act for act in la])
388 #filename = 'corpus/cent3.csv'
389 #filename = 'corpus/agir2sortie.csv'
390 #tab = Tableau('',filename, encodage='utf-8')
391 #tab.parametre['csvfile'] = tab.parametre['filename']
392 #tab.parametre['sep'] = '\t'
393 #tab.firstrowiscolnames = True
394 #tab.firstcolisrownames = False
396 #tab.make_01('corpus/matrice01.csv')