X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=tableau.py;h=204df9ce69520c3895a5cd8d93e17e42278808ed;hb=refs%2Fheads%2F3.0;hp=6fa6abc868263e823a8cdce4027e47d72b869edc;hpb=6919f2ef8d85c176c7be824b606c4b71142e10fd;p=iramuteq diff --git a/tableau.py b/tableau.py index 6fa6abc..204df9c 100644 --- a/tableau.py +++ b/tableau.py @@ -1,47 +1,59 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2010 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL +""" +Removes HTML or XML character references and entities from a text string. + +@param text The HTML (or XML) source text. +@return The plain text, as a Unicode string, if necessary. +""" + +#------------------------------------ +# import des modules python +#------------------------------------ import codecs import sys import xlrd import ooolib import os -import tempfile +from copy import copy import re -import htmlentitydefs -import shelve -from functions import DoConf +import html.entities +#import shelve +import json from uuid import uuid4 -from chemins import PathOut import logging +#------------------------------------ +# import des fichiers du projet +#------------------------------------ +from functions import DoConf +from chemins import PathOut + + log = logging.getLogger('iramuteq.tableau') -## -# Removes HTML or XML character references and entities from a text string. -# -# @param text The HTML (or XML) source text. -# @return The plain text, as a Unicode string, if necessary. def unescape(text): def fixup(m): #apos is not in the dictionnary - htmlentitydefs.name2codepoint['apos'] = ord("'") + html.entities.name2codepoint['apos'] = ord("'") text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": - return unichr(int(text[3:-1], 16)) + return chr(int(text[3:-1], 16)) else: - return unichr(int(text[2:-1])) + return chr(int(text[2:-1])) except ValueError: pass else: try: - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + text = chr(html.entities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is @@ -53,25 +65,27 @@ def UpdateDico(Dico, word, line): Dico[word][1].append(line) else: Dico[word] = [1, [line]] - + def copymatrix(tableau): log.info('copy matrix') copymat = Tableau(tableau.parent, parametres = tableau.parametres) - copymat.linecontent = tableau.linecontent - copymat.csvtable = tableau.csvtable - copymat.pathout = tableau.pathout - copymat.colnames = tableau.colnames - copymat.rownb = tableau.rownb - copymat.colnb = tableau.colnb + copymat.linecontent = copy(tableau.linecontent) + copymat.csvtable = copy(tableau.csvtable) + copymat.pathout = copy(tableau.pathout) + copymat.colnames = copy(tableau.colnames) + copymat.rownb = copy(tableau.rownb) + copymat.colnb = copy(tableau.colnb) if copymat.csvtable is None : copymat.open() return copymat + class Tableau() : + def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8', parametres = None) : self.parent = parent if parametres is None : - self.parametres = DoConf(os.path.join(self.parent.UserConfigPath,'matrix.cfg')).getoptions('matrix') + self.parametres = DoConf(self.parent.ConfigPath['matrix']).getoptions('matrix') self.parametres['pathout'] = PathOut(filename, 'matrix').mkdirout() self.parametres['originalpath'] = filename self.parametres['filetype'] = filetype @@ -82,7 +96,7 @@ class Tableau() : self.parametres['type'] = 'matrix' self.parametres['matrix_name'] = os.path.basename(filename) self.parametres['uuid'] = str(uuid4()) - self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve.db') + self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve') self.parametres['ira'] = os.path.join(self.parametres['pathout'], 'Matrix.ira') else : self.parametres = parametres @@ -105,10 +119,8 @@ class Tableau() : #self.parametres = self.parametre def read_tableau(self, fileout) : - d=shelve.open(fileout) - #self.parametres = d['parametres'] - #if 'syscoding' not in self.parametres : - # self.parametres['syscoding'] = sys.getdefaultencoding() + with open(fileout, 'r', encoding='utf8') as f : + d = json.load(f) self.actives = d['actives'] self.sups = d['sups'] self.classes = d['classes'] @@ -123,17 +135,16 @@ class Tableau() : self.lchi = d['lchi'] if 'content' in d : self.content = d['content'] - d.close() - + def open(self): - print 'open matrix' + print('open matrix') self.read_csvfile() self.colnames = self.csvtable[0][1:] self.rownb = len(self.linecontent) self.colnb = len(self.linecontent[0]) def save_tableau(self, fileout) : - d=shelve.open(fileout) + d = {} d['parametres'] = self.parametres d['actives'] = self.actives d['sups'] = self.sups @@ -148,7 +159,8 @@ class Tableau() : if 'lchi' in dir(self) : d['lchi'] = self.lchi d['content'] = self.content - d.close() + with open(fileout, 'w', encoding='utf8') as f : + json.dump(d, f) def make_content(self) : self.pathout.createdir(self.parametres['pathout']) @@ -163,13 +175,19 @@ class Tableau() : DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira']) self.parent.history.addMatrix(self.parametres) + def make_content_simple(self): + self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv') + self.make_tmpfile() + DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira']) + self.parent.history.addMatrix(self.parametres) + def read_xls(self) : #FIXME : encodage #print '############## ENCODING IN EXCEL #######################' #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer") datafile = xlrd.open_workbook(self.parametres['originalpath']) datatable = datafile.sheet_by_index(self.parametres['sheetnb']-1) - self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace(u'"','').replace(u';','').replace(u'\n',' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)] + self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace('"','').replace(';',' ').replace('\n',' ').replace('\r', ' ').replace('\t', ' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)] def read_ods(self) : doc = ooolib.Calc(opendoc=self.parametres['originalpath']) @@ -180,19 +198,19 @@ class Tableau() : for col in range(1, cols + 1): data = doc.get_cell_value(col, row) if data is not None : - ligne.append(unescape(data[1].replace(u'"','').replace(u';','').replace(u'\n', ' ').strip())) + ligne.append(unescape(data[1].replace('"','').replace(';',' ').replace('\n', ' ').replace('\t', ' ').strip())) else : ligne.append('') self.linecontent.append(ligne) def read_csv(self) : with codecs.open(self.parametres['originalpath'], 'r', self.parametres['encodage']) as f : - content = f.read() + content = f.read() self.linecontent = [line.split(self.parametres['colsep']) for line in content.splitlines()] - self.linecontent = [[val.replace(u'"','').strip() for val in line] for line in self.linecontent] + self.linecontent = [[val.replace('"','').replace(';',' ').replace('\t', ' ').strip() for val in line] for line in self.linecontent] def write_csvfile(self) : - with open(self.parametres['csvfile'], 'w') as f : + with open(self.parametres['csvfile'], 'w', encoding='utf8') as f : f.write('\n'.join(['\t'.join(line) for line in self.csvtable])) def make_tmpfile(self) : @@ -203,7 +221,7 @@ class Tableau() : self.linecontent.pop(0) self.rownb -= 1 else : - self.colnames = ['_'.join([u'colonne', `i`]) for i in range(self.colnb)] + self.colnames = ['_'.join(['colonne', repr(i)]) for i in range(self.colnb)] if self.firstcolisrownames : self.rownames = [row[0] for row in self.linecontent] self.linecontent = [row[1:] for row in self.linecontent] @@ -212,23 +230,38 @@ class Tableau() : self.colnames.pop(0) self.check_rownames() else : - self.rownames = [`i` for i in range(self.rownb)] - self.idname = u'identifiant' - self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))] + self.rownames = [repr(i) for i in range(self.rownb)] + self.idname = 'identifiant' + self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))] self.write_csvfile() def read_csvfile(self): - with codecs.open(self.parametres['csvfile'], 'r', self.parametres['syscoding']) as f: + with open(self.parametres['csvfile'], 'r', encoding='utf8') as f: self.csvtable = [line.split('\t') for line in f.read().splitlines()] self.linecontent = [line[1:] for line in self.csvtable] self.linecontent.pop(0) + def extractfrommod(self, col, val): + return ([''] + self.colnames) + [line for line in self.csvtable[1:] if line[col + 1] == val] + + def splitfromvar(self, col): + newtabs = {} + for line in self.csvtable[1:] : + mod = line[col+1] + if mod in newtabs : + newtabs[mod].append(line) + else : + newtabs[mod] = [line] + for mod in newtabs : + newtabs[mod].insert(0, [''] + self.colnames) + return newtabs + def check_rownames(self) : if len(self.rownames) == len(list(set(self.rownames))) : - print u'row names ok' + print('row names ok') else : - print u'les noms de lignes ne sont pas uniques, ils sont remplaces' - self.rownames = [`i` for i in range(self.rownb)] + print('les noms de lignes ne sont pas uniques, ils sont remplaces') + self.rownames = [repr(i) for i in range(self.rownb)] def make_unique_list(self) : return list(set([val for line in self.linecontent for val in line if val.strip() != ''])) @@ -240,31 +273,34 @@ class Tableau() : if forme.strip() != '' : UpdateDico(dico, forme, i) return dico - + def select_col(self, listcol) : - dc = dict(zip(listcol, listcol)) + dc = dict(list(zip(listcol, listcol))) selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent] return selcol + def countmultiple(self, liscol): + return self.make_dico(self.select_col(liscol)) + def getactlistfromselection(self, listact) : selcol = self.select_col(listact) self.actives = self.make_dico(selcol) - return [[val, self.actives[val][0]] for val in self.actives] + return [[val, self.actives[val][0]] for val in self.actives] def make_listactives(self) : - self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val] >= self.parametres['mineff']] - + self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val][0] >= self.parametres['mineff']] + def write01(self, fileout, dico, linecontent) : if self.listactives is None : - self.listactives = [val for val in dico if val != 'NA' and dico[val] >= self.parametres['mineff']] + self.listactives = [val for val in dico if val != 'NA' and dico[val][0] >= self.parametres['mineff']] out = [['0' for forme in self.listactives] for line in linecontent] for i, forme in enumerate(self.listactives) : for line in dico[forme][1] : out[line][i] = '1' - #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))] + #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))] #out.insert(0,[self.idname] + self.listactives) out.insert(0, self.listactives) - with open(fileout, 'w') as f : + with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join(line) for line in out])) def make_01_from_selection(self, listact, listsup = None, dowrite = True) : @@ -279,10 +315,10 @@ class Tableau() : for i, ligne in enumerate(self.linecontent) : for forme in ligne: if len(forme) >= 1: - if forme[0] == u'*': + if forme[0] == '*': UpdateDico(self.sups, forme, i) else: - UpdateDico(self.actives, forme, i) + UpdateDico(self.actives, forme, i) self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametres['mineff']] table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)] for i, val in enumerate(self.listactives) : @@ -292,15 +328,15 @@ class Tableau() : #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))] #table.insert(0, [self.idname] + self.listactives) table.insert(0, self.listactives) - with open(fileout, 'w') as f: + with open(fileout, 'w', encoding='utf8') as f: f.write('\n'.join([';'.join(line) for line in table])) def printtable(self, filename, Table, sep = ';'): - with open(filename, 'w') as f : + with open(filename, 'w', encoding='utf8') as f : f.write('\n'.join([sep.join(line) for line in Table])) - + def buildprofil(self) : - with open(self.pathout['uce'], 'rU') as filein : + with open(self.pathout['uce'], 'r', encoding='utf8') as filein : content = filein.readlines() content.pop(0) lsucecl = [] @@ -316,7 +352,6 @@ class Tableau() : self.clnb = len(dicocl) - 1 else: self.clnb = len(dicocl) - tablecont = [] for active in self.listactives : line = [active] @@ -328,8 +363,7 @@ class Tableau() : if active in self.linecontent[uce]: line[i + 1] += 1 if sum(line[1:]) > self.parametres['mineff']: - tablecont.append([line[0]] + [`don` for don in line if type(don) == type(1)]) - + tablecont.append([line[0]] + [repr(don) for don in line if type(don) == type(1)]) tablecontet = [] for sup in self.sups : line = [sup] @@ -340,10 +374,10 @@ class Tableau() : if cl == i + 1 : if sup in self.linecontent[uce]: line[i + 1] += 1 - tablecontet.append([line[0]] + [`don` for don in line if type(don) == type(1)]) - + tablecontet.append([line[0]] + [repr(don) for don in line if type(don) == type(1)]) + self.printtable(self.pathout['ContEtOut'], tablecontet) - self.printtable(self.pathout['Contout'], tablecont) + self.printtable(self.pathout['Contout'], tablecont) def get_colnames(self) : return self.colnames[:] @@ -357,15 +391,3 @@ class Tableau() : out[j][i] = '1' out.insert(0,[act for act in la]) return out - - - -#filename = 'corpus/cent3.csv' -#filename = 'corpus/agir2sortie.csv' -#tab = Tableau('',filename, encodage='utf-8') -#tab.parametre['csvfile'] = tab.parametre['filename'] -#tab.parametre['sep'] = '\t' -#tab.firstrowiscolnames = True -#tab.firstcolisrownames = False -#tab.read_data() -#tab.make_01('corpus/matrice01.csv')