X-Git-Url: http://www.iramuteq.org/git?a=blobdiff_plain;f=import_txm.py;h=37ffa2e71185a53aeecc002291ee45be18b3fbb6;hb=refs%2Fheads%2F3.0;hp=bc55fc2a378ff6e18424a9ac622567d5dae798a4;hpb=54fef96ad151ba25920f3e589b39a83c3f62ae2c;p=iramuteq diff --git a/import_txm.py b/import_txm.py index bc55fc2..37ffa2e 100644 --- a/import_txm.py +++ b/import_txm.py @@ -1,20 +1,16 @@ -#!/bin/env python # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2013 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL - +#------------------------------------ +# import des modules python +#------------------------------------ import os import xml.sax import glob - - -#infiledir = '/home/pierre/TXM/corpora/voeux-bin/txm/VOEUX/' -#fileout = 'VOEUXExportfromTXM.txt' - - class TXMParser(xml.sax.ContentHandler) : def __init__(self, fileout, encodage_out) : self.fileout = fileout @@ -43,37 +39,34 @@ class TXMParser(xml.sax.ContentHandler) : def characters(self, content) : if self.name == 'txm:form' : - if content not in [u'', u' ', u'\n', '\r'] : + if content not in ['', ' ', '\n', '\r'] : self.sent.append(content.rstrip('\n\r')) #self.fileout.write(content.encode('utf8')) def text2stars(self, attrs) : - stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'·','') for val in attrs.items()] - stars = [''.join([u'*', val]) for val in stars] - stars = u'**** ' + ' '.join(stars) - self.fileout.write(stars.encode(self.encodage_out)) + stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace('·','') for val in list(attrs.items())] + stars = [''.join(['*', val]) for val in stars] + stars = '**** ' + ' '.join(stars) + self.fileout.write(stars) self.fileout.write('\n') def printsent(self) : if self.sent != [] : sent = ' ' + ' '.join(self.sent) - for val in [u' .', u' ,', u' ;', u' :', u' ?', u' !', u' -'] : + for val in [' .', ' ,', ' ;', ' :', ' ?', ' !', ' -'] : sent = sent.replace(val, val.strip()) sent = sent.replace("' ", "'") - self.fileout.write(sent.encode(self.encodage_out)) + self.fileout.write(sent) self.sent = [] - - -class TXM2IRA : - def __init__(self, pathin, fileout, encodage_in, encodage_out) : +def TXM2IRA(pathin, fileout, encodage_in, encodage_out) : parser = xml.sax.make_parser() files = glob.glob(os.path.join(pathin,'*.xml')) + if len(files) == 0 : + return 'nofile' with open(fileout, 'w') as fout : parser.setContentHandler(TXMParser(fout, encodage_out)) for f in files : parser.parse(open(f, 'r')) fout.write('\n\n') - print 'done' - -#TXM2IRA(infiledir, fileout) + return None