1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
14 class TXMParser(xml.sax.ContentHandler) :
15 def __init__(self, fileout, encodage_out) :
16 self.fileout = fileout
17 self.encodage_out = encodage_out
20 def startElement(self, name, attrs) :
26 if name == 'taxonomy' :
29 self.text2stars(attrs)
33 def endElement(self, name) :
34 if name == 's' or name == 'w' :
38 self.fileout.write('\n')
40 def characters(self, content) :
41 if self.name == 'txm:form' :
42 if content not in ['', ' ', '\n', '\r'] :
43 self.sent.append(content.rstrip('\n\r'))
44 #self.fileout.write(content.encode('utf8'))
46 def text2stars(self, attrs) :
47 stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace('·','') for val in list(attrs.items())]
48 stars = [''.join(['*', val]) for val in stars]
49 stars = '**** ' + ' '.join(stars)
50 self.fileout.write(stars)
51 self.fileout.write('\n')
55 sent = ' ' + ' '.join(self.sent)
56 for val in [' .', ' ,', ' ;', ' :', ' ?', ' !', ' -'] :
57 sent = sent.replace(val, val.strip())
58 sent = sent.replace("' ", "'")
59 self.fileout.write(sent)
62 def TXM2IRA(pathin, fileout, encodage_in, encodage_out) :
63 parser = xml.sax.make_parser()
64 files = glob.glob(os.path.join(pathin,'*.xml'))
67 with open(fileout, 'w') as fout :
68 parser.setContentHandler(TXMParser(fout, encodage_out))
70 parser.parse(open(f, 'r'))