2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2013 Pierre Ratinaud
12 class TXMParser(xml.sax.ContentHandler) :
13 def __init__(self, fileout, encodage_out) :
14 self.fileout = fileout
15 self.encodage_out = encodage_out
18 def startElement(self, name, attrs) :
24 if name == 'taxonomy' :
27 self.text2stars(attrs)
31 def endElement(self, name) :
32 if name == 's' or name == 'w' :
36 self.fileout.write('\n')
38 def characters(self, content) :
39 if self.name == 'txm:form' :
40 if content not in [u'', u' ', u'\n', '\r'] :
41 self.sent.append(content.rstrip('\n\r'))
42 #self.fileout.write(content.encode('utf8'))
44 def text2stars(self, attrs) :
45 stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'ยท','') for val in attrs.items()]
46 stars = [''.join([u'*', val]) for val in stars]
47 stars = u'**** ' + ' '.join(stars)
48 self.fileout.write(stars.encode(self.encodage_out))
49 self.fileout.write('\n')
53 sent = ' ' + ' '.join(self.sent)
54 for val in [u' .', u' ,', u' ;', u' :', u' ?', u' !', u' -'] :
55 sent = sent.replace(val, val.strip())
56 sent = sent.replace("' ", "'")
57 self.fileout.write(sent.encode(self.encodage_out))
60 def TXM2IRA(pathin, fileout, encodage_in, encodage_out) :
61 parser = xml.sax.make_parser()
62 files = glob.glob(os.path.join(pathin,'*.xml'))
65 with open(fileout, 'w') as fout :
66 parser.setContentHandler(TXMParser(fout, encodage_out))
68 parser.parse(open(f, 'r'))