2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2013 Pierre Ratinaud
14 #infiledir = '/home/pierre/TXM/corpus/voeux-bin/txm/VOEUX/'
15 #fileout = 'VOEUXExportfromTXM.txt'
18 class TXMParser(xml.sax.ContentHandler) :
19 def __init__(self, fileout, encodage_out) :
20 self.fileout = fileout
21 self.encodage_out = encodage_out
24 def startElement(self, name, attrs) :
30 if name == 'taxonomy' :
33 self.text2stars(attrs)
37 def endElement(self, name) :
38 if name == 's' or name == 'w' :
42 self.fileout.write('\n')
44 def characters(self, content) :
45 if self.name == 'txm:form' :
46 if content not in [u'', u' ', u'\n', '\r'] :
47 self.sent.append(content.rstrip('\n\r'))
48 #self.fileout.write(content.encode('utf8'))
50 def text2stars(self, attrs) :
51 stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'ยท','') for val in attrs.items()]
52 stars = [''.join([u'*', val]) for val in stars]
53 stars = u'**** ' + ' '.join(stars)
54 self.fileout.write(stars.encode(self.encodage_out))
55 self.fileout.write('\n')
59 sent = ' ' + ' '.join(self.sent)
60 for val in [u' .', u' ,', u' ;', u' :', u' ?', u' !', u' -'] :
61 sent = sent.replace(val, val.strip())
62 sent = sent.replace("' ", "'")
63 self.fileout.write(sent.encode(self.encodage_out))
69 def __init__(self, pathin, fileout, encodage_in, encodage_out) :
70 parser = xml.sax.make_parser()
71 files = glob.glob(os.path.join(pathin,'*.xml'))
72 with open(fileout, 'w') as fout :
73 parser.setContentHandler(TXMParser(fout, encodage_out))
75 parser.parse(open(f, 'r'))
79 #TXM2IRA(infiledir, fileout)