From 970a30c06f7399edfb9d0a6823776c611f066af1 Mon Sep 17 00:00:00 2001 From: Pierre Date: Wed, 13 Nov 2013 15:21:09 +0100 Subject: [PATCH 1/1] add txm import --- import_txm.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 import_txm.py diff --git a/import_txm.py b/import_txm.py new file mode 100644 index 0000000..6ef0c2b --- /dev/null +++ b/import_txm.py @@ -0,0 +1,78 @@ +#!/bin/env python +# -*- coding: utf-8 -*- +#Author: Pierre Ratinaud +#Copyright (c) 2013 Pierre Ratinaud +#Lisense: GNU/GPL + + +import os +import xml.sax +import glob + + + +#infiledir = '/home/pierre/TXM/corpora/voeux-bin/txm/VOEUX/' +#fileout = 'VOEUXExportfromTXM.txt' + + +class TXMParser(xml.sax.ContentHandler) : + def __init__(self, fileout, encodage_out) : + self.fileout = fileout + self.encodage_out = encodage_out + self.sent = [] + + def startElement(self, name, attrs) : + self.name = name + if name == 'title' : + pass + if name == 's' : + pass + if name == 'taxonomy' : + pass + if name == 'text' : + self.text2stars(attrs) + if name == 'w' : + pass + + def endElement(self, name) : + if name == 's' : + self.printsent() + if name == 'p' : + self.printsent() + self.fileout.write('\n') + + def characters(self, content) : + if self.name == 'txm:form' : + if content not in [u'', u' ', u'\n', '\r'] : + self.sent.append(content.rstrip('\n\r')) + #self.fileout.write(content.encode('utf8')) + + def text2stars(self, attrs) : + stars = ['_'.join(val).replace(' ', '_').replace("'", '_') for val in attrs.items()] + stars = [''.join([u'*', val]) for val in stars] + stars = u'**** ' + ' '.join(stars) + self.fileout.write(stars.encode(self.encodage_out)) + self.fileout.write('\n') + + def printsent(self) : + if self.sent != [] : + sent = ' ' + ' '.join(self.sent) + for val in [u' .', u' ,', u' ;', u' :', u' ?', u' !', u' -'] : + sent = sent.replace(val, val.strip()) + sent = sent.replace("' ", "'") + self.fileout.write(sent.encode(self.encodage_out)) + self.sent = [] + + + +class TXM2IRA : + def __init__(self, pathin, fileout, encodage_in, encodage_out) : + parser = xml.sax.make_parser() + files = glob.glob(os.path.join(pathin,'*.xml')) + with open(fileout, 'w') as fout : + parser.setContentHandler(TXMParser(fout, encodage_out)) + for f in files : + parser.parse(open(f, 'r')) + fout.write('\n\n') + +#TXM2IRA(infiledir, fileout) -- 2.7.4