X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=import_txm.py;h=0dfd1ebb82e7d61f50f8b3f4cbcef97f3921317d;hp=d9eea3182c1d1f27ba7afa5f9bb1038d656dc87c;hb=287f9e72c3e3d666b016dff0fa3dc39419adfcc2;hpb=69b6d701d4298a125c51cd0ac8e884359f93a6ad;ds=inline diff --git a/import_txm.py b/import_txm.py index d9eea31..0dfd1eb 100644 --- a/import_txm.py +++ b/import_txm.py @@ -9,12 +9,6 @@ import os import xml.sax import glob - - -#infiledir = '/home/pierre/TXM/corpora/voeux-bin/txm/VOEUX/' -#fileout = 'VOEUXExportfromTXM.txt' - - class TXMParser(xml.sax.ContentHandler) : def __init__(self, fileout, encodage_out) : self.fileout = fileout @@ -35,7 +29,7 @@ class TXMParser(xml.sax.ContentHandler) : pass def endElement(self, name) : - if name == 's' : + if name == 's' or name == 'w' : self.printsent() if name == 'p' : self.printsent() @@ -48,7 +42,7 @@ class TXMParser(xml.sax.ContentHandler) : #self.fileout.write(content.encode('utf8')) def text2stars(self, attrs) : - stars = ['_'.join(val).replace(' ', '_').replace("'", '_') for val in attrs.items()] + stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'·','') for val in attrs.items()] stars = [''.join([u'*', val]) for val in stars] stars = u'**** ' + ' '.join(stars) self.fileout.write(stars.encode(self.encodage_out)) @@ -63,16 +57,14 @@ class TXMParser(xml.sax.ContentHandler) : self.fileout.write(sent.encode(self.encodage_out)) self.sent = [] - - -class TXM2IRA : - def __init__(self, pathin, fileout, encodage_in, encodage_out) : +def TXM2IRA(pathin, fileout, encodage_in, encodage_out) : parser = xml.sax.make_parser() files = glob.glob(os.path.join(pathin,'*.xml')) + if len(files) == 0 : + return 'nofile' with open(fileout, 'w') as fout : parser.setContentHandler(TXMParser(fout, encodage_out)) for f in files : parser.parse(open(f, 'r')) fout.write('\n\n') - -#TXM2IRA(infiledir, fileout) + return None \ No newline at end of file