X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=import_txm.py;h=675a9c919d790ce67f779994e517c4934a3c2df5;hp=6ef0c2b439d824caf03bf65c06c93abe91685362;hb=e84160b7f61eb5b05cc12339e44a61d67b499e15;hpb=970a30c06f7399edfb9d0a6823776c611f066af1 diff --git a/import_txm.py b/import_txm.py index 6ef0c2b..675a9c9 100644 --- a/import_txm.py +++ b/import_txm.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud #Copyright (c) 2013 Pierre Ratinaud -#Lisense: GNU/GPL +#License: GNU/GPL import os @@ -11,7 +11,7 @@ import glob -#infiledir = '/home/pierre/TXM/corpora/voeux-bin/txm/VOEUX/' +#infiledir = '/home/pierre/TXM/corpus/voeux-bin/txm/VOEUX/' #fileout = 'VOEUXExportfromTXM.txt' @@ -35,7 +35,7 @@ class TXMParser(xml.sax.ContentHandler) : pass def endElement(self, name) : - if name == 's' : + if name == 's' or name == 'w' : self.printsent() if name == 'p' : self.printsent() @@ -48,7 +48,7 @@ class TXMParser(xml.sax.ContentHandler) : #self.fileout.write(content.encode('utf8')) def text2stars(self, attrs) : - stars = ['_'.join(val).replace(' ', '_').replace("'", '_') for val in attrs.items()] + stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'·','') for val in attrs.items()] stars = [''.join([u'*', val]) for val in stars] stars = u'**** ' + ' '.join(stars) self.fileout.write(stars.encode(self.encodage_out)) @@ -74,5 +74,6 @@ class TXM2IRA : for f in files : parser.parse(open(f, 'r')) fout.write('\n\n') + print 'done' #TXM2IRA(infiledir, fileout)