X-Git-Url: http://iramuteq.org/git?a=blobdiff_plain;ds=sidebyside;f=import_txm.py;h=37ffa2e71185a53aeecc002291ee45be18b3fbb6;hb=refs%2Fheads%2F3.0;hp=0dfd1ebb82e7d61f50f8b3f4cbcef97f3921317d;hpb=46fe1c98d3c43f5bce9d8acb6d0ee24b5299fd85;p=iramuteq diff --git a/import_txm.py b/import_txm.py index 0dfd1eb..be440ce 100644 --- a/import_txm.py +++ b/import_txm.py @@ -1,10 +1,12 @@ -#!/bin/env python # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2013 Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020 #License: GNU/GPL - +#------------------------------------ +# import des modules python +#------------------------------------ import os import xml.sax import glob @@ -34,27 +36,27 @@ class TXMParser(xml.sax.ContentHandler) : if name == 'p' : self.printsent() self.fileout.write('\n') - + def characters(self, content) : if self.name == 'txm:form' : - if content not in [u'', u' ', u'\n', '\r'] : + if content not in ['', ' ', '\n', '\r'] : self.sent.append(content.rstrip('\n\r')) #self.fileout.write(content.encode('utf8')) def text2stars(self, attrs) : - stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace(u'·','') for val in attrs.items()] - stars = [''.join([u'*', val]) for val in stars] - stars = u'**** ' + ' '.join(stars) - self.fileout.write(stars.encode(self.encodage_out)) + stars = ['_'.join(val).replace(' ', '_').replace("'", '_').replace('/','').replace('.','').replace(';', '').replace(':', '').replace('·','') for val in list(attrs.items())] + stars = [''.join(['*', val]) for val in stars] + stars = '**** ' + ' '.join(stars) + self.fileout.write(stars) self.fileout.write('\n') def printsent(self) : if self.sent != [] : sent = ' ' + ' '.join(self.sent) - for val in [u' .', u' ,', u' ;', u' :', u' ?', u' !', u' -'] : + for val in [' .', ' ,', ' ;', ' :', ' ?', ' !', ' -'] : sent = sent.replace(val, val.strip()) sent = sent.replace("' ", "'") - self.fileout.write(sent.encode(self.encodage_out)) + self.fileout.write(sent) self.sent = [] def TXM2IRA(pathin, fileout, encodage_in, encodage_out) : @@ -62,9 +64,9 @@ def TXM2IRA(pathin, fileout, encodage_in, encodage_out) : files = glob.glob(os.path.join(pathin,'*.xml')) if len(files) == 0 : return 'nofile' - with open(fileout, 'w') as fout : + with open(fileout, 'w', encoding='utf8') as fout : parser.setContentHandler(TXMParser(fout, encodage_out)) for f in files : - parser.parse(open(f, 'r')) + parser.parse(open(f, 'r', encoding='utf8')) fout.write('\n\n') - return None \ No newline at end of file + return None