X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_factiva_txt.py;h=fd856f4c82812999dfa018e7ee87ad9f63d513ac;hp=18461e06c6f1ec3290f927b235c0c7d5f7befc98;hb=42499e5db4394973bd0761d40ea09e515925ed4b;hpb=6919f2ef8d85c176c7be824b606c4b71142e10fd diff --git a/parse_factiva_txt.py b/parse_factiva_txt.py index 18461e0..fd856f4 100644 --- a/parse_factiva_txt.py +++ b/parse_factiva_txt.py @@ -82,7 +82,7 @@ def parsetxtpaste(txt): else : pass if keepline and line.strip() not in ['LP', 'TD', ''] : - ucis[-1][1] = '\n'.join([ucis[-1][1],line]) + ucis[-1][1] = '\n'.join([ucis[-1][1],line.replace(u'*', ' ')]) return ucis @@ -94,14 +94,15 @@ def print_ucis(ucis, ofile, encodage) : class ParseFactivaPaste : def __init__(self, txtdir, fileout, encodage_in, encodage_out) : - files = os.listdir(txtdir) - files = [f for f in files if f.split('.')[-1] == 'txt'] + files = [] + for root, subfolders, subfiles in os.walk(txtdir) : + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'txt'] + nf.sort() + files += nf tot = 0 with open(fileout,'w') as outf : for f in files : print f - f = os.path.join(txtdir, f) - print f with codecs.open(f, 'rU', encodage_in) as infile : content = infile.read() ucis = parsetxtpaste(content)