X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_factiva_txt.py;h=fd856f4c82812999dfa018e7ee87ad9f63d513ac;hp=9cb2af2dc44be3916df86226543b7fb6b0c51f38;hb=f12da65c1895ecdd1b48109d7b1334181487a25f;hpb=0f8b4705852de1679aba3e91c9809fe2250a715c diff --git a/parse_factiva_txt.py b/parse_factiva_txt.py index 9cb2af2..fd856f4 100644 --- a/parse_factiva_txt.py +++ b/parse_factiva_txt.py @@ -2,10 +2,11 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud #Copyright (c) 2012-2013 Pierre Ratinaud -#Lisense: GNU/GPL +#License: GNU/GPL import os import codecs +import re #txtdir = 'dev/factiva_txt' @@ -13,6 +14,31 @@ import codecs #encodage_in = 'utf8' #encodage_out = 'utf8' +mois = {u'janvier' : '01', + u'février' : '02', + u'mars' : '03', + u'avril' : '04', + u'mai' : '05', + u'juin' : '06', + u'juillet' : '07', + u'août' : '08', + u'septembre' : '09', + u'octobre' : '10', + u'novembre' : '11', + u'décembre' : '12', + u'january' : '01', + u'february': '02', + u'march' : '03', + u'april': '04', + u'may': '05', + u'june' : '06', + u'july': '07', + u'august': '08', + u'september' : '09', + u'october': '10', + u'november': '11', + u'december': '12'} + def parsetxtpaste(txt): """ @@ -25,19 +51,27 @@ def parsetxtpaste(txt): keepline = False ucis = [] for line in txt : - if line.startswith('Article') : + if line.startswith(u'Article') : lp = line.split() if len(lp) > 2 : - if lp[2] == 'Article' : + if lp[2] == u'Article' or lp[2] == u'Next' or lp[2] == u'Previous': ucis.append([[u'****'],'']) keepline = False if line.startswith('SN ') : #source - source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() + jsource = re.sub(u'[\'" !\.?;,:\+\-°&]', '', line[4:]) + source = u'_'.join([u'*source', jsource]).lower() + #source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() ucis[-1][0].append(source) elif line.startswith('PD ') : #date - mois_annee = '*ma_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] - ucis[-1][0].append(mois_annee) - annee = u'*annee_' + line[4:].split(' ')[2] + datemois = line[4:].split(' ')[1].lower() + datemois = mois.get(datemois, datemois) + dateannee = line[4:].split(' ')[2] + datejour = '%02d' % int(line[4:].split(' ')[0]) + am = '_'.join([u'*am', dateannee, datemois]) + amj = '_'.join([u'*amj', dateannee, datemois, datejour]) + ucis[-1][0].append(am) + ucis[-1][0].append(amj) + annee = '_'.join([u'*annee', dateannee]) ucis[-1][0].append(annee) elif line.strip() in no : #fin keepline = False @@ -48,7 +82,7 @@ def parsetxtpaste(txt): else : pass if keepline and line.strip() not in ['LP', 'TD', ''] : - ucis[-1][1] = '\n'.join([ucis[-1][1],line]) + ucis[-1][1] = '\n'.join([ucis[-1][1],line.replace(u'*', ' ')]) return ucis @@ -56,18 +90,25 @@ def print_ucis(ucis, ofile, encodage) : #elimination des articles vides ucis = [uci for uci in ucis if uci[1].strip() != ''] toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) - ofile.write(toprint.encode(encodage)) + ofile.write(toprint.encode(encodage, errors='replace') + '\n') class ParseFactivaPaste : def __init__(self, txtdir, fileout, encodage_in, encodage_out) : - files = os.listdir(txtdir) + files = [] + for root, subfolders, subfiles in os.walk(txtdir) : + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'txt'] + nf.sort() + files += nf + tot = 0 with open(fileout,'w') as outf : for f in files : - f= os.path.join(txtdir, f) + print f with codecs.open(f, 'rU', encodage_in) as infile : content = infile.read() ucis = parsetxtpaste(content) print_ucis(ucis, outf, encodage_out) + tot += len(ucis) + print 'ok', len(ucis), 'articles', ' - total : ', tot #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] : # path = os.path.join(txtdir,dat)