X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_factiva_txt2.py;fp=parse_factiva_txt2.py;h=0000000000000000000000000000000000000000;hp=da048ec50d95efaaff120029a1bc7ccbf6480e81;hb=0f8b4705852de1679aba3e91c9809fe2250a715c;hpb=be2de22859c8f59abe9e1082981e4c3a62e9b7ca diff --git a/parse_factiva_txt2.py b/parse_factiva_txt2.py deleted file mode 100644 index da048ec..0000000 --- a/parse_factiva_txt2.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/env python -# -*- coding: utf-8 -*- -#Author: Pierre Ratinaud -#Copyright (c) 2012 Pierre Ratinaud -#Lisense: GNU/GPL - -import os -import codecs - - -txtdir = 'dev/factiva_txt' -fileout = 'dev/factiva_txt_out.txt' -encodage_in = 'utf8' -encodage_out = 'utf8' - - -def parsetxt(txt): - """ - parser de texte pour factiva - à partir d'un copier/coller de la fenêtre de visualisation - merci à Lucie Loubère pour l'astuce :) - """ - no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin - txt = txt.splitlines() - keepline = False - ucis = [] - for line in txt : - if line.startswith('Article') : - lp = line.split() - if len(lp) > 2 : - if lp[2] == 'Article' : - ucis.append([[u'****'],'']) - keepline = False - if line.startswith('SN ') : #source - source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() - ucis[-1][0].append(source) - elif line.startswith('PD ') : #date - mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] - ucis[-1][0].append(mois_annee) - elif line.strip() in no : #fin - keepline = False - elif line.startswith('RF ') : #fin - keepline = False - elif line.strip() in ['LP', 'TD'] : #debut texte - keepline = True - else : - pass - if keepline and line.strip() not in ['LP', 'TD', ''] : - ucis[-1][1] = '\n'.join([ucis[-1][1],line]) - return ucis - - -def print_ucis(ucis, ofile, encodage) : - #elimination des articles vides - ucis = [uci for uci in ucis if uci[1] != ''] - toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) - ofile.write(toprint.encode(encodage)) - -def doparse(txtdir, fileout, encodage_in, encodage_out): - files = os.listdir(txtdir) - with open(fileout,'w') as outf : - for f in files : - f= os.path.join(txtdir, f) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() - ucis = parsetxt(content) - print_ucis(ucis, outf, encodage_out) - -#for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] : -# path = os.path.join(txtdir,dat) -# outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt') -# doparse(path, outfile) - - -if __name__ == '__main__' : - doparse(txtdir, fileout, encodage_in, encodage_out) - print 'fini'