iramuteq.org Git - iramuteq/blob - parse_factiva_txt2.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2012 Pierre Ratinaud
   5 #Lisense: GNU/GPL
   6
   7 import os
   8 import codecs
   9
  10
  11 txtdir = 'dev/factiva_txt'
  12 fileout = 'dev/factiva_txt_out.txt'
  13 encodage_in = 'utf8'
  14 encodage_out = 'utf8'
  15
  16
  17 def parsetxt(txt):
  18     """
  19     parser de texte pour factiva
  20     à partir d'un copier/coller de la fenêtre de visualisation
  21     merci à Lucie Loubère pour l'astuce :)
  22     """
  23     no = ['NS','RE','IPD','CO','IN']  # les balises qui signalent une fin
  24     txt = txt.splitlines()
  25     keepline = False
  26     ucis = []
  27     for line in txt :
  28         if line.startswith('Article') :
  29             lp = line.split()
  30             if len(lp) > 2  :
  31                 if lp[2] == 'Article' :
  32                     ucis.append([[u'****'],''])
  33                     keepline = False
  34         if line.startswith('SN ') : #source
  35             source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
  36             ucis[-1][0].append(source)
  37         elif line.startswith('PD ') : #date
  38             mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2]
  39             ucis[-1][0].append(mois_annee)
  40         elif line.strip() in no : #fin
  41             keepline = False
  42         elif line.startswith('RF ') : #fin
  43             keepline = False
  44         elif line.strip() in ['LP', 'TD'] : #debut texte
  45             keepline = True
  46         else :
  47             pass
  48         if keepline and line.strip() not in ['LP', 'TD', ''] :
  49             ucis[-1][1] = '\n'.join([ucis[-1][1],line])
  50     return ucis
  51
  52
  53 def print_ucis(ucis, ofile, encodage) :
  54     #elimination des articles vides
  55     ucis = [uci for uci in ucis if uci[1] != '']
  56     toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
  57     ofile.write(toprint.encode(encodage))
  58
  59 def doparse(txtdir, fileout, encodage_in, encodage_out):
  60     files = os.listdir(txtdir)
  61     with open(fileout,'w') as outf :
  62         for f in files :
  63             f= os.path.join(txtdir, f)
  64             with codecs.open(f, 'rU', encodage_in) as infile :
  65                 content = infile.read()
  66             ucis = parsetxt(content)
  67             print_ucis(ucis, outf, encodage_out)
  68
  69 #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
  70 #    path = os.path.join(txtdir,dat)
  71 #    outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt')
  72 #    doparse(path, outfile)
  73
  74
  75 if __name__ == '__main__' :
  76     doparse(txtdir, fileout, encodage_in, encodage_out)
  77     print 'fini'