iramuteq.org Git - iramuteq/blob - parse_europress.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 #Copyright (c) 2014 Pierre Ratinaud
   4 #License: GNU/GPL
   5
   6
   7 #from BeautifulSoup import BeautifulSoup
   8 import codecs
   9 import os
  10 from HTMLParser import HTMLParser
  11
  12
  13 infile = '/home/pierre/workspace/iramuteq/dev/europress/DocActionPrintSave.aspx.html'
  14
  15 with codecs.open(infile, 'r', 'utf8') as f :
  16     html = f.read()
  17
  18 mois = {u'janvier' : '01',
  19         u'février' : '02',
  20         u'mars' : '03',
  21         u'avril' : '04',
  22         u'mai' : '05',
  23         u'juin' : '06',
  24         u'juillet' : '07',
  25         u'août' : '08',
  26         u'septembre' : '09',
  27         u'octobre' : '10',
  28         u'novembre' : '11',
  29         u'décembre' : '12',
  30         u'january' : '01',
  31         u'february': '02',
  32         u'march' : '03',
  33         u'april': '04',
  34         u'may': '05',
  35         u'june' : '06',
  36         u'july': '07',
  37         u'august': '08',
  38         u'september' : '09',
  39         u'october': '10',
  40         u'november': '11',
  41         u'december': '12'}
  42
  43
  44 def finddate(data):
  45     data = data.split()
  46     try :
  47         day = int(data[0])
  48         year = int(data[2])
  49         month = mois[data[1]]
  50     except :
  51         return None
  52     else :
  53         return [`year`, month, '%02d' % day]
  54
  55
  56 # create a subclass and override the handler methods
  57 class MyHTMLParser(HTMLParser):
  58     def handle_starttag(self, tag, attrs):
  59         #print "Encountered a start tag:", tag
  60         if tag == 'span' :
  61             if attrs[0][1] == 'DocPublicationName' :
  62                 self.headercount = 0
  63                 self.currentattr = 'DocPublicationName'
  64             elif attrs[0][1] == 'DocHeader' :
  65                 self.headercount += 1
  66                 self.currentattr = 'DocHeader'
  67             elif attrs[0][1] == 'TitreArticleVisu' :
  68                 self.outfile.write('\n\n')
  69                 self.meta.append('\n')
  70                 self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
  71                 self.meta = [u'****']
  72                 self.nb += 1
  73                 self.currentattr = 'TitreArticleVisu'
  74         elif tag == 'table' :
  75             self.currentattr = None
  76     def handle_endtag(self, tag):
  77         pass
  78         #print "Encountered an end tag :", tag
  79     def handle_data(self, data):
  80         if self.currentattr == 'DocPublicationName' :
  81             PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower()
  82             PublicationName = PublicationName.split(',')[0]
  83             self.meta.append(u'*source_' + PublicationName)
  84         elif self.currentattr == 'DocHeader' :
  85             date = finddate(data)
  86             if date is not None :
  87                 self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
  88         elif self.currentattr == 'TitreArticleVisu' :
  89             if data.startswith(u'©') :
  90                 self.currentattr = None
  91                 return
  92             self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
  93
  94     def doinit(self, outfile):
  95         self.currentattr = None
  96         self.meta = [u'****']
  97         self.nb = 0
  98         self.outfile = outfile
  99
 100
 101 class ParseEuropress :
 102     def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
 103         files = []
 104         for root, subfolders, subfiles in os.walk(txtdir) :
 105             nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html']
 106             nf.sort()
 107             files += nf
 108         tot = 0
 109         parser = MyHTMLParser()
 110         with open(fileout,'w') as outf :
 111             for f in files :
 112                 print f
 113                 parser.doinit(outf)
 114                 with codecs.open(f, 'rU', encodage_in) as infile :
 115                     content = infile.read()
 116                 parser.feed(content)
 117                 tot += parser.nb
 118                 print 'ok', parser.nb, 'articles', ' - total : ', tot
 119
 120 #ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8')
 121         #print "Encountered some data  :", data
 122 # instantiate the parser and fed it some HTML
 123 #outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt'
 124 #parser = MyHTMLParser()
 125 #with open(outfile, 'w') as f :
 126 #    parser.doinit(f)
 127 #    parser.feed(html)