1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2014 Pierre Ratinaud
7 #from BeautifulSoup import BeautifulSoup
10 from HTMLParser import HTMLParser
13 infile = '/home/pierre/workspace/iramuteq/dev/europress/DocActionPrintSave.aspx.html'
15 with codecs.open(infile, 'r', 'utf8') as f :
18 mois = {u'janvier' : '01',
53 return [`year`, month, '%02d' % day]
56 # create a subclass and override the handler methods
57 class MyHTMLParser(HTMLParser):
58 def handle_starttag(self, tag, attrs):
59 #print "Encountered a start tag:", tag
61 if attrs[0][1] == 'DocPublicationName' :
63 self.currentattr = 'DocPublicationName'
64 elif attrs[0][1] == 'DocHeader' :
66 self.currentattr = 'DocHeader'
67 elif attrs[0][1] == 'TitreArticleVisu' :
68 self.outfile.write('\n\n')
69 self.meta.append('\n')
70 self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
73 self.currentattr = 'TitreArticleVisu'
75 self.currentattr = None
76 def handle_endtag(self, tag):
78 #print "Encountered an end tag :", tag
79 def handle_data(self, data):
80 if self.currentattr == 'DocPublicationName' :
81 PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower()
82 PublicationName = PublicationName.split(',')[0]
83 self.meta.append(u'*source_' + PublicationName)
84 elif self.currentattr == 'DocHeader' :
87 self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
88 elif self.currentattr == 'TitreArticleVisu' :
89 if data.startswith(u'©') :
90 self.currentattr = None
92 self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
94 def doinit(self, outfile):
95 self.currentattr = None
98 self.outfile = outfile
101 class ParseEuropress :
102 def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
104 for root, subfolders, subfiles in os.walk(txtdir) :
105 nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html']
109 parser = MyHTMLParser()
110 with open(fileout,'w') as outf :
114 with codecs.open(f, 'rU', encodage_in) as infile :
115 content = infile.read()
118 print 'ok', parser.nb, 'articles', ' - total : ', tot
120 #ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8')
121 #print "Encountered some data :", data
122 # instantiate the parser and fed it some HTML
123 #outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt'
124 #parser = MyHTMLParser()
125 #with open(outfile, 'w') as f :