From e4ec2234f0a1224c628c7d6017211cc820913385 Mon Sep 17 00:00:00 2001 From: Pierre Ratinaud Date: Sun, 11 Dec 2016 17:05:46 +0100 Subject: [PATCH] adaptation au nouveau format d'europresse --- parse_europress.py | 103 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/parse_europress.py b/parse_europress.py index 5678aca..9927063 100644 --- a/parse_europress.py +++ b/parse_europress.py @@ -10,10 +10,10 @@ import os from HTMLParser import HTMLParser -mois = {u'janvier' : '01', +mois = {u'janvier' : '01', u'février' : '02', u'mars' : '03', - u'avril' : '04', + u'avril' : '04', u'mai' : '05', u'juin' : '06', u'juillet' : '07', @@ -21,7 +21,7 @@ mois = {u'janvier' : '01', u'septembre' : '09', u'octobre' : '10', u'novembre' : '11', - u'décembre' : '12', + u'décembre' : '12', u'january' : '01', u'february': '02', u'march' : '03', @@ -46,57 +46,99 @@ def finddate(data): return None else : return [`year`, month, '%02d' % day] - + +def makedate(date): + year = date[0:4] + month = date[4:6] + day = date[6:] + return [year, month, day] + # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): #print "Encountered a start tag:", tag if tag == 'span' : - if attrs[0][1] == 'DocPublicationName' : - self.headercount = 0 - self.currentattr = 'DocPublicationName' - elif attrs[0][1] == 'DocHeader' : - self.headercount += 1 - self.currentattr = 'DocHeader' - elif attrs[0][1] == 'TitreArticleVisu' : - self.outfile.write('\n\n') - self.meta.append('\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.meta = [u'****'] - self.nb += 1 - self.currentattr = 'TitreArticleVisu' + if len(attrs) > 0 : + if attrs[0][1] == 'DocPublicationName' : + #print 'DocPublicationName' + self.headercount = 0 + self.currentattr = 'DocPublicationName' + elif attrs[0][1] == 'DocHeader' : + self.headercount += 1 + self.currentattr = 'DocHeader' + elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu'] : + self.outfile.write('\n\n') + self.meta.append('\n') + self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + self.meta = [u'****'] + self.nb += 1 + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'PubliC_lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' elif tag == 'table' : self.currentattr = None + elif tag == 'div' : + if len(attrs)>0 : + if attrs[0][1] == 'publiC-lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' + elif tag == 'p' : + if len(attrs) > 0 : + if attrs[0][1] == 'titreArticleVisu' : + # self.outfile.write('\n\n') + # self.meta.append('\n') + # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + # self.meta = [u'****'] + # self.nb += 1 + self.currentattr = 'TitreArticleVisu' + def handle_endtag(self, tag): pass #print "Encountered an end tag :", tag def handle_data(self, data): if self.currentattr == 'DocPublicationName' : - PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() + #print data + PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() PublicationName = PublicationName.split(',')[0] self.meta.append(u'*source_' + PublicationName) - elif self.currentattr == 'DocHeader' : - date = finddate(data) - if date is not None : - self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + self.currentattr = None +# elif self.currentattr == 'DocHeader' : +# date = finddate(data) +# if date is not None : +# self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] elif self.currentattr == 'TitreArticleVisu' : + #print data if data.startswith(u'©') : self.currentattr = None return - self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') + self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') + #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') + elif self.currentattr == 'PubliC_lblNodoc' : + date = data[5:13] + date = makedate(date) + self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + self.meta.append('\n') + self.outfile.write('\n\n') + self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + self.outfile.write(' '.join(self.content).encode('utf8')) + self.content = [] + self.meta = [u'****'] + self.nb += 1 + self.currentattr = None def doinit(self, outfile): self.currentattr = None self.meta = [u'****'] + self.content = [] self.nb = 0 self.outfile = outfile + print 'init ok' def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : files = [] for root, subfolders, subfiles in os.walk(txtdir) : - nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html'] + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] nf.sort() files += nf if len(files) == 0 : @@ -109,15 +151,10 @@ def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : parser.doinit(outf) with codecs.open(f, 'rU', encodage_in) as infile : content = infile.read() + content = HTMLParser().unescape(content) parser.feed(content) tot += parser.nb return tot - -#ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8') - #print "Encountered some data :", data -# instantiate the parser and fed it some HTML -#outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt' -#parser = MyHTMLParser() -#with open(outfile, 'w') as f : -# parser.doinit(f) -# parser.feed(html) \ No newline at end of file + +#ParseEuropress('/home/pierre/fac/etudiant/DeNadai/corpus_loi_travail', +# '/home/pierre/fac/etudiant/DeNadai/corpus_loi_W.txt', 'utf8', 'utf8') -- 2.7.4