X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_europress.py;h=7d707fa6796cc21f9df8a600426a1f8b5caff2c6;hp=62ed52c730cdfda0fa9ad1f0698ca237b391f9f5;hb=80f4bfad30ece8835cb1f91349b1dda36439e4ca;hpb=be996b139e57dd5015a9c447b11738537a81d5d6 diff --git a/parse_europress.py b/parse_europress.py old mode 100644 new mode 100755 index 62ed52c..7d707fa --- a/parse_europress.py +++ b/parse_europress.py @@ -10,15 +10,10 @@ import os from HTMLParser import HTMLParser -infile = '/home/pierre/workspace/iramuteq/dev/europress/DocActionPrintSave.aspx.html' - -with codecs.open(infile, 'r', 'utf8') as f : - html = f.read() - -mois = {u'janvier' : '01', +mois = {u'janvier' : '01', u'février' : '02', u'mars' : '03', - u'avril' : '04', + u'avril' : '04', u'mai' : '05', u'juin' : '06', u'juillet' : '07', @@ -26,7 +21,7 @@ mois = {u'janvier' : '01', u'septembre' : '09', u'octobre' : '10', u'novembre' : '11', - u'décembre' : '12', + u'décembre' : '12', u'january' : '01', u'february': '02', u'march' : '03', @@ -51,77 +46,124 @@ def finddate(data): return None else : return [`year`, month, '%02d' % day] - + +def makedate(date): + year = date[0:4] + month = date[4:6] + day = date[6:] + return [year, month, day] + # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): #print "Encountered a start tag:", tag if tag == 'span' : - if attrs[0][1] == 'DocPublicationName' : - self.headercount = 0 - self.currentattr = 'DocPublicationName' - elif attrs[0][1] == 'DocHeader' : - self.headercount += 1 - self.currentattr = 'DocHeader' - elif attrs[0][1] == 'TitreArticleVisu' : - self.outfile.write('\n\n') - self.meta.append('\n') - self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) - self.meta = [u'****'] - self.nb += 1 - self.currentattr = 'TitreArticleVisu' + if len(attrs) > 0 : + if attrs[0][1] == 'DocPublicationName' : + #print 'DocPublicationName' + self.headercount = 0 + self.currentattr = 'DocPublicationName' + elif attrs[0][1] == 'DocHeader' : + self.headercount += 1 + self.currentattr = 'DocHeader' + elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] : + self.outfile.write('\n\n') + self.meta.append('\n') + self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + self.meta = [u'****'] + self.nb += 1 + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'PubliC_lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' elif tag == 'table' : self.currentattr = None + elif tag == 'div' : + if len(attrs)>0 : + if attrs[0][1] == 'publiC-lblNodoc' : + self.currentattr = 'PubliC_lblNodoc' + elif attrs[0][1] == 'DocText' : + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'titreArticle' : + self.currentattr = 'TitreArticleVisu' + elif tag == 'p' : + if len(attrs) > 0 : + if attrs[0][1] == 'titreArticleVisu' : + # self.outfile.write('\n\n') + # self.meta.append('\n') + # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + # self.meta = [u'****'] + # self.nb += 1 + self.currentattr = 'TitreArticleVisu' + def handle_endtag(self, tag): pass #print "Encountered an end tag :", tag def handle_data(self, data): + #print self.currentattr if self.currentattr == 'DocPublicationName' : - PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower() + #print data + PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() PublicationName = PublicationName.split(',')[0] - self.meta.append(u'*source_' + PublicationName) - elif self.currentattr == 'DocHeader' : - date = finddate(data) - if date is not None : - self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + if len([val for val in self.meta if val.startswith(u'*source_')]) == 0 : + self.meta.append(u'*source_' + PublicationName) + self.currentattr = None +# elif self.currentattr == 'DocHeader' : +# date = finddate(data) +# if date is not None : +# self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] elif self.currentattr == 'TitreArticleVisu' : + #print data if data.startswith(u'©') : self.currentattr = None return - self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') - + self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') + #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') + elif self.currentattr == 'PubliC_lblNodoc' : + date = data.split(u'·')[1]#data[5:13] + date = makedate(date) + self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] + self.meta.append('\n') + self.outfile.write('\n\n') + self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) + self.outfile.write(' '.join(self.content).encode('utf8')) + self.content = [] + self.meta = [u'****'] + self.nb += 1 + self.currentattr = None + def doinit(self, outfile): self.currentattr = None self.meta = [u'****'] + self.content = [] self.nb = 0 self.outfile = outfile - + print 'init ok' -class ParseEuropress : - def __init__(self, txtdir, fileout, encodage_in, encodage_out) : + +def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : files = [] - for root, subfolders, subfiles in os.walk(txtdir) : - nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html'] - nf.sort() - files += nf + if os.path.isdir(txtdir) : + for root, subfolders, subfiles in os.walk(txtdir) : + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] + nf.sort() + files += nf + if len(files) == 0 : + return 'nofile' + elif os.path.isfile(txtdir) : + files.append(txtdir) tot = 0 parser = MyHTMLParser() - with open(fileout,'w') as outf : - for f in files : + with open(fileout,'w') as outf : + for f in files : print f parser.doinit(outf) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() + with codecs.open(f, 'rU', encodage_in) as infile : + content = infile.read() + content = HTMLParser().unescape(content) parser.feed(content) tot += parser.nb - print 'ok', parser.nb, 'articles', ' - total : ', tot - -#ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8') - #print "Encountered some data :", data -# instantiate the parser and fed it some HTML -#outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt' -#parser = MyHTMLParser() -#with open(outfile, 'w') as f : -# parser.doinit(f) -# parser.feed(html) \ No newline at end of file + return tot + +#ParseEuropress('/home/pierre/fac/HDR/psychanalyse', +# '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8')