X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_europress.py;h=5678acac0737efaa3365338efe8e57b897ed10a2;hp=62ed52c730cdfda0fa9ad1f0698ca237b391f9f5;hb=148fe710bf14981c45e865e8b4ddb68333e62f7c;hpb=be996b139e57dd5015a9c447b11738537a81d5d6 diff --git a/parse_europress.py b/parse_europress.py index 62ed52c..5678aca 100644 --- a/parse_europress.py +++ b/parse_europress.py @@ -10,11 +10,6 @@ import os from HTMLParser import HTMLParser -infile = '/home/pierre/workspace/iramuteq/dev/europress/DocActionPrintSave.aspx.html' - -with codecs.open(infile, 'r', 'utf8') as f : - html = f.read() - mois = {u'janvier' : '01', u'février' : '02', u'mars' : '03', @@ -78,7 +73,7 @@ class MyHTMLParser(HTMLParser): #print "Encountered an end tag :", tag def handle_data(self, data): if self.currentattr == 'DocPublicationName' : - PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower() + PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() PublicationName = PublicationName.split(',')[0] self.meta.append(u'*source_' + PublicationName) elif self.currentattr == 'DocHeader' : @@ -98,13 +93,14 @@ class MyHTMLParser(HTMLParser): self.outfile = outfile -class ParseEuropress : - def __init__(self, txtdir, fileout, encodage_in, encodage_out) : +def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : files = [] for root, subfolders, subfiles in os.walk(txtdir) : nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html'] nf.sort() files += nf + if len(files) == 0 : + return 'nofile' tot = 0 parser = MyHTMLParser() with open(fileout,'w') as outf : @@ -115,7 +111,7 @@ class ParseEuropress : content = infile.read() parser.feed(content) tot += parser.nb - print 'ok', parser.nb, 'articles', ' - total : ', tot + return tot #ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8') #print "Encountered some data :", data