X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_europress.py;h=7d707fa6796cc21f9df8a600426a1f8b5caff2c6;hp=9927063624194935dde6ad72a5764379ed0ac4ac;hb=93daeec719b63428df470cd40de60dbefb270b4a;hpb=e4ec2234f0a1224c628c7d6017211cc820913385 diff --git a/parse_europress.py b/parse_europress.py old mode 100644 new mode 100755 index 9927063..7d707fa --- a/parse_europress.py +++ b/parse_europress.py @@ -67,7 +67,7 @@ class MyHTMLParser(HTMLParser): elif attrs[0][1] == 'DocHeader' : self.headercount += 1 self.currentattr = 'DocHeader' - elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu'] : + elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] : self.outfile.write('\n\n') self.meta.append('\n') self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace')) @@ -82,6 +82,10 @@ class MyHTMLParser(HTMLParser): if len(attrs)>0 : if attrs[0][1] == 'publiC-lblNodoc' : self.currentattr = 'PubliC_lblNodoc' + elif attrs[0][1] == 'DocText' : + self.currentattr = 'TitreArticleVisu' + elif attrs[0][1] == 'titreArticle' : + self.currentattr = 'TitreArticleVisu' elif tag == 'p' : if len(attrs) > 0 : if attrs[0][1] == 'titreArticleVisu' : @@ -96,11 +100,13 @@ class MyHTMLParser(HTMLParser): pass #print "Encountered an end tag :", tag def handle_data(self, data): + #print self.currentattr if self.currentattr == 'DocPublicationName' : #print data PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower() PublicationName = PublicationName.split(',')[0] - self.meta.append(u'*source_' + PublicationName) + if len([val for val in self.meta if val.startswith(u'*source_')]) == 0 : + self.meta.append(u'*source_' + PublicationName) self.currentattr = None # elif self.currentattr == 'DocHeader' : # date = finddate(data) @@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser): self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ') #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ') elif self.currentattr == 'PubliC_lblNodoc' : - date = data[5:13] + date = data.split(u'·')[1]#data[5:13] date = makedate(date) self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]] self.meta.append('\n') @@ -125,7 +131,7 @@ class MyHTMLParser(HTMLParser): self.meta = [u'****'] self.nb += 1 self.currentattr = None - + def doinit(self, outfile): self.currentattr = None self.meta = [u'****'] @@ -133,28 +139,31 @@ class MyHTMLParser(HTMLParser): self.nb = 0 self.outfile = outfile print 'init ok' - + def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : files = [] - for root, subfolders, subfiles in os.walk(txtdir) : - nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] - nf.sort() - files += nf - if len(files) == 0 : - return 'nofile' + if os.path.isdir(txtdir) : + for root, subfolders, subfiles in os.walk(txtdir) : + nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ] + nf.sort() + files += nf + if len(files) == 0 : + return 'nofile' + elif os.path.isfile(txtdir) : + files.append(txtdir) tot = 0 parser = MyHTMLParser() - with open(fileout,'w') as outf : - for f in files : + with open(fileout,'w') as outf : + for f in files : print f parser.doinit(outf) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() + with codecs.open(f, 'rU', encodage_in) as infile : + content = infile.read() content = HTMLParser().unescape(content) parser.feed(content) tot += parser.nb return tot -#ParseEuropress('/home/pierre/fac/etudiant/DeNadai/corpus_loi_travail', -# '/home/pierre/fac/etudiant/DeNadai/corpus_loi_W.txt', 'utf8', 'utf8') +#ParseEuropress('/home/pierre/fac/HDR/psychanalyse', +# '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8')