# -*- coding: utf-8 -*-
#Author: Pierre Ratinaud
-#Copyright (c) 2014 Pierre Ratinaud
+#Copyright (c) 2008-2020 Pierre Ratinaud
+#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
#License: GNU/GPL
-
-#from BeautifulSoup import BeautifulSoup
+#------------------------------------
+# import des modules python
+#------------------------------------
import codecs
import os
-from HTMLParser import HTMLParser
+#from BeautifulSoup import BeautifulSoup #???
+
+#------------------------------------
+# import des fichiers du projet
+#------------------------------------
+from html.parser import HTMLParser
-mois = {u'janvier' : '01',
- u'février' : '02',
- u'mars' : '03',
- u'avril' : '04',
- u'mai' : '05',
- u'juin' : '06',
- u'juillet' : '07',
- u'août' : '08',
- u'septembre' : '09',
- u'octobre' : '10',
- u'novembre' : '11',
- u'décembre' : '12',
- u'january' : '01',
- u'february': '02',
- u'march' : '03',
- u'april': '04',
- u'may': '05',
- u'june' : '06',
- u'july': '07',
- u'august': '08',
- u'september' : '09',
- u'october': '10',
- u'november': '11',
- u'december': '12'}
+
+mois = {'janvier' : '01',
+ 'février' : '02',
+ 'mars' : '03',
+ 'avril' : '04',
+ 'mai' : '05',
+ 'juin' : '06',
+ 'juillet' : '07',
+ 'août' : '08',
+ 'septembre' : '09',
+ 'octobre' : '10',
+ 'novembre' : '11',
+ 'décembre' : '12',
+ 'january' : '01',
+ 'february': '02',
+ 'march' : '03',
+ 'april': '04',
+ 'may': '05',
+ 'june' : '06',
+ 'july': '07',
+ 'august': '08',
+ 'september' : '09',
+ 'october': '10',
+ 'november': '11',
+ 'december': '12'}
def finddate(data):
except :
return None
else :
- return [`year`, month, '%02d' % day]
+ return [repr(year), month, '%02d' % day]
def makedate(date):
year = date[0:4]
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
+
def handle_starttag(self, tag, attrs):
#print "Encountered a start tag:", tag
if tag == 'span' :
elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] :
self.outfile.write('\n\n')
self.meta.append('\n')
- self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
- self.meta = [u'****']
+ self.outfile.write(' '.join(self.meta))
+ self.meta = ['****']
self.nb += 1
self.currentattr = 'TitreArticleVisu'
elif attrs[0][1] == 'PubliC_lblNodoc' :
# self.outfile.write('\n\n')
# self.meta.append('\n')
# self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
- # self.meta = [u'****']
+ # self.meta = ['****']
# self.nb += 1
self.currentattr = 'TitreArticleVisu'
def handle_endtag(self, tag):
pass
#print "Encountered an end tag :", tag
+
def handle_data(self, data):
#print self.currentattr
if self.currentattr == 'DocPublicationName' :
#print data
- PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'·','').lower()
+ PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower()
PublicationName = PublicationName.split(',')[0]
- if len([val for val in self.meta if val.startswith(u'*source_')]) == 0 :
- self.meta.append(u'*source_' + PublicationName)
+ if len([val for val in self.meta if val.startswith('*source_')]) == 0 :
+ self.meta.append('*source_' + PublicationName)
self.currentattr = None
# elif self.currentattr == 'DocHeader' :
# date = finddate(data)
# if date is not None :
-# self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
+# self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
elif self.currentattr == 'TitreArticleVisu' :
#print data
- if data.startswith(u'©') :
+ if data.startswith('©') :
self.currentattr = None
return
self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ')
#self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
elif self.currentattr == 'PubliC_lblNodoc' :
- date = data.split(u'·')[1]#data[5:13]
+ date = data.split('·')[1]#data[5:13]
date = makedate(date)
- self.meta += [u'*date_' + '-'.join(date), u'*am_' + '-'.join(date[0:2]), u'*annee_' + date[0]]
+ self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
self.meta.append('\n')
self.outfile.write('\n\n')
- self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
- self.outfile.write(' '.join(self.content).encode('utf8'))
+ self.outfile.write(' '.join(self.meta))
+ self.outfile.write(' '.join(self.content))
self.content = []
- self.meta = [u'****']
+ self.meta = ['****']
self.nb += 1
self.currentattr = None
def doinit(self, outfile):
self.currentattr = None
- self.meta = [u'****']
+ self.meta = ['****']
self.content = []
self.nb = 0
self.outfile = outfile
- print 'init ok'
-
+ print('init ok')
def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
- files = []
- if os.path.isdir(txtdir) :
- for root, subfolders, subfiles in os.walk(txtdir) :
- nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
- nf.sort()
- files += nf
- if len(files) == 0 :
- return 'nofile'
- elif os.path.isfile(txtdir) :
- files.append(txtdir)
- tot = 0
- parser = MyHTMLParser()
- with open(fileout,'w') as outf :
- for f in files :
- print f
- parser.doinit(outf)
- with codecs.open(f, 'rU', encodage_in) as infile :
- content = infile.read()
- content = HTMLParser().unescape(content)
- parser.feed(content)
- tot += parser.nb
- return tot
-
-#ParseEuropress('/home/pierre/fac/HDR/psychanalyse',
-# '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8')
+ files = []
+ if os.path.isdir(txtdir) :
+ for root, subfolders, subfiles in os.walk(txtdir) :
+ nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
+ nf.sort()
+ files += nf
+ if len(files) == 0 :
+ return 'nofile'
+ elif os.path.isfile(txtdir) :
+ files.append(txtdir)
+ tot = 0
+ parser = MyHTMLParser()
+ with open(fileout,'w') as outf :
+ for f in files :
+ print(f)
+ parser.doinit(outf)
+ with codecs.open(f, 'r', encodage_in) as infile :
+ content = infile.read()
+ content = HTMLParser().unescape(content)
+ parser.feed(content)
+ tot += parser.nb
+ return tot