1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
13 #from BeautifulSoup import BeautifulSoup #???
15 #------------------------------------
16 # import des fichiers du projet
17 #------------------------------------
18 from html.parser import HTMLParser
21 mois = {'janvier' : '01',
56 return [repr(year), month, '%02d' % day]
62 return [year, month, day]
65 # create a subclass and override the handler methods
66 class MyHTMLParser(HTMLParser):
68 def handle_starttag(self, tag, attrs):
69 #print "Encountered a start tag:", tag
72 if attrs[0][1] == 'DocPublicationName' :
73 #print 'DocPublicationName'
75 self.currentattr = 'DocPublicationName'
76 elif attrs[0][1] == 'DocHeader' :
78 self.currentattr = 'DocHeader'
79 elif attrs[0][1] in ['TitreArticleVisu', 'titreArticleVisu', 'titreArticle'] :
80 self.outfile.write('\n\n')
81 self.meta.append('\n')
82 self.outfile.write(' '.join(self.meta))
85 self.currentattr = 'TitreArticleVisu'
86 elif attrs[0][1] == 'PubliC_lblNodoc' :
87 self.currentattr = 'PubliC_lblNodoc'
89 self.currentattr = None
92 if attrs[0][1] == 'publiC-lblNodoc' :
93 self.currentattr = 'PubliC_lblNodoc'
94 elif attrs[0][1] == 'DocText' :
95 self.currentattr = 'TitreArticleVisu'
96 elif attrs[0][1] == 'titreArticle' :
97 self.currentattr = 'TitreArticleVisu'
100 if attrs[0][1] == 'titreArticleVisu' :
101 # self.outfile.write('\n\n')
102 # self.meta.append('\n')
103 # self.outfile.write(' '.join(self.meta).encode('utf8', errors='replace'))
104 # self.meta = ['****']
106 self.currentattr = 'TitreArticleVisu'
108 def handle_endtag(self, tag):
110 #print "Encountered an end tag :", tag
112 def handle_data(self, data):
113 #print self.currentattr
114 if self.currentattr == 'DocPublicationName' :
116 PublicationName = data.strip().replace(' ', '_').replace('(','').replace(')','').replace('-','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace('·','').lower()
117 PublicationName = PublicationName.split(',')[0]
118 if len([val for val in self.meta if val.startswith('*source_')]) == 0 :
119 self.meta.append('*source_' + PublicationName)
120 self.currentattr = None
121 # elif self.currentattr == 'DocHeader' :
122 # date = finddate(data)
123 # if date is not None :
124 # self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
125 elif self.currentattr == 'TitreArticleVisu' :
127 if data.startswith('©') :
128 self.currentattr = None
130 self.content.append(' '.join(data.replace('\n', ' ').split()) + ' ')
131 #self.outfile.write(' '.join(data.replace('\n', ' ').split()).encode('utf8', errors='replace') + ' ')
132 elif self.currentattr == 'PubliC_lblNodoc' :
133 date = data.split('·')[1]#data[5:13]
134 date = makedate(date)
135 self.meta += ['*date_' + '-'.join(date), '*am_' + '-'.join(date[0:2]), '*annee_' + date[0]]
136 self.meta.append('\n')
137 self.outfile.write('\n\n')
138 self.outfile.write(' '.join(self.meta))
139 self.outfile.write(' '.join(self.content))
143 self.currentattr = None
145 def doinit(self, outfile):
146 self.currentattr = None
150 self.outfile = outfile
153 def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
155 if os.path.isdir(txtdir) :
156 for root, subfolders, subfiles in os.walk(txtdir) :
157 nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
162 elif os.path.isfile(txtdir) :
165 parser = MyHTMLParser()
166 with open(fileout,'w') as outf :
170 with codecs.open(f, 'r', encodage_in) as infile :
171 content = infile.read()
172 content = HTMLParser().unescape(content)