1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
6 #------------------------------------
7 # import des modules python
8 #------------------------------------
14 #------------------------------------
15 # import des modules wx
16 #------------------------------------
18 import wx.lib.sized_controls as sc
19 import wx.lib.filebrowsebutton as filebrowse
21 #------------------------------------
22 # import des fichiers du projet
23 #------------------------------------
24 from html.parser import HTMLParser
27 htmldir = 'dev/factiva_html'
30 class MyHTMLParser(HTMLParser):
33 HTMLParser.__init__(self)
42 def handle_starttag(self, tag, attrs):
44 if tag not in ['div', 'p', 'b'] :
54 tagname = attrs[0][1].split()
55 if tagtype == 'class' and tagname[0] == 'article' :
60 elif tagtype == 'class' and tagname[0] == 'author' :
65 tagname = attrs[0][1].split()
66 if tagtype == 'class' and tagname[0] == 'articleParagraph' :
72 def handle_data(self, data) :
73 #print data.encode('utf-8')
75 #print data.encode('utf-8')
78 #print 'data', data.encode('utf8')
81 self.data[-1].append(data)
86 if self.count == 2 and not self.author :
87 self.data[-1].append('PAS DAUTEUR')
89 self.data[-1].append(data)
92 self.data[-1].append(data)
93 # print "Encountered a start tag:", tag
94 #def handle_endtag(self, tag):
95 # print "Encountered an end tag :", tag
96 #def handle_data(self, data):
97 # print "Encountered some data :", data
99 # execution en direct ???
100 files = os.listdir(htmldir)
101 parser = MyHTMLParser()
103 f= os.path.join(htmldir, f)
104 with codecs.open(f, 'r', 'utf8') as infile :
105 content = infile.read()
107 out = [[' '.join(['****','*date_'+art[4].replace(' ','_'),'*s_'+art[5].replace(' ','_')]), ' '.join(art[10:len(art)-1])] for art in parser.data]
109 print(parser.data[i])
110 out = [' '.join(art) for art in out]
111 print('\n\n\n'.join(out))