projects
/
iramuteq
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
| inline |
side by side
(parent:
460fae9
)
europresse parser
author
pierre
<ratinaud@univ-tlse2.fr>
Sat, 9 Nov 2024 10:18:35 +0000
(11:18 +0100)
committer
pierre
<ratinaud@univ-tlse2.fr>
Sat, 9 Nov 2024 10:18:35 +0000
(11:18 +0100)
parse_europress.py
patch
|
blob
|
history
diff --git
a/parse_europress.py
b/parse_europress.py
index
a4c7e23
..
5159ef2
100755
(executable)
--- a/
parse_europress.py
+++ b/
parse_europress.py
@@
-16,6
+16,7
@@
import os
# import des fichiers du projet
#------------------------------------
from html.parser import HTMLParser
+from html import unescape
mois = {'janvier' : '01',
@@
-169,7
+170,7
@@
def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
parser.doinit(outf)
with codecs.open(f, 'r', encodage_in) as infile :
content = infile.read()
- content =
HTMLParser().
unescape(content)
+ content = unescape(content)
parser.feed(content)
tot += parser.nb
return tot