From 7a42265190acc197cb76a86ac3d7cd234cfaba4f Mon Sep 17 00:00:00 2001 From: pierre Date: Sat, 9 Nov 2024 11:18:35 +0100 Subject: [PATCH] europresse parser --- parse_europress.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parse_europress.py b/parse_europress.py index a4c7e23..5159ef2 100755 --- a/parse_europress.py +++ b/parse_europress.py @@ -16,6 +16,7 @@ import os # import des fichiers du projet #------------------------------------ from html.parser import HTMLParser +from html import unescape mois = {'janvier' : '01', @@ -169,7 +170,7 @@ def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) : parser.doinit(outf) with codecs.open(f, 'r', encodage_in) as infile : content = infile.read() - content = HTMLParser().unescape(content) + content = unescape(content) parser.feed(content) tot += parser.nb return tot -- 2.7.4