- tot = 0
- parser = MyHTMLParser()
- with open(fileout,'w') as outf :
- for f in files :
- print f
- parser.doinit(outf)
- with codecs.open(f, 'rU', encodage_in) as infile :
- content = infile.read()
- parser.feed(content)
- tot += parser.nb
- return tot
-
-#ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8')
- #print "Encountered some data :", data
-# instantiate the parser and fed it some HTML
-#outfile = '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt'
-#parser = MyHTMLParser()
-#with open(outfile, 'w') as f :
-# parser.doinit(f)
-# parser.feed(html)
\ No newline at end of file
+ elif os.path.isfile(txtdir) :
+ files.append(txtdir)
+ tot = 0
+ parser = MyHTMLParser()
+ with open(fileout,'w') as outf :
+ for f in files :
+ print(f)
+ parser.doinit(outf)
+ with codecs.open(f, 'r', encodage_in) as infile :
+ content = infile.read()
+ content = HTMLParser().unescape(content)
+ parser.feed(content)
+ tot += parser.nb
+ return tot