- files = []
- if os.path.isdir(txtdir) :
- for root, subfolders, subfiles in os.walk(txtdir) :
- nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
- nf.sort()
- files += nf
- if len(files) == 0 :
- return 'nofile'
- elif os.path.isfile(txtdir) :
- files.append(txtdir)
- tot = 0
- parser = MyHTMLParser()
- with open(fileout,'w') as outf :
- for f in files :
- print f
- parser.doinit(outf)
- with codecs.open(f, 'rU', encodage_in) as infile :
- content = infile.read()
- content = HTMLParser().unescape(content)
- parser.feed(content)
- tot += parser.nb
- return tot
-
-#ParseEuropress('/home/pierre/fac/HDR/psychanalyse',
-# '/home/pierre/fac/HDR/psycha.txt', 'utf8', 'utf8')
+ files = []
+ if os.path.isdir(txtdir) :
+ for root, subfolders, subfiles in os.walk(txtdir) :
+ nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] in ['html', 'HTML'] ]
+ nf.sort()
+ files += nf
+ if len(files) == 0 :
+ return 'nofile'
+ elif os.path.isfile(txtdir) :
+ files.append(txtdir)
+ tot = 0
+ parser = MyHTMLParser()
+ with open(fileout,'w') as outf :
+ for f in files :
+ print(f)
+ parser.doinit(outf)
+ with codecs.open(f, 'r', encodage_in) as infile :
+ content = infile.read()
+ content = HTMLParser().unescape(content)
+ parser.feed(content)
+ tot += parser.nb
+ return tot