#print "Encountered an end tag :", tag
def handle_data(self, data):
if self.currentattr == 'DocPublicationName' :
- PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').lower()
+ PublicationName = data.replace(' ', '_').replace('(','').replace(')','').replace('.','').replace('/','').replace("'",'').replace(';', '').replace(':', '').replace(u'ยท','').lower()
PublicationName = PublicationName.split(',')[0]
self.meta.append(u'*source_' + PublicationName)
elif self.currentattr == 'DocHeader' :
self.outfile = outfile
-class ParseEuropress :
- def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
+def ParseEuropress(txtdir, fileout, encodage_in, encodage_out) :
files = []
for root, subfolders, subfiles in os.walk(txtdir) :
nf = [os.path.join(root, f) for f in subfiles if f.split('.')[-1] == 'html']
nf.sort()
files += nf
+ if len(files) == 0 :
+ return 'nofile'
tot = 0
parser = MyHTMLParser()
with open(fileout,'w') as outf :
content = infile.read()
parser.feed(content)
tot += parser.nb
- print 'ok', parser.nb, 'articles', ' - total : ', tot
+ return tot
#ParseEuropress('/home/pierre/fac/lerass/mariage/press', '/home/pierre/workspace/iramuteq/dev/europress/corpus_out.txt', 'utf8', 'utf8')
#print "Encountered some data :", data