From: Pierre Date: Thu, 31 Oct 2013 10:03:53 +0000 (+0100) Subject: Merge branch 'master' of http://www.iramuteq.org/git/iramuteq X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=commitdiff_plain;h=1eebb48d56a3fecb1e753073c0a9cdca87cde421;hp=45dd6badc0446adedf6728d3d8f7f28cfec2c6db Merge branch 'master' of iramuteq.org/git/iramuteq merge --- diff --git a/parse_factiva_txt.py b/parse_factiva_txt.py index 9cb2af2..eddbe47 100644 --- a/parse_factiva_txt.py +++ b/parse_factiva_txt.py @@ -6,6 +6,7 @@ import os import codecs +import re #txtdir = 'dev/factiva_txt' @@ -32,7 +33,9 @@ def parsetxtpaste(txt): ucis.append([[u'****'],'']) keepline = False if line.startswith('SN ') : #source - source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() + jsource = re.sub('[^A-Za-z0-9]', '', line[4:]) + source = u'_'.join([u'*source', jsource]).lower() + #source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() ucis[-1][0].append(source) elif line.startswith('PD ') : #date mois_annee = '*ma_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] @@ -56,18 +59,22 @@ def print_ucis(ucis, ofile, encodage) : #elimination des articles vides ucis = [uci for uci in ucis if uci[1].strip() != ''] toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) - ofile.write(toprint.encode(encodage)) + ofile.write(toprint.encode(encodage) + '\n') class ParseFactivaPaste : def __init__(self, txtdir, fileout, encodage_in, encodage_out) : files = os.listdir(txtdir) + tot = 0 with open(fileout,'w') as outf : for f in files : - f= os.path.join(txtdir, f) + print f + f = os.path.join(txtdir, f) with codecs.open(f, 'rU', encodage_in) as infile : content = infile.read() ucis = parsetxtpaste(content) print_ucis(ucis, outf, encodage_out) + tot += len(ucis) + print 'ok', len(ucis), 'articles', ' - total : ', tot #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] : # path = os.path.join(txtdir,dat) diff --git a/parse_factiva_xml.py b/parse_factiva_xml.py index 1fed159..8d9b8a7 100644 --- a/parse_factiva_xml.py +++ b/parse_factiva_xml.py @@ -81,17 +81,20 @@ def getcorpus_from_xml(xmldir, corpus_out): return 'ok' class PrefImport(wx.Dialog): - def __init__(self, parent, size=wx.DefaultSize, pos=wx.DefaultPosition, style=wx.DEFAULT_DIALOG_STYLE): + def __init__(self, parent, size=wx.DefaultSize, pos=wx.DefaultPosition, style=wx.DEFAULT_DIALOG_STYLE, methode = 'mail'): pre = wx.PreDialog() pre.SetExtraStyle(wx.DIALOG_EX_CONTEXTHELP) pre.Create(parent, -1, '', pos, size, style) self.PostCreate(pre) - + if methode == 'xml' : + txt = _(u'Select a directory of xml files') + else : + txt = _(u'Select a directory of txt files') self.parent = parent - self.txt1 = wx.StaticText(self, -1, u"Répertoire des fichiers xml") + self.txt1 = wx.StaticText(self, -1, txt.encode('utf8')) self.dbb = filebrowse.DirBrowseButton(self, -1, size=(450, -1), changeCallback = self.fbbCallback) self.dbb.SetLabel("") - self.txt2 = wx.StaticText(self, -1, u"Fichier en sortie") + self.txt2 = wx.StaticText(self, -1, _(u'Output file').encode('utf8')) self.fbb = filebrowse.FileBrowseButton(self, -1, size=(450, -1), fileMode = 2) self.fbb.SetLabel("") @@ -156,7 +159,7 @@ class PrefImport(wx.Dialog): class ImportFactiva(): def __init__(self, parent, methode): - self.dial = PrefImport(parent) + self.dial = PrefImport(parent, methode=methode) self.dial.CenterOnParent() val = self.dial.ShowModal() if val == wx.ID_OK :