X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_factiva_xml.py;h=6eff72d8ea8d651bdc92f5f6e636d36c5ea6545b;hp=1fed159cc99f2b485a5872adb732071f0e008d94;hb=238f42801fed31007932d28e2d8e517081c9542d;hpb=0f8b4705852de1679aba3e91c9809fe2250a715c diff --git a/parse_factiva_xml.py b/parse_factiva_xml.py index 1fed159..6eff72d 100644 --- a/parse_factiva_xml.py +++ b/parse_factiva_xml.py @@ -2,28 +2,26 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud #Copyright (c) 2008-2010 Pierre Ratinaud -#Lisense: GNU/GPL +#License: GNU/GPL import xml.dom.minidom -import wx.lib.sized_controls as sc import wx.lib.filebrowsebutton as filebrowse import os import codecs -import sys import re import wx from parse_factiva_mail import ParseFactivaMail from parse_factiva_txt import ParseFactivaPaste +from import_txm import TXM2IRA def ParseDocument(filename) : print filename with codecs.open(filename, 'r', 'utf-8') as f : - content = f.read() + content = f.read() content = content.replace('', ' ').replace('', ' ') dom = xml.dom.minidom.parseString(content.encode("utf-8")) result = [] articles = dom.getElementsByTagName("article") - i = 0 for article in articles : headline = article.getElementsByTagName("headline") if headline != [] : @@ -49,7 +47,7 @@ def ParseDocument(filename) : if sourceName != [] : val_sourceName = sourceName[0].firstChild.nodeValue.replace('\n', ' ') else : - val_sourceName = INCONNU + val_sourceName = 'INCONNU' tailParagraphs = article.getElementsByTagName("tailParagraphs") if tailParagraphs != [] : para_tailParagraphs = tailParagraphs[0].getElementsByTagName("paragraph") @@ -64,13 +62,12 @@ def ParseDocument(filename) : def getcorpus_from_xml(xmldir, corpus_out): files = os.listdir(xmldir) - files = [os.path.join(xmldir,file) for file in files if os.path.splitext(file)[1] == '.xml'] + files = [os.path.join(xmldir,f) for f in files if os.path.splitext(f)[1] == '.xml'] if len(files) == 0 : return 'nofile' - result = [] fileout = codecs.open(corpus_out, 'w', 'utf-8') - for file in files : - rs = ParseDocument(file) + for f in files : + rs = ParseDocument(f) #dates = [row[2].split('-') for row in rs] #dates = [[date[0],date[1],date[2].split('T')[0]] for date in dates] #txt = '\n'.join(['\n'.join([' '.join([u'****', '*%s' % row[1].replace(' ','_').replace('\'','_'), '*%s' % row[2].replace('-','_')]), row[3], row[4]]) for row in rs]) @@ -81,17 +78,20 @@ def getcorpus_from_xml(xmldir, corpus_out): return 'ok' class PrefImport(wx.Dialog): - def __init__(self, parent, size=wx.DefaultSize, pos=wx.DefaultPosition, style=wx.DEFAULT_DIALOG_STYLE): + def __init__(self, parent, size=wx.DefaultSize, pos=wx.DefaultPosition, style=wx.DEFAULT_DIALOG_STYLE, methode = 'mail'): pre = wx.PreDialog() pre.SetExtraStyle(wx.DIALOG_EX_CONTEXTHELP) pre.Create(parent, -1, '', pos, size, style) self.PostCreate(pre) - + if methode in ['xml', 'txm'] : + txt = _(u'Select a directory of xml files').decode('utf8') + else : + txt = _(u'Select a directory of txt files').decode('utf8') self.parent = parent - self.txt1 = wx.StaticText(self, -1, u"Répertoire des fichiers xml") + self.txt1 = wx.StaticText(self, -1, txt.encode('utf8')) self.dbb = filebrowse.DirBrowseButton(self, -1, size=(450, -1), changeCallback = self.fbbCallback) self.dbb.SetLabel("") - self.txt2 = wx.StaticText(self, -1, u"Fichier en sortie") + self.txt2 = wx.StaticText(self, -1, _(u'Output file').decode('utf8')) self.fbb = filebrowse.FileBrowseButton(self, -1, size=(450, -1), fileMode = 2) self.fbb.SetLabel("") @@ -130,7 +130,7 @@ class PrefImport(wx.Dialog): def fbbCallback(self, evt): if self.fbb.GetValue() == "" : - self.fbb.SetValue(os.path.join(self.dbb.GetValue(), 'corpus.txt')) + self.fbb.SetValue(os.path.join(self.dbb.GetValue(), 'corpus.txt')) #self.log.write('FileBrowseButton: %s\n' % evt.GetString()) def checkfile(self, evt) : @@ -156,19 +156,21 @@ class PrefImport(wx.Dialog): class ImportFactiva(): def __init__(self, parent, methode): - self.dial = PrefImport(parent) - self.dial.CenterOnParent() - val = self.dial.ShowModal() - if val == wx.ID_OK : - xmldir = self.dial.dbb.GetValue() - corp_out = self.dial.fbb.GetValue() - if methode == 'xml' : + self.dial = PrefImport(parent, methode=methode) + self.dial.CenterOnParent() + val = self.dial.ShowModal() + if val == wx.ID_OK : + xmldir = self.dial.dbb.GetValue() + corp_out = self.dial.fbb.GetValue() + if methode == 'xml' : res = getcorpus_from_xml(xmldir, corp_out) - elif methode == 'mail' : - res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding) - elif methode == 'txt' : - res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding) - if res == 'nofile' : + elif methode == 'mail' : + res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding) + elif methode == 'txt' : + res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding) + elif methode == 'txm' : + res = TXM2IRA(xmldir, corp_out, 'utf8', parent.syscoding) + if res == 'nofile' : dlg = wx.MessageDialog(parent, u"Pas de fichier \'.xml\' dans %s" % xmldir, 'ATTENTION', wx.OK | wx.NO_DEFAULT | wx.ICON_WARNING) dlg.CenterOnParent() dlg.ShowModal()