# -*- coding: utf-8 -*-
#Author: Pierre Ratinaud
#Copyright (c) 2008-2010 Pierre Ratinaud
-#Lisense: GNU/GPL
+#License: GNU/GPL
import xml.dom.minidom
-import wx.lib.sized_controls as sc
import wx.lib.filebrowsebutton as filebrowse
import os
import codecs
-import sys
import re
import wx
from parse_factiva_mail import ParseFactivaMail
from parse_factiva_txt import ParseFactivaPaste
+from import_txm import TXM2IRA
def ParseDocument(filename) :
print filename
with codecs.open(filename, 'r', 'utf-8') as f :
- content = f.read()
+ content = f.read()
content = content.replace('<hlt>', ' ').replace('</hlt>', ' ')
dom = xml.dom.minidom.parseString(content.encode("utf-8"))
result = []
articles = dom.getElementsByTagName("article")
- i = 0
for article in articles :
headline = article.getElementsByTagName("headline")
if headline != [] :
if sourceName != [] :
val_sourceName = sourceName[0].firstChild.nodeValue.replace('\n', ' ')
else :
- val_sourceName = INCONNU
+ val_sourceName = 'INCONNU'
tailParagraphs = article.getElementsByTagName("tailParagraphs")
if tailParagraphs != [] :
para_tailParagraphs = tailParagraphs[0].getElementsByTagName("paragraph")
def getcorpus_from_xml(xmldir, corpus_out):
files = os.listdir(xmldir)
- files = [os.path.join(xmldir,file) for file in files if os.path.splitext(file)[1] == '.xml']
+ files = [os.path.join(xmldir,f) for f in files if os.path.splitext(f)[1] == '.xml']
if len(files) == 0 :
return 'nofile'
- result = []
fileout = codecs.open(corpus_out, 'w', 'utf-8')
- for file in files :
- rs = ParseDocument(file)
+ for f in files :
+ rs = ParseDocument(f)
#dates = [row[2].split('-') for row in rs]
#dates = [[date[0],date[1],date[2].split('T')[0]] for date in dates]
#txt = '\n'.join(['\n'.join([' '.join([u'****', '*%s' % row[1].replace(' ','_').replace('\'','_'), '*%s' % row[2].replace('-','_')]), row[3], row[4]]) for row in rs])
pre.SetExtraStyle(wx.DIALOG_EX_CONTEXTHELP)
pre.Create(parent, -1, '', pos, size, style)
self.PostCreate(pre)
- if methode == 'xml' :
- txt = _(u'Select a directory of xml files')
+ if methode in ['xml', 'txm'] :
+ txt = _(u'Select a directory of xml files').decode('utf8')
else :
- txt = _(u'Select a directory of txt files')
+ txt = _(u'Select a directory of txt files').decode('utf8')
self.parent = parent
self.txt1 = wx.StaticText(self, -1, txt.encode('utf8'))
self.dbb = filebrowse.DirBrowseButton(self, -1, size=(450, -1), changeCallback = self.fbbCallback)
self.dbb.SetLabel("")
- self.txt2 = wx.StaticText(self, -1, _(u'Output file').encode('utf8'))
+ self.txt2 = wx.StaticText(self, -1, _(u'Output file').decode('utf8'))
self.fbb = filebrowse.FileBrowseButton(self, -1, size=(450, -1), fileMode = 2)
self.fbb.SetLabel("")
def fbbCallback(self, evt):
if self.fbb.GetValue() == "" :
- self.fbb.SetValue(os.path.join(self.dbb.GetValue(), 'corpus.txt'))
+ self.fbb.SetValue(os.path.join(self.dbb.GetValue(), 'corpus.txt'))
#self.log.write('FileBrowseButton: %s\n' % evt.GetString())
def checkfile(self, evt) :
class ImportFactiva():
def __init__(self, parent, methode):
- self.dial = PrefImport(parent, methode=methode)
- self.dial.CenterOnParent()
- val = self.dial.ShowModal()
- if val == wx.ID_OK :
- xmldir = self.dial.dbb.GetValue()
- corp_out = self.dial.fbb.GetValue()
- if methode == 'xml' :
+ self.dial = PrefImport(parent, methode=methode)
+ self.dial.CenterOnParent()
+ val = self.dial.ShowModal()
+ if val == wx.ID_OK :
+ xmldir = self.dial.dbb.GetValue()
+ corp_out = self.dial.fbb.GetValue()
+ if methode == 'xml' :
res = getcorpus_from_xml(xmldir, corp_out)
- elif methode == 'mail' :
- res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding)
- elif methode == 'txt' :
- res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding)
- if res == 'nofile' :
+ elif methode == 'mail' :
+ res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding)
+ elif methode == 'txt' :
+ res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding)
+ elif methode == 'txm' :
+ res = TXM2IRA(xmldir, corp_out, 'utf8', parent.syscoding)
+ if res == 'nofile' :
dlg = wx.MessageDialog(parent, u"Pas de fichier \'.xml\' dans %s" % xmldir, 'ATTENTION', wx.OK | wx.NO_DEFAULT | wx.ICON_WARNING)
dlg.CenterOnParent()
dlg.ShowModal()