2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2008-2010 Pierre Ratinaud
8 import wx.lib.sized_controls as sc
9 import wx.lib.filebrowsebutton as filebrowse
15 from parse_factiva_mail import ParseFactivaMail
16 from parse_factiva_txt import ParseFactivaPaste
17 from import_txm import TXM2IRA
19 def ParseDocument(filename) :
21 with codecs.open(filename, 'r', 'utf-8') as f :
23 content = content.replace('<hlt>', ' ').replace('</hlt>', ' ')
24 dom = xml.dom.minidom.parseString(content.encode("utf-8"))
26 articles = dom.getElementsByTagName("article")
28 for article in articles :
29 headline = article.getElementsByTagName("headline")
31 para_headline = headline[0].getElementsByTagName("paragraph")
32 val_headline = [val.firstChild.nodeValue.replace('\n', ' ') for val in para_headline]
35 leadParagraph = article.getElementsByTagName("leadParagraph")
36 if leadParagraph != [] :
37 para_leadParagraph = leadParagraph[0].getElementsByTagName("paragraph")
38 val_leadParagraph = [val.firstChild.nodeValue.replace('\n', ' ') for val in para_leadParagraph]
40 val_leadParagraph = []
41 publicationDate = article.getElementsByTagName("publicationDate")
42 if publicationDate != [] :
43 para_publicationDate = publicationDate[0].getElementsByTagName("date")
44 if para_publicationDate == [] :
45 para_publicationDate = publicationDate[0].getElementsByTagName("dateTime")
46 val_publicationDate = [val.firstChild.nodeValue.replace('\n', ' ') for val in para_publicationDate]
48 val_publicationDate = []
49 sourceName = article.getElementsByTagName("sourceName")
51 val_sourceName = sourceName[0].firstChild.nodeValue.replace('\n', ' ')
53 val_sourceName = INCONNU
54 tailParagraphs = article.getElementsByTagName("tailParagraphs")
55 if tailParagraphs != [] :
56 para_tailParagraphs = tailParagraphs[0].getElementsByTagName("paragraph")
57 val_tailParagraphs = [val.firstChild.nodeValue.replace('\n', ' ') for val in para_tailParagraphs]
59 val_tailParagraphs = []
60 inter = [' '.join(val_headline), val_sourceName,' '.join(val_publicationDate), ' '.join(val_leadParagraph), ' '.join(val_tailParagraphs)]
61 inter = [re.sub(ur'[ "\n\r]+', ' ', val).replace('"',' ').replace('\n', ' ').replace('\r', ' ') for val in inter]
62 #inter = ['"' + val +'"' for val in inter]
66 def getcorpus_from_xml(xmldir, corpus_out):
67 files = os.listdir(xmldir)
68 files = [os.path.join(xmldir,file) for file in files if os.path.splitext(file)[1] == '.xml']
72 fileout = codecs.open(corpus_out, 'w', 'utf-8')
74 rs = ParseDocument(file)
75 #dates = [row[2].split('-') for row in rs]
76 #dates = [[date[0],date[1],date[2].split('T')[0]] for date in dates]
77 #txt = '\n'.join(['\n'.join([' '.join([u'****', '*%s' % row[1].replace(' ','_').replace('\'','_'), '*%s' % row[2].replace('-','_')]), row[3], row[4]]) for row in rs])
78 #avec la date decompose
79 txt = '\n'.join(['\n'.join([' '.join([u'****', '*s_%s' % row[1].replace(' ','').replace('\'',''), '*annee_%s' % row[2].split('-')[0], '*mois_%s' % row[2].split('-')[1], '*jour_%s' % row[2].split('-')[2].split('T')[0]]), row[3], row[4]]) for row in rs])
80 fileout.write(txt+'\n\n')
84 class PrefImport(wx.Dialog):
85 def __init__(self, parent, size=wx.DefaultSize, pos=wx.DefaultPosition, style=wx.DEFAULT_DIALOG_STYLE, methode = 'mail'):
87 pre.SetExtraStyle(wx.DIALOG_EX_CONTEXTHELP)
88 pre.Create(parent, -1, '', pos, size, style)
90 if methode in ['xml', 'txm'] :
91 txt = _(u'Select a directory of xml files').decode('utf8')
93 txt = _(u'Select a directory of txt files').decode('utf8')
95 self.txt1 = wx.StaticText(self, -1, txt.encode('utf8'))
96 self.dbb = filebrowse.DirBrowseButton(self, -1, size=(450, -1), changeCallback = self.fbbCallback)
98 self.txt2 = wx.StaticText(self, -1, _(u'Output file').decode('utf8'))
99 self.fbb = filebrowse.FileBrowseButton(self, -1, size=(450, -1), fileMode = 2)
100 self.fbb.SetLabel("")
102 self.btnsizer = wx.StdDialogButtonSizer()
103 btn_ok = wx.Button(self, wx.ID_OK)
104 btn = wx.Button(self, wx.ID_CANCEL)
105 self.btnsizer.AddButton(btn_ok)
106 self.btnsizer.AddButton(btn)
107 self.btnsizer.Realize()
110 self.Bind(wx.EVT_BUTTON, self.checkfile, btn_ok)
112 #self.SetButtonSizer(self.CreateStdDialogButtonSizer(wx.OK | wx.CANCEL))
113 self.Bind(wx.EVT_BUTTON, self.checkfile)
117 self.SetMinSize(self.GetSize())
119 def __do_layout(self):
120 sizer = wx.BoxSizer(wx.VERTICAL)
121 grid_sizer_1 = wx.BoxSizer(wx.HORIZONTAL)
122 grid_sizer_2 = wx.BoxSizer(wx.HORIZONTAL)
123 grid_sizer_1.Add(self.txt1, 0, wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 0)
124 grid_sizer_1.Add(self.dbb, 2, wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 0)
125 grid_sizer_2.Add(self.txt2, 0, wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 0)
126 grid_sizer_2.Add(self.fbb, 2, wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 0)
127 sizer.Add(grid_sizer_1, 0, wx.EXPAND, 0)
128 sizer.Add(grid_sizer_2, 0, wx.EXPAND, 0)
129 sizer.Add(self.btnsizer, 0, wx.EXPAND, 0)
135 def fbbCallback(self, evt):
136 if self.fbb.GetValue() == "" :
137 self.fbb.SetValue(os.path.join(self.dbb.GetValue(), 'corpus.txt'))
138 #self.log.write('FileBrowseButton: %s\n' % evt.GetString())
140 def checkfile(self, evt) :
141 if evt.GetId() == wx.ID_OK :
142 if self.dbb.GetValue() != "" :
143 if os.path.exists(self.fbb.GetValue()):
144 dlg = wx.MessageDialog(self,
145 u"%s\nCe fichier existe, continuer quand même ?" % self.fbb.GetValue(), 'ATTENTION', wx.NO | wx.YES | wx.ICON_WARNING)
147 if dlg.ShowModal() not in [wx.ID_NO, wx.ID_CANCEL]:
148 self.EndModal(wx.ID_OK)
150 self.EndModal(wx.ID_OK)
152 dlg = wx.MessageDialog(self, u"Vous devez choisir le répertoire contenant le ou les fichier(s) xml", 'ATTENTION', wx.OK | wx.ICON_WARNING)
157 self.EndModal(wx.ID_CANCEL)
161 class ImportFactiva():
162 def __init__(self, parent, methode):
163 self.dial = PrefImport(parent, methode=methode)
164 self.dial.CenterOnParent()
165 val = self.dial.ShowModal()
167 xmldir = self.dial.dbb.GetValue()
168 corp_out = self.dial.fbb.GetValue()
169 if methode == 'xml' :
170 res = getcorpus_from_xml(xmldir, corp_out)
171 elif methode == 'mail' :
172 res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding)
173 elif methode == 'txt' :
174 res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding)
175 elif methode == 'txm' :
176 res = TXM2IRA(xmldir, corp_out, 'utf8', parent.syscoding)
178 dlg = wx.MessageDialog(parent, u"Pas de fichier \'.xml\' dans %s" % xmldir, 'ATTENTION', wx.OK | wx.NO_DEFAULT | wx.ICON_WARNING)
182 # parent.filename = corp_out