for word in uce :
self.last += 1
- #if self.dlg is not None :
- # if self.limitshow > self.count :
- # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
- # self.count += 1
- # self.limitshow = 0
- # else :
- # self.limitshow = self.last / 100000
log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
if self.last > self.lim :
if douce :
out = []
reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
-# print 'reste'
-# print reste
-# print 'texte_uce'
-# print texte_uce
-# print 'suite'
-# print suite
while reste :
uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
if uce != '' :
reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
-# print 'reste'
-# print reste
-# print 'texte_uce'
-# print texte_uce
-# print 'suite'
-# print suite
uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
if uce != '' :
return out
else :
from checkinstall import CreateIraDirectory, CheckRPath, FindRPAthWin32, FindRPathNix, CheckRPackages, IsNew, UpgradeConf, CopyConf, RLibsAreInstalled
from chemins import ConstructRscriptsPath, ConstructConfigPath, ConstructDicoPath, ConstructGlobalPath, PathOut
from parse_factiva_xml import ImportFactiva
from tree import LeftTree
ID_OpenData = wx.NewId()
- item1 = wx.MenuItem(file_menu, ID_Import, _(u"Import a factiva corpora..."), _("Import a factiva corpora..."))
- item1.SetBitmap(wx.ArtProvider_GetBitmap(wx.ART_TIP))
- file_menu.AppendItem(item1)
+ menuFactiva = wx.Menu()
+ fact_from_xml = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from xml"))
+ fact_from_mail = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from mail"))
+ fact_from_txt = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from copy/paste"))
+ menuFactiva.AppendItem(fact_from_xml)
+ menuFactiva.AppendItem(fact_from_mail)
+ menuFactiva.AppendItem(fact_from_txt)
+ file_menu.AppendMenu(-1, _(u"Import from factiva"), menuFactiva)
item = wx.MenuItem(file_menu, ID_SaveTab, _(u"Save tab as..."), _(u"Save tab as..."))
self.Bind(wx.EVT_MENU, self.OnOpenData, id=ID_OpenData)
self.Bind(wx.EVT_MENU, self.OnOpenText, id=ID_OpenText)
self.Bind(wx.EVT_MENU, self.OnOpenAnalyse, id=ID_OnOpenAnalyse)
- self.Bind(wx.EVT_MENU, self.import_factiva, id= ID_Import)
+ self.Bind(wx.EVT_MENU, self.import_factiva_xml, fact_from_xml)
+ self.Bind(wx.EVT_MENU, self.import_factiva_mail, fact_from_mail)
+ self.Bind(wx.EVT_MENU, self.import_factiva_txt, fact_from_txt)
self.Bind(wx.EVT_MENU, self.OnFreq, id=ID_Freq)
self.Bind(wx.EVT_MENU, self.OnChi2, id=ID_Chi2)
self.Bind(wx.EVT_MENU, self.OnStudent, id=ID_Student)
- def import_factiva(self,event):
+ def import_factiva_xml(self,event):
+ try :
+ ImportFactiva(self, 'xml')
+ except :
+ BugReport(self)
+ def import_factiva_mail(self, evt) :
+ try :
+ ImportFactiva(self, 'mail')
+ except :
+ BugReport(self)
+ def import_factiva_txt(self, evt) :
try :
- ImportFactiva(self)
+ ImportFactiva(self, 'txt')
except :
#!/bin/env python
# -*- coding: utf-8 -*-
#Author: Pierre Ratinaud
-#Copyright (c) 2012 Pierre Ratinaud
+#Copyright (c) 2012-2013 Pierre Ratinaud
#Lisense: GNU/GPL
import os
import codecs
-txtdir = 'dev/factiva_txt' #repertoire des textes
-#txtdir = 'corpus/jeunesdebanlieues'
-fileout = 'dev/factiva_txt_out.txt'
-encodage_in = 'utf8'
-encodage_out = 'utf8'
+#txtdir = 'dev/factiva_txt'
+#fileout = 'dev/factiva_txt_out.txt'
+#encodage_in = 'utf8'
+#encodage_out = 'utf8'
-def parsetxt(txt):
+def parsetxtpaste(txt):
parser de texte pour factiva
+ à partir d'un copier/coller de la fenêtre de visualisation
+ merci à Lucie Loubère pour l'astuce :)
no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin
- txt = txt.splitlines() #met le texte dans une liste de lignes
- txt.pop(0) # la premiere ligne sert a rien
- txt = txt[0:(len(txt)-10)] # les dernieres lignes ne servent a rien
+ txt = txt.splitlines()
keepline = False
ucis = []
- for line in txt : #pour chaque ligne du texte...
- if line.startswith('---------------------------------------------------------------') : # si la ligne commence avec...
- ucis.append([['****'],'']) # c'est une nouvelle uci
- keepline = False
- elif line.startswith('SN ') : #source
+ for line in txt :
+ if line.startswith('Article') :
+ lp = line.split()
+ if len(lp) > 2 :
+ if lp[2] == 'Article' :
+ ucis.append([[u'****'],''])
+ keepline = False
+ if line.startswith('SN ') : #source
source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
elif line.startswith('PD ') : #date
- mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2]
+ mois_annee = '*ma_' + line[4:].split(' ')[1] + line[4:].split(' ')[2]
- elif line in no : #fin
+ annee = u'*annee_' + line[4:].split(' ')[2]
+ ucis[-1][0].append(annee)
+ elif line.strip() in no : #fin
keepline = False
- elif line.startswith('RF ') : #fin
+ elif line.startswith('RF ') : #fin
keepline = False
- elif line in ['LP', 'TD'] : #debut texte
+ elif line.strip() in ['LP', 'TD'] : #debut texte
keepline = True
else :
- if keepline and line not in ['LP', 'TD'] :
+ if keepline and line.strip() not in ['LP', 'TD', ''] :
ucis[-1][1] = '\n'.join([ucis[-1][1],line])
return ucis
def print_ucis(ucis, ofile, encodage) :
- toprint = '\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
+ #elimination des articles vides
+ ucis = [uci for uci in ucis if uci[1].strip() != '']
+ toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
-def doparse(txtdir, fileout, encodage_in, encodage_out):
- files = os.listdir(txtdir) #liste des fichiers dans txtdir
- with open(fileout,'w') as outf : #ouverture du fichier en sortie
- for f in files : #pour chaque fichier en entree...
- f= os.path.join(txtdir, f) #chemin du fichier
- with, 'r', encodage_in) as infile : #ouverture du fichier
- content = #lecture du fichier
- ucis = parsetxt(content)
- print_ucis(ucis, outf, encodage_out)
+class ParseFactivaPaste :
+ def __init__(self, txtdir, fileout, encodage_in, encodage_out) :
+ files = os.listdir(txtdir)
+ with open(fileout,'w') as outf :
+ for f in files :
+ f= os.path.join(txtdir, f)
+ with, 'rU', encodage_in) as infile :
+ content =
+ ucis = parsetxtpaste(content)
+ print_ucis(ucis, outf, encodage_out)
#for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
# path = os.path.join(txtdir,dat)
+++ /dev/null
-#!/bin/env python
-# -*- coding: utf-8 -*-
-#Author: Pierre Ratinaud
-#Copyright (c) 2012 Pierre Ratinaud
-#Lisense: GNU/GPL
-import os
-import codecs
-txtdir = 'dev/factiva_txt'
-fileout = 'dev/factiva_txt_out.txt'
-encodage_in = 'utf8'
-encodage_out = 'utf8'
-def parsetxt(txt):
- """
- parser de texte pour factiva
- à partir d'un copier/coller de la fenêtre de visualisation
- merci à Lucie Loubère pour l'astuce :)
- """
- no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin
- txt = txt.splitlines()
- keepline = False
- ucis = []
- for line in txt :
- if line.startswith('Article') :
- lp = line.split()
- if len(lp) > 2 :
- if lp[2] == 'Article' :
- ucis.append([[u'****'],''])
- keepline = False
- if line.startswith('SN ') : #source
- source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower()
- ucis[-1][0].append(source)
- elif line.startswith('PD ') : #date
- mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2]
- ucis[-1][0].append(mois_annee)
- elif line.strip() in no : #fin
- keepline = False
- elif line.startswith('RF ') : #fin
- keepline = False
- elif line.strip() in ['LP', 'TD'] : #debut texte
- keepline = True
- else :
- pass
- if keepline and line.strip() not in ['LP', 'TD', ''] :
- ucis[-1][1] = '\n'.join([ucis[-1][1],line])
- return ucis
-def print_ucis(ucis, ofile, encodage) :
- #elimination des articles vides
- ucis = [uci for uci in ucis if uci[1] != '']
- toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis])
- ofile.write(toprint.encode(encodage))
-def doparse(txtdir, fileout, encodage_in, encodage_out):
- files = os.listdir(txtdir)
- with open(fileout,'w') as outf :
- for f in files :
- f= os.path.join(txtdir, f)
- with, 'rU', encodage_in) as infile :
- content =
- ucis = parsetxt(content)
- print_ucis(ucis, outf, encodage_out)
-#for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] :
-# path = os.path.join(txtdir,dat)
-# outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt')
-# doparse(path, outfile)
-if __name__ == '__main__' :
- doparse(txtdir, fileout, encodage_in, encodage_out)
- print 'fini'
import sys
import re
import wx
+from parse_factiva_mail import ParseFactivaMail
+from parse_factiva_txt import ParseFactivaPaste
def ParseDocument(filename) :
print filename
class ImportFactiva():
- def __init__(self,parent):
+ def __init__(self, parent, methode):
self.dial = PrefImport(parent)
val = self.dial.ShowModal()
if val == wx.ID_OK :
xmldir = self.dial.dbb.GetValue()
corp_out = self.dial.fbb.GetValue()
- res = getcorpus_from_xml(xmldir, corp_out)
- if res != 'ok' :
+ if methode == 'xml' :
+ res = getcorpus_from_xml(xmldir, corp_out)
+ elif methode == 'mail' :
+ res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding)
+ elif methode == 'txt' :
+ res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding)
+ if res == 'nofile' :
dlg = wx.MessageDialog(parent, u"Pas de fichier \'.xml\' dans %s" % xmldir, 'ATTENTION', wx.OK | wx.NO_DEFAULT | wx.ICON_WARNING)
- else :
- parent.filename = corp_out
- parent.OpenText()
+ #else :
+ # parent.filename = corp_out
+ # parent.OpenText()
pt = event.GetPosition()
item, flags = self.HitTest(pt)
- pydata = self.GetPyData(item)
- if pydata['uuid'] in self.parent.history.opened :
- for i in range(self.parent.nb.GetPageCount()) :
- page = self.parent.nb.GetPage(i)
- if 'parametres' in dir(page) :
- if page.parametres['uuid'] == pydata['uuid'] :
- self.parent.nb.SetSelection(i)
- break
- else :
- OpenAnalyse(self.parent, pydata)
- self.SetItemBold(item, True)
- self.OnSelChanged(pydata = pydata)
+ if item is not None :
+ pydata = self.GetPyData(item)
+ if pydata['uuid'] in self.parent.history.opened :
+ for i in range(self.parent.nb.GetPageCount()) :
+ page = self.parent.nb.GetPage(i)
+ if 'parametres' in dir(page) :
+ if page.parametres['uuid'] == pydata['uuid'] :
+ self.parent.nb.SetSelection(i)
+ break
+ else :
+ OpenAnalyse(self.parent, pydata)
+ self.SetItemBold(item, True)
+ self.OnSelChanged(pydata = pydata)
#if item and (flags & CT.TREE_HITTEST_ONITEMLABEL):
# if self.GetAGWWindowStyleFlag() & CT.TR_EDIT_LABELS:
#"OnLeftDClick: %s (manually starting label edit)"% self.GetItemText(item) + "\n")