From: Pierre Date: Mon, 7 Jan 2013 11:24:01 +0000 (+0100) Subject: factiva X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=commitdiff_plain;h=0f8b4705852de1679aba3e91c9809fe2250a715c factiva --- diff --git a/corpusNG.py b/corpusNG.py index f6468b3..fa13a8b 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -1156,13 +1156,6 @@ class BuildFromAlceste(BuildCorpus) : for word in uce : self.last += 1 self.corpus.add_word(word) - #if self.dlg is not None : - # if self.limitshow > self.count : - # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1)) - # self.count += 1 - # self.limitshow = 0 - # else : - # self.limitshow = self.last / 100000 log.debug(' '.join([`iduci`,`idpara`,`iduce`])) if self.last > self.lim : self.backup_uce() @@ -1174,27 +1167,13 @@ class BuildFromAlceste(BuildCorpus) : if douce : out = [] reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite while reste : uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : out.append(uce) reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite - uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : - #print 'RESTEE UUCEEEEEEEEEEEEE', uce out.append(uce) return out else : diff --git a/iramuteq.py b/iramuteq.py index 17d348f..479278d 100644 --- a/iramuteq.py +++ b/iramuteq.py @@ -64,6 +64,7 @@ from sheet import MySheet from checkinstall import CreateIraDirectory, CheckRPath, FindRPAthWin32, FindRPathNix, CheckRPackages, IsNew, UpgradeConf, CopyConf, RLibsAreInstalled from chemins import ConstructRscriptsPath, ConstructConfigPath, ConstructDicoPath, ConstructGlobalPath, PathOut from parse_factiva_xml import ImportFactiva + from tree import LeftTree ########################################################## ID_OpenData = wx.NewId() @@ -187,9 +188,16 @@ class IraFrame(wx.Frame): item.SetBitmap(wx.ArtProvider_GetBitmap(wx.ART_FILE_OPEN)) file_menu.AppendItem(item) - item1 = wx.MenuItem(file_menu, ID_Import, _(u"Import a factiva corpora..."), _("Import a factiva corpora...")) - item1.SetBitmap(wx.ArtProvider_GetBitmap(wx.ART_TIP)) - file_menu.AppendItem(item1) + + menuFactiva = wx.Menu() + fact_from_xml = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from xml")) + fact_from_mail = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from mail")) + fact_from_txt = wx.MenuItem(menuFactiva, wx.ID_ANY, _(u"from copy/paste")) + menuFactiva.AppendItem(fact_from_xml) + menuFactiva.AppendItem(fact_from_mail) + menuFactiva.AppendItem(fact_from_txt) + + file_menu.AppendMenu(-1, _(u"Import from factiva"), menuFactiva) #item1.Enable(True) item = wx.MenuItem(file_menu, ID_SaveTab, _(u"Save tab as..."), _(u"Save tab as...")) @@ -332,7 +340,9 @@ class IraFrame(wx.Frame): self.Bind(wx.EVT_MENU, self.OnOpenData, id=ID_OpenData) self.Bind(wx.EVT_MENU, self.OnOpenText, id=ID_OpenText) self.Bind(wx.EVT_MENU, self.OnOpenAnalyse, id=ID_OnOpenAnalyse) - self.Bind(wx.EVT_MENU, self.import_factiva, id= ID_Import) + self.Bind(wx.EVT_MENU, self.import_factiva_xml, fact_from_xml) + self.Bind(wx.EVT_MENU, self.import_factiva_mail, fact_from_mail) + self.Bind(wx.EVT_MENU, self.import_factiva_txt, fact_from_txt) self.Bind(wx.EVT_MENU, self.OnFreq, id=ID_Freq) self.Bind(wx.EVT_MENU, self.OnChi2, id=ID_Chi2) self.Bind(wx.EVT_MENU, self.OnStudent, id=ID_Student) @@ -886,9 +896,21 @@ Voulez-vous fermer quand même ?""" except: BugReport(self) - def import_factiva(self,event): + def import_factiva_xml(self,event): + try : + ImportFactiva(self, 'xml') + except : + BugReport(self) + + def import_factiva_mail(self, evt) : + try : + ImportFactiva(self, 'mail') + except : + BugReport(self) + + def import_factiva_txt(self, evt) : try : - ImportFactiva(self) + ImportFactiva(self, 'txt') except : BugReport(self) diff --git a/parse_factiva_txt.py b/parse_factiva_txt.py index 53f6d6d..9cb2af2 100644 --- a/parse_factiva_txt.py +++ b/parse_factiva_txt.py @@ -1,66 +1,73 @@ #!/bin/env python # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2012 Pierre Ratinaud +#Copyright (c) 2012-2013 Pierre Ratinaud #Lisense: GNU/GPL import os import codecs -txtdir = 'dev/factiva_txt' #repertoire des textes -#txtdir = 'corpus/jeunesdebanlieues' -fileout = 'dev/factiva_txt_out.txt' -encodage_in = 'utf8' -encodage_out = 'utf8' +#txtdir = 'dev/factiva_txt' +#fileout = 'dev/factiva_txt_out.txt' +#encodage_in = 'utf8' +#encodage_out = 'utf8' -def parsetxt(txt): +def parsetxtpaste(txt): """ parser de texte pour factiva + à partir d'un copier/coller de la fenêtre de visualisation + merci à Lucie Loubère pour l'astuce :) """ no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin - txt = txt.splitlines() #met le texte dans une liste de lignes - txt.pop(0) # la premiere ligne sert a rien - txt = txt[0:(len(txt)-10)] # les dernieres lignes ne servent a rien + txt = txt.splitlines() keepline = False ucis = [] - for line in txt : #pour chaque ligne du texte... - if line.startswith('---------------------------------------------------------------') : # si la ligne commence avec... - ucis.append([['****'],'']) # c'est une nouvelle uci - keepline = False - elif line.startswith('SN ') : #source + for line in txt : + if line.startswith('Article') : + lp = line.split() + if len(lp) > 2 : + if lp[2] == 'Article' : + ucis.append([[u'****'],'']) + keepline = False + if line.startswith('SN ') : #source source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() ucis[-1][0].append(source) elif line.startswith('PD ') : #date - mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] + mois_annee = '*ma_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] ucis[-1][0].append(mois_annee) - elif line in no : #fin + annee = u'*annee_' + line[4:].split(' ')[2] + ucis[-1][0].append(annee) + elif line.strip() in no : #fin keepline = False - elif line.startswith('RF ') : #fin + elif line.startswith('RF ') : #fin keepline = False - elif line in ['LP', 'TD'] : #debut texte + elif line.strip() in ['LP', 'TD'] : #debut texte keepline = True else : pass - if keepline and line not in ['LP', 'TD'] : + if keepline and line.strip() not in ['LP', 'TD', ''] : ucis[-1][1] = '\n'.join([ucis[-1][1],line]) return ucis def print_ucis(ucis, ofile, encodage) : - toprint = '\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) + #elimination des articles vides + ucis = [uci for uci in ucis if uci[1].strip() != ''] + toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) ofile.write(toprint.encode(encodage)) -def doparse(txtdir, fileout, encodage_in, encodage_out): - files = os.listdir(txtdir) #liste des fichiers dans txtdir - with open(fileout,'w') as outf : #ouverture du fichier en sortie - for f in files : #pour chaque fichier en entree... - f= os.path.join(txtdir, f) #chemin du fichier - with codecs.open(f, 'r', encodage_in) as infile : #ouverture du fichier - content = infile.read() #lecture du fichier - ucis = parsetxt(content) - print_ucis(ucis, outf, encodage_out) +class ParseFactivaPaste : + def __init__(self, txtdir, fileout, encodage_in, encodage_out) : + files = os.listdir(txtdir) + with open(fileout,'w') as outf : + for f in files : + f= os.path.join(txtdir, f) + with codecs.open(f, 'rU', encodage_in) as infile : + content = infile.read() + ucis = parsetxtpaste(content) + print_ucis(ucis, outf, encodage_out) #for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] : # path = os.path.join(txtdir,dat) diff --git a/parse_factiva_txt2.py b/parse_factiva_txt2.py deleted file mode 100644 index da048ec..0000000 --- a/parse_factiva_txt2.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/env python -# -*- coding: utf-8 -*- -#Author: Pierre Ratinaud -#Copyright (c) 2012 Pierre Ratinaud -#Lisense: GNU/GPL - -import os -import codecs - - -txtdir = 'dev/factiva_txt' -fileout = 'dev/factiva_txt_out.txt' -encodage_in = 'utf8' -encodage_out = 'utf8' - - -def parsetxt(txt): - """ - parser de texte pour factiva - à partir d'un copier/coller de la fenêtre de visualisation - merci à Lucie Loubère pour l'astuce :) - """ - no = ['NS','RE','IPD','CO','IN'] # les balises qui signalent une fin - txt = txt.splitlines() - keepline = False - ucis = [] - for line in txt : - if line.startswith('Article') : - lp = line.split() - if len(lp) > 2 : - if lp[2] == 'Article' : - ucis.append([[u'****'],'']) - keepline = False - if line.startswith('SN ') : #source - source = '*source_' + line[4:].replace(' ','').replace('\'','').replace(u'´','').replace(u'’','').replace('-','').lower() - ucis[-1][0].append(source) - elif line.startswith('PD ') : #date - mois_annee = '*date_' + line[4:].split(' ')[1] + line[4:].split(' ')[2] - ucis[-1][0].append(mois_annee) - elif line.strip() in no : #fin - keepline = False - elif line.startswith('RF ') : #fin - keepline = False - elif line.strip() in ['LP', 'TD'] : #debut texte - keepline = True - else : - pass - if keepline and line.strip() not in ['LP', 'TD', ''] : - ucis[-1][1] = '\n'.join([ucis[-1][1],line]) - return ucis - - -def print_ucis(ucis, ofile, encodage) : - #elimination des articles vides - ucis = [uci for uci in ucis if uci[1] != ''] - toprint = '\n\n'.join(['\n'.join([' '.join(uci[0]),uci[1]]) for uci in ucis]) - ofile.write(toprint.encode(encodage)) - -def doparse(txtdir, fileout, encodage_in, encodage_out): - files = os.listdir(txtdir) - with open(fileout,'w') as outf : - for f in files : - f= os.path.join(txtdir, f) - with codecs.open(f, 'rU', encodage_in) as infile : - content = infile.read() - ucis = parsetxt(content) - print_ucis(ucis, outf, encodage_out) - -#for dat in ['2001','2002','2003','2004', '2005','2006','2007','2008','2009','2010','2011'] : -# path = os.path.join(txtdir,dat) -# outfile = os.path.join(txtdir, 'corpus_' + dat + '.txt') -# doparse(path, outfile) - - -if __name__ == '__main__' : - doparse(txtdir, fileout, encodage_in, encodage_out) - print 'fini' diff --git a/parse_factiva_xml.py b/parse_factiva_xml.py index 6dd2d56..1fed159 100644 --- a/parse_factiva_xml.py +++ b/parse_factiva_xml.py @@ -12,7 +12,8 @@ import codecs import sys import re import wx - +from parse_factiva_mail import ParseFactivaMail +from parse_factiva_txt import ParseFactivaPaste def ParseDocument(filename) : print filename @@ -154,18 +155,23 @@ class PrefImport(wx.Dialog): class ImportFactiva(): - def __init__(self,parent): + def __init__(self, parent, methode): self.dial = PrefImport(parent) self.dial.CenterOnParent() val = self.dial.ShowModal() if val == wx.ID_OK : xmldir = self.dial.dbb.GetValue() corp_out = self.dial.fbb.GetValue() - res = getcorpus_from_xml(xmldir, corp_out) - if res != 'ok' : + if methode == 'xml' : + res = getcorpus_from_xml(xmldir, corp_out) + elif methode == 'mail' : + res = ParseFactivaMail(xmldir, corp_out, 'utf8', parent.syscoding) + elif methode == 'txt' : + res = ParseFactivaPaste(xmldir, corp_out, 'utf8', parent.syscoding) + if res == 'nofile' : dlg = wx.MessageDialog(parent, u"Pas de fichier \'.xml\' dans %s" % xmldir, 'ATTENTION', wx.OK | wx.NO_DEFAULT | wx.ICON_WARNING) dlg.CenterOnParent() dlg.ShowModal() - else : - parent.filename = corp_out - parent.OpenText() + #else : + # parent.filename = corp_out + # parent.OpenText() diff --git a/tree.py b/tree.py index ef324e0..0009136 100644 --- a/tree.py +++ b/tree.py @@ -733,18 +733,19 @@ class LeftTree(CT.CustomTreeCtrl): pt = event.GetPosition() item, flags = self.HitTest(pt) - pydata = self.GetPyData(item) - if pydata['uuid'] in self.parent.history.opened : - for i in range(self.parent.nb.GetPageCount()) : - page = self.parent.nb.GetPage(i) - if 'parametres' in dir(page) : - if page.parametres['uuid'] == pydata['uuid'] : - self.parent.nb.SetSelection(i) - break - else : - OpenAnalyse(self.parent, pydata) - self.SetItemBold(item, True) - self.OnSelChanged(pydata = pydata) + if item is not None : + pydata = self.GetPyData(item) + if pydata['uuid'] in self.parent.history.opened : + for i in range(self.parent.nb.GetPageCount()) : + page = self.parent.nb.GetPage(i) + if 'parametres' in dir(page) : + if page.parametres['uuid'] == pydata['uuid'] : + self.parent.nb.SetSelection(i) + break + else : + OpenAnalyse(self.parent, pydata) + self.SetItemBold(item, True) + self.OnSelChanged(pydata = pydata) #if item and (flags & CT.TREE_HITTEST_ONITEMLABEL): # if self.GetAGWWindowStyleFlag() & CT.TR_EDIT_LABELS: # self.log.info("OnLeftDClick: %s (manually starting label edit)"% self.GetItemText(item) + "\n")