From 83802e662acde01994fe0bd2bf6978fef90b14f8 Mon Sep 17 00:00:00 2001 From: Pierre Ratinaud Date: Tue, 25 Nov 2014 22:33:42 +0100 Subject: [PATCH] ... --- PrintRScript.py | 15 ++++++++++++++- ProfList.py | 10 ++++++++-- Rscripts/chdfunct.R | 26 ++++++++++++++++++++++++++ configuration/global.cfg | 4 ++-- configuration/iramuteq.cfg | 4 ++-- corpus.py | 24 ++++++++++++++++++++++++ dialog.py | 2 +- functions.py | 2 +- iramuteq.py | 4 ++-- layout.py | 13 ++++++++++--- listlex.py | 15 +++++++-------- textdist.py | 2 +- textreinert.py | 42 +++++++++++++++++++++++++++++++++++++----- 13 files changed, 135 insertions(+), 28 deletions(-) diff --git a/PrintRScript.py b/PrintRScript.py index f9d7be3..760c340 100644 --- a/PrintRScript.py +++ b/PrintRScript.py @@ -319,7 +319,7 @@ def RchdQuest(DicoPath, RscriptPath, nbcl = 10, mincl = 10): fileout.write(txt) fileout.close() -def AlcesteTxtProf(DictChdTxtOut, RscriptsPath, clnb, taillecar): +def ReinertTxtProf(DictChdTxtOut, RscriptsPath, clnb, taillecar): txt = "clnb<-%i\n" % clnb txt += """ source("%s") @@ -1213,6 +1213,19 @@ class TgenSpecScript(PrintRScript): """ % ffr(self.pathout['tgenspec.csv']) self.add(txt) +class TgenProfScript(PrintRScript): + def make_script(self): + self.sources([self.analyse.ira.RscriptsPath['chdfunct']]) + txt = """ + tgen <- read.csv2("%s", row.names = 1, sep = '\\t') + """ % ffr(self.parametres['tgeneff']) + txt += """ + res <- build.prof.tgen(tgen) + write.table(res$chi2, file = "%s", sep='\\t', col.names = NA) + write.table(res$pchi2, file = "%s", sep='\\t', col.names = NA) + """ % (ffr(self.pathout['tgenchi2.csv']), ffr(self.pathout['tgenpchi2.csv'])) + self.add(txt) + class FreqMultiScript(PrintRScript): def make_script(self): self.sources([self.analyse.parent.RscriptsPath['Rgraph']]) diff --git a/ProfList.py b/ProfList.py index 4cdffa1..c220171 100644 --- a/ProfList.py +++ b/ProfList.py @@ -278,6 +278,7 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col self.idexport = wx.NewId() self.idexporttropes = wx.NewId() self.idexportowledge = wx.NewId() + self.onmaketgen = wx.NewId() # self.export_classes = wx.NewId() self.Bind(wx.EVT_MENU, self.OnPopupOne, id=self.popupID1) @@ -302,6 +303,7 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col self.Bind(wx.EVT_MENU, self.onexport, id = self.idexport) self.Bind(wx.EVT_MENU, self.onexporttropes, id = self.idexporttropes) self.Bind(wx.EVT_MENU, self.onexportowledge, id = self.idexportowledge) + self.Bind(wx.EVT_MENU, self.OnMakeTgen, id=self.onmaketgen) # self.Bind(wx.EVT_MENU, self.on_export_classes, id = self.export_classes) # self.Bind(wx.EVT_MENU, self.OnPopupThree, id=self.popupID3) @@ -320,7 +322,8 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col menu_conc.Append(self.popupID2, u"dans les segments de texte de la classe") menu_conc.Append(self.popupID3, u"dans les segments de texte classés") menu_conc.Append(self.popupID4, u"dans tous les segments de texte") - menu.AppendMenu(-1, u"Concordancier", menu_conc) + menu.AppendMenu(-1, u"Concordancier", menu_conc) + menu.Append(self.onmaketgen, _(u"Make Tgen").decode('utf8')) menu_cnrtl = wx.Menu() menu_cnrtl.Append(self.popupID5, u"Définition") menu_cnrtl.Append(self.popupID6, u"Etymologie") @@ -355,7 +358,7 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col menu.Append(self.pop2, u"Chi2 par classe") menu.Append(self.pop3, u"Chi2 modalités de la variable") menu.AppendSeparator() - menu.Append(self.pop1, u"Graph de la classe") + menu.Append(self.pop1, u"Graphe de la classe") self.PopupMenu(menu) menu.Destroy() @@ -772,6 +775,9 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col #win.html = '\n' + '
'.join([' : '.join([str(val) for val in forme]) for forme in rep]) + '\n' #win.HtmlPage.SetPage(win.html) win.Show(True) + + def OnMakeTgen(self, evt): + self.parent.tree.OnTgenEditor(self.getselectedwords()) class wliste(wx.Frame): diff --git a/Rscripts/chdfunct.R b/Rscripts/chdfunct.R index 5e31442..914278c 100644 --- a/Rscripts/chdfunct.R +++ b/Rscripts/chdfunct.R @@ -321,6 +321,32 @@ BuildProf01<-function(x,classes) { mat } +build.prof.tgen <- function(x) { + nbst <- sum(x[nrow(x),]) + totcl <- x[nrow(x),] + tottgen <- rowSums(x) + nbtgen <- nrow(x) - 1 + chi2 <- x[1:(nrow(x)-1),] + pchi2 <- chi2 + for (classe in 1:ncol(x)) { + for (tg in 1:nbtgen) { + cont <- c(x[tg, classe], tottgen[tg] - x[tg, classe], totcl[classe] - x[tg, classe], (nbst - totcl[classe]) - (tottgen[tg] - x[tg, classe])) + cont <- matrix(unlist(cont), nrow=2) + chiresult<-chisq.test(cont,correct=FALSE) + if (is.na(chiresult$p.value)) { + chiresult$p.value<-1 + chiresult$statistic<-0 + } + if (chiresult$expected[1,1] > cont[1,1]) { + chiresult$statistic <- chiresult$statistic * -1 + } + chi2[tg,classe] <- chiresult$statistic + pchi2[tg,classe] <- chiresult$p.value + } + } + res <- list(chi2 = chi2, pchi2 = pchi2) +} + BuildProf<- function(x,dataclasse,clusternb,lim=2) { #### #r.names<-rownames(x) diff --git a/configuration/global.cfg b/configuration/global.cfg index e4d877b..fa99eef 100644 --- a/configuration/global.cfg +++ b/configuration/global.cfg @@ -6,6 +6,6 @@ copyright = (c) 2008-2014 Pierre Ratinaud author = Pierre Ratinaud gpl-fr = gpl-2.0-fr.txt dev = Pierre Ratinaud (Université de Toulouse - Laboratoire LERASS - ratinaud@univ-tlse2.fr);Sébastien Déjean (Université de Toulouse);David Skalinder (Mash Strategy - davids@mash.uk.com); -version = 0.6 alpha 10 +version = 0.6 alpha 11 licence = GNU GPL (v2) -version_nb = 0.6.a10 \ No newline at end of file +version_nb = 0.6.a11 \ No newline at end of file diff --git a/configuration/iramuteq.cfg b/configuration/iramuteq.cfg index 58fd685..eb15ab1 100644 --- a/configuration/iramuteq.cfg +++ b/configuration/iramuteq.cfg @@ -5,8 +5,8 @@ language=french guilanguage=french R_mem = false R_max_mem = 1535 -version_nb = 0.6.a10 +version_nb = 0.6.a11 rlibs = false libsvdc = false libsvdc_path = /usr/bin/svd -rmirror = http://cran.rstudio.com/ +rmirror = http://cran.rstudio.com/ \ No newline at end of file diff --git a/corpus.py b/corpus.py index 4ba60d1..011206f 100644 --- a/corpus.py +++ b/corpus.py @@ -177,6 +177,10 @@ class Corpus : query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) def getlemucis(self, lem) : uces = self.getlemuces(lem) @@ -397,6 +401,26 @@ class Corpus : tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) return tgenoccurrences, totoccurrences + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + if uci : + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + def make_efftype_from_etoiles(self, etoiles) : dtype = {} etuces = [[] for et in etoiles] diff --git a/dialog.py b/dialog.py index 7fd2f4a..14b8a24 100755 --- a/dialog.py +++ b/dialog.py @@ -1564,7 +1564,7 @@ class FindInCluster(wx.Frame): class SearchDial ( wx.Frame ): def __init__( self, parent, listctrl, col, shown): - wx.Frame.__init__ ( self, parent, id = wx.ID_ANY, title = wx.EmptyString, pos = wx.DefaultPosition, size = wx.DefaultSize, style = wx.DEFAULT_FRAME_STYLE ) + wx.Frame.__init__ ( self, parent, id = wx.ID_ANY, title = wx.EmptyString, pos = wx.DefaultPosition, size = wx.DefaultSize, style = wx.DEFAULT_FRAME_STYLE | wx.STAY_ON_TOP ) self.parent = parent self.listctrl = listctrl self.col = col diff --git a/functions.py b/functions.py index 758f94b..6ad088d 100644 --- a/functions.py +++ b/functions.py @@ -35,7 +35,7 @@ def normpath_win32(path) : return path while '\\\\' in path : path = path.replace('\\\\', '\\') - if sys.platform == 'win32' and path.startswith('\\') and not path.startswith('\\\\') : + if path.startswith('\\') and not path.startswith('\\\\') : path = '\\' + path return path diff --git a/iramuteq.py b/iramuteq.py index 2fe1f07..69a42c2 100644 --- a/iramuteq.py +++ b/iramuteq.py @@ -45,7 +45,7 @@ from tabsimi import DoSimi from tabrsimple import InputText from tabverges import Prototypical from tabsplitvar import SplitMatrixFromVar -from textdist import AnalysePam +#from textdist import AnalysePam from textstat import Stat from textaslexico import Lexico from textsimi import SimiTxt, SimiFromCluster @@ -340,7 +340,7 @@ class IraFrame(wx.Frame): # f.write('') self.history = History(os.path.join(UserConfigPath, 'history.db')) self.tree = LeftTree(self) - self._mgr.AddPane(self.tree, aui.AuiPaneInfo().Name("lefttree").Caption(_(u"Navigator").decode('utf8')). + self._mgr.AddPane(self.tree, aui.AuiPaneInfo().Name("lefttree").Caption(_(u"Historic").decode('utf8')). Left().MinSize(wx.Size(200,500)).Layer(1).Position(1).CloseButton(False).MaximizeButton(True). MinimizeButton(True)) diff --git a/layout.py b/layout.py index 8e92228..7b89279 100644 --- a/layout.py +++ b/layout.py @@ -519,6 +519,10 @@ class OpenCHDS(): panel.TabChdSim.AddPage(self.prof_seg_nb, _(u"Repeated segments profiles").decode('utf8')) # panel.Bind(wx.EVT_BUTTON, self.ongetrapport, id = self.ID_rapport) + if os.path.exists(os.path.join(self.parametres['pathout'], 'tgenchi2.csv')) : + self.parametres['tgenspec'] = os.path.join(self.parametres['pathout'], 'tgenchi2.csv') + TgenLayout(panel) + panel.TabChdSim.SetSelection(0) self.parent.nb.AddPage(panel, _(u"Clustering").decode('utf8') + ' - %s' % corpname) self.parent.ShowTab(True) self.parent.nb.SetSelection(self.parent.nb.GetPageCount() - 1) @@ -665,6 +669,8 @@ class TgenLayout : tgen.read() tgentab = False gparent = None + if 'TabChdSim' in dir(page) : + page = page.TabChdSim for i in range(page.GetPageCount()) : tab = page.GetPage(i) if 'gparent' in dir(tab) : @@ -674,16 +680,17 @@ class TgenLayout : if tab.tgen : tgentab = tab break + if tgentab : self.page.tgentab.RefreshData(self.page.tgens) self.page.tgentab.tgens = tgen.tgen - self.page.SetSelection(i) + page.SetSelection(i) else : self.page.tgentab = ListForSpec(ira, gparent, self.page.tgens, etoiles[1:]) self.page.tgentab.tgen = True self.page.tgentab.tgens = tgen.tgen - self.page.AddPage(self.page.tgentab, u'Tgens Specificities') - self.page.SetSelection(self.page.GetPageCount() - 1) + page.AddPage(self.page.tgentab, u'Tgens Specificities') + page.SetSelection(page.GetPageCount() - 1) class dolexlayout : def __init__(self, ira, corpus, parametres): diff --git a/listlex.py b/listlex.py index dea3c98..d4e730f 100644 --- a/listlex.py +++ b/listlex.py @@ -200,15 +200,14 @@ class ListForSpec(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.ColumnSor menu.Append(self.popupID3, u"Graphique") menu_stcaract = wx.Menu() self.menuid = {} - for i, et in enumerate(self.etoiles) : - nid = wx.NewId() - self.menuid[nid] = i - menu_stcaract.Append(nid, et) - self.Bind(wx.EVT_MENU, self.onstcaract, id = nid) - menu.AppendMenu(-1, u"Segments de texte caractéristiques", menu_stcaract) - #menu.Append(self.popup_Tgen_glob, "Tgen global") if not self.tgen : - menu.Append(self.onmaketgen, "Make Tgen") + for i, et in enumerate(self.etoiles) : + nid = wx.NewId() + self.menuid[nid] = i + menu_stcaract.Append(nid, et) + self.Bind(wx.EVT_MENU, self.onstcaract, id = nid) + menu.AppendMenu(-1, u"Segments de texte caractéristiques", menu_stcaract) + menu.Append(self.onmaketgen, _(u"Make Tgen").decode('utf8')) self.PopupMenu(menu) menu.Destroy() diff --git a/textdist.py b/textdist.py index 8886a9e..33dd2ce 100644 --- a/textdist.py +++ b/textdist.py @@ -14,7 +14,7 @@ from ConfigParser import * import sys from functions import print_liste, exec_rcode, CreateIraFile, progressbar, check_Rresult, BugDialog from layout import PrintRapport -from PrintRScript import AlcesteTxtProf, RPamTxt +from PrintRScript import ReinertTxtProf, RPamTxt from openanalyse import OpenAnalyse from time import time, sleep diff --git a/textreinert.py b/textreinert.py index b15273f..6300ae5 100644 --- a/textreinert.py +++ b/textreinert.py @@ -7,10 +7,10 @@ import os from time import time from analysetxt import AnalyseText from OptionAlceste import OptionAlc -from PrintRScript import RchdTxt, AlcesteTxtProf +from PrintRScript import RchdTxt, ReinertTxtProf, TgenProfScript from layout import PrintRapport -from chemins import ChdTxtPathOut -from functions import DoConf, print_liste +from chemins import ChdTxtPathOut, PathOut +from functions import DoConf, print_liste, TGen class Reinert(AnalyseText) : @@ -81,7 +81,7 @@ class Reinert(AnalyseText) : return self.pathout['Rchdtxt'] def printRscript2(self) : - AlcesteTxtProf(self.pathout, self.parent.RscriptsPath, self.clnb, 0.9) + ReinertTxtProf(self.pathout, self.parent.RscriptsPath, self.clnb, 0.9) return self.pathout['RTxtProfGraph'] def print_graph_files(self) : @@ -98,4 +98,36 @@ class Reinert(AnalyseText) : chd_graph_list.append([os.path.basename(self.pathout['arbre2']), u'chd2']) print_liste(self.pathout['liste_graph_afc'], afc_graph_list) print_liste(self.pathout['liste_graph_chd'], chd_graph_list) - PrintRapport(self, self.corpus, self.parametres) \ No newline at end of file + PrintRapport(self, self.corpus, self.parametres) + +class TgenProf(AnalyseText): + def __init__(self, ira, corpus, parametres, cluster_size): + self.ira = ira + self.corpus = corpus + self.parametres = parametres + self.pathout = PathOut(dirout = self.parametres['pathout']) + self.cluster_size = [len(classe) for classe in corpus.lc] + print cluster_size + self.doanalyse() + + def doanalyse(self): + self.tgen = TGen(path = self.parametres['tgenpath'], encoding = self.ira.syscoding) + self.tgen.read(self.tgen.path) + #self.parametres['etoiles'].sort() + self.parametres['tgeneff'] = os.path.join(self.parametres['pathout'], 'tgeneff.csv') + tgenst = self.corpus.make_tgen_profile(self.tgen.tgen, self.corpus.lc) + clnames = ['cluster_%03d' % i for i in range(1, len(self.cluster_size) + 1)] + et = dict(zip(clnames, self.cluster_size)) + tgenst = dict([[line[0], dict(zip(clnames, line[1:]))] for line in tgenst]) + self.tgen.writetable(self.parametres['tgeneff'], tgenst, et) + self.parametres['tgenspec'] = os.path.join(self.parametres['pathout'], 'tgenchi2.csv') + self.Rscript = TgenProfScript(self) + self.Rscript.make_script() + self.Rscript.write() + self.doR(self.Rscript.scriptout, dlg = False, message = 'R...') + + + + + + \ No newline at end of file -- 2.7.4