From: Pierre Date: Fri, 25 Jan 2013 10:53:41 +0000 (+0100) Subject: modif chdtxt, a tester, problemes sur double sur rst X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=commitdiff_plain;h=ad8fe486b34f1cee918ea8564bf909e30cd25328 modif chdtxt, a tester, problemes sur double sur rst --- diff --git a/PrintRScript.py b/PrintRScript.py index 4987b21..d0dc4f0 100644 --- a/PrintRScript.py +++ b/PrintRScript.py @@ -193,11 +193,18 @@ def RchdTxt(DicoPath, RscriptPath, mincl, classif_mode, nbt = 9, svdmethod = 'sv rm(data2) """ txt += """ - chd.result <- Rchdtxt("%s",mincl=%i,classif_mode=%i, nbt = nbt) + classif_mode <- %i + mincl <- %i + uceout <- "%s" + if (classif_mode == 0) { + chd.result <- Rchdtxt(uceout, chd1, chd2 = chd2, mincl = mincl,classif_mode = classif_mode, nbt = nbt) + } else { + chd.result <- Rchdtxt(uceout, chd1, chd2 = chd1, mincl = mincl,classif_mode = classif_mode, nbt = nbt) + } n1 <- chd.result$n1 classeuce1 <- chd.result$cuce1 classeuce2 <- chd.result$cuce2 - """ % (DicoPath['uce'], mincl, classif_mode) + """ % (classif_mode, mincl, DicoPath['uce']) txt += """ tree.tot1 <- make_tree_tot(chd1) @@ -704,13 +711,33 @@ class PrintSimiScript(PrintRScript) : cn.path <- "%s" selected.col <- "%s" """ % (self.pathout['mat01.csv'], self.pathout['actives.csv'], self.pathout['selected.csv']) + if 'word' in self.parametres : + txt += """ + word <- TRUE + index <- %i + 1 + """ % self.parametres['word'] + else : + txt += """ + word <- FALSE + """ txt += """ dm <-readMM(dm.path) cn <- read.table(cn.path, sep='\t', quote='"') colnames(dm) <- cn[,1] - sel.col <- read.csv2(selected.col) - dm <- dm[, sel.col[,1] + 1] + sel.col <- read.csv2(selected.col, header = FALSE) + sel.col <- sel.col[,1] + 1 + if (!word) { + dm <- dm[, sel.col] + } else { + forme <- colnames(dm)[index] + if (!index %in% sel.col) { + sel.col <- append(sel.col, index) + } + dm <- dm[, sel.col] + index <- which(colnames(dm) == forme) + } """ + else : txt += """ load("%s") @@ -754,6 +781,16 @@ class PrintSimiScript(PrintRScript) : mat[is.na(mat)] <- 0 mat[is.infinite(mat)] <- 0 """ + if 'word' in self.parametres and not self.parametres['keep_coord'] : + txt += """ + mat <- graph.word(mat, index) + cs <- colSums(mat) + if (length(cs)) mat <- mat[,-which(cs==0)] + rs <- rowSums(mat) + if (length(rs)) mat <- mat[-which(rs==0),] + if (length(cs)) dm <- dm[, -which(cs==0)] + """ + if self.parametres['layout'] == 0 : layout = 'random' if self.parametres['layout'] == 1 : layout = 'circle' if self.parametres['layout'] == 2 : layout = 'frutch' @@ -943,18 +980,15 @@ class PrintSimiScript(PrintRScript) : vertex.size <- NULL """ else : - #FIXME - tmpchi = False - if tmpchi : + if self.parametres['type'] == 'clustersimitxt' : txt += """ lchi <- read.table("%s") lchi <- lchi[,1] - """ % ffr(tmpchi) - if 'selected_col' in dir(self.tableau) : - txt += """ - lchi <- lchi[c%s+1] - """ % datas - if tmpchi and self.parametres.get('cexfromchi', False) : + """ % ffr(self.parametres['tmpchi']) + txt += """ + lchi <- lchi[sel.col] + """ + if self.parametres['type'] == 'clustersimitxt' and self.parametres.get('cexfromchi', False) : txt += """ label.cex <- norm.vec(lchi, vcexminmax[1], vcexminmax[2]) """ @@ -966,7 +1000,7 @@ class PrintSimiScript(PrintRScript) : label.cex <- graph.simi$label.cex } """ - if tmpchi and self.parametres.get('sfromchi', False) : + if self.parametres['type'] == 'clustersimitxt' and self.parametres.get('sfromchi', False) : txt += """ vertex.size <- norm.vec(lchi, minmaxeff[1], minmaxeff[2]) """ diff --git a/ProfList.py b/ProfList.py index 600dbcf..e2a7c84 100644 --- a/ProfList.py +++ b/ProfList.py @@ -74,6 +74,7 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col self.la = [] self.lchi = [] self.lfreq = [] + self.tmpchi = None #adding some art self.il = wx.ImageList(16, 16) @@ -406,7 +407,7 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col def quest_simi(self, evt) : tableau = self.Source.tableau tab = tableau.make_table_from_classe(self.cl, self.la) - pathout = ConstructPathOut(self.Source.pathout+'/', 'simi_classe_%i' %self.cl) + pathout = ConstructPathOut(os.path.join(self.Source.pathout, 'simi_classe_%i' %self.cl)) self.filename = os.path.join(pathout,'mat01.csv') tableau.printtable(self.filename, tab) del tab @@ -459,56 +460,32 @@ class ProfListctrlPanel(wx.ListCtrl, listmix.ListCtrlAutoWidthMixin, listmix.Col def onwordgraph(self, evt): word = self.getColumnText(self.GetFirstSelected(), 6) - dlg = progressbar(self, 2) - corpus = self.Source.corpus - uces = corpus.lc[self.cl-1] - dlg.Update(1, u'Tableau...') - #tab = corpus.make_table_with_classe(uces, self.la) - pathout = ConstructPathOut(self.Source.pathout.dirout + '/' , 'simi_%s' % word) - self.filename = os.path.join(pathout,'mat01.csv') - dlg.Update(2, u'Ecriture...') - #corpus.write_tab(tab, self.filename) - #del tab - corpus.make_and_write_sparse_matrix_from_classe(self.la, uces, self.filename) - dlg.Destroy() - paramsimi = {'coeff' : 0, - 'layout' : 2, - 'type' : 1, - 'arbremax' : 0, - 'coeff_tv' : 1, - 'coeff_tv_nb' : 0, - 'tvprop' : 0, - 'tvmin' : 5, - 'tvmax' : 30, - 'coeff_te' : 1, - 'coeff_temin' : 1, - 'coeff_temax' : 10, - 'label_v': 1, - 'label_e': 0, - 'vcex' : 1, - 'vcexmin' : 10, - 'vcexmax' : 25, - 'cex' : 10, - 'seuil_ok' : 1, - 'seuil' : 1, - 'cols' : (255,0,0), - 'cola' : (200,200,200), - 'width' : 600, - 'height' : 600, - 'first' : True, - 'keep_coord' : True, - 'alpha' : 20, - 'film': False, - } - self.tableau = Tableau(self.parent, '') - self.tableau.listactives = self.la - self.tableau.actives = {} - for i, val in enumerate(self.la) : - self.tableau.actives[val] = [self.lfreq[i]] - DoSimi(self, param = paramsimi, fromprof = ffr(self.filename), pathout = pathout, wordgraph = word) + if self.tmpchi is None : + self.tmpchi = tempfile.mktemp(dir=self.Source.parent.TEMPDIR) + with open(self.tmpchi, 'w') as f: + f.write('\n'.join([str(val) for val in self.lchi])) + index = self.la.index(word) + parametres = {'type' : 'clustersimitxt', + 'pathout' : self.Source.parametres['pathout'], + 'word' : index , + 'lem' : self.Source.parametres['lem'], + 'tmpchi' : self.tmpchi} + #try : + self.parent.SimiFromCluster(self.parent, self.Source.corpus, self.la, self.cl - 1, parametres = parametres, dlg = progressbar(self, 4)) + #except : + # print 'not acitve' def on_graph(self, evt): - self.parent.SimiFromCluster(self.parent, self.Source.corpus, self.la, self.cl - 1, parametres = {'type' : 'clustersimitxt', 'pathout' : self.Source.parametres['pathout']}, dlg = progressbar(self, 4)) + if self.tmpchi is None : + self.tmpchi = tempfile.mktemp(dir=self.Source.parent.TEMPDIR) + with open(self.tmpchi, 'w') as f: + f.write('\n'.join([str(val) for val in self.lchi])) + parametres = {'type' : 'clustersimitxt', + 'pathout' : self.Source.parametres['pathout'], + 'lem' : self.Source.parametres['lem'], + 'tmpchi' : self.tmpchi} + + self.parent.SimiFromCluster(self.parent, self.Source.corpus, self.la, self.cl - 1, parametres = parametres, dlg = progressbar(self, 4)) #dlg = progressbar(self, 2) #corpus = self.Source.corpus #uces = corpus.lc[self.cl-1] diff --git a/Rscripts/CHD.R b/Rscripts/CHD.R index 53fb813..974e901 100644 --- a/Rscripts/CHD.R +++ b/Rscripts/CHD.R @@ -41,6 +41,10 @@ find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) { res } + + + + CHD<-function(data.in, x=9, mode.patate = FALSE, svd.method, libsvdc.path=NULL){ # sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt') dataori <- data.in @@ -103,7 +107,7 @@ CHD<-function(data.in, x=9, mode.patate = FALSE, svd.method, libsvdc.path=NULL){ rmax <- NULL inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT) - print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@') + print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@') pp('max inter phase 1', inert$maxinter/TT)#max(listinter)) print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@') ordert <- ordert[order(ordert[,3]),] diff --git a/Rscripts/afc_graph.R b/Rscripts/afc_graph.R index d067c16..8a62d8a 100644 --- a/Rscripts/afc_graph.R +++ b/Rscripts/afc_graph.R @@ -132,7 +132,11 @@ if ( qui == 3 ) { infp <- which(is.infinite(maxchi) & maxchi > 0) if (length(infp)) { maxchi[infp] <- NA - valmax <- max(maxchi, na.rm = TRUE) + if (!length(infp) == length(maxchi)) { + valmax <- max(maxchi, na.rm = TRUE) + } else { + valmax <- 8 + } maxchi[infp] <- valmax + 2 } if (cex.txt) { diff --git a/Rscripts/chdtxt.R b/Rscripts/chdtxt.R index a0a9cdd..436d0ba 100644 --- a/Rscripts/chdtxt.R +++ b/Rscripts/chdtxt.R @@ -6,39 +6,9 @@ #fonction pour la double classification #cette fonction doit etre splitter en 4 ou 5 fonctions -#Rchdtxt<-function(tableuc1,tableuc2,listeuce1,listeuce2,arbre1,arbre2,uceout) { - #source('/home/pierre/workspace/iramuteq/Rscripts/CHD.R') - - #lecture des tableaux -# data1<-read.csv2(tableuc1) -# data2<-read.csv2(tableuc2) - - #analyse des tableaux avec la fonction CHD qui doit etre sourcee avant -# chd1<-CHD(data1) -# chd2<-CHD(data2) - - #lecture des uce -# listuce1<-read.csv2(listeuce1) -# listuce2<-read.csv2(listeuce2) - - #Une fonction pour assigner une classe a chaque UCE -#AssignClasseToUce<-function(listuce,chd) { -# out<-matrix(nrow=nrow(listuce),ncol=ncol(chd)) -# for (i in 1:nrow(listuce)) { -# for (j in 1:ncol(chd)) { -# out[i,j]<-chd[(listuce[i,2]+1),j] -# } -# } -# out -#} - AssignClasseToUce <- function(listuce, chd) { print('assigne classe -> uce') - out<-matrix(nrow=nrow(listuce),ncol=ncol(chd)) - for (j in 1:ncol(chd)) { - out[listuce[,1]+1, j] <- chd[listuce[,2]+1, j] - } - out + chd[listuce[,2]+1,] } fille<-function(classe,classeuce) { @@ -47,8 +17,77 @@ fille<-function(classe,classeuce) { listf<-unique(listf) listf } + + +croiseeff <- function(croise, classeuce1, classeuce2) { + cl1 <- 0 + cl2 <- 1 + for (i in 1:ncol(classeuce1)) { + cl1 <- cl1 + 2 + cl2 <- cl2 + 2 + clj1 <- 0 + clj2 <- 1 + for (j in 1:ncol(classeuce2)) { + clj1 <- clj1 + 2 + clj2 <- clj2 + 2 + croise[cl1 - 1, clj1 -1] <- length(which(classeuce1[,i] == cl1 & classeuce2[,j] == clj1)) + croise[cl1 - 1, clj2 -1] <- length(which(classeuce1[,i] == cl1 & classeuce2[,j] == clj2)) + croise[cl2 - 1, clj1 -1] <- length(which(classeuce1[,i] == cl2 & classeuce2[,j] == clj1)) + croise[cl2 - 1, clj2 -1] <- length(which(classeuce1[,i] == cl2 & classeuce2[,j] == clj2)) + } + } + croise +} + +addallfille <- function(lf) { + nlf <- list() + for (i in 1:length(lf)) { + if (! is.null(lf[[i]])) { + nlf[[i]] <- lf[[i]] + filles <- lf[[i]] + f1 <- filles[1] + f2 <- filles[2] + if (f1 > length(lf)) { + for (j in (length(lf) + 1):f2) { + nlf[[j]] <- 0 + } + } + } else { + nlf[[i]] <- 0 + } + } +nlf +} + +getfille <- function(nlf, classe, pf) { + if (length(nlf[[classe]]) == 1) { + return(pf) + } else { + pf <- c(pf, nlf[[classe]]) + c1 <- nlf[[classe]][1] + c2 <- nlf[[classe]][2] + pf <- getfille(nlf, c1, pf) + pf <- getfille(nlf, c2, pf) + } + return(pf) +} + +getmere <- function(list_mere, classe) { + i <- classe + pf <- NULL + while (i != 1 ) { + pf <- c(pf, list_mere[[i]]) + i <- list_mere[[i]] + } + pf +} + +getfillemere <- function(list_fille, list_mere, classe) { + return(c(getfille(list_fille, classe, NULL), getmere(list_mere, classe))) +} + #nbt nbcl = nbt+1 tcl=((nbt+1) *2) - 2 n1[,ncol(n1)], nchd1[,ncol(nchd1)] -Rchdtxt<-function(uceout,mincl=0,classif_mode=0, nbt = 9) { +Rchdtxt<-function(uceout, chd1, chd2 = NULL, mincl=0, classif_mode=0, nbt = 9) { #FIXME: le nombre de classe peut etre inferieur nbcl <- nbt + 1 tcl <- ((nbt+1) * 2) - 2 @@ -62,16 +101,29 @@ Rchdtxt<-function(uceout,mincl=0,classif_mode=0, nbt = 9) { #calcul des poids (effectifs) - makepoids<-function(classeuce,poids) { - for (classes in 2:(tcl + 1)){ - for (i in 1:ncol(classeuce)) { - if (poids[(classes-1)] 0, arr.ind = TRUE) +# for (i in 1:nrow(tocompute)) { +# chitable <- matrix(ncol=2,nrow=2) +# chitable[1,1] <- croise[tocompute[i,1], tocompute[i,2]] +# chitable[1,2] <- poids1[tocompute[i,1]] - chitable[1,1] +# chitable[2,1] <- poids2[tocompute[i,2]] - chitable[1,1] +# chitable[2,2] <- nr - poids2[tocompute[i,2]] - chitable[1,2] +# chitest<-chisq.test(chitable,correct=FALSE) +# chicroise[tocompute[i,1], tocompute[i,2]] <- ifelse(chitable[1,1] > chitest$expected[1,1], round(chitest$statistic,digits=7), -round(chitest$statistic,digits=7)) +# } +# chicroise +# } +# + + + dochicroise <- function(croise, mincl) { + chicroise <- croise + for (i in 1:nrow(croise)) { + for (j in 1:ncol(croise)) { + if (croise[i,j]==0) { + chicroise[i,j]<-0 + } else if (croise[i,j] 3.84, arr.ind = TRUE) + #print(listxy) + val <- chicroise[which(chicroise > 3.84)] + ord <- order(val, decreasing = TRUE) + listxy <- listxy[ord,] + #for (i in 1:nrow(listxy)) { + # if ((!listxy[,2][i] %in% listx) & (!listxy[,1][i] %in% listy)) { + # listx <- c(listx, listxy[,2][i]) + # listy <- c(listy, listxy[,1][i]) + # } + #} + xy <- list(x = listxy[,2], y = listxy[,1]) + xy + } + xy <- doxy(chicroise) + print(xy) + listx <- xy$x + listy <- xy$y + +# maxi<-vector() +# chimax<-vector() +# for (i in 1:tcl) { +# maxi[i]<-which.max(chicroise) +# chimax[i]<-chicroise[maxi[i]] +# chicroise[maxi[i]]<-0 +# } +# testpres<-function(x,listcoord) { +# for (i in 1:length(listcoord)) { +# if (x==listcoord[i]) { +# return(-1) +# } else { +# a<-1 +# } +# } +# a +# } +# c.len=nrow(chicroise) +# r.len=ncol(chicroise) +# listx<-c(0) +# listy<-c(0) +# rang<-0 +# cons<-list() +# #on garde une valeur par ligne / colonne +# for (i in 1:length(maxi)) { +# #coordonnées de chi2 max +# #coord <- arrayInd(maxi[i], dim(chicroise)) +# #x.co <- coord[1,2] +# #y.co <- coord[1,1] +# x.co<-ceiling(maxi[i]/c.len) +# y.co<-maxi[i]-(x.co-1)*c.len +# #print(x.co) +# #print(y.co) +# #print(arrayInd(maxi[i], dim(chicroise))) +# a<-testpres(x.co,listx) +# b<-testpres(y.co,listy) +# +# if (a==1) { +# if (b==1) { +# rang<-rang+1 +# listx[rang]<-x.co +# listy[rang]<-y.co +# } +# } +# cons[[1]]<-listx +# cons[[2]]<-listy +# } #pour ecrire les resultats for (i in 1:length(listx)) { txt<-paste(listx[i]+1,listy[i]+1,sep=' ') @@ -192,73 +305,172 @@ Rchdtxt<-function(uceout,mincl=0,classif_mode=0, nbt = 9) { unique(unlist(chd[chd[,classe%/%2]==classe,])) } + +#---------------------------------------------------------------------- + findbestcoord <- function(classeuce1, classeuce2) { + #fillemere1 <- NULL + #fillemere2 <- NULL + + #fillemere1 <- unique(classeuce1) + #if (classif_mode == 0) { + # fillemere2 <- unique(classeuce2) + #} else { + # fillemere2 <- fillemere1 + #} + + # + listcoordok <- list() + maxcl <- 0 + nb <- 0 + lf1 <- addallfille(chd1$list_fille) + if (classif_mode == 0) { + lf2 <- addallfille(chd2$list_fille) + } else { + lf2 <- lf1 + } + lme1 <- chd1$list_mere + if (classif_mode == 0) { + lme2 <- chd2$list_mere + } else { + lme2 <- lme1 + } + for (first in 1:length(listx)) { + coordok <- NULL + f1 <- NULL + f2 <- NULL + listxp<-listx + listyp<-listy + + #listxp<-listx[first:length(listx)] + #listxp<-c(listxp,listx[1:(first-1)]) + #listyp<-listy[first:length(listy)] + #listyp<-c(listyp,listy[1:(first-1)]) + listxp <- listxp[order(listx, decreasing = TRUE)] + listyp <- listyp[order(listx, decreasing = TRUE)] + #listxp<-c(listxp[first:length(listx)], listx[1:(first-1)]) + #listyp<-c(listyp[first:length(listy)], listy[1:(first-1)]) + for (i in 1:length(listx)) { + if( (!(listxp[i]+1) %in% f1) & (!(listyp[i]+1) %in% f2) ) { + #print(listyp[i]+1) + #print('not in') + #print(f2) + coordok <- rbind(coordok, c(listyp[i] + 1,listxp[i] + 1)) + #print(c(listyp[i] + 1,listxp[i] + 1)) + un1 <- getfillemere(lf2, chd2$list_mere, listxp[i] + 1) + f1 <- c(f1, un1) + f1 <- c(f1, listxp[i] + 1) + un2 <- getfillemere(lf1, chd1$list_mere, listyp[i] + 1) + f2 <- c(f2, un2) + f2 <- c(f2, listyp[i] + 1) + } + #print(coordok) + } + #if (nrow(coordok) > maxcl) { + nb <- 1 + # listcoordok <- list() + listcoordok[[nb]] <- coordok + # maxcl <- nrow(coordok) + #} else if (nrow(coordok) == maxcl) { + nb <- nb + 1 + # listcoordok[[nb]] <- coordok + #} + } + listcoordok <- unique(listcoordok) + print(listcoordok) + best <- 1 + if (length(listcoordok) > 1) { + maxchi <- 0 + for (i in 1:length(listcoordok)) { + chi <- NULL + uce <- NULL + for (j in 1:nrow(listcoordok[[i]])) { + chi<-c(chi,croise[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) + uce<-c(uce,chicroiseori[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) + } + if (maxchi < sum(chi)) { + maxchi <- sum(chi) + suce <- sum(uce) + best <- i + } + } + print(suce/nrow(classeuce1)) + } + listcoordok[[best]] + } +#--------------------------------------------------------------------------------- #pour trouver une valeur dans une liste #is.element(elem, list) #== elem%in%list - - coordok<-NULL - trouvecoordok<-function(first) { - fillemere1<-NULL - fillemere2<-NULL - listxp<-listx - listyp<-listy - listxp<-listx[first:length(listx)] - listxp<-c(listxp,listx[1:(first-1)]) - listyp<-listy[first:length(listy)] - listyp<-c(listyp,listy[1:(first-1)]) - for (i in 1:length(listxp)) { - if (!(listxp[i]+1)%in%fillemere1) { - if (!(listyp[i]+1)%in%fillemere2) { - coordok<-rbind(coordok,c(listyp[i]+1,listxp[i]+1)) - fillemere1<-c(fillemere1,trouvefillemere(listxp[i]+1,chd2$n1)) - fillemere2<-c(fillemere2,trouvefillemere(listyp[i]+1,chd1$n1)) - } - } - } + oldfindbestcoord <- function(listx, listy) { + coordok<-NULL + trouvecoordok<-function(first) { + fillemere1<-NULL + fillemere2<-NULL + listxp<-listx + listyp<-listy + listxp<-listx[first:length(listx)] + listxp<-c(listxp,listx[1:(first-1)]) + listyp<-listy[first:length(listy)] + listyp<-c(listyp,listy[1:(first-1)]) + for (i in 1:length(listxp)) { + if (!(listxp[i]+1)%in%fillemere1) { + if (!(listyp[i]+1)%in%fillemere2) { + coordok<-rbind(coordok,c(listyp[i]+1,listxp[i]+1)) + fillemere1<-c(fillemere1,trouvefillemere(listxp[i]+1,chd2$n1)) + fillemere2<-c(fillemere2,trouvefillemere(listyp[i]+1,chd1$n1)) + } + } + } + coordok + } + #fonction pour trouver le nombre maximum de classes + findmaxclasse<-function(listx,listy) { + listcoordok<-list() + maxcl<-0 + nb<-1 + for (i in 1:length(listy)) { + coordok<-trouvecoordok(i) + if (maxcl <= nrow(coordok)) { + maxcl<-nrow(coordok) + listcoordok[[nb]]<-coordok + nb<-nb+1 + } + } + listcoordok<-unique(listcoordok) + print(listcoordok) + #si plusieurs ensemble avec le meme nombre de classe, on conserve + #la liste avec le plus fort chi2 + if (length(listcoordok)>1) { + maxchi<-0 + best<-NULL + for (i in 1:length(listcoordok)) { + chi<-NULL + uce<-NULL + if (nrow(listcoordok[[i]])==maxcl) { + for (j in 1:nrow(listcoordok[[i]])) { + chi<-c(chi,croise[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) + uce<-c(uce,chicroiseori[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) + } + if (maxchi < sum(chi)) { + maxchi <- sum(chi) + suce <- sum(uce) + best <- i + } + } + } + } + print((maxchi/nrow(classeuce1)*100)) + listcoordok[[best]] + } + print('cherche max') + coordok<-findmaxclasse(listx,listy) coordok - } -#fonction pour trouver le nombre maximum de classes - findmaxclasse<-function(listx,listy) { - listcoordok<-list() - maxcl<-0 - nb<-1 - for (i in 1:length(listy)) { - coordok<-trouvecoordok(i) - if (maxcl <= nrow(coordok)) { - maxcl<-nrow(coordok) - listcoordok[[nb]]<-coordok - nb<-nb+1 - } - } - listcoordok<-unique(listcoordok) - #si plusieurs ensemble avec le meme nombre de classe, on conserve - #la liste avec le plus fort chi2 - if (length(listcoordok)>1) { - maxchi<-0 - best<-NULL - for (i in 1:length(listcoordok)) { - chi<-NULL - uce<-NULL - if (nrow(listcoordok[[i]])==maxcl) { - for (j in 1:nrow(listcoordok[[i]])) { - chi<-c(chi,croise[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) - uce<-c(uce,chicroiseori[(listcoordok[[i]][j,1]-1),(listcoordok[[i]][j,2]-1)]) - } - if (maxchi < sum(chi)) { - maxchi <- sum(chi) - suce <- sum(uce) - best <- i - } - } - } - } - print((suce/nrow(classeuce1)*100)) - listcoordok[[best]] - } + } #findmaxclasse(listx,listy) #coordok<-trouvecoordok(1) - coordok<-findmaxclasse(listx,listy) - print(coordok) + #coordok <- oldfindbestcoord(listx, listy) + coordok <- findbestcoord(listx, listy) + lfilletot<-function(classeuce,x) { listfille<-NULL @@ -267,7 +479,7 @@ Rchdtxt<-function(uceout,mincl=0,classif_mode=0, nbt = 9) { listfille } } - + print('listfille') listfille1<-lfilletot(classeuce1,1) listfille2<-lfilletot(classeuce2,2) @@ -283,10 +495,11 @@ Rchdtxt<-function(uceout,mincl=0,classif_mode=0, nbt = 9) { } print('commence assigne new classe') nchd1<-Assignclasse(classeuce1,1) - if (classif_mode==0) + if (classif_mode==0) { nchd2<-Assignclasse(classeuce2,2) - else + } else { nchd2<-nchd1 + } print('fini assign new classe') #croisep<-matrix(ncol=nrow(coordok),nrow=nrow(coordok)) nchd2[which(nchd1[,ncol(nchd1)]==0),] <- 0 diff --git a/Rscripts/simi.R b/Rscripts/simi.R index 98bacf7..067eaec 100644 --- a/Rscripts/simi.R +++ b/Rscripts/simi.R @@ -297,4 +297,9 @@ graph.word <- function(mat.simi, index) { nm[,index] <- mat.simi[,index] nm[index,] <- mat.simi[index,] nm +# cs <- colSums(nm) +# if (cs) nm <- nm[,-which(cs==0)] +# rs <- rowSums(nm) +# if (rs) nm <- nm[-which(rs==0),] +# nm } diff --git a/analysetxt.py b/analysetxt.py index ff37c98..2811a7e 100644 --- a/analysetxt.py +++ b/analysetxt.py @@ -19,18 +19,19 @@ from time import time log = logging.getLogger('iramuteq.analyse') class AnalyseText : - def __init__(self, ira, corpus, parametres = None, dlg = False) : + def __init__(self, ira, corpus, parametres = None, dlg = False, lemdial = True) : self.corpus = corpus self.ira = ira self.parent = ira self.dlg = dlg self.dialok = True self.parametres = parametres + self.lemdial = lemdial self.val = False if not 'pathout' in self.parametres : self.pathout = PathOut(corpus.parametres['originalpath'], analyse_type = parametres['type'], dirout = corpus.parametres['pathout']) else : - self.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = self.parametres['pathout'], analyse_type = self.parametres['name']) + self.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = self.parametres['pathout'], analyse_type = self.parametres['type']) self.parametres = self.lemparam() if self.parametres is not None : self.parametres = self.make_config(parametres) @@ -77,7 +78,7 @@ class AnalyseText : pass def lemparam(self) : - if self.dlg : + if self.dlg and self.lemdial: dial = StatDialog(self, self.parent) dial.CenterOnParent() val = dial.ShowModal() @@ -117,7 +118,7 @@ class AnalyseText : log.info('R code...') pid = exec_rcode(self.ira.RPath, Rscript, wait = wait) while pid.poll() is None : - if dlg is not None : + if dlg : self.dlg.Pulse(message) sleep(0.2) else : @@ -143,6 +144,7 @@ class Alceste(AnalyseText) : self.corpus.make_and_write_sparse_matrix_from_uci(self.actives, self.pathout['TableUc1'], self.pathout['listeuce1']) Rscript = self.printRscript() self.doR(Rscript, dlg = self.dlg, message = 'CHD...') + self.corpus.make_ucecl_from_R(self.pathout['uce']) self.corpus.make_and_write_profile(self.actives, self.corpus.lc, self.pathout['Contout']) self.sup, lim = self.corpus.make_actives_nb(self.parametres['max_actives'], 2) diff --git a/corpus.py b/corpus.py index d4357cf..90332c1 100644 --- a/corpus.py +++ b/corpus.py @@ -213,13 +213,23 @@ class Corpus : def getetoileuces(self) : log.info('get uces etoiles') etoileuces = {} + idpara = 0 for uci in self.ucis : - etoiles = uci.etoiles[1:] + uci.paras + etoiles = uci.etoiles[1:] for et in etoiles : if et in etoileuces : etoileuces[et] += [uce.ident for uce in uci.uces] else : etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 + else : + idpara += 1 return etoileuces def getucefromid(self, uceid) : @@ -542,7 +552,7 @@ class Corpus : def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : - etoiles.update(uci.etoiles[1:] + uci.paras) + etoiles.update(uci.etoiles[1:]) return list(etoiles) def make_etoiles_dict(self) : diff --git a/iracmd.py b/iracmd.py index cec02e9..8c000cb 100644 --- a/iracmd.py +++ b/iracmd.py @@ -12,7 +12,7 @@ import locale import codecs sys.setdefaultencoding(locale.getpreferredencoding()) from chemins import ConstructConfigPath, ConstructDicoPath, ConstructRscriptsPath -from functions import ReadLexique, DoConf, History +from functions import ReadLexique, DoConf, History, ReadDicoAsDico from ConfigParser import * ####################################### #from textchdalc import AnalyseAlceste diff --git a/layout.py b/layout.py index 2a30f5c..8cfcd96 100644 --- a/layout.py +++ b/layout.py @@ -250,7 +250,7 @@ class GraphPanel(wx.ScrolledWindow): self.Dict = dico self.txt = txt self.parent = parent - self.SetFont(wx.Font(10, wx.DEFAULT, wx.NORMAL, wx.NORMAL, 0, "courier")) + self.SetFont(wx.Font(10, wx.DEFAULT, wx.NORMAL, wx.FONTWEIGHT_BOLD, 0, "courier")) self.labels = [] self.listimg = [] self.dirout = os.path.dirname(self.Dict['ira']) @@ -561,11 +561,10 @@ def PrintRapport(self, corpus, parametres, istxt = True): """ % datetime.datetime.now().ctime() - print istxt if istxt : totocc = corpus.gettotocc() - txt += u'nombre d\'uci: %i%s' % (corpus.getucinb(), sep) - txt += u'nombre d\'uce: %i%s' % (corpus.getucenb(), sep) + txt += u'nombre de textes: %i%s' % (corpus.getucinb(), sep) + txt += u'nombre de segments de textes: %i%s' % (corpus.getucenb(), sep) txt += u'nombre de formes: %i%s' % (len(corpus.formes), sep) txt += u'nombre d\'occurrences: %i%s' % (totocc, sep) txt += u'moyenne d\'occurrences par forme: %f%s' % (float(totocc) / float(len(self.corpus.formes)), sep) @@ -575,10 +574,8 @@ def PrintRapport(self, corpus, parametres, istxt = True): txt += u'nombre de formes actives de fréquence >= %i: %i%s' % (parametres['eff_min_forme'], parametres['nbactives'], sep) txt += u'moyenne d\'occurrences par uce :%f%s' % (float(totocc) / float(corpus.getucenb()), sep) if 'tailleuc1' in parametres : - if parametres['classif_mode'] != 0 : - txt += u'taille uc1 : %i\n' % parametres['tailleuc1'] - else: - txt += u'taille uc1 / uc2: %i / %i - %i / %i%s' % (parametres['tailleuc1'], parametres['tailleuc2'], parametres['lenuc1'], parametres['lenuc2'], sep) + if parametres['classif_mode'] == 0 : + txt += u'taille rst1 / rst2: %i / %i - %i / %i%s' % (parametres['tailleuc1'], parametres['tailleuc2'], parametres['lenuc1'], parametres['lenuc2'], sep) else : self.Ucenb = self.nbind txt += u'nombre d\'individus : %i%s' % (self.nbind, sep) @@ -586,9 +583,9 @@ def PrintRapport(self, corpus, parametres, istxt = True): if istxt : txt += u'nombre de classes : %i%s' % (parametres['clnb'], sep) if parametres['classif_mode'] == 0 or parametres['classif_mode'] == 1 : - txt += u'%i uce classées sur %i (%.2f%%)%s' % (sum([len(cl) for cl in corpus.lc]), corpus.getucenb(), (float(sum([len(cl) for cl in corpus.lc])) / float(corpus.getucenb())) * 100, sep) + txt += u'%i segments classés sur %i (%.2f%%)%s' % (sum([len(cl) for cl in corpus.lc]), corpus.getucenb(), (float(sum([len(cl) for cl in corpus.lc])) / float(corpus.getucenb())) * 100, sep) elif self.parametres['classif_mode'] == 2 : - txt += u'%i uci classées sur %i (%.2f%%)%s' % (sum([len(cl) for cl in corpus.lc]), corpus.getucinb(), (float(sum([len(cl) for cl in corpus.lc]))) / float(corpus.getucinb()) * 100, sep) + txt += u'%i textes classés sur %i (%.2f%%)%s' % (sum([len(cl) for cl in corpus.lc]), corpus.getucinb(), (float(sum([len(cl) for cl in corpus.lc]))) / float(corpus.getucinb()) * 100, sep) else : txt += u'%i uce classées sur %i (%.2f%%)%s' % (self.ucecla, self.Ucenb, (float(self.ucecla) / float(self.Ucenb)) * 100, sep) @@ -692,12 +689,13 @@ class GraphPanelDendro(wx.Panel): self.dirout = os.path.dirname(self.dictpathout['ira']) self.list_graph = list_graph self.parent = self.GetParent()#parent - self.SetFont(wx.Font(10, wx.DEFAULT, wx.NORMAL, wx.NORMAL, 0, "courier")) + self.SetFont(wx.Font(10, wx.DEFAULT, wx.NORMAL, wx.NORMAL, 0, "Arial")) self.labels = [] self.listimg = [] self.tabchd = self.parent.GetParent() self.ira = self.tabchd.GetParent() self.panel_1 = wx.ScrolledWindow(self, -1, style=wx.TAB_TRAVERSAL) + self.panel_1.SetBackgroundColour('white') self.deb = wx.StaticText(self.panel_1, -1, txt) dendro_img = wx.Image(os.path.join(self.ira.images_path,'but_dendro.png'), wx.BITMAP_TYPE_ANY).ConvertToBitmap() self.butdendro = wx.BitmapButton(self, -1, dendro_img) diff --git a/textsimi.py b/textsimi.py index 8650be1..0bebb95 100644 --- a/textsimi.py +++ b/textsimi.py @@ -121,14 +121,12 @@ class SimiTxt(AnalyseText): with open(self.pathout['actives.csv'], 'w') as f : f.write('\n'.join(self.actives).encode(self.ira.syscoding)) - - class SimiFromCluster(SimiTxt) : def __init__(self, ira, corpus, actives, numcluster, parametres = None, dlg = False) : self.actives = actives self.numcluster = numcluster parametres['name'] = 'simi_classe_%i' % (numcluster + 1) - SimiTxt.__init__(self, ira, corpus, parametres, dlg) + SimiTxt.__init__(self, ira, corpus, parametres, dlg, lemdial = False) def preferences(self) : return self.parametres diff --git a/textstat.py b/textstat.py index afc9ff1..2618c51 100644 --- a/textstat.py +++ b/textstat.py @@ -81,12 +81,17 @@ class Stat(AnalyseText) : txt = """ source("%s") tot <- read.csv2("%s", header = FALSE, row.names = 1) - hapax <- read.csv2("%s", header = FALSE, row.names = 1) - tot <- rbind(tot, hapax) + """ % (self.parent.RscriptsPath['Rgraph'], self.pathout['total.csv']) + if len(hapax) : + txt += """ + hapax <- read.csv2("%s", header = FALSE, row.names = 1) + tot <- rbind(tot, hapax) + """ % self.pathout['hapax.csv'] + txt += """ open_file_graph("%s", width = 400, height = 400) plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16) dev.off() - """ % (self.parent.RscriptsPath['Rgraph'], self.pathout['total.csv'], self.pathout['hapax.csv'], self.pathout['zipf.png']) + """ % (self.pathout['zipf.png']) tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR) with open(tmpscript, 'w') as f : f.write(txt)