From a08aea7209aa958dee6b6337525b8036b9215100 Mon Sep 17 00:00:00 2001 From: Pierre Ratinaud Date: Sat, 19 Jul 2014 01:43:55 +0200 Subject: [PATCH] rm --- Rlib/textometrieR/R/cooccurrences.R | 168 ----------------------- Rlib/textometrieR/R/pareto.R | 33 ----- Rlib/textometrieR/R/repartition.R | 128 ------------------ Rlib/textometrieR/R/specificites.R | 256 ------------------------------------ configuration/alceste.cfg | 36 ----- 5 files changed, 621 deletions(-) delete mode 100644 Rlib/textometrieR/R/cooccurrences.R delete mode 100644 Rlib/textometrieR/R/pareto.R delete mode 100644 Rlib/textometrieR/R/repartition.R delete mode 100644 Rlib/textometrieR/R/specificites.R delete mode 100644 configuration/alceste.cfg diff --git a/Rlib/textometrieR/R/cooccurrences.R b/Rlib/textometrieR/R/cooccurrences.R deleted file mode 100644 index 4f12544..0000000 --- a/Rlib/textometrieR/R/cooccurrences.R +++ /dev/null @@ -1,168 +0,0 @@ -# #* Textometrie -# #* ANR project ANR-06-CORP-029 -# #* http://textometrie.ens-lsh.fr/ -# #* -# #* 2008 (C) Textometrie project -# #* BSD New and Simplified BSD licenses -# #* http://www.opensource.org/licenses/bsd-license.php -# -# ## Sylvain Loiseau -# -# ## -# ## Ex. : type1 = 7 occ., type2 = 4 occ., separator = 11 -# ## Cf. Lafon, 1981 : 120 sqq. -# ## -# ## phyper(0:4, 4, 7 + 11, 7) -# ## -# -# ## -# ## wrapper for the function "cooccurrences.directed.contexts" given a factor as -# ## corpus and a separator. -# ## - -cooccurrences <- function(corpus) { - stop("not implemented yet"); -} - -# -# `cooccurrences.directed.corpus` <- -# function (corpus, separator) { -# if (! is.factor(corpus)) stop("The corpus must be a factor."); -# if (! is.character(separator)) stop("Separator must be a character vector."); -# if (length(separator) != 1) stop("Separator must be a character of length 1"); -# -# corpus <- corpus[drop=TRUE]; -# frequencies = table(corpus); -# separatorfrequency = frequencies[separator]; -# if (is.na(separatorfrequency)) { -# separatorfrequency = 0; -# } -# frequencies = frequencies[names(frequencies) != separator]; -# types <- levels(corpus); -# types <- types[types != separator]; -# -# contexts = get.contexts(corpus, separator); -# -# return(cooccurrences.directed.contexts(contexts, types, frequencies, separatorfrequency)); -# } -# -# ## -# ## Compute all the cooccurrence indices according to a context list (see the "get.contexts" function"), -# ## the vector of types to be taken into account, their frequency, and the separator frequency. -# ## -# ## Compute the cooccurrency index (based on hypergeometric cumulative -# ## probability) for each directed pair of token and return a matrix with tokens as column names -# ## and row names where each cell give a cooccurrency index -# ## -# -# `cooccurrences.directed.contexts` <- -# function(contexts, types, type.frequencies, separator.frequency) { -# -# all.cooccurrences.index <- matrix(0.0, nrow=length(types), ncol=length(type)); -# rownames(all.cooccurrences.index) <- types; -# colnames(all.cooccurrences.index) <- types; -# -# for (i in 1:length(types)) { -# type = types[i]; -# typefrequency = frequencies[i]; -# for (j in c(1:(types-1), (types + 1):length(types))) { -# othertype = types[j]; -# othertypefrequency = frequencies[j]; -# cooccurrences = cooccurrences.frequency(contexts, type, othertype); -# all.cooccurrences.index[type, othertype] <- -# cooccurrences.directed.cooccurrents(cooccurrences, type, othertype, separatorfrequency); -# } -# } -# -# return(all.cooccurrences.index); -# } -# -# ## -# ## In a list of contexts (see the "get.contexts" function), count the number of contexts -# ## having both at least one occurrence of type1 and at least one occurrence of type2. -# ## -# -# `cooccurrences.frequency` <- -# function(contexts, type1, type2) { -# -# if (! is.character(type1)) stop("Type1 must be a character vector."); -# if (length(type1) != 1) stop("Type1 must be a character of length 1"); -# if (! is.character(type2)) stop("Type2 must be a character vector."); -# if (length(type2) != 1) stop("Type2 must be a character of length 1"); -# -# is.cooccurring <- sapply(contexts, -# function(x) { -# if(type1 %in% x && type2 %in% x) return(1) else return(0) -# } -# ); -# return(sum(is.cooccurring)); -# } -# -# ## -# ## -# ## -# -# `cooccurrences.directed.cooccurrents` <- -# function(cooccurrences, type1, type2, separator) { -# if (! is.integer(cooccurrences)) stop("cooccurrences must be integer."); -# if (! is.integer(type1)) stop("type1 must be integer."); -# if (! is.integer(type2)) stop("type2 must be integer."); -# if (! is.integer(separator)) stop("separator must be integer."); -# ph <- phyper(found, type2, type1 + separator, type1); -# return(ph); -# } -# -# ## -# ## Create a list of character vectors according to a character vector and a separator string. -# ## -# -# `get.contexts` <- -# function(corpus, separator) { -# if (! is.factor(corpus)) stop("The corpus must be a factor."); -# if (! is.character(separator)) stop("Separator must be a character vector."); -# if (length(separator) != 1) stop("Separator must be a vector of length 1"); -# -# index.separator = which(corpus == separator); -# contexts = list(); -# lastindex = 1; -# for(i in 1:(length(index.separator))) { -# contexts[[i]] = as.character(corpus[lastindex:(index.separator[i]-1)]); -# lastindex=index.separator[i] + 1; -# } -# contexts[[length(contexts) + 1]] <- as.character(corpus[lastindex:length(corpus)]); -# return(contexts); -# } -# -# ## -# ## Create a lexical table according to a character vector and a separator string. -# ## -# -# `get.lexical.table` <- -# function(corpus, separator) { -# if (! is.factor(corpus)) stop("The corpus must be a factor."); -# if (! is.character(separator)) stop("Separator must be a character vector."); -# if (length(separator) == 1) stop("Separator must be a character of length 1"); -# -# contexts = get.contexts(corpus, separator); -# -# corpus <- corpus[drop=TRUE]; -# types <- levels(corpus); -# types <- types[types != separator]; -# -# lexical.table <- matrix(0, nrow=length(contexts), ncol=length(types)); -# colnames(lexical.table) <- types; -# -# for (i in 1:length(contexts)) { -# freq.list <- table(contexts[[i]]); -# lexical.table[i, names(freq.list)] <- freq.list; -# } -# -# return(lexical.table); -# } -# -# `cooccurrences` <- -# function(type1, type2, separator) { -# # max = ifelse(f > g, 2*g, i) -# # #choose(T, t); -# } -# diff --git a/Rlib/textometrieR/R/pareto.R b/Rlib/textometrieR/R/pareto.R deleted file mode 100644 index 85a0db4..0000000 --- a/Rlib/textometrieR/R/pareto.R +++ /dev/null @@ -1,33 +0,0 @@ -#* -#* Textometrie -#* ANR project ANR-06-CORP-029 -#* http://textometrie.ens-lsh.fr/ -#* -#* 2008 (C) Textometrie project -#* BSD New and Simplified BSD licenses -#* http://www.opensource.org/licenses/bsd-license.php - -`pareto` <- -function (x) { - op <- par(mar = c(5, 4, 4, 5) + 0.1) - par(las=2) - if( ! inherits(x, "table") ) { - x <- table(x) - } - x <- rev(sort(x)) - plot( x, type='h', axes=F) - # , lwd=16 - axis(2) - points( x, type='h', lwd=6) # , col=heat.colors(length(x)) - - y <- cumsum(x)/sum(x) - par(new=T) - plot(y, type="b", lwd=3, pch=7, axes=F, xlab='', ylab='', main='') - points(y, type='h') - axis(4) - #print(names(x)) - axis(1, at=1:length(x), labels=names(x)) - par(op) - title("Pareto graphic with cumulative frequency"); -} - diff --git a/Rlib/textometrieR/R/repartition.R b/Rlib/textometrieR/R/repartition.R deleted file mode 100644 index c819fbb..0000000 --- a/Rlib/textometrieR/R/repartition.R +++ /dev/null @@ -1,128 +0,0 @@ -#* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr -#* -#* This file is part of the TXM platform. -#* -#* The TXM platform is free software: you can redistribute it and/or modif y -#* it under the terms of the GNU General Public License as published by -#* the Free Software Foundation, either version 3 of the License, or -#* (at your option) any later version. -#* -#* The TXM platform is distributed in the hope that it will be useful, -#* but WITHOUT ANY WARRANTY; without even the implied warranty of -#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -#* General Public License for more details. -#* -#* You should have received a copy of the GNU General Public License -#* along with the TXM platform. If not, see . - -`printrepartition` <-function(positions, names, colors, styles, widths, corpusname, Xmin, T, doCumulative, structurepositions, strutnames, graphtitle, bande) { - options(scipen=1000) - - linestyle = 1 - linewidth = 1 - - if (length(positions) > length(colors)) stop("colors list size too small"); - if (length(positions) > length(names)) stop("names list size too small"); - if (length(positions) > length(styles)) stop("styles list size too small"); - if (length(positions) > length(widths)) stop("widths list size too small"); - - # if (length(structurepositions) > length(strutnames)) stop("structure names list size too small"); - - doCumu <- (doCumulative == "true") - - maxX = T - maxY = 0 - draw = 0 - - # set maxX and maxY the ranges - if(!doCumu) - { - for(i in 1:length(names)) - { - x = positions[[i]] - if(length(x) > 0) - { - d = density(x, bw=bande) - m = max(d[["y"]]) - if(maxY < m) - maxY <- m - } - } - maxY=2*maxY - } - else - { - for(i in 1:length(names)) - { - my <- length(positions[[i]]) - if(maxY < my) - maxY <- my - } - } - - # draw curves - for(i in 1:length(names)) - { - #line styles and width update - linestyle = linestyle + 1 - if(linestyle >= 6) - { - linestyle = 1 - linewidth = linewidth+ 1 - } - x = positions[[i]] - if(length(x) > 0) - { - y = 1:length(x) - - y <- c( c(0), y , c(y[[length(x)]]) ) - x <- c( c(x[[1]]), x , c(maxX) ) - - if(draw == 0)# first draw - { - if(doCumu) - { - -plot(x, y, type="s", xlab=paste("T = ", maxX), main = graphtitle, ylab="Occurrences", ylim=c(0, maxY), xlim=c(Xmin, maxX), pch=15, col=colors[i], lty=styles[i], lwd=widths[i], xaxs="i", yaxs="i") - } - else - { -plot(density(x, bw=bande), type="l", xlab=paste("T = ", maxX), graphtitle, ylab="Density", ylim=c(0, maxY), xlim=c(Xmin, maxX), pch=15, col=colors[i], lty=styles[i], lwd=widths[i], xaxs="i", yaxs="i") - } - } - else #next draws - { - if(doCumu) - { -points(x, y, type="s", pch=15, col=colors[i], lty=styles[i], lwd=widths[i]) - } - else - { -points(density(x, bw=bande), type="l", pch=15, col=colors[i], lty=styles[i], lwd=widths[i]) - } - } - rm(y) - draw <- draw + 1 - } - } - - # draw legend - for(i in 1:length(names)) - names[i] = paste(names[i], length(positions[[i]])) - - if(draw > 0) - legend("topleft", names, inset = .02, col = colors, lty=styles, lwd=widths) - - # draw hist of struct - y = c() - if(length(structurepositions) > 0) - { - for(i in 1:length(structurepositions)) - { - y[i] <- maxY - text(structurepositions[[i]], maxY*0.70, strutnames[[i]], cex = .8, srt=-90, adj = c(0,0)) - } - points(structurepositions, y, type="h", ylim=c(0, maxY), xlim=c(Xmin, maxX)) - } -} - diff --git a/Rlib/textometrieR/R/specificites.R b/Rlib/textometrieR/R/specificites.R deleted file mode 100644 index 7fbd2ba..0000000 --- a/Rlib/textometrieR/R/specificites.R +++ /dev/null @@ -1,256 +0,0 @@ -#* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr -#* -#* This file is part of the TXM platform. -#* -#* The TXM platform is free software: you can redistribute it and/or modif y -#* it under the terms of the GNU General Public License as published by -#* the Free Software Foundation, either version 3 of the License, or -#* (at your option) any later version. -#* -#* The TXM platform is distributed in the hope that it will be useful, -#* but WITHOUT ANY WARRANTY; without even the implied warranty of -#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -#* General Public License for more details. -#* -#* You should have received a copy of the GNU General Public License -#* along with the TXM platform. If not, see . - -## Sylvain Loiseau -## Lise Vaudor - -phyper_bis=function(v_a,v_b,v_c,v_d){ - v_s2=rep(NA,length(v_a)) - for (j in 1:length(v_a)){ - a=v_a[j] - b=v_b[j] - c=v_c[j] - d=v_d - a_tmp=a#+1 - s1=dhyper(a_tmp,b,c,d) - s_tmp=dhyper(a_tmp+1,b,c,d) - s2=s1+s_tmp - a_tmp=a_tmp+1 - while(log(s2)!=log(s1)){ - s1=s2 - a_tmp=a_tmp+1 - s_tmp=dhyper(a_tmp,b,c,d) - s2=s1+s_tmp - } - v_s2[j]=s2 - } - return(v_s2) -} - -`specificites` <- - function(lexicaltable, types=NULL, parts=NULL) { - spe <- specificites.probabilities(lexicaltable, types, parts); - spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); - spelog[spe < 0] <- log10(-spe[spe < 0]); - spelog[spe > 0] <- abs(log10(spe[spe >0])); - spelog[spe == 0] <- 0; - spelog[is.infinite(spe)] <- 0; - spelog <- round(spelog, digits=4); - rownames(spelog) <- rownames(spe); - colnames(spelog) <- colnames(spe); - #class(spelog) <- "specificites"; - #attr(spelog, "l.t") <- spe; - return(spelog); - } - -`specificites.probabilities` <- - function(lexicaltable, types=NULL, parts=NULL) { - - # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values."); - - rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types). - colMargin <- colSums(lexicaltable); # or "T" (the size of the parts). - F <- sum(lexicaltable); # The grand total (number of tokens in the corpus). - - if (! is.null(types)) { # Filter on tokens to be considered. - if(is.character(types)) { # convert the name of types given with "types" into row index numbers. - if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector."); - if (! all(types %in% rownames(lexicaltable))) stop(paste( - "Some requested types are not known in the lexical table: ", - paste(types[! (types %in% rownames(lexicaltable))], collapse=" ")) - ); - } else { - if (any(types < 1)) stop("The row index must be greater than 0."); - if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows."); - } - lexicaltable <- lexicaltable[types,, drop = FALSE]; - rowMargin <- rowMargin[types]; - } - - if (! is.null(parts)) { # Filter on parts to be considered. - if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers. - if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector."); - if (! all(parts %in% colnames(lexicaltable))) stop(paste( - "Some requested parts are not known in the lexical table: ", - paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" ")) - ); - } else { - if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols."); - if (any(parts < 1)) stop("The col index must be greater than 0."); - } - lexicaltable <- lexicaltable[,parts, drop=FALSE]; - colMargin <- colMargin[parts]; - } - - if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) { - stop("The lexical table must contains at least one row and one column."); - } - - specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable)); - - for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part). - - whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type. - white <- rowMargin; # The total frequencies in the corpus for each type. - black <- F-white; # The total complement frequency in the corpus for each type. - drawn <- colMargin[i]; # The number of tokens in the part. - - - independance <- (white * drawn) / F; # The theoretic frequency of each type. - specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. - specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. - - specif[specif_negative,i] <- -phyper ( - whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn - ); - - specif[specif_positive,i] <- phyper_bis ( - whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn - ); - } - - colnames(specif) <- colnames(lexicaltable); - rownames(specif) <- rownames(lexicaltable); - - return(specif); - } - -`specificites.lexicon` <- - function(lexicon, sublexicon) { - spe <- specificites.lexicon.probabilities(lexicon, sublexicon); - spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); - spelog[spe < 0] <- log10(-spe[spe < 0]); - spelog[spe > 0] <- abs(log10(spe[spe >=0])); - spelog[spe == 0] <- 0; - spelog[is.infinite(spe)] <- 0; - spelog <- round(spelog, digits=4); - rownames(spelog) <- rownames(spe); - colnames(spelog) <- colnames(spe); - #class(spelog) <- "specificites"; - #attr(spelog, "l.t") <- spe; - return(spelog); - } - -`lexiconsToLexicalTable` <- function(lexicon, sublexicon) { - if (! all(names(sublexicon) %in% names(lexicon))) - stop(paste( - sum(! (names(sublexicon) %in% names(lexicon))), - "types of the sublexicon not found in the lexicon: ", - ) - ); - - sub <- numeric(length(lexicon)); - names(sub) <- names(lexicon); - sub[names(sublexicon)] <- sublexicon; - - complementary.lexicon <- c(lexicon- sub); - if (any(complementary.lexicon < 0)) - stop("type cannot be more frequent in the sublexicon than in the lexicon"); - - lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2); - rownames(lexicaltable) <- names(lexicon); - colnames(lexicaltable) <- c("sublexicon", "complementary"); - return(lexicaltable) -} - -`specificites.lexicon.new` <- function(lexicon, sublexicon) { - lexicaltable <- lexiconsToLexicalTable(lexicon, sublexicon); - return(specificites(lexicaltable,NULL,NULL)); -} - -`specificites.lexicon.probabilities` <- - function(lexicon, sublexicon) { - - if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values."); - if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values."); - if (is.null(names(lexicon))) stop("The lexicon must contain names."); - if (is.null(names(sublexicon))) stop("The sub lexicon must contain names."); - - if (! all(names(sublexicon) %in% names(lexicon))) - stop( - paste( - "Some requested types of the sublexicon are not known in the lexicon: ", - paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ") - ) - ); - - F <- sum(lexicon); - f <- sum(sublexicon); - - # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]); - - if (F < f) { - stop("The lexicon cannot be smaller than the sublexicon"); - } - - whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type. - names(whiteDrawn) <- names(lexicon); - whiteDrawn[names(sublexicon)] <- sublexicon; - white <- lexicon; # The total frequencies in the corpus for each type. - black <- F-white; # The total complement frequency in the corpus for each type. - drawn <- f; # The number of tokens in the part. - - # print(whiteDrawn); - # print(white); - # print(black); - # print(drawn); - - independance <- (white * drawn) / F; # The theoretic frequency of each type. - - specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. - specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. - - specif <- double(length(lexicon)); - - specif[specif_negative] <- phyper ( - whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn - ); - - specif[specif_positive] <- phyper_bis ( - whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn - ); - - names(specif) <- names(lexicon); - - return(specif); - } - -`SpecifTopN` <- function(Specif, N=10, file=NULL) { - symbol = Specif - top <- c() - cols <- colnames(symbol) - - for (i in 1:length(cols)) { - sorted <- sort(symbol[, cols[i]], decreasing=TRUE, index.return= TRUE)$x - top <- union(top, names(sorted[1:N])) - top <- union(top, names(sorted[length(sorted) -N: length(sorted)])) - } - - symbol <- symbol[top,] - if (file != NULL) { - write.table(symbol, file) - } -} - -#print.specificites(x, line=20, part=1, form=NULL, ...) { -# if (all(is.null(line, part))) { -# stop("either a line or a part must be specified"); -# } -# if (all(!is.null(line, part))) { -# stop("only a line or a part must be specified"); -# } -#} diff --git a/configuration/alceste.cfg b/configuration/alceste.cfg deleted file mode 100644 index a3b89fc..0000000 --- a/configuration/alceste.cfg +++ /dev/null @@ -1,36 +0,0 @@ -[ALCESTE] -#nombre minimum d'unités par classe, 0=calcul automatique -mincl = 0 -#nombre de formes par uce, 0=calcul automatique -nbforme_uce = 0 -#nombre maximum de formes actives -max_actives = 3000 -#lemmatisation -lem = True -#taille du premier tableau uc/forme -tailleuc1 = 12 -#nombre de classes terminales de l'analyse, NE PAS EDITER -nbcl = 4 -#taille du second tableau uc/forme -tailleuc2 = 14 -#mode de classification, 0=double sur UC, 1=simple sur UCE, 2=simple sur UCI -classif_mode = 1 -#????? -minforme = 2 -#utilisation du dictionnaire des expressions -expressions = True -#nbre de classe terminale de la phase 1 -nbcl_p1 = 10 -#methode pour svd -svdmethod = irlba -#mode patate (+ rapide et - precis) -mode.patate = 0 - -[IMAGE] -#non utilise -heigth = 400 -width = 400 -reso = 200 -x = 1 -y = 2 -seuilkhi = 2 -- 2.7.4