+++ /dev/null
-# #* Textometrie
-# #* ANR project ANR-06-CORP-029
-# #* http://textometrie.ens-lsh.fr/
-# #*
-# #* 2008 (C) Textometrie project
-# #* BSD New and Simplified BSD licenses
-# #* http://www.opensource.org/licenses/bsd-license.php
-#
-# ## Sylvain Loiseau <sloiseau@u-paris10.fr>
-#
-# ##
-# ## Ex. : type1 = 7 occ., type2 = 4 occ., separator = 11
-# ## Cf. Lafon, 1981 : 120 sqq.
-# ##
-# ## phyper(0:4, 4, 7 + 11, 7)
-# ##
-#
-# ##
-# ## wrapper for the function "cooccurrences.directed.contexts" given a factor as
-# ## corpus and a separator.
-# ##
-
-cooccurrences <- function(corpus) {
- stop("not implemented yet");
-}
-
-#
-# `cooccurrences.directed.corpus` <-
-# function (corpus, separator) {
-# if (! is.factor(corpus)) stop("The corpus must be a factor.");
-# if (! is.character(separator)) stop("Separator must be a character vector.");
-# if (length(separator) != 1) stop("Separator must be a character of length 1");
-#
-# corpus <- corpus[drop=TRUE];
-# frequencies = table(corpus);
-# separatorfrequency = frequencies[separator];
-# if (is.na(separatorfrequency)) {
-# separatorfrequency = 0;
-# }
-# frequencies = frequencies[names(frequencies) != separator];
-# types <- levels(corpus);
-# types <- types[types != separator];
-#
-# contexts = get.contexts(corpus, separator);
-#
-# return(cooccurrences.directed.contexts(contexts, types, frequencies, separatorfrequency));
-# }
-#
-# ##
-# ## Compute all the cooccurrence indices according to a context list (see the "get.contexts" function"),
-# ## the vector of types to be taken into account, their frequency, and the separator frequency.
-# ##
-# ## Compute the cooccurrency index (based on hypergeometric cumulative
-# ## probability) for each directed pair of token and return a matrix with tokens as column names
-# ## and row names where each cell give a cooccurrency index
-# ##
-#
-# `cooccurrences.directed.contexts` <-
-# function(contexts, types, type.frequencies, separator.frequency) {
-#
-# all.cooccurrences.index <- matrix(0.0, nrow=length(types), ncol=length(type));
-# rownames(all.cooccurrences.index) <- types;
-# colnames(all.cooccurrences.index) <- types;
-#
-# for (i in 1:length(types)) {
-# type = types[i];
-# typefrequency = frequencies[i];
-# for (j in c(1:(types-1), (types + 1):length(types))) {
-# othertype = types[j];
-# othertypefrequency = frequencies[j];
-# cooccurrences = cooccurrences.frequency(contexts, type, othertype);
-# all.cooccurrences.index[type, othertype] <-
-# cooccurrences.directed.cooccurrents(cooccurrences, type, othertype, separatorfrequency);
-# }
-# }
-#
-# return(all.cooccurrences.index);
-# }
-#
-# ##
-# ## In a list of contexts (see the "get.contexts" function), count the number of contexts
-# ## having both at least one occurrence of type1 and at least one occurrence of type2.
-# ##
-#
-# `cooccurrences.frequency` <-
-# function(contexts, type1, type2) {
-#
-# if (! is.character(type1)) stop("Type1 must be a character vector.");
-# if (length(type1) != 1) stop("Type1 must be a character of length 1");
-# if (! is.character(type2)) stop("Type2 must be a character vector.");
-# if (length(type2) != 1) stop("Type2 must be a character of length 1");
-#
-# is.cooccurring <- sapply(contexts,
-# function(x) {
-# if(type1 %in% x && type2 %in% x) return(1) else return(0)
-# }
-# );
-# return(sum(is.cooccurring));
-# }
-#
-# ##
-# ##
-# ##
-#
-# `cooccurrences.directed.cooccurrents` <-
-# function(cooccurrences, type1, type2, separator) {
-# if (! is.integer(cooccurrences)) stop("cooccurrences must be integer.");
-# if (! is.integer(type1)) stop("type1 must be integer.");
-# if (! is.integer(type2)) stop("type2 must be integer.");
-# if (! is.integer(separator)) stop("separator must be integer.");
-# ph <- phyper(found, type2, type1 + separator, type1);
-# return(ph);
-# }
-#
-# ##
-# ## Create a list of character vectors according to a character vector and a separator string.
-# ##
-#
-# `get.contexts` <-
-# function(corpus, separator) {
-# if (! is.factor(corpus)) stop("The corpus must be a factor.");
-# if (! is.character(separator)) stop("Separator must be a character vector.");
-# if (length(separator) != 1) stop("Separator must be a vector of length 1");
-#
-# index.separator = which(corpus == separator);
-# contexts = list();
-# lastindex = 1;
-# for(i in 1:(length(index.separator))) {
-# contexts[[i]] = as.character(corpus[lastindex:(index.separator[i]-1)]);
-# lastindex=index.separator[i] + 1;
-# }
-# contexts[[length(contexts) + 1]] <- as.character(corpus[lastindex:length(corpus)]);
-# return(contexts);
-# }
-#
-# ##
-# ## Create a lexical table according to a character vector and a separator string.
-# ##
-#
-# `get.lexical.table` <-
-# function(corpus, separator) {
-# if (! is.factor(corpus)) stop("The corpus must be a factor.");
-# if (! is.character(separator)) stop("Separator must be a character vector.");
-# if (length(separator) == 1) stop("Separator must be a character of length 1");
-#
-# contexts = get.contexts(corpus, separator);
-#
-# corpus <- corpus[drop=TRUE];
-# types <- levels(corpus);
-# types <- types[types != separator];
-#
-# lexical.table <- matrix(0, nrow=length(contexts), ncol=length(types));
-# colnames(lexical.table) <- types;
-#
-# for (i in 1:length(contexts)) {
-# freq.list <- table(contexts[[i]]);
-# lexical.table[i, names(freq.list)] <- freq.list;
-# }
-#
-# return(lexical.table);
-# }
-#
-# `cooccurrences` <-
-# function(type1, type2, separator) {
-# # max = ifelse(f > g, 2*g, i)
-# # #choose(T, t);
-# }
-#
+++ /dev/null
-#*
-#* Textometrie
-#* ANR project ANR-06-CORP-029
-#* http://textometrie.ens-lsh.fr/
-#*
-#* 2008 (C) Textometrie project
-#* BSD New and Simplified BSD licenses
-#* http://www.opensource.org/licenses/bsd-license.php
-
-`pareto` <-
-function (x) {
- op <- par(mar = c(5, 4, 4, 5) + 0.1)
- par(las=2)
- if( ! inherits(x, "table") ) {
- x <- table(x)
- }
- x <- rev(sort(x))
- plot( x, type='h', axes=F)
- # , lwd=16
- axis(2)
- points( x, type='h', lwd=6) # , col=heat.colors(length(x))
-
- y <- cumsum(x)/sum(x)
- par(new=T)
- plot(y, type="b", lwd=3, pch=7, axes=F, xlab='', ylab='', main='')
- points(y, type='h')
- axis(4)
- #print(names(x))
- axis(1, at=1:length(x), labels=names(x))
- par(op)
- title("Pareto graphic with cumulative frequency");
-}
-
+++ /dev/null
-#* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr
-#*
-#* This file is part of the TXM platform.
-#*
-#* The TXM platform is free software: you can redistribute it and/or modif y
-#* it under the terms of the GNU General Public License as published by
-#* the Free Software Foundation, either version 3 of the License, or
-#* (at your option) any later version.
-#*
-#* The TXM platform is distributed in the hope that it will be useful,
-#* but WITHOUT ANY WARRANTY; without even the implied warranty of
-#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-#* General Public License for more details.
-#*
-#* You should have received a copy of the GNU General Public License
-#* along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
-
-`printrepartition` <-function(positions, names, colors, styles, widths, corpusname, Xmin, T, doCumulative, structurepositions, strutnames, graphtitle, bande) {
- options(scipen=1000)
-
- linestyle = 1
- linewidth = 1
-
- if (length(positions) > length(colors)) stop("colors list size too small");
- if (length(positions) > length(names)) stop("names list size too small");
- if (length(positions) > length(styles)) stop("styles list size too small");
- if (length(positions) > length(widths)) stop("widths list size too small");
-
- # if (length(structurepositions) > length(strutnames)) stop("structure names list size too small");
-
- doCumu <- (doCumulative == "true")
-
- maxX = T
- maxY = 0
- draw = 0
-
- # set maxX and maxY the ranges
- if(!doCumu)
- {
- for(i in 1:length(names))
- {
- x = positions[[i]]
- if(length(x) > 0)
- {
- d = density(x, bw=bande)
- m = max(d[["y"]])
- if(maxY < m)
- maxY <- m
- }
- }
- maxY=2*maxY
- }
- else
- {
- for(i in 1:length(names))
- {
- my <- length(positions[[i]])
- if(maxY < my)
- maxY <- my
- }
- }
-
- # draw curves
- for(i in 1:length(names))
- {
- #line styles and width update
- linestyle = linestyle + 1
- if(linestyle >= 6)
- {
- linestyle = 1
- linewidth = linewidth+ 1
- }
- x = positions[[i]]
- if(length(x) > 0)
- {
- y = 1:length(x)
-
- y <- c( c(0), y , c(y[[length(x)]]) )
- x <- c( c(x[[1]]), x , c(maxX) )
-
- if(draw == 0)# first draw
- {
- if(doCumu)
- {
-
-plot(x, y, type="s", xlab=paste("T = ", maxX), main = graphtitle, ylab="Occurrences", ylim=c(0, maxY), xlim=c(Xmin, maxX), pch=15, col=colors[i], lty=styles[i], lwd=widths[i], xaxs="i", yaxs="i")
- }
- else
- {
-plot(density(x, bw=bande), type="l", xlab=paste("T = ", maxX), graphtitle, ylab="Density", ylim=c(0, maxY), xlim=c(Xmin, maxX), pch=15, col=colors[i], lty=styles[i], lwd=widths[i], xaxs="i", yaxs="i")
- }
- }
- else #next draws
- {
- if(doCumu)
- {
-points(x, y, type="s", pch=15, col=colors[i], lty=styles[i], lwd=widths[i])
- }
- else
- {
-points(density(x, bw=bande), type="l", pch=15, col=colors[i], lty=styles[i], lwd=widths[i])
- }
- }
- rm(y)
- draw <- draw + 1
- }
- }
-
- # draw legend
- for(i in 1:length(names))
- names[i] = paste(names[i], length(positions[[i]]))
-
- if(draw > 0)
- legend("topleft", names, inset = .02, col = colors, lty=styles, lwd=widths)
-
- # draw hist of struct
- y = c()
- if(length(structurepositions) > 0)
- {
- for(i in 1:length(structurepositions))
- {
- y[i] <- maxY
- text(structurepositions[[i]], maxY*0.70, strutnames[[i]], cex = .8, srt=-90, adj = c(0,0))
- }
- points(structurepositions, y, type="h", ylim=c(0, maxY), xlim=c(Xmin, maxX))
- }
-}
-
+++ /dev/null
-#* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr
-#*
-#* This file is part of the TXM platform.
-#*
-#* The TXM platform is free software: you can redistribute it and/or modif y
-#* it under the terms of the GNU General Public License as published by
-#* the Free Software Foundation, either version 3 of the License, or
-#* (at your option) any later version.
-#*
-#* The TXM platform is distributed in the hope that it will be useful,
-#* but WITHOUT ANY WARRANTY; without even the implied warranty of
-#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-#* General Public License for more details.
-#*
-#* You should have received a copy of the GNU General Public License
-#* along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
-
-## Sylvain Loiseau <sloiseau@u-paris10.fr>
-## Lise Vaudor <lise.vaudor@ens-lyon.fr>
-
-phyper_bis=function(v_a,v_b,v_c,v_d){
- v_s2=rep(NA,length(v_a))
- for (j in 1:length(v_a)){
- a=v_a[j]
- b=v_b[j]
- c=v_c[j]
- d=v_d
- a_tmp=a#+1
- s1=dhyper(a_tmp,b,c,d)
- s_tmp=dhyper(a_tmp+1,b,c,d)
- s2=s1+s_tmp
- a_tmp=a_tmp+1
- while(log(s2)!=log(s1)){
- s1=s2
- a_tmp=a_tmp+1
- s_tmp=dhyper(a_tmp,b,c,d)
- s2=s1+s_tmp
- }
- v_s2[j]=s2
- }
- return(v_s2)
-}
-
-`specificites` <-
- function(lexicaltable, types=NULL, parts=NULL) {
- spe <- specificites.probabilities(lexicaltable, types, parts);
- spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
- spelog[spe < 0] <- log10(-spe[spe < 0]);
- spelog[spe > 0] <- abs(log10(spe[spe >0]));
- spelog[spe == 0] <- 0;
- spelog[is.infinite(spe)] <- 0;
- spelog <- round(spelog, digits=4);
- rownames(spelog) <- rownames(spe);
- colnames(spelog) <- colnames(spe);
- #class(spelog) <- "specificites";
- #attr(spelog, "l.t") <- spe;
- return(spelog);
- }
-
-`specificites.probabilities` <-
- function(lexicaltable, types=NULL, parts=NULL) {
-
- # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values.");
-
- rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types).
- colMargin <- colSums(lexicaltable); # or "T" (the size of the parts).
- F <- sum(lexicaltable); # The grand total (number of tokens in the corpus).
-
- if (! is.null(types)) { # Filter on tokens to be considered.
- if(is.character(types)) { # convert the name of types given with "types" into row index numbers.
- if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector.");
- if (! all(types %in% rownames(lexicaltable))) stop(paste(
- "Some requested types are not known in the lexical table: ",
- paste(types[! (types %in% rownames(lexicaltable))], collapse=" "))
- );
- } else {
- if (any(types < 1)) stop("The row index must be greater than 0.");
- if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows.");
- }
- lexicaltable <- lexicaltable[types,, drop = FALSE];
- rowMargin <- rowMargin[types];
- }
-
- if (! is.null(parts)) { # Filter on parts to be considered.
- if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers.
- if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
- if (! all(parts %in% colnames(lexicaltable))) stop(paste(
- "Some requested parts are not known in the lexical table: ",
- paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
- );
- } else {
- if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
- if (any(parts < 1)) stop("The col index must be greater than 0.");
- }
- lexicaltable <- lexicaltable[,parts, drop=FALSE];
- colMargin <- colMargin[parts];
- }
-
- if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
- stop("The lexical table must contains at least one row and one column.");
- }
-
- specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable));
-
- for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part).
-
- whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type.
- white <- rowMargin; # The total frequencies in the corpus for each type.
- black <- F-white; # The total complement frequency in the corpus for each type.
- drawn <- colMargin[i]; # The number of tokens in the part.
-
-
- independance <- (white * drawn) / F; # The theoretic frequency of each type.
- specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies.
- specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies.
-
- specif[specif_negative,i] <- -phyper (
- whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
- );
-
- specif[specif_positive,i] <- phyper_bis (
- whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
- );
- }
-
- colnames(specif) <- colnames(lexicaltable);
- rownames(specif) <- rownames(lexicaltable);
-
- return(specif);
- }
-
-`specificites.lexicon` <-
- function(lexicon, sublexicon) {
- spe <- specificites.lexicon.probabilities(lexicon, sublexicon);
- spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
- spelog[spe < 0] <- log10(-spe[spe < 0]);
- spelog[spe > 0] <- abs(log10(spe[spe >=0]));
- spelog[spe == 0] <- 0;
- spelog[is.infinite(spe)] <- 0;
- spelog <- round(spelog, digits=4);
- rownames(spelog) <- rownames(spe);
- colnames(spelog) <- colnames(spe);
- #class(spelog) <- "specificites";
- #attr(spelog, "l.t") <- spe;
- return(spelog);
- }
-
-`lexiconsToLexicalTable` <- function(lexicon, sublexicon) {
- if (! all(names(sublexicon) %in% names(lexicon)))
- stop(paste(
- sum(! (names(sublexicon) %in% names(lexicon))),
- "types of the sublexicon not found in the lexicon: ",
- )
- );
-
- sub <- numeric(length(lexicon));
- names(sub) <- names(lexicon);
- sub[names(sublexicon)] <- sublexicon;
-
- complementary.lexicon <- c(lexicon- sub);
- if (any(complementary.lexicon < 0))
- stop("type cannot be more frequent in the sublexicon than in the lexicon");
-
- lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2);
- rownames(lexicaltable) <- names(lexicon);
- colnames(lexicaltable) <- c("sublexicon", "complementary");
- return(lexicaltable)
-}
-
-`specificites.lexicon.new` <- function(lexicon, sublexicon) {
- lexicaltable <- lexiconsToLexicalTable(lexicon, sublexicon);
- return(specificites(lexicaltable,NULL,NULL));
-}
-
-`specificites.lexicon.probabilities` <-
- function(lexicon, sublexicon) {
-
- if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values.");
- if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values.");
- if (is.null(names(lexicon))) stop("The lexicon must contain names.");
- if (is.null(names(sublexicon))) stop("The sub lexicon must contain names.");
-
- if (! all(names(sublexicon) %in% names(lexicon)))
- stop(
- paste(
- "Some requested types of the sublexicon are not known in the lexicon: ",
- paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ")
- )
- );
-
- F <- sum(lexicon);
- f <- sum(sublexicon);
-
- # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]);
-
- if (F < f) {
- stop("The lexicon cannot be smaller than the sublexicon");
- }
-
- whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type.
- names(whiteDrawn) <- names(lexicon);
- whiteDrawn[names(sublexicon)] <- sublexicon;
- white <- lexicon; # The total frequencies in the corpus for each type.
- black <- F-white; # The total complement frequency in the corpus for each type.
- drawn <- f; # The number of tokens in the part.
-
- # print(whiteDrawn);
- # print(white);
- # print(black);
- # print(drawn);
-
- independance <- (white * drawn) / F; # The theoretic frequency of each type.
-
- specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies.
- specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies.
-
- specif <- double(length(lexicon));
-
- specif[specif_negative] <- phyper (
- whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
- );
-
- specif[specif_positive] <- phyper_bis (
- whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
- );
-
- names(specif) <- names(lexicon);
-
- return(specif);
- }
-
-`SpecifTopN` <- function(Specif, N=10, file=NULL) {
- symbol = Specif
- top <- c()
- cols <- colnames(symbol)
-
- for (i in 1:length(cols)) {
- sorted <- sort(symbol[, cols[i]], decreasing=TRUE, index.return= TRUE)$x
- top <- union(top, names(sorted[1:N]))
- top <- union(top, names(sorted[length(sorted) -N: length(sorted)]))
- }
-
- symbol <- symbol[top,]
- if (file != NULL) {
- write.table(symbol, file)
- }
-}
-
-#print.specificites(x, line=20, part=1, form=NULL, ...) {
-# if (all(is.null(line, part))) {
-# stop("either a line or a part must be specified");
-# }
-# if (all(!is.null(line, part))) {
-# stop("only a line or a part must be specified");
-# }
-#}
+++ /dev/null
-[ALCESTE]\r
-#nombre minimum d'unités par classe, 0=calcul automatique
-mincl = 0
-#nombre de formes par uce, 0=calcul automatique
-nbforme_uce = 0\r
-#nombre maximum de formes actives
-max_actives = 3000
-#lemmatisation
-lem = True\r
-#taille du premier tableau uc/forme
-tailleuc1 = 12\r
-#nombre de classes terminales de l'analyse, NE PAS EDITER
-nbcl = 4\r
-#taille du second tableau uc/forme
-tailleuc2 = 14\r
-#mode de classification, 0=double sur UC, 1=simple sur UCE, 2=simple sur UCI
-classif_mode = 1
-#?????
-minforme = 2
-#utilisation du dictionnaire des expressions
-expressions = True
-#nbre de classe terminale de la phase 1
-nbcl_p1 = 10
-#methode pour svd
-svdmethod = irlba
-#mode patate (+ rapide et - precis)
-mode.patate = 0
-\r
-[IMAGE]\r
-#non utilise
-heigth = 400\r
-width = 400\r
-reso = 200\r
-x = 1\r
-y = 2\r
-seuilkhi = 2