X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=Rlib%2FtextometrieR%2FR%2Fspecificites.R;fp=Rlib%2FtextometrieR%2FR%2Fspecificites.R;h=7fbd2baee92ea2819be0808e78efcb22a80b9cc6;hp=0426549598f131beb6d54912ef58036bf90a55a1;hb=2bd16d105beb32d6706f03e4ea717e4c7af43d03;hpb=ff602f724b8fc80f4fa067dbbc830343c0b4bed4 diff --git a/Rlib/textometrieR/R/specificites.R b/Rlib/textometrieR/R/specificites.R index 0426549..7fbd2ba 100644 --- a/Rlib/textometrieR/R/specificites.R +++ b/Rlib/textometrieR/R/specificites.R @@ -1,194 +1,249 @@ -#* Textometrie -#* ANR project ANR-06-CORP-029 -#* http://textometrie.ens-lsh.fr/ -#* -#* 2008 (C) Textometrie project -#* BSD New and Simplified BSD licenses -#* http://www.opensource.org/licenses/bsd-license.php +#* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr +#* +#* This file is part of the TXM platform. +#* +#* The TXM platform is free software: you can redistribute it and/or modif y +#* it under the terms of the GNU General Public License as published by +#* the Free Software Foundation, either version 3 of the License, or +#* (at your option) any later version. +#* +#* The TXM platform is distributed in the hope that it will be useful, +#* but WITHOUT ANY WARRANTY; without even the implied warranty of +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +#* General Public License for more details. +#* +#* You should have received a copy of the GNU General Public License +#* along with the TXM platform. If not, see . ## Sylvain Loiseau +## Lise Vaudor + +phyper_bis=function(v_a,v_b,v_c,v_d){ + v_s2=rep(NA,length(v_a)) + for (j in 1:length(v_a)){ + a=v_a[j] + b=v_b[j] + c=v_c[j] + d=v_d + a_tmp=a#+1 + s1=dhyper(a_tmp,b,c,d) + s_tmp=dhyper(a_tmp+1,b,c,d) + s2=s1+s_tmp + a_tmp=a_tmp+1 + while(log(s2)!=log(s1)){ + s1=s2 + a_tmp=a_tmp+1 + s_tmp=dhyper(a_tmp,b,c,d) + s2=s1+s_tmp + } + v_s2[j]=s2 + } + return(v_s2) +} `specificites` <- -function(lexicaltable, types=NULL, parts=NULL) { - spe <- specificites.probabilities(lexicaltable, types, parts); - spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); - spelog[spe < 0.5] <- log10(spe[spe < 0.5]); - spelog[spe > 0.5] <- abs(log10(1 - spe[spe > 0.5])); - spelog[spe == 0.5] <- 0; - spelog[is.infinite(spe)] <- 0; - spelog <- round(spelog, digits=4); - rownames(spelog) <- rownames(spe); - colnames(spelog) <- colnames(spe); - #class(spelog) <- "specificites"; - #attr(spelog, "l.t") <- spe; - return(spelog); -} + function(lexicaltable, types=NULL, parts=NULL) { + spe <- specificites.probabilities(lexicaltable, types, parts); + spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); + spelog[spe < 0] <- log10(-spe[spe < 0]); + spelog[spe > 0] <- abs(log10(spe[spe >0])); + spelog[spe == 0] <- 0; + spelog[is.infinite(spe)] <- 0; + spelog <- round(spelog, digits=4); + rownames(spelog) <- rownames(spe); + colnames(spelog) <- colnames(spe); + #class(spelog) <- "specificites"; + #attr(spelog, "l.t") <- spe; + return(spelog); + } `specificites.probabilities` <- -function(lexicaltable, types=NULL, parts=NULL) { - - # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values."); - - rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types). - colMargin <- colSums(lexicaltable); # or "T" (the size of the parts). - F <- sum(lexicaltable); # The grand total (number of tokens in the corpus). - - if (! is.null(types)) { # Filter on tokens to be considered. - if(is.character(types)) { # convert the name of types given with "types" into row index numbers. - if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector."); - if (! all(types %in% rownames(lexicaltable))) stop(paste( - "Some requested types are not known in the lexical table: ", - paste(types[! (types %in% rownames(lexicaltable))], collapse=" ")) - ); - } else { - if (any(types < 1)) stop("The row index must be greater than 0."); - if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows."); + function(lexicaltable, types=NULL, parts=NULL) { + + # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values."); + + rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types). + colMargin <- colSums(lexicaltable); # or "T" (the size of the parts). + F <- sum(lexicaltable); # The grand total (number of tokens in the corpus). + + if (! is.null(types)) { # Filter on tokens to be considered. + if(is.character(types)) { # convert the name of types given with "types" into row index numbers. + if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector."); + if (! all(types %in% rownames(lexicaltable))) stop(paste( + "Some requested types are not known in the lexical table: ", + paste(types[! (types %in% rownames(lexicaltable))], collapse=" ")) + ); + } else { + if (any(types < 1)) stop("The row index must be greater than 0."); + if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows."); + } + lexicaltable <- lexicaltable[types,, drop = FALSE]; + rowMargin <- rowMargin[types]; } - lexicaltable <- lexicaltable[types,, drop = FALSE]; - rowMargin <- rowMargin[types]; - } - - if (! is.null(parts)) { # Filter on parts to be considered. - if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers. - if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector."); - if (! all(parts %in% colnames(lexicaltable))) stop(paste( - "Some requested parts are not known in the lexical table: ", - paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" ")) - ); - } else { - if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols."); - if (any(parts < 1)) stop("The col index must be greater than 0."); + + if (! is.null(parts)) { # Filter on parts to be considered. + if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers. + if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector."); + if (! all(parts %in% colnames(lexicaltable))) stop(paste( + "Some requested parts are not known in the lexical table: ", + paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" ")) + ); + } else { + if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols."); + if (any(parts < 1)) stop("The col index must be greater than 0."); + } + lexicaltable <- lexicaltable[,parts, drop=FALSE]; + colMargin <- colMargin[parts]; + } + + if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) { + stop("The lexical table must contains at least one row and one column."); + } + + specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable)); + + for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part). + + whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type. + white <- rowMargin; # The total frequencies in the corpus for each type. + black <- F-white; # The total complement frequency in the corpus for each type. + drawn <- colMargin[i]; # The number of tokens in the part. + + + independance <- (white * drawn) / F; # The theoretic frequency of each type. + specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. + specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. + + specif[specif_negative,i] <- -phyper ( + whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn + ); + + specif[specif_positive,i] <- phyper_bis ( + whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn + ); } - lexicaltable <- lexicaltable[,parts, drop=FALSE]; - colMargin <- colMargin[parts]; - } - - if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) { - stop("The lexical table must contains at least one row and one column."); - } - - specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable)); - - for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part). - - whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type. - white <- rowMargin; # The total frequencies in the corpus for each type. - black <- F-white; # The total complement frequency in the corpus for each type. - drawn <- colMargin[i]; # The number of tokens in the part. - - independance <- (white * drawn) / F; # The theoretic frequency of each type. - specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. - specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. - - specif[specif_negative,i] <- phyper ( - whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn - ); - specif[specif_positive,i] <- phyper ( - whiteDrawn[specif_positive] - 1, white[specif_positive], black[specif_positive], drawn - ); + colnames(specif) <- colnames(lexicaltable); + rownames(specif) <- rownames(lexicaltable); + + return(specif); } - dimnames(specif) <- dimnames(lexicaltable); - - return(specif); -} - `specificites.lexicon` <- -function(lexicon, sublexicon) { - spe <- specificites.lexicon.probabilities(lexicon, sublexicon); - spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); - spelog[spe < 0.5] <- log10(spe[spe < 0.5]); - spelog[spe > 0.5] <- abs(log10(1 - spe[spe > 0.5])); - spelog[spe == 0.5] <- 0; - spelog[is.infinite(spe)] <- 0; - spelog <- round(spelog, digits=4); - rownames(spelog) <- rownames(spe); - colnames(spelog) <- colnames(spe); - #class(spelog) <- "specificites"; - #attr(spelog, "l.t") <- spe; - return(spelog); -} - -`specificites.lexicon.new` <- -function(lexicon, sublexicon) { + function(lexicon, sublexicon) { + spe <- specificites.lexicon.probabilities(lexicon, sublexicon); + spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe)); + spelog[spe < 0] <- log10(-spe[spe < 0]); + spelog[spe > 0] <- abs(log10(spe[spe >=0])); + spelog[spe == 0] <- 0; + spelog[is.infinite(spe)] <- 0; + spelog <- round(spelog, digits=4); + rownames(spelog) <- rownames(spe); + colnames(spelog) <- colnames(spe); + #class(spelog) <- "specificites"; + #attr(spelog, "l.t") <- spe; + return(spelog); + } - if (! all(names(sublexicon) %in% names(lexicon))) stop( - paste( +`lexiconsToLexicalTable` <- function(lexicon, sublexicon) { + if (! all(names(sublexicon) %in% names(lexicon))) + stop(paste( sum(! (names(sublexicon) %in% names(lexicon))), "types of the sublexicon not found in the lexicon: ", ) ); -sub <- numeric(length(lexicon)); -names(sub) <- names(lexicon); -sub[names(sublexicon)] <- sublexicon; + sub <- numeric(length(lexicon)); + names(sub) <- names(lexicon); + sub[names(sublexicon)] <- sublexicon; - complementary.lexicon <- c(lexicon- sub); - if (any(complementary.lexicon < 0)) stop("type cannot be more frequent in the sublexicon than in the lexicon"); + complementary.lexicon <- c(lexicon- sub); + if (any(complementary.lexicon < 0)) + stop("type cannot be more frequent in the sublexicon than in the lexicon"); + + lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2); + rownames(lexicaltable) <- names(lexicon); + colnames(lexicaltable) <- c("sublexicon", "complementary"); + return(lexicaltable) +} - lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2); - rownames(lexicaltable) <- names(lexicon); - colnames(lexicaltable) <- c("sublexicon", "complementary"); - #stop(colnames(lexicaltable)); +`specificites.lexicon.new` <- function(lexicon, sublexicon) { + lexicaltable <- lexiconsToLexicalTable(lexicon, sublexicon); return(specificites(lexicaltable,NULL,NULL)); } `specificites.lexicon.probabilities` <- -function(lexicon, sublexicon) { - - if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values."); - if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values."); - if (is.null(names(lexicon))) stop("The lexicon must contain names."); - if (is.null(names(sublexicon))) stop("The sub lexicon must contain names."); - - if (! all(names(sublexicon) %in% names(lexicon))) - stop( + function(lexicon, sublexicon) { + + if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values."); + if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values."); + if (is.null(names(lexicon))) stop("The lexicon must contain names."); + if (is.null(names(sublexicon))) stop("The sub lexicon must contain names."); + + if (! all(names(sublexicon) %in% names(lexicon))) + stop( paste( "Some requested types of the sublexicon are not known in the lexicon: ", paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ") - ) - ); - - F <- sum(lexicon); - f <- sum(sublexicon); - - # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]); - - if (F < f) { - stop("The lexicon cannot be smaller than the sublexicon"); - } - - whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type. - names(whiteDrawn) <- names(lexicon); - whiteDrawn[names(sublexicon)] <- sublexicon; - white <- lexicon; # The total frequencies in the corpus for each type. - black <- F-white; # The total complement frequency in the corpus for each type. - drawn <- f; # The number of tokens in the part. - - # print(whiteDrawn); - # print(white); - # print(black); - # print(drawn); - - independance <- (white * drawn) / F; # The theoretic frequency of each type. - - specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. - specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. - - specif <- double(length(lexicon)); - - specif[specif_negative] <- phyper ( + ) + ); + + F <- sum(lexicon); + f <- sum(sublexicon); + + # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]); + + if (F < f) { + stop("The lexicon cannot be smaller than the sublexicon"); + } + + whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type. + names(whiteDrawn) <- names(lexicon); + whiteDrawn[names(sublexicon)] <- sublexicon; + white <- lexicon; # The total frequencies in the corpus for each type. + black <- F-white; # The total complement frequency in the corpus for each type. + drawn <- f; # The number of tokens in the part. + + # print(whiteDrawn); + # print(white); + # print(black); + # print(drawn); + + independance <- (white * drawn) / F; # The theoretic frequency of each type. + + specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies. + specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies. + + specif <- double(length(lexicon)); + + specif[specif_negative] <- phyper ( whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn - ); - - specif[specif_positive] <- phyper ( - whiteDrawn[specif_positive] - 1, white[specif_positive], black[specif_positive], drawn - ); - - names(specif) <- names(lexicon); + ); + + specif[specif_positive] <- phyper_bis ( + whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn + ); + + names(specif) <- names(lexicon); + + return(specif); + } - return(specif); +`SpecifTopN` <- function(Specif, N=10, file=NULL) { + symbol = Specif + top <- c() + cols <- colnames(symbol) + + for (i in 1:length(cols)) { + sorted <- sort(symbol[, cols[i]], decreasing=TRUE, index.return= TRUE)$x + top <- union(top, names(sorted[1:N])) + top <- union(top, names(sorted[length(sorted) -N: length(sorted)])) + } + + symbol <- symbol[top,] + if (file != NULL) { + write.table(symbol, file) + } } #print.specificites(x, line=20, part=1, form=NULL, ...) {