iramuteq.org Git - iramuteq/blob - Rlib/textometrieR/R/specificites.R

   1 #* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr
   2 #*
   3 #* This file is part of the TXM platform.
   4 #*
   5 #* The TXM platform is free software: you can redistribute it and/or modif y
   6 #* it under the terms of the GNU General Public License as published by
   7 #* the Free Software Foundation, either version 3 of the License, or
   8 #* (at your option) any later version.
   9 #*
  10 #* The TXM platform is distributed in the hope that it will be useful,
  11 #* but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13 #* General Public License for more details.
  14 #*
  15 #* You should have received a copy of the GNU General Public License
  16 #* along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 ## Sylvain Loiseau <sloiseau@u-paris10.fr>
  19 ## Lise Vaudor <lise.vaudor@ens-lyon.fr>
  20
  21 phyper_bis=function(v_a,v_b,v_c,v_d){
  22   v_s2=rep(NA,length(v_a))
  23   for (j in 1:length(v_a)){
  24       a=v_a[j]
  25       b=v_b[j]
  26       c=v_c[j]
  27       d=v_d
  28       a_tmp=a#+1
  29       s1=dhyper(a_tmp,b,c,d)
  30       s_tmp=dhyper(a_tmp+1,b,c,d)
  31       s2=s1+s_tmp
  32       a_tmp=a_tmp+1
  33       while(log(s2)!=log(s1)){
  34         s1=s2
  35         a_tmp=a_tmp+1
  36         s_tmp=dhyper(a_tmp,b,c,d)
  37         s2=s1+s_tmp
  38       }
  39       v_s2[j]=s2
  40   }
  41   return(v_s2)
  42 }
  43
  44 `specificites` <-
  45   function(lexicaltable, types=NULL, parts=NULL) {
  46     spe <- specificites.probabilities(lexicaltable, types, parts);
  47     spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
  48     spelog[spe < 0] <- log10(-spe[spe < 0]);
  49     spelog[spe > 0] <- abs(log10(spe[spe >0]));
  50     spelog[spe == 0] <- 0;
  51     spelog[is.infinite(spe)] <- 0;
  52     spelog <- round(spelog, digits=4);
  53     rownames(spelog) <- rownames(spe);
  54     colnames(spelog) <- colnames(spe);
  55     #class(spelog) <- "specificites";
  56     #attr(spelog, "l.t") <- spe;
  57     return(spelog);
  58   }
  59
  60 `specificites.probabilities` <-
  61   function(lexicaltable, types=NULL, parts=NULL) {
  62
  63     # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values.");
  64
  65     rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types).
  66     colMargin <- colSums(lexicaltable); # or "T" (the size of the parts).
  67     F <- sum(lexicaltable);             # The grand total (number of tokens in the corpus).
  68
  69     if (! is.null(types)) {      # Filter on tokens to be considered.
  70       if(is.character(types)) {  # convert the name of types given with "types" into row index numbers.
  71         if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector.");
  72         if (! all(types %in% rownames(lexicaltable))) stop(paste(
  73           "Some requested types are not known in the lexical table: ",
  74           paste(types[! (types %in% rownames(lexicaltable))], collapse=" "))
  75         );
  76       } else {
  77         if (any(types < 1)) stop("The row index must be greater than 0.");
  78         if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows.");
  79       }
  80       lexicaltable <- lexicaltable[types,, drop = FALSE];
  81       rowMargin <- rowMargin[types];
  82     }
  83
  84     if (! is.null(parts)) {      # Filter on parts to be considered.
  85       if(is.character(parts)) {  # convert the name of parts given with "parts" into col index numbers.
  86         if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
  87         if (! all(parts %in% colnames(lexicaltable))) stop(paste(
  88           "Some requested parts are not known in the lexical table: ",
  89           paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
  90         );
  91       } else {
  92         if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
  93         if (any(parts < 1)) stop("The col index must be greater than 0.");
  94       }
  95       lexicaltable <- lexicaltable[,parts, drop=FALSE];
  96       colMargin <- colMargin[parts];
  97     }
  98
  99     if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
 100       stop("The lexical table must contains at least one row and one column.");
 101     }
 102
 103     specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable));
 104
 105     for(i in 1:ncol(lexicaltable)) {    # We proceed the whole lexical table by column (i.e. by part).
 106
 107       whiteDrawn <- lexicaltable[,i];  # The frequencies observed in this part for each type.
 108       white <- rowMargin;     # The total frequencies in the corpus for each type.
 109       black <- F-white;       # The total complement frequency in the corpus for each type.
 110       drawn <- colMargin[i];  # The number of tokens in the part.
 111
 112
 113       independance    <- (white * drawn) / F;         # The theoretic frequency of each type.
 114       specif_negative <- whiteDrawn <  independance;  # index of observed frequencies below the theoretic frequencies.
 115       specif_positive <- whiteDrawn >= independance;  # index of observed frequencies above the theoretic frequencies.
 116
 117       specif[specif_negative,i] <- -phyper (
 118         whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
 119       );
 120
 121       specif[specif_positive,i] <- phyper_bis (
 122         whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
 123       );
 124     }
 125
 126     colnames(specif) <- colnames(lexicaltable);
 127     rownames(specif) <- rownames(lexicaltable);
 128
 129     return(specif);
 130   }
 131
 132 `specificites.lexicon` <-
 133   function(lexicon, sublexicon) {
 134     spe <- specificites.lexicon.probabilities(lexicon, sublexicon);
 135     spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
 136     spelog[spe < 0] <- log10(-spe[spe < 0]);
 137     spelog[spe > 0] <- abs(log10(spe[spe >=0]));
 138     spelog[spe == 0] <- 0;
 139     spelog[is.infinite(spe)] <- 0;
 140     spelog <- round(spelog, digits=4);
 141     rownames(spelog) <- rownames(spe);
 142     colnames(spelog) <- colnames(spe);
 143     #class(spelog) <- "specificites";
 144     #attr(spelog, "l.t") <- spe;
 145     return(spelog);
 146   }
 147
 148 `lexiconsToLexicalTable` <- function(lexicon, sublexicon) {
 149         if (! all(names(sublexicon) %in% names(lexicon)))
 150         stop(paste(
 151           sum(! (names(sublexicon) %in% names(lexicon))),
 152           "types of the sublexicon not found in the lexicon: ",
 153           )
 154       );
 155
 156         sub <- numeric(length(lexicon));
 157         names(sub) <- names(lexicon);
 158         sub[names(sublexicon)] <- sublexicon;
 159
 160         complementary.lexicon <- c(lexicon- sub);
 161         if (any(complementary.lexicon < 0))
 162                 stop("type cannot be more frequent in the sublexicon than in the lexicon");
 163
 164         lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2);
 165         rownames(lexicaltable) <- names(lexicon);
 166         colnames(lexicaltable) <- c("sublexicon", "complementary");
 167         return(lexicaltable)
 168 }
 169
 170 `specificites.lexicon.new` <- function(lexicon, sublexicon) {
 171   lexicaltable <- lexiconsToLexicalTable(lexicon, sublexicon);
 172   return(specificites(lexicaltable,NULL,NULL));
 173 }
 174
 175 `specificites.lexicon.probabilities` <-
 176   function(lexicon, sublexicon) {
 177
 178     if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values.");
 179     if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values.");
 180     if (is.null(names(lexicon))) stop("The lexicon must contain names.");
 181     if (is.null(names(sublexicon))) stop("The sub lexicon must contain names.");
 182
 183     if (! all(names(sublexicon) %in% names(lexicon)))
 184       stop(
 185         paste(
 186           "Some requested types of the sublexicon are not known in the lexicon: ",
 187           paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ")
 188         )
 189       );
 190
 191     F <- sum(lexicon);
 192     f <- sum(sublexicon);
 193
 194     # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]);
 195
 196     if (F < f) {
 197       stop("The lexicon cannot be smaller than the sublexicon");
 198     }
 199
 200     whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type.
 201     names(whiteDrawn) <- names(lexicon);
 202     whiteDrawn[names(sublexicon)] <- sublexicon;
 203     white <- lexicon; # The total frequencies in the corpus for each type.
 204     black <- F-white;          # The total complement frequency in the corpus for each type.
 205     drawn <- f;     # The number of tokens in the part.
 206
 207     # print(whiteDrawn);
 208     # print(white);
 209     # print(black);
 210     # print(drawn);
 211
 212     independance    <- (white * drawn) / F;         # The theoretic frequency of each type.
 213
 214     specif_negative <- whiteDrawn <  independance;  # index of observed frequencies below the theoretic frequencies.
 215     specif_positive <- whiteDrawn >= independance;  # index of observed frequencies above the theoretic frequencies.
 216
 217     specif <- double(length(lexicon));
 218
 219     specif[specif_negative] <-     phyper (
 220       whiteDrawn[specif_negative],     white[specif_negative], black[specif_negative], drawn
 221     );
 222
 223     specif[specif_positive] <- phyper_bis (
 224       whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
 225     );
 226
 227     names(specif) <- names(lexicon);
 228
 229     return(specif);
 230   }
 231
 232 `SpecifTopN` <- function(Specif, N=10, file=NULL) {
 233         symbol = Specif
 234         top <- c()
 235         cols <- colnames(symbol)
 236
 237         for (i in 1:length(cols)) {
 238                 sorted <- sort(symbol[, cols[i]], decreasing=TRUE, index.return= TRUE)$x
 239                 top <- union(top, names(sorted[1:N]))
 240                 top <- union(top, names(sorted[length(sorted) -N: length(sorted)]))
 241         }
 242
 243         symbol <- symbol[top,]
 244         if (file != NULL) {
 245                 write.table(symbol, file)
 246         }
 247 }
 248
 249 #print.specificites(x, line=20, part=1, form=NULL, ...) {
 250 #  if (all(is.null(line, part))) {
 251 #    stop("either a line or a part must be specified");
 252 #  }
 253 #  if (all(!is.null(line, part))) {
 254 #    stop("only a line or a part must be specified");
 255 #  }
 256 #}