iramuteq.org Git - iramuteq/blob - Rlib/textometrieR/R/specificites.R

   1 #* Textometrie
   2 #* ANR project ANR-06-CORP-029
   3 #* http://textometrie.ens-lsh.fr/
   4 #*
   5 #* 2008 (C) Textometrie project
   6 #* BSD New and Simplified BSD licenses
   7 #* http://www.opensource.org/licenses/bsd-license.php
   8
   9 ## Sylvain Loiseau <sloiseau@u-paris10.fr>
  10
  11 `specificites` <-
  12 function(lexicaltable, types=NULL, parts=NULL) {
  13   spe <- specificites.probabilities(lexicaltable, types, parts);
  14   spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
  15   spelog[spe < 0.5] <- log10(spe[spe < 0.5]);
  16   spelog[spe > 0.5] <- abs(log10(1 - spe[spe > 0.5]));
  17   spelog[spe == 0.5] <- 0;
  18   spelog[is.infinite(spe)] <- 0;
  19   spelog <- round(spelog, digits=4);
  20   rownames(spelog) <- rownames(spe);
  21   colnames(spelog) <- colnames(spe);
  22   #class(spelog) <- "specificites";
  23   #attr(spelog, "l.t") <- spe;
  24   return(spelog);
  25 }
  26
  27 `specificites.probabilities` <-
  28 function(lexicaltable, types=NULL, parts=NULL) {
  29
  30  # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values.");
  31
  32   rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types).
  33   colMargin <- colSums(lexicaltable); # or "T" (the size of the parts).
  34   F <- sum(lexicaltable);             # The grand total (number of tokens in the corpus).
  35
  36   if (! is.null(types)) {      # Filter on tokens to be considered.
  37     if(is.character(types)) {  # convert the name of types given with "types" into row index numbers.
  38       if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector.");
  39       if (! all(types %in% rownames(lexicaltable))) stop(paste(
  40             "Some requested types are not known in the lexical table: ",
  41             paste(types[! (types %in% rownames(lexicaltable))], collapse=" "))
  42           );
  43     } else {
  44       if (any(types < 1)) stop("The row index must be greater than 0.");
  45       if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows.");
  46     }
  47     lexicaltable <- lexicaltable[types,, drop = FALSE];
  48     rowMargin <- rowMargin[types];
  49   }
  50
  51   if (! is.null(parts)) {      # Filter on parts to be considered.
  52     if(is.character(parts)) {  # convert the name of parts given with "parts" into col index numbers.
  53       if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
  54       if (! all(parts %in% colnames(lexicaltable))) stop(paste(
  55             "Some requested parts are not known in the lexical table: ",
  56             paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
  57          );
  58     } else {
  59       if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
  60       if (any(parts < 1)) stop("The col index must be greater than 0.");
  61     }
  62     lexicaltable <- lexicaltable[,parts, drop=FALSE];
  63     colMargin <- colMargin[parts];
  64   }
  65
  66   if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
  67     stop("The lexical table must contains at least one row and one column.");
  68   }
  69
  70   specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable));
  71
  72   for(i in 1:ncol(lexicaltable)) {    # We proceed the whole lexical table by column (i.e. by part).
  73
  74      whiteDrawn <- lexicaltable[,i];  # The frequencies observed in this part for each type.
  75      white <- rowMargin;     # The total frequencies in the corpus for each type.
  76      black <- F-white;       # The total complement frequency in the corpus for each type.
  77      drawn <- colMargin[i];  # The number of tokens in the part.
  78
  79      independance    <- (white * drawn) / F;         # The theoretic frequency of each type.
  80      specif_negative <- whiteDrawn <  independance;  # index of observed frequencies below the theoretic frequencies.
  81      specif_positive <- whiteDrawn >= independance;  # index of observed frequencies above the theoretic frequencies.
  82
  83      specif[specif_negative,i] <- phyper (
  84          whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
  85          );
  86
  87      specif[specif_positive,i] <- phyper (
  88          whiteDrawn[specif_positive] - 1, white[specif_positive], black[specif_positive], drawn
  89          );
  90   }
  91
  92   dimnames(specif) <- dimnames(lexicaltable);
  93
  94   return(specif);
  95 }
  96
  97 `specificites.lexicon` <-
  98 function(lexicon, sublexicon) {
  99   spe <- specificites.lexicon.probabilities(lexicon, sublexicon);
 100   spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
 101   spelog[spe < 0.5] <- log10(spe[spe < 0.5]);
 102   spelog[spe > 0.5] <- abs(log10(1 - spe[spe > 0.5]));
 103   spelog[spe == 0.5] <- 0;
 104   spelog[is.infinite(spe)] <- 0;
 105   spelog <- round(spelog, digits=4);
 106   rownames(spelog) <- rownames(spe);
 107   colnames(spelog) <- colnames(spe);
 108   #class(spelog) <- "specificites";
 109   #attr(spelog, "l.t") <- spe;
 110   return(spelog);
 111 }
 112
 113 `specificites.lexicon.new` <-
 114 function(lexicon, sublexicon) {
 115
 116   if (! all(names(sublexicon) %in% names(lexicon))) stop(
 117         paste(
 118           sum(! (names(sublexicon) %in% names(lexicon))),
 119           "types of the sublexicon not found in the lexicon: ",
 120           )
 121       );
 122
 123 sub <- numeric(length(lexicon));
 124 names(sub) <- names(lexicon);
 125 sub[names(sublexicon)] <- sublexicon;
 126
 127   complementary.lexicon <- c(lexicon- sub);
 128   if (any(complementary.lexicon < 0)) stop("type cannot be more frequent in the sublexicon than in the lexicon");
 129
 130   lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2);
 131   rownames(lexicaltable) <- names(lexicon);
 132   colnames(lexicaltable) <- c("sublexicon", "complementary");
 133  #stop(colnames(lexicaltable));
 134   return(specificites(lexicaltable,NULL,NULL));
 135 }
 136
 137 `specificites.lexicon.probabilities` <-
 138 function(lexicon, sublexicon) {
 139
 140   if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values.");
 141   if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values.");
 142   if (is.null(names(lexicon))) stop("The lexicon must contain names.");
 143   if (is.null(names(sublexicon))) stop("The sub lexicon must contain names.");
 144
 145   if (! all(names(sublexicon) %in% names(lexicon)))
 146     stop(
 147         paste(
 148           "Some requested types of the sublexicon are not known in the lexicon: ",
 149           paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ")
 150           )
 151         );
 152
 153   F <- sum(lexicon);
 154   f <- sum(sublexicon);
 155
 156   # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]);
 157
 158   if (F < f) {
 159     stop("The lexicon cannot be smaller than the sublexicon");
 160   }
 161
 162   whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type.
 163   names(whiteDrawn) <- names(lexicon);
 164   whiteDrawn[names(sublexicon)] <- sublexicon;
 165   white <- lexicon; # The total frequencies in the corpus for each type.
 166   black <- F-white;          # The total complement frequency in the corpus for each type.
 167   drawn <- f;     # The number of tokens in the part.
 168
 169   # print(whiteDrawn);
 170   # print(white);
 171   # print(black);
 172   # print(drawn);
 173
 174   independance    <- (white * drawn) / F;         # The theoretic frequency of each type.
 175
 176   specif_negative <- whiteDrawn <  independance;  # index of observed frequencies below the theoretic frequencies.
 177   specif_positive <- whiteDrawn >= independance;  # index of observed frequencies above the theoretic frequencies.
 178
 179   specif <- double(length(lexicon));
 180
 181   specif[specif_negative] <-     phyper (
 182       whiteDrawn[specif_negative],     white[specif_negative], black[specif_negative], drawn
 183       );
 184
 185   specif[specif_positive] <- phyper (
 186       whiteDrawn[specif_positive] - 1, white[specif_positive], black[specif_positive], drawn
 187       );
 188
 189   names(specif) <- names(lexicon);
 190
 191   return(specif);
 192 }
 193
 194 #print.specificites(x, line=20, part=1, form=NULL, ...) {
 195 #  if (all(is.null(line, part))) {
 196 #    stop("either a line or a part must be specified");
 197 #  }
 198 #  if (all(!is.null(line, part))) {
 199 #    stop("only a line or a part must be specified");
 200 #  }
 201 #}