iramuteq.org Git - iramuteq/blob - Rlib/textometrieR/R/cooccurrences.R

   1 # #* Textometrie
   2 # #* ANR project ANR-06-CORP-029
   3 # #* http://textometrie.ens-lsh.fr/
   4 # #*
   5 # #* 2008 (C) Textometrie project
   6 # #* BSD New and Simplified BSD licenses
   7 # #* http://www.opensource.org/licenses/bsd-license.php
   8 #
   9 # ## Sylvain Loiseau <sloiseau@u-paris10.fr>
  10 #
  11 # ##
  12 # ## Ex. : type1 = 7 occ., type2 = 4 occ., separator = 11
  13 # ## Cf. Lafon, 1981 : 120 sqq.
  14 # ##
  15 # ## phyper(0:4, 4, 7 + 11, 7)
  16 # ##
  17 #
  18 # ##
  19 # ## wrapper for the function "cooccurrences.directed.contexts" given a factor as
  20 # ## corpus and a separator.
  21 # ##
  22
  23 cooccurrences <- function(corpus) {
  24  stop("not implemented yet");
  25 }
  26
  27 #
  28 # `cooccurrences.directed.corpus` <-
  29 # function (corpus, separator) {
  30 #   if (! is.factor(corpus)) stop("The corpus must be a factor.");
  31 #   if (! is.character(separator)) stop("Separator must be a character vector.");
  32 #   if (length(separator) != 1) stop("Separator must be a character of length 1");
  33 #
  34 #   corpus <- corpus[drop=TRUE];
  35 #   frequencies = table(corpus);
  36 #   separatorfrequency = frequencies[separator];
  37 #   if (is.na(separatorfrequency)) {
  38 #     separatorfrequency = 0;
  39 #   }
  40 #   frequencies = frequencies[names(frequencies) != separator];
  41 #   types <- levels(corpus);
  42 #   types <- types[types != separator];
  43 #
  44 #   contexts = get.contexts(corpus, separator);
  45 #
  46 #   return(cooccurrences.directed.contexts(contexts, types, frequencies, separatorfrequency));
  47 # }
  48 #
  49 # ##
  50 # ## Compute all the cooccurrence indices according to a context list (see the "get.contexts" function"),
  51 # ## the vector of types to be taken into account, their frequency, and the separator frequency.
  52 # ##
  53 # ## Compute the cooccurrency index (based on hypergeometric cumulative
  54 # ## probability) for each directed pair of token and return a matrix with tokens as column names
  55 # ## and row names where each cell give a cooccurrency index
  56 # ##
  57 #
  58 # `cooccurrences.directed.contexts` <-
  59 # function(contexts, types, type.frequencies, separator.frequency) {
  60 #
  61 #   all.cooccurrences.index <- matrix(0.0, nrow=length(types), ncol=length(type));
  62 #   rownames(all.cooccurrences.index) <- types;
  63 #   colnames(all.cooccurrences.index) <- types;
  64 #
  65 #   for (i in 1:length(types)) {
  66 #     type = types[i];
  67 #     typefrequency = frequencies[i];
  68 #     for (j in c(1:(types-1), (types + 1):length(types))) {
  69 #       othertype = types[j];
  70 #       othertypefrequency = frequencies[j];
  71 #       cooccurrences = cooccurrences.frequency(contexts, type, othertype);
  72 #       all.cooccurrences.index[type, othertype] <-
  73 #         cooccurrences.directed.cooccurrents(cooccurrences, type, othertype, separatorfrequency);
  74 #     }
  75 #   }
  76 #
  77 #   return(all.cooccurrences.index);
  78 # }
  79 #
  80 # ##
  81 # ## In a list of contexts (see the "get.contexts" function), count the number of contexts
  82 # ## having both at least one occurrence of type1 and at least one occurrence of type2.
  83 # ##
  84 #
  85 # `cooccurrences.frequency` <-
  86 # function(contexts, type1, type2) {
  87 #
  88 #   if (! is.character(type1)) stop("Type1 must be a character vector.");
  89 #   if (length(type1) != 1) stop("Type1 must be a character of length 1");
  90 #   if (! is.character(type2)) stop("Type2 must be a character vector.");
  91 #   if (length(type2) != 1) stop("Type2 must be a character of length 1");
  92 #
  93 #   is.cooccurring <- sapply(contexts,
  94 #       function(x) {
  95 #       if(type1 %in% x && type2 %in% x) return(1) else return(0)
  96 #       }
  97 #       );
  98 #   return(sum(is.cooccurring));
  99 # }
 100 #
 101 # ##
 102 # ##
 103 # ##
 104 #
 105 # `cooccurrences.directed.cooccurrents` <-
 106 # function(cooccurrences, type1, type2, separator) {
 107 #   if (! is.integer(cooccurrences)) stop("cooccurrences must be integer.");
 108 #   if (! is.integer(type1)) stop("type1 must be integer.");
 109 #   if (! is.integer(type2)) stop("type2 must be integer.");
 110 #   if (! is.integer(separator)) stop("separator must be integer.");
 111 #   ph <- phyper(found, type2, type1 + separator, type1);
 112 #   return(ph);
 113 # }
 114 #
 115 # ##
 116 # ## Create a list of character vectors according to a character vector and a separator string.
 117 # ##
 118 #
 119 # `get.contexts` <-
 120 # function(corpus, separator) {
 121 #   if (! is.factor(corpus)) stop("The corpus must be a factor.");
 122 #   if (! is.character(separator)) stop("Separator must be a character vector.");
 123 #   if (length(separator) != 1) stop("Separator must be a vector of length 1");
 124 #
 125 #   index.separator = which(corpus == separator);
 126 #   contexts = list();
 127 #   lastindex = 1;
 128 #   for(i in 1:(length(index.separator))) {
 129 #     contexts[[i]] = as.character(corpus[lastindex:(index.separator[i]-1)]);
 130 #     lastindex=index.separator[i] + 1;
 131 #   }
 132 #   contexts[[length(contexts) + 1]] <- as.character(corpus[lastindex:length(corpus)]);
 133 #   return(contexts);
 134 # }
 135 #
 136 # ##
 137 # ## Create a lexical table according to a character vector and a separator string.
 138 # ##
 139 #
 140 # `get.lexical.table` <-
 141 # function(corpus, separator) {
 142 #   if (! is.factor(corpus)) stop("The corpus must be a factor.");
 143 #   if (! is.character(separator)) stop("Separator must be a character vector.");
 144 #   if (length(separator) == 1) stop("Separator must be a character of length 1");
 145 #
 146 #   contexts = get.contexts(corpus, separator);
 147 #
 148 #   corpus <- corpus[drop=TRUE];
 149 #   types <- levels(corpus);
 150 #   types <- types[types != separator];
 151 #
 152 #   lexical.table <- matrix(0, nrow=length(contexts), ncol=length(types));
 153 #   colnames(lexical.table) <- types;
 154 #
 155 #   for (i in 1:length(contexts)) {
 156 #     freq.list <- table(contexts[[i]]);
 157 #     lexical.table[i, names(freq.list)] <- freq.list;
 158 #   }
 159 #
 160 #   return(lexical.table);
 161 # }
 162 #
 163 # `cooccurrences` <-
 164 # function(type1, type2, separator) {
 165 # #    max = ifelse(f > g, 2*g, i)
 166 # #    #choose(T, t);
 167 # }
 168 #