2 # #* ANR project ANR-06-CORP-029
3 # #* http://textometrie.ens-lsh.fr/
5 # #* 2008 (C) Textometrie project
6 # #* BSD New and Simplified BSD licenses
7 # #* http://www.opensource.org/licenses/bsd-license.php
9 # ## Sylvain Loiseau <sloiseau@u-paris10.fr>
12 # ## Ex. : type1 = 7 occ., type2 = 4 occ., separator = 11
13 # ## Cf. Lafon, 1981 : 120 sqq.
15 # ## phyper(0:4, 4, 7 + 11, 7)
19 # ## wrapper for the function "cooccurrences.directed.contexts" given a factor as
20 # ## corpus and a separator.
23 cooccurrences <- function(corpus) {
24 stop("not implemented yet");
28 # `cooccurrences.directed.corpus` <-
29 # function (corpus, separator) {
30 # if (! is.factor(corpus)) stop("The corpus must be a factor.");
31 # if (! is.character(separator)) stop("Separator must be a character vector.");
32 # if (length(separator) != 1) stop("Separator must be a character of length 1");
34 # corpus <- corpus[drop=TRUE];
35 # frequencies = table(corpus);
36 # separatorfrequency = frequencies[separator];
37 # if (is.na(separatorfrequency)) {
38 # separatorfrequency = 0;
40 # frequencies = frequencies[names(frequencies) != separator];
41 # types <- levels(corpus);
42 # types <- types[types != separator];
44 # contexts = get.contexts(corpus, separator);
46 # return(cooccurrences.directed.contexts(contexts, types, frequencies, separatorfrequency));
50 # ## Compute all the cooccurrence indices according to a context list (see the "get.contexts" function"),
51 # ## the vector of types to be taken into account, their frequency, and the separator frequency.
53 # ## Compute the cooccurrency index (based on hypergeometric cumulative
54 # ## probability) for each directed pair of token and return a matrix with tokens as column names
55 # ## and row names where each cell give a cooccurrency index
58 # `cooccurrences.directed.contexts` <-
59 # function(contexts, types, type.frequencies, separator.frequency) {
61 # all.cooccurrences.index <- matrix(0.0, nrow=length(types), ncol=length(type));
62 # rownames(all.cooccurrences.index) <- types;
63 # colnames(all.cooccurrences.index) <- types;
65 # for (i in 1:length(types)) {
67 # typefrequency = frequencies[i];
68 # for (j in c(1:(types-1), (types + 1):length(types))) {
69 # othertype = types[j];
70 # othertypefrequency = frequencies[j];
71 # cooccurrences = cooccurrences.frequency(contexts, type, othertype);
72 # all.cooccurrences.index[type, othertype] <-
73 # cooccurrences.directed.cooccurrents(cooccurrences, type, othertype, separatorfrequency);
77 # return(all.cooccurrences.index);
81 # ## In a list of contexts (see the "get.contexts" function), count the number of contexts
82 # ## having both at least one occurrence of type1 and at least one occurrence of type2.
85 # `cooccurrences.frequency` <-
86 # function(contexts, type1, type2) {
88 # if (! is.character(type1)) stop("Type1 must be a character vector.");
89 # if (length(type1) != 1) stop("Type1 must be a character of length 1");
90 # if (! is.character(type2)) stop("Type2 must be a character vector.");
91 # if (length(type2) != 1) stop("Type2 must be a character of length 1");
93 # is.cooccurring <- sapply(contexts,
95 # if(type1 %in% x && type2 %in% x) return(1) else return(0)
98 # return(sum(is.cooccurring));
105 # `cooccurrences.directed.cooccurrents` <-
106 # function(cooccurrences, type1, type2, separator) {
107 # if (! is.integer(cooccurrences)) stop("cooccurrences must be integer.");
108 # if (! is.integer(type1)) stop("type1 must be integer.");
109 # if (! is.integer(type2)) stop("type2 must be integer.");
110 # if (! is.integer(separator)) stop("separator must be integer.");
111 # ph <- phyper(found, type2, type1 + separator, type1);
116 # ## Create a list of character vectors according to a character vector and a separator string.
120 # function(corpus, separator) {
121 # if (! is.factor(corpus)) stop("The corpus must be a factor.");
122 # if (! is.character(separator)) stop("Separator must be a character vector.");
123 # if (length(separator) != 1) stop("Separator must be a vector of length 1");
125 # index.separator = which(corpus == separator);
128 # for(i in 1:(length(index.separator))) {
129 # contexts[[i]] = as.character(corpus[lastindex:(index.separator[i]-1)]);
130 # lastindex=index.separator[i] + 1;
132 # contexts[[length(contexts) + 1]] <- as.character(corpus[lastindex:length(corpus)]);
137 # ## Create a lexical table according to a character vector and a separator string.
140 # `get.lexical.table` <-
141 # function(corpus, separator) {
142 # if (! is.factor(corpus)) stop("The corpus must be a factor.");
143 # if (! is.character(separator)) stop("Separator must be a character vector.");
144 # if (length(separator) == 1) stop("Separator must be a character of length 1");
146 # contexts = get.contexts(corpus, separator);
148 # corpus <- corpus[drop=TRUE];
149 # types <- levels(corpus);
150 # types <- types[types != separator];
152 # lexical.table <- matrix(0, nrow=length(contexts), ncol=length(types));
153 # colnames(lexical.table) <- types;
155 # for (i in 1:length(contexts)) {
156 # freq.list <- table(contexts[[i]]);
157 # lexical.table[i, names(freq.list)] <- freq.list;
160 # return(lexical.table);
164 # function(type1, type2, separator) {
165 # # max = ifelse(f > g, 2*g, i)