1 #* Copyright © - 2008-2013 ANR Textométrie - http://textometrie.ens-lyon.fr
3 #* This file is part of the TXM platform.
5 #* The TXM platform is free software: you can redistribute it and/or modif y
6 #* it under the terms of the GNU General Public License as published by
7 #* the Free Software Foundation, either version 3 of the License, or
8 #* (at your option) any later version.
10 #* The TXM platform is distributed in the hope that it will be useful,
11 #* but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 #* General Public License for more details.
15 #* You should have received a copy of the GNU General Public License
16 #* along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
18 ## Sylvain Loiseau <sloiseau@u-paris10.fr>
19 ## Lise Vaudor <lise.vaudor@ens-lyon.fr>
21 phyper_bis=function(v_a,v_b,v_c,v_d){
22 v_s2=rep(NA,length(v_a))
23 for (j in 1:length(v_a)){
29 s1=dhyper(a_tmp,b,c,d)
30 s_tmp=dhyper(a_tmp+1,b,c,d)
33 while(log(s2)!=log(s1)){
36 s_tmp=dhyper(a_tmp,b,c,d)
45 function(lexicaltable, types=NULL, parts=NULL) {
46 spe <- specificites.probabilities(lexicaltable, types, parts);
47 spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
48 spelog[spe < 0] <- log10(-spe[spe < 0]);
49 spelog[spe > 0] <- abs(log10(spe[spe >0]));
50 spelog[spe == 0] <- 0;
51 spelog[is.infinite(spe)] <- 0;
52 spelog <- round(spelog, digits=4);
53 rownames(spelog) <- rownames(spe);
54 colnames(spelog) <- colnames(spe);
55 #class(spelog) <- "specificites";
56 #attr(spelog, "l.t") <- spe;
60 `specificites.probabilities` <-
61 function(lexicaltable, types=NULL, parts=NULL) {
63 # if (!is.numeric(lexicaltable)) stop("The lexical table must contain numeric values.");
65 rowMargin <- rowSums(lexicaltable); # or "F" (the total frequency of all the types).
66 colMargin <- colSums(lexicaltable); # or "T" (the size of the parts).
67 F <- sum(lexicaltable); # The grand total (number of tokens in the corpus).
69 if (! is.null(types)) { # Filter on tokens to be considered.
70 if(is.character(types)) { # convert the name of types given with "types" into row index numbers.
71 if (is.null(rownames(lexicaltable))) stop("The lexical table has no row names and the \"types\" argument is a character vector.");
72 if (! all(types %in% rownames(lexicaltable))) stop(paste(
73 "Some requested types are not known in the lexical table: ",
74 paste(types[! (types %in% rownames(lexicaltable))], collapse=" "))
77 if (any(types < 1)) stop("The row index must be greater than 0.");
78 if (max(types) > nrow(lexicaltable)) stop("Row index must be smaller than the number of rows.");
80 lexicaltable <- lexicaltable[types,, drop = FALSE];
81 rowMargin <- rowMargin[types];
84 if (! is.null(parts)) { # Filter on parts to be considered.
85 if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers.
86 if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
87 if (! all(parts %in% colnames(lexicaltable))) stop(paste(
88 "Some requested parts are not known in the lexical table: ",
89 paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
92 if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
93 if (any(parts < 1)) stop("The col index must be greater than 0.");
95 lexicaltable <- lexicaltable[,parts, drop=FALSE];
96 colMargin <- colMargin[parts];
99 if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
100 stop("The lexical table must contains at least one row and one column.");
103 specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable));
105 for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part).
107 whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type.
108 white <- rowMargin; # The total frequencies in the corpus for each type.
109 black <- F-white; # The total complement frequency in the corpus for each type.
110 drawn <- colMargin[i]; # The number of tokens in the part.
113 independance <- (white * drawn) / F; # The theoretic frequency of each type.
114 specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies.
115 specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies.
117 specif[specif_negative,i] <- -phyper (
118 whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
121 specif[specif_positive,i] <- phyper_bis (
122 whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
126 colnames(specif) <- colnames(lexicaltable);
127 rownames(specif) <- rownames(lexicaltable);
132 `specificites.lexicon` <-
133 function(lexicon, sublexicon) {
134 spe <- specificites.lexicon.probabilities(lexicon, sublexicon);
135 spelog <- matrix(0, nrow=nrow(spe), ncol=ncol(spe));
136 spelog[spe < 0] <- log10(-spe[spe < 0]);
137 spelog[spe > 0] <- abs(log10(spe[spe >=0]));
138 spelog[spe == 0] <- 0;
139 spelog[is.infinite(spe)] <- 0;
140 spelog <- round(spelog, digits=4);
141 rownames(spelog) <- rownames(spe);
142 colnames(spelog) <- colnames(spe);
143 #class(spelog) <- "specificites";
144 #attr(spelog, "l.t") <- spe;
148 `lexiconsToLexicalTable` <- function(lexicon, sublexicon) {
149 if (! all(names(sublexicon) %in% names(lexicon)))
151 sum(! (names(sublexicon) %in% names(lexicon))),
152 "types of the sublexicon not found in the lexicon: ",
156 sub <- numeric(length(lexicon));
157 names(sub) <- names(lexicon);
158 sub[names(sublexicon)] <- sublexicon;
160 complementary.lexicon <- c(lexicon- sub);
161 if (any(complementary.lexicon < 0))
162 stop("type cannot be more frequent in the sublexicon than in the lexicon");
164 lexicaltable <- matrix(c(sub, complementary.lexicon), ncol=2);
165 rownames(lexicaltable) <- names(lexicon);
166 colnames(lexicaltable) <- c("sublexicon", "complementary");
170 `specificites.lexicon.new` <- function(lexicon, sublexicon) {
171 lexicaltable <- lexiconsToLexicalTable(lexicon, sublexicon);
172 return(specificites(lexicaltable,NULL,NULL));
175 `specificites.lexicon.probabilities` <-
176 function(lexicon, sublexicon) {
178 if (!is.numeric(lexicon)) stop("The lexicon must contain numeric values.");
179 if (!is.numeric(sublexicon)) stop("The sublexicon must contain numeric values.");
180 if (is.null(names(lexicon))) stop("The lexicon must contain names.");
181 if (is.null(names(sublexicon))) stop("The sub lexicon must contain names.");
183 if (! all(names(sublexicon) %in% names(lexicon)))
186 "Some requested types of the sublexicon are not known in the lexicon: ",
187 paste(names(sublexicon)[! (names(sublexicon) %in% names(lexicon))], collapse=" ")
192 f <- sum(sublexicon);
194 # complementary.lexicon <- c(lexicon[names(sublexicon)] - sublexicon, lexicon[!names(lexicon) %in% names(sublexicon)]);
197 stop("The lexicon cannot be smaller than the sublexicon");
200 whiteDrawn <- numeric(length(lexicon)); # The frequencies observed in this part for each type.
201 names(whiteDrawn) <- names(lexicon);
202 whiteDrawn[names(sublexicon)] <- sublexicon;
203 white <- lexicon; # The total frequencies in the corpus for each type.
204 black <- F-white; # The total complement frequency in the corpus for each type.
205 drawn <- f; # The number of tokens in the part.
212 independance <- (white * drawn) / F; # The theoretic frequency of each type.
214 specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies.
215 specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies.
217 specif <- double(length(lexicon));
219 specif[specif_negative] <- phyper (
220 whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
223 specif[specif_positive] <- phyper_bis (
224 whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
227 names(specif) <- names(lexicon);
232 `SpecifTopN` <- function(Specif, N=10, file=NULL) {
235 cols <- colnames(symbol)
237 for (i in 1:length(cols)) {
238 sorted <- sort(symbol[, cols[i]], decreasing=TRUE, index.return= TRUE)$x
239 top <- union(top, names(sorted[1:N]))
240 top <- union(top, names(sorted[length(sorted) -N: length(sorted)]))
243 symbol <- symbol[top,]
245 write.table(symbol, file)
249 #print.specificites(x, line=20, part=1, form=NULL, ...) {
250 # if (all(is.null(line, part))) {
251 # stop("either a line or a part must be specified");
253 # if (all(!is.null(line, part))) {
254 # stop("only a line or a part must be specified");