- lexicaltable <- lexicaltable[types,, drop = FALSE];
- rowMargin <- rowMargin[types];
- }
-
- if (! is.null(parts)) { # Filter on parts to be considered.
- if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers.
- if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
- if (! all(parts %in% colnames(lexicaltable))) stop(paste(
- "Some requested parts are not known in the lexical table: ",
- paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
- );
- } else {
- if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
- if (any(parts < 1)) stop("The col index must be greater than 0.");
+
+ if (! is.null(parts)) { # Filter on parts to be considered.
+ if(is.character(parts)) { # convert the name of parts given with "parts" into col index numbers.
+ if (is.null(colnames(lexicaltable))) stop("The lexical table has no col names and the \"parts\" argument is a character vector.");
+ if (! all(parts %in% colnames(lexicaltable))) stop(paste(
+ "Some requested parts are not known in the lexical table: ",
+ paste(parts[! (parts %in% colnames(lexicaltable))], collapse=" "))
+ );
+ } else {
+ if (max(parts) > ncol(lexicaltable)) stop("Column index must be smaller than the number of cols.");
+ if (any(parts < 1)) stop("The col index must be greater than 0.");
+ }
+ lexicaltable <- lexicaltable[,parts, drop=FALSE];
+ colMargin <- colMargin[parts];
+ }
+
+ if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
+ stop("The lexical table must contains at least one row and one column.");
+ }
+
+ specif <- matrix(0.0, nrow=nrow(lexicaltable), ncol=ncol(lexicaltable));
+
+ for(i in 1:ncol(lexicaltable)) { # We proceed the whole lexical table by column (i.e. by part).
+
+ whiteDrawn <- lexicaltable[,i]; # The frequencies observed in this part for each type.
+ white <- rowMargin; # The total frequencies in the corpus for each type.
+ black <- F-white; # The total complement frequency in the corpus for each type.
+ drawn <- colMargin[i]; # The number of tokens in the part.
+
+
+ independance <- (white * drawn) / F; # The theoretic frequency of each type.
+ specif_negative <- whiteDrawn < independance; # index of observed frequencies below the theoretic frequencies.
+ specif_positive <- whiteDrawn >= independance; # index of observed frequencies above the theoretic frequencies.
+
+ specif[specif_negative,i] <- -phyper (
+ whiteDrawn[specif_negative], white[specif_negative], black[specif_negative], drawn
+ );
+
+ specif[specif_positive,i] <- phyper_bis (
+ whiteDrawn[specif_positive], white[specif_positive], black[specif_positive], drawn
+ );