remove numpy + matrix
[iramuteq] / Rscripts / CHD.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008-2011 Pierre Ratinaud
3 #Lisense: GNU/GPL
4
5 pp<-function(txt,val) {
6         d<-paste(txt,' : ')
7         print(paste(d,val))
8 }
9 MyChiSq<-function(x,sc,n){
10         sr<-rowSums(x)
11         E <- outer(sr, sc, "*")/n
12         STAT<-sum(((x - E)^2)/E)
13         STAT
14 }
15
16 MySpeedChi <- function(x,sc) {
17     sr <-rowSums(x)
18     E <- outer(sr, sc, "*")
19         STAT<-sum((x - E)^2/E)
20         STAT
21 }
22
23 find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) {
24     ln <- which(dtable==1, arr.ind=TRUE)
25     lo <- list()
26     lo[1:nrow(dtable)] <- 0
27     for (k in 1:nrow(ln)) {lo[[ln[k,1]]]<-append(lo[[ln[k,1]]],ln[k,2])}
28     for (k in 1:nrow(dtable)) {lo[[k]] <- lo[[k]][-1]}
29     lo<-lo[-c(1,length(lo))]
30     for (l in lo) {
31         compte <- compte + 1 
32         chitable[1,l]<-chitable[1,l]+1
33         chitable[2,l]<-chitable[2,l]-1
34         chi<-MyChiSq(chitable,sc,TT)
35         if (chi>maxinter) {
36             maxinter<-chi
37             rmax<-compte
38         }   
39     }
40     res <- list(maxinter=maxinter, rmax=rmax)
41     res
42 }  
43
44 CHD<-function(data.in, x=9, mode.patate = FALSE, libsvdc=FALSE, libsvdc.path=NULL){
45 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
46         dataori <- data.in
47     row.names(dataori) <- rownames(data.in)
48         dtable <- data.in
49         colnames(dtable) <- 1:ncol(dtable)
50     dout <- NULL
51         rowelim<-NULL
52         pp('ncol entree : ',ncol(dtable))
53         pp('nrow entree',nrow(dtable))
54         listcol <- list()
55         listmere <- list()
56         list_fille <- list()
57         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
58         sdt<-colSums(dtable)
59         if (min(sdt)==0)
60                 dtable<-dtable[,-which(sdt==0)]
61     print('vire lignes vides en entree')
62     sdt<-rowSums(dtable)
63         if (min(sdt)==0) {
64         rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
65         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
66         print(rowelim)
67         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
68                 dtable<-dtable[-which(sdt==0),]
69         }
70         mere<-1
71         for (i in 1:x) {
72                 clnb<-(i*2)
73                 listmere[[clnb]]<-mere
74                 listmere[[clnb+1]]<-mere
75                 list_fille[[mere]] <- c(clnb,clnb+1)
76                 listcol[[clnb]]<-vector()
77                 listcol[[clnb+1]]<-vector()
78                 #extraction du premier facteur de l'afc
79                 print('afc')
80                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
81                 afc<-boostana(dtable, nd=1, libsvdc=libsvdc, libsvdc.path=libsvdc.path)
82                 pp('SV',afc$singular.values)
83                 pp('V.P.', afc$eigen.values)
84                 coordrow <- as.matrix(afc$row.scores[,1])
85                 coordrowori<-coordrow
86                 row.names(coordrow)<-rownames(dtable)
87         coordrow <- cbind(coordrow,1:nrow(dtable))
88                 print('deb recherche meilleur partition')
89         ordert <- as.matrix(coordrow[order(coordrow[,1]),])
90         ordert <- cbind(ordert, 1:nrow(ordert))
91         ordert <- ordert[order(ordert[,2]),]
92
93                 listinter<-vector()
94                 listlim<-vector()
95         dtable <- dtable[order(ordert[,3]),]
96         sc <- colSums(dtable)
97         TT <- sum(sc)
98         sc1 <- dtable[1,]
99         sc2 <- colSums(dtable) - sc1 
100         chitable <- rbind(sc1, sc2)
101         compte <- 1
102         maxinter <- 0
103         rmax <- NULL
104
105         inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT)
106                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
107                 pp('max inter phase 1', inert$maxinter/TT)#max(listinter))
108                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
109         ordert <- ordert[order(ordert[,3]),]
110                 listclasse<-ifelse(coordrowori<=ordert[(inert$rmax),1],clnb,clnb+1)
111             dtable <- dtable[order(ordert[,2]),]
112                 cl<-listclasse
113                 pp('TT',TT)
114                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
115
116                 N1<-length(listclasse[listclasse==clnb])
117                 N2<-length(listclasse[listclasse==clnb+1])
118                 pp('N1',N1)
119                 pp('N2',N2)
120 ###################################################################
121 #                  reclassement des individus                     #
122 ###################################################################
123         if (!mode.patate) {
124                 malcl<-1000000000000
125                 it<-0
126                 listsub<-list()
127                 #in boucle
128             ln <- which(dtable==1, arr.ind=TRUE)
129             lnz <- list()
130             lnz[1:nrow(dtable)] <- 0
131     
132             for (k in 1:nrow(ln)) {lnz[[ln[k,1]]]<-append(lnz[[ln[k,1]]],ln[k,2])}
133             for (k in 1:nrow(dtable)) {lnz[[k]] <- lnz[[k]][-1]}
134                 TT<-sum(dtable)
135     
136                 while (malcl!=0 & N1>=5 & N2>=5) {
137                         it<-it+1
138                         listsub[[it]]<-vector()
139                 txt <- paste('nombre iteration', it)
140                         #pp('nombre iteration',it)
141                         vdelta<-vector()
142                         #dtable[,'cl']<-cl
143                         t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
144                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
145                         ncolt<-ncol(t1)
146                         #pp('ncolt',ncolt)
147     
148                 if (N1 != 1) {
149                     sc1<-colSums(t1)
150                 } else {
151                     sc1 <- t1
152                 }
153                 if (N2 != 1) {
154                             sc2<-colSums(t2)
155                 } else {
156                     sc2 <- t2
157                 }
158                         
159                 sc<-sc1+sc2
160                         chtableori<-rbind(sc1,sc2)
161                         chtable<-chtableori
162                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
163                         txt <- paste(txt, ' - interori : ',interori)
164                 #pp('interori',interori)
165     
166                         N1<-nrow(t1)
167                         N2<-nrow(t2)
168     
169                         #pp('N1',N1)
170                         #pp('N2',N2)
171                         txt <- paste(txt, 'N1:', N1,'-N2:',N2)
172                 print(txt)
173                 compte <- 0
174                         for (l in lnz){
175                         chi.in<-chtable
176                         compte <- compte + 1
177                                         if(cl[compte]==clnb){
178                                                 chtable[1,l]<-chtable[1,l]-1
179                                                 chtable[2,l]<-chtable[2,l]+1
180                                         }else{
181                                                 chtable[1,l]<-chtable[1,l]+1
182                                                 chtable[2,l]<-chtable[2,l]-1
183                                         }
184                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
185                                         ws<-interori-interswitch
186     
187                                         if (ws<0){
188                                                 interori<-interswitch
189                                                 if(cl[compte]==clnb){
190                                                         #sc1<-chtable[1,]
191                                                         #sc2<-chtable[2,]
192                                                         cl[compte]<-clnb+1
193                                                         listsub[[it]]<-append(listsub[[it]],compte)
194                                                 } else {
195                                                         #sc1<-chtable[1,]
196                                                         #sc2<-chtable[2,]
197                                                         cl[compte]<-clnb
198                                                         listsub[[it]]<-append(listsub[[it]],compte)
199                                                 }
200                                                 vdelta<-append(vdelta,compte)
201                         } else {
202                             chtable<-chi.in
203                                         }
204                    }
205     #                   for (val in vdelta) {
206     #                           if (cl[val]==clnb) {
207     #                                   cl[val]<-clnb+1
208     #                                   listsub[[it]]<-append(listsub[[it]],val)
209     #                                   }else {
210     #                                   cl[val]<-clnb
211     #                                   listsub[[it]]<-append(listsub[[it]],val)
212     #                           }
213     #                   }
214                         print('###################################')
215                         print('longueur < 0')
216                         malcl<-length(vdelta)
217                         if ((it>1)&&(!is.logical(listsub[[it]]))&&(!is.logical(listsub[[it-1]]))){
218                                 if (listsub[[it]]==listsub[[(it-1)]]){
219                                         malcl<-0
220                                 }
221                         }
222                         print(malcl)
223                         print('###################################')
224                 }
225         }
226                 #dtable<-cbind(dtable,'cl'=as.vector(cl))
227         #dtable[,'cl'] <-as.vector(cl)
228 #######################################################################
229 #                 Fin reclassement des individus                      #
230 #######################################################################
231 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
232                         #t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
233                         #t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
234                     t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
235                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
236             if (class(t1)=='numeric') {
237                 sc1 <- as.vector(t1)
238                 nrowt1 <- 1
239             } else {
240                 sc1 <- colSums(t1)
241                 nrowt1 <- nrow(t1)
242             }
243             if (class(t2)=='numeric') {
244                 sc2 <- as.vector(t2)
245                 nrowt2 <- 1
246             } else {
247                 sc2 <- colSums(t2)
248                 nrowt2 <- nrow(t2)
249             }
250             chtable<-rbind(sc1,sc2)
251                         inter<-chisq.test(chtable)$statistic/TT
252                         pp('last inter',inter)
253                         print('=====================')
254                         #calcul de la specificite des colonnes
255                         mint<-min(nrowt1,nrowt2)
256                         maxt<-max(nrowt1,nrowt2)
257                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
258                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
259 #                       print('ATTENTION SEUIL 3,84')
260 #                       seuil<-3.84
261                         pp('seuil',seuil)
262                         sominf<-0
263                         nv<-0
264                         nz<-0
265                         ncclnb<-0
266                         ncclnbp<-0
267             NN1<-0
268             NN2<-0
269             maxchip<-0
270             nbzeroun<-0
271             res1<-0
272             res2<-0
273             nbseuil<-0
274             nbexe<-0
275             nbcontrib<-0
276                         cn<-colnames(dtable)
277             #another try#########################################
278             one <- cbind(sc1,sc2)
279             cols <- c(length(which(cl==clnb)), length(which(cl==clnb+1))) 
280             print(cols)
281             colss <- matrix(rep(cols,ncol(dtable)), ncol=2, byrow=TRUE)
282             zero <- colss - one
283             rows <- cbind(rowSums(zero), rowSums(one))
284             n <- sum(cols)
285             for (m in 1:nrow(rows)) {
286                 obs <- t(matrix(c(zero[m,],one[m,]),2,2))
287                 E <- outer(rows[m,],cols,'*')/n
288                 if ((min(obs[2,])==0) & (min(obs[1,])!=0)) {
289                     chi <- seuil + 1
290                 } else if ((min(obs[1,])==0) & (min(obs[2,])!=0)) {
291                     chi <- seuil - 1
292                 } else if (any(obs < 10)) {
293                     chi <- sum((abs(obs - E) - 0.5)^2 / E)
294                 } else {
295                     chi <- sum(((obs - E)^2)/E)
296                 }
297                 if (is.na(chi)) {
298                     chi <- 0
299                 }
300                 if (chi > seuil) {
301                     if (obs[2,1] < E[2,1]) {
302                         listcol[[clnb]]<-append(listcol[[clnb]],cn[m])
303                         ncclnb<-ncclnb+1
304                     } else if (obs[2,2] < E[2,2]) {
305                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[m])
306                                                 ncclnbp<-ncclnbp+1
307                     }
308                 }
309             }
310             ######################################################
311                         print('resultats elim item')
312                         pp(clnb+1,length(listcol[[clnb+1]]))
313                         pp(clnb,length(listcol[[clnb]]))
314                         pp('ncclnb',ncclnb)
315                         pp('ncclnbp',ncclnbp)
316                         listrownamedtable<-rownames(dtable)
317                         listrownamedtable<-as.integer(listrownamedtable)
318                         newcol<-vector(length=nrow(dataori))
319                         #remplissage de la nouvelle colonne avec les nouvelles classes
320                         print('remplissage')
321 #                       num<-0
322             newcol[listrownamedtable] <- cl[,1]
323                         #recuperation de la classe precedante pour les cases vides
324                         print('recuperation classes precedentes')
325             if (i!=1) {
326                 newcol[which(newcol==0)] <- dout[,ncol(dout)][which(newcol==0)]
327             }
328             if(!is.null(rowelim)) {
329                 newcol[rowelim] <- 0
330             }
331                         tailleclasse<-as.matrix(summary(as.factor(as.character(newcol))))
332                         print('tailleclasse')
333                         print(tailleclasse)
334                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
335                         plusgrand<-which.max(tailleclasse)
336                         #???????????????????????????????????
337                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
338                         if (length(plusgrand)>1) {
339                                 plusgrand<-plusgrand[1]
340                         }
341                         #????????????????????????????????????
342                         
343                         #constuction du prochain tableau a analyser
344                         print('construction tableau suivant')
345             dout<-cbind(dout,newcol)
346                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
347                         dtable<-dataori[which(newcol==classe),]
348                         row.names(dtable)<-rownames(dataori)[which(newcol==classe)]
349             colnames(dtable) <- 1:ncol(dtable)
350                         mere<-classe
351                         listcolelim<-listcol[[as.integer(classe)]]
352                         mother<-listmere[[as.integer(classe)]]
353                         while (mother!=1) {
354                                 listcolelim<-append(listcolelim,listcol[[mother]])
355                                 mother<-listmere[[mother]]
356                         }
357                         
358                         listcolelim<-sort(unique(listcolelim))
359                         pp('avant',ncol(dtable))
360                         if (!is.logical(listcolelim)){
361                                 print('elimination colonne')
362                                 #dtable<-dtable[,-listcolelim]
363                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
364                         }
365                         pp('apres',ncol(dtable))
366                         #elimination des colonnes ne contenant que des 0
367                         print('vire colonne inf 3 dans boucle')
368                         sdt<-colSums(dtable)
369                         if (min(sdt)<=3)
370                                 dtable<-dtable[,-which(sdt<=3)]
371         
372                         #elimination des lignes ne contenant que des 0
373                         print('vire ligne vide dans boucle')
374                         if (ncol(dtable)==1) {
375                                 sdt<-dtable[,1]
376                         } else {
377                                 sdt<-rowSums(dtable)
378                         }
379                         if (min(sdt)==0) {
380                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
381                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
382                                 print(rowelim)
383                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
384                                 dtable<-dtable[-which(sdt==0),]
385                         }
386 #               }
387         }
388 #       sink()
389         res <- list(n1 = dout, list_mere = listmere, list_fille = list_fille)
390         res
391 }