speed find clusters
[iramuteq] / Rscripts / CHD.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008-2011 Pierre Ratinaud
3 #License: GNU/GPL
4
5 pp<-function(txt,val) {
6         d<-paste(txt,' : ')
7         print(paste(d,val))
8 }
9 MyChiSq<-function(x,sc,n){
10         sr<-rowSums(x)
11         E <- outer(sr, sc, "*")/n
12         STAT<-sum(((x - E)^2)/E)
13         STAT
14 }
15
16 MySpeedChi <- function(x,sc) {
17     sr <-rowSums(x)
18     E <- outer(sr, sc, "*")
19         STAT<-sum((x - E)^2/E)
20         STAT
21 }
22
23 find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) {
24     ln <- which(dtable==1, arr.ind=TRUE)
25     lo <- list()
26     lo[1:nrow(dtable)] <- 0
27     for (k in 1:nrow(ln)) {lo[[ln[k,1]]]<-append(lo[[ln[k,1]]],ln[k,2])}
28     for (k in 1:nrow(dtable)) {lo[[k]] <- lo[[k]][-1]}
29         ## lo<-lo[-c(1,length(lo))]
30         ## for (l in lo) {
31         ##     compte <- compte + 1 
32         ##     chitable[1,l]<-chitable[1,l]+1
33         ##     chitable[2,l]<-chitable[2,l]-1
34         ##     chi<-MyChiSq(chitable,sc,TT)
35                 ## if (chi>maxinter) {
36                 ##     maxinter<-chi
37                 ##     rmax<-compte
38                 ## }   
39     #}
40         lo<-lo[-c(1)]
41         for (l in lo) {
42                 chi<-MyChiSq(chitable,sc,TT)
43                 if (chi>maxinter) {
44                         maxinter<-chi
45                         rmax<-compte
46                 }
47                 compte <- compte + 1
48                 chitable[1,l]<-chitable[1,l]+1
49                 chitable[2,l]<-chitable[2,l]-1
50         }       
51     res <- list(maxinter=maxinter, rmax=rmax)
52     res
53 }  
54
55
56
57
58
59 CHD<-function(data.in, x=9, mode.patate = FALSE, svd.method, libsvdc.path=NULL){
60 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
61         dataori <- data.in
62     row.names(dataori) <- rownames(data.in)
63         dtable <- data.in
64         colnames(dtable) <- 1:ncol(dtable)
65     dout <- NULL
66         rowelim<-NULL
67         pp('ncol entree : ',ncol(dtable))
68         pp('nrow entree',nrow(dtable))
69         listcol <- list()
70         listmere <- list()
71         list_fille <- list()
72         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
73         sdt<-colSums(dtable)
74         if (min(sdt)==0)
75                 dtable<-dtable[,-which(sdt==0)]
76     print('vire lignes vides en entree')
77     sdt<-rowSums(dtable)
78         if (min(sdt)==0) {
79         rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
80         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
81         print(rowelim)
82         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
83                 dtable<-dtable[-which(sdt==0),]
84         }
85         mere<-1
86         for (i in 1:x) {
87                 clnb<-(i*2)
88                 listmere[[clnb]]<-mere
89                 listmere[[clnb+1]]<-mere
90                 list_fille[[mere]] <- c(clnb,clnb+1)
91                 listcol[[clnb]]<-vector()
92                 listcol[[clnb+1]]<-vector()
93                 #extraction du premier facteur de l'afc
94                 print('afc')
95                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
96                 afc<-boostana(dtable, nd=1, svd.method = svd.method, libsvdc.path=libsvdc.path)
97                 pp('SV',afc$singular.values)
98                 pp('V.P.', afc$eigen.values)
99                 coordrow <- as.matrix(afc$row.scores[,1])
100                 coordrowori<-coordrow
101                 row.names(coordrow)<-rownames(dtable)
102         coordrow <- cbind(coordrow,1:nrow(dtable))
103                 print('deb recherche meilleur partition')
104         ordert <- as.matrix(coordrow[order(coordrow[,1]),])
105         ordert <- cbind(ordert, 1:nrow(ordert))
106         ordert <- ordert[order(ordert[,2]),]
107
108                 listinter<-vector()
109                 listlim<-vector()
110         dtable <- dtable[order(ordert[,3]),]
111         sc <- colSums(dtable)
112         TT <- sum(sc)
113         sc1 <- dtable[1,]
114         sc2 <- colSums(dtable) - sc1 
115         chitable <- rbind(sc1, sc2)
116         compte <- 1
117         maxinter <- 0
118         rmax <- NULL
119
120         inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT)
121         print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
122                 pp('max inter phase 1', inert$maxinter/TT)#max(listinter))
123                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
124         ordert <- ordert[order(ordert[,3]),]
125                 listclasse<-ifelse(coordrowori<=ordert[(inert$rmax),1],clnb,clnb+1)
126             dtable <- dtable[order(ordert[,2]),]
127                 cl<-listclasse
128                 pp('TT',TT)
129                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
130
131                 N1<-length(listclasse[listclasse==clnb])
132                 N2<-length(listclasse[listclasse==clnb+1])
133                 pp('N1',N1)
134                 pp('N2',N2)
135 ###################################################################
136 #                  reclassement des individus                     #
137 ###################################################################
138         if (!mode.patate) {
139                 malcl<-1000000000000
140                 it<-0
141                 listsub<-list()
142                 #in boucle
143             ln <- which(dtable==1, arr.ind=TRUE)
144             lnz <- list()
145             lnz[1:nrow(dtable)] <- 0
146     
147             for (k in 1:nrow(ln)) {lnz[[ln[k,1]]]<-append(lnz[[ln[k,1]]],ln[k,2])}
148             for (k in 1:nrow(dtable)) {lnz[[k]] <- lnz[[k]][-1]}
149                 TT<-sum(dtable)
150     
151                 while (malcl!=0 & N1>=5 & N2>=5) {
152                         it<-it+1
153                         listsub[[it]]<-vector()
154                 txt <- paste('nombre iteration', it)
155                         #pp('nombre iteration',it)
156                         vdelta<-vector()
157                         #dtable[,'cl']<-cl
158                         t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
159                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
160                         ncolt<-ncol(t1)
161                         #pp('ncolt',ncolt)
162     
163                 if (N1 != 1) {
164                     sc1<-colSums(t1)
165                 } else {
166                     sc1 <- t1
167                 }
168                 if (N2 != 1) {
169                             sc2<-colSums(t2)
170                 } else {
171                     sc2 <- t2
172                 }
173                         
174                 sc<-sc1+sc2
175                         chtableori<-rbind(sc1,sc2)
176                         chtable<-chtableori
177                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
178                         txt <- paste(txt, ' - interori : ',interori)
179                 #pp('interori',interori)
180     
181                         N1<-nrow(t1)
182                         N2<-nrow(t2)
183     
184                         #pp('N1',N1)
185                         #pp('N2',N2)
186                         txt <- paste(txt, 'N1:', N1,'-N2:',N2)
187                 print(txt)
188                 compte <- 0
189                         for (l in lnz){
190                         chi.in<-chtable
191                         compte <- compte + 1
192                                         if(cl[compte]==clnb){
193                                                 chtable[1,l]<-chtable[1,l]-1
194                                                 chtable[2,l]<-chtable[2,l]+1
195                                         }else{
196                                                 chtable[1,l]<-chtable[1,l]+1
197                                                 chtable[2,l]<-chtable[2,l]-1
198                                         }
199                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
200                                         ws<-interori-interswitch
201     
202                                         if (ws<0){
203                                                 interori<-interswitch
204                                                 if(cl[compte]==clnb){
205                                                         #sc1<-chtable[1,]
206                                                         #sc2<-chtable[2,]
207                                                         cl[compte]<-clnb+1
208                                                         listsub[[it]]<-append(listsub[[it]],compte)
209                                                 } else {
210                                                         #sc1<-chtable[1,]
211                                                         #sc2<-chtable[2,]
212                                                         cl[compte]<-clnb
213                                                         listsub[[it]]<-append(listsub[[it]],compte)
214                                                 }
215                                                 vdelta<-append(vdelta,compte)
216                         } else {
217                             chtable<-chi.in
218                                         }
219                    }
220     #                   for (val in vdelta) {
221     #                           if (cl[val]==clnb) {
222     #                                   cl[val]<-clnb+1
223     #                                   listsub[[it]]<-append(listsub[[it]],val)
224     #                                   }else {
225     #                                   cl[val]<-clnb
226     #                                   listsub[[it]]<-append(listsub[[it]],val)
227     #                           }
228     #                   }
229                         print('###################################')
230                         print('longueur < 0')
231                         malcl<-length(vdelta)
232                         if ((it>1)&&(!is.logical(listsub[[it]]))&&(!is.logical(listsub[[it-1]]))){
233                                 if (listsub[[it]]==listsub[[(it-1)]]){
234                                         malcl<-0
235                                 }
236                         }
237                         print(malcl)
238                         print('###################################')
239                 }
240         }
241                 #dtable<-cbind(dtable,'cl'=as.vector(cl))
242         #dtable[,'cl'] <-as.vector(cl)
243 #######################################################################
244 #                 Fin reclassement des individus                      #
245 #######################################################################
246 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
247                         #t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
248                         #t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
249                     t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
250                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
251             if (class(t1)=='numeric') {
252                 sc1 <- as.vector(t1)
253                 nrowt1 <- 1
254             } else {
255                 sc1 <- colSums(t1)
256                 nrowt1 <- nrow(t1)
257             }
258             if (class(t2)=='numeric') {
259                 sc2 <- as.vector(t2)
260                 nrowt2 <- 1
261             } else {
262                 sc2 <- colSums(t2)
263                 nrowt2 <- nrow(t2)
264             }
265             chtable<-rbind(sc1,sc2)
266                         inter<-chisq.test(chtable)$statistic/TT
267                         pp('last inter',inter)
268                         print('=====================')
269                         #calcul de la specificite des colonnes
270                         mint<-min(nrowt1,nrowt2)
271                         maxt<-max(nrowt1,nrowt2)
272                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
273                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
274 #                       print('ATTENTION SEUIL 3,84')
275 #                       seuil<-3.84
276                         pp('seuil',seuil)
277                         sominf<-0
278                         nv<-0
279                         nz<-0
280                         ncclnb<-0
281                         ncclnbp<-0
282             NN1<-0
283             NN2<-0
284             maxchip<-0
285             nbzeroun<-0
286             res1<-0
287             res2<-0
288             nbseuil<-0
289             nbexe<-0
290             nbcontrib<-0
291                         cn<-colnames(dtable)
292             #another try#########################################
293             one <- cbind(sc1,sc2)
294             cols <- c(length(which(cl==clnb)), length(which(cl==clnb+1))) 
295             print(cols)
296             colss <- matrix(rep(cols,ncol(dtable)), ncol=2, byrow=TRUE)
297             zero <- colss - one
298             rows <- cbind(rowSums(zero), rowSums(one))
299             n <- sum(cols)
300             for (m in 1:nrow(rows)) {
301                 obs <- t(matrix(c(zero[m,],one[m,]),2,2))
302                 E <- outer(rows[m,],cols,'*')/n
303                 if ((min(obs[2,])==0) & (min(obs[1,])!=0)) {
304                     chi <- seuil + 1
305                 } else if ((min(obs[1,])==0) & (min(obs[2,])!=0)) {
306                     chi <- seuil - 1
307                 } else if (any(obs < 10)) {
308                     chi <- sum((abs(obs - E) - 0.5)^2 / E)
309                 } else {
310                     chi <- sum(((obs - E)^2)/E)
311                 }
312                 if (is.na(chi)) {
313                     chi <- 0
314                 }
315                 if (chi > seuil) {
316                     if (obs[2,1] < E[2,1]) {
317                         listcol[[clnb]]<-append(listcol[[clnb]],cn[m])
318                         ncclnb<-ncclnb+1
319                     } else if (obs[2,2] < E[2,2]) {
320                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[m])
321                                                 ncclnbp<-ncclnbp+1
322                     }
323                 }
324             }
325             ######################################################
326                         print('resultats elim item')
327                         pp(clnb+1,length(listcol[[clnb+1]]))
328                         pp(clnb,length(listcol[[clnb]]))
329                         pp('ncclnb',ncclnb)
330                         pp('ncclnbp',ncclnbp)
331                         listrownamedtable<-rownames(dtable)
332                         listrownamedtable<-as.integer(listrownamedtable)
333                         newcol<-vector(length=nrow(dataori))
334                         #remplissage de la nouvelle colonne avec les nouvelles classes
335                         print('remplissage')
336 #                       num<-0
337             newcol[listrownamedtable] <- cl[,1]
338                         #recuperation de la classe precedante pour les cases vides
339                         print('recuperation classes precedentes')
340             if (i!=1) {
341                 newcol[which(newcol==0)] <- dout[,ncol(dout)][which(newcol==0)]
342             }
343             if(!is.null(rowelim)) {
344                 newcol[rowelim] <- 0
345             }
346                         tailleclasse<-as.matrix(summary(as.factor(as.character(newcol))))
347                         print('tailleclasse')
348                         print(tailleclasse)
349                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
350                         plusgrand<-which.max(tailleclasse)
351                         #???????????????????????????????????
352                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
353                         if (length(plusgrand)>1) {
354                                 plusgrand<-plusgrand[1]
355                         }
356                         #????????????????????????????????????
357                         
358                         #constuction du prochain tableau a analyser
359                         print('construction tableau suivant')
360             dout<-cbind(dout,newcol)
361                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
362                         dtable<-dataori[which(newcol==classe),]
363                         row.names(dtable)<-rownames(dataori)[which(newcol==classe)]
364             colnames(dtable) <- 1:ncol(dtable)
365                         mere<-classe
366                         listcolelim<-listcol[[as.integer(classe)]]
367                         mother<-listmere[[as.integer(classe)]]
368                         while (mother!=1) {
369                                 listcolelim<-append(listcolelim,listcol[[mother]])
370                                 mother<-listmere[[mother]]
371                         }
372                         
373                         listcolelim<-sort(unique(listcolelim))
374                         pp('avant',ncol(dtable))
375                         if (!is.logical(listcolelim)){
376                                 print('elimination colonne')
377                                 #dtable<-dtable[,-listcolelim]
378                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
379                         }
380                         pp('apres',ncol(dtable))
381                         #elimination des colonnes ne contenant que des 0
382                         print('vire colonne inf 3 dans boucle')
383                         sdt<-colSums(dtable)
384                         if (min(sdt)<=3)
385                                 dtable<-dtable[,-which(sdt<=3)]
386         
387                         #elimination des lignes ne contenant que des 0
388                         print('vire ligne vide dans boucle')
389                         if (ncol(dtable)==1) {
390                                 sdt<-dtable[,1]
391                         } else {
392                                 sdt<-rowSums(dtable)
393                         }
394                         if (min(sdt)==0) {
395                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
396                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
397                                 print(rowelim)
398                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
399                                 dtable<-dtable[-which(sdt==0),]
400                         }
401 #               }
402         }
403 #       sink()
404         res <- list(n1 = dout, list_mere = listmere, list_fille = list_fille)
405         res
406 }