first import
[iramuteq] / Rscripts / CHD.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008-2011 Pierre Ratinaud
3 #Lisense: GNU/GPL
4
5 pp<-function(txt,val) {
6         d<-paste(txt,' : ')
7         print(paste(d,val))
8 }
9 MyChiSq<-function(x,sc,n){
10         sr<-rowSums(x)
11         E <- outer(sr, sc, "*")/n
12         STAT<-sum(((x - E)^2)/E)
13         STAT
14 }
15
16 MySpeedChi <- function(x,sc) {
17     sr <-rowSums(x)
18     E <- outer(sr, sc, "*")
19         STAT<-sum((x - E)^2/E)
20         STAT
21 }
22
23 find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) {
24     ln <- which(dtable==1, arr.ind=TRUE)
25     lo <- list()
26     lo[1:nrow(dtable)] <- 0
27     for (k in 1:nrow(ln)) {lo[[ln[k,1]]]<-append(lo[[ln[k,1]]],ln[k,2])}
28     for (k in 1:nrow(dtable)) {lo[[k]] <- lo[[k]][-1]}
29     lo<-lo[-c(1,length(lo))]
30     for (l in lo) {
31         compte <- compte + 1 
32         chitable[1,l]<-chitable[1,l]+1
33         chitable[2,l]<-chitable[2,l]-1
34         chi<-MyChiSq(chitable,sc,TT)
35         if (chi>maxinter) {
36             maxinter<-chi
37             rmax<-compte
38         }   
39     }
40     res <- list(maxinter=maxinter, rmax=rmax)
41     res
42 }  
43
44 CHD<-function(data.in, x=9, libsvdc=FALSE, libsvdc.path=NULL){
45 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
46         dataori <- data.in
47     row.names(dataori) <- rownames(data.in)
48         dtable <- data.in
49         colnames(dtable) <- 1:ncol(dtable)
50     dout <- NULL
51         rowelim<-NULL
52         pp('ncol entree : ',ncol(dtable))
53         pp('nrow entree',nrow(dtable))
54         listcol <- list()
55         listmere <- list()
56         list_fille <- list()
57         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
58         sdt<-colSums(dtable)
59         if (min(sdt)==0)
60                 dtable<-dtable[,-which(sdt==0)]
61     print('vire lignes vides en entree')
62     sdt<-rowSums(dtable)
63         if (min(sdt)==0) {
64         rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
65         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
66         print(rowelim)
67         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
68                 dtable<-dtable[-which(sdt==0),]
69         }
70         mere<-1
71         for (i in 1:x) {
72                 clnb<-(i*2)
73                 listmere[[clnb]]<-mere
74                 listmere[[clnb+1]]<-mere
75                 list_fille[[mere]] <- c(clnb,clnb+1)
76                 listcol[[clnb]]<-vector()
77                 listcol[[clnb+1]]<-vector()
78                 #extraction du premier facteur de l'afc
79                 print('afc')
80                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
81                 afc<-boostana(dtable, nd=1, libsvdc=libsvdc, libsvdc.path=libsvdc.path)
82                 pp('SV',afc$singular.values)
83                 pp('V.P.', afc$eigen.values)
84                 coordrow <- as.matrix(afc$row.scores[,1])
85                 coordrowori<-coordrow
86                 row.names(coordrow)<-rownames(dtable)
87         coordrow <- cbind(coordrow,1:nrow(dtable))
88                 print('deb recherche meilleur partition')
89         ordert <- as.matrix(coordrow[order(coordrow[,1]),])
90         ordert <- cbind(ordert, 1:nrow(ordert))
91         ordert <- ordert[order(ordert[,2]),]
92
93                 listinter<-vector()
94                 listlim<-vector()
95         dtable <- dtable[order(ordert[,3]),]
96         sc <- colSums(dtable)
97         TT <- sum(sc)
98         sc1 <- dtable[1,]
99         sc2 <- colSums(dtable) - sc1 
100         chitable <- rbind(sc1, sc2)
101         compte <- 1
102         maxinter <- 0
103         rmax <- NULL
104
105         inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT)
106                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
107                 pp('max inter phase 1', inert$maxinter/TT)#max(listinter))
108                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
109         ordert <- ordert[order(ordert[,3]),]
110                 listclasse<-ifelse(coordrowori<=ordert[(inert$rmax),1],clnb,clnb+1)
111             dtable <- dtable[order(ordert[,2]),]
112                 cl<-listclasse
113                 pp('TT',TT)
114                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
115
116                 N1<-length(listclasse[listclasse==clnb])
117                 N2<-length(listclasse[listclasse==clnb+1])
118                 pp('N1',N1)
119                 pp('N2',N2)
120 ###################################################################
121 #                  reclassement des individus                     #
122 ###################################################################
123                 malcl<-1000000000000
124                 it<-0
125                 listsub<-list()
126                 #in boucle
127         ln <- which(dtable==1, arr.ind=TRUE)
128         lnz <- list()
129         lnz[1:nrow(dtable)] <- 0
130
131         for (k in 1:nrow(ln)) {lnz[[ln[k,1]]]<-append(lnz[[ln[k,1]]],ln[k,2])}
132         for (k in 1:nrow(dtable)) {lnz[[k]] <- lnz[[k]][-1]}
133                 TT<-sum(dtable)
134
135                 while (malcl!=0 & N1>=5 & N2>=5) {
136                         it<-it+1
137                         listsub[[it]]<-vector()
138             txt <- paste('nombre iteration', it)
139                         #pp('nombre iteration',it)
140                         vdelta<-vector()
141                         #dtable[,'cl']<-cl
142                         t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
143                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
144                         ncolt<-ncol(t1)
145                         #pp('ncolt',ncolt)
146
147             if (N1 != 1) {
148                 sc1<-colSums(t1)
149             } else {
150                 sc1 <- t1
151             }
152             if (N2 != 1) {
153                             sc2<-colSums(t2)
154             } else {
155                 sc2 <- t2
156             }
157                         
158             sc<-sc1+sc2
159                         chtableori<-rbind(sc1,sc2)
160                         chtable<-chtableori
161                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
162                         txt <- paste(txt, ' - interori : ',interori)
163             #pp('interori',interori)
164
165                         N1<-nrow(t1)
166                         N2<-nrow(t2)
167
168                         #pp('N1',N1)
169                         #pp('N2',N2)
170                         txt <- paste(txt, 'N1:', N1,'-N2:',N2)
171             print(txt)
172             compte <- 0
173                         for (l in lnz){
174                     chi.in<-chtable
175                     compte <- compte + 1
176                                         if(cl[compte]==clnb){
177                                                 chtable[1,l]<-chtable[1,l]-1
178                                                 chtable[2,l]<-chtable[2,l]+1
179                                         }else{
180                                                 chtable[1,l]<-chtable[1,l]+1
181                                                 chtable[2,l]<-chtable[2,l]-1
182                                         }
183                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
184                                         ws<-interori-interswitch
185
186                                         if (ws<0){
187                                                 interori<-interswitch
188                                                 if(cl[compte]==clnb){
189                                                         #sc1<-chtable[1,]
190                                                         #sc2<-chtable[2,]
191                                                         cl[compte]<-clnb+1
192                                                         listsub[[it]]<-append(listsub[[it]],compte)
193                                                 } else {
194                                                         #sc1<-chtable[1,]
195                                                         #sc2<-chtable[2,]
196                                                         cl[compte]<-clnb
197                                                         listsub[[it]]<-append(listsub[[it]],compte)
198                                                 }
199                                                 vdelta<-append(vdelta,compte)
200                     } else {
201                         chtable<-chi.in
202                                         }
203                }
204 #                       for (val in vdelta) {
205 #                               if (cl[val]==clnb) {
206 #                                       cl[val]<-clnb+1
207 #                                       listsub[[it]]<-append(listsub[[it]],val)
208 #                                       }else {
209 #                                       cl[val]<-clnb
210 #                                       listsub[[it]]<-append(listsub[[it]],val)
211 #                               }
212 #                       }
213                         print('###################################')
214                         print('longueur < 0')
215                         malcl<-length(vdelta)
216                         if ((it>1)&&(!is.logical(listsub[[it]]))&&(!is.logical(listsub[[it-1]]))){
217                                 if (listsub[[it]]==listsub[[(it-1)]]){
218                                         malcl<-0
219                                 }
220                         }
221                         print(malcl)
222                         print('###################################')
223                 }
224                 #dtable<-cbind(dtable,'cl'=as.vector(cl))
225         #dtable[,'cl'] <-as.vector(cl)
226 #######################################################################
227 #                 Fin reclassement des individus                      #
228 #######################################################################
229 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
230                         #t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
231                         #t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
232                     t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
233                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
234             if (class(t1)=='numeric') {
235                 sc1 <- as.vector(t1)
236                 nrowt1 <- 1
237             } else {
238                 sc1 <- colSums(t1)
239                 nrowt1 <- nrow(t1)
240             }
241             if (class(t2)=='numeric') {
242                 sc2 <- as.vector(t2)
243                 nrowt2 <- 1
244             } else {
245                 sc2 <- colSums(t2)
246                 nrowt2 <- nrow(t2)
247             }
248             chtable<-rbind(sc1,sc2)
249                         inter<-chisq.test(chtable)$statistic/TT
250                         pp('last inter',inter)
251                         print('=====================')
252                         #calcul de la specificite des colonnes
253                         mint<-min(nrowt1,nrowt2)
254                         maxt<-max(nrowt1,nrowt2)
255                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
256                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
257 #                       print('ATTENTION SEUIL 3,84')
258 #                       seuil<-3.84
259                         pp('seuil',seuil)
260                         sominf<-0
261                         nv<-0
262                         nz<-0
263                         ncclnb<-0
264                         ncclnbp<-0
265             NN1<-0
266             NN2<-0
267             maxchip<-0
268             nbzeroun<-0
269             res1<-0
270             res2<-0
271             nbseuil<-0
272             nbexe<-0
273             nbcontrib<-0
274                         cn<-colnames(dtable)
275             #another try#########################################
276             one <- cbind(sc1,sc2)
277             cols <- c(length(which(cl==clnb)), length(which(cl==clnb+1))) 
278             print(cols)
279             colss <- matrix(rep(cols,ncol(dtable)), ncol=2, byrow=TRUE)
280             zero <- colss - one
281             rows <- cbind(rowSums(zero), rowSums(one))
282             n <- sum(cols)
283             for (m in 1:nrow(rows)) {
284                 obs <- t(matrix(c(zero[m,],one[m,]),2,2))
285                 E <- outer(rows[m,],cols,'*')/n
286                 if ((min(obs[2,])==0) & (min(obs[1,])!=0)) {
287                     chi <- seuil + 1
288                 } else if ((min(obs[1,])==0) & (min(obs[2,])!=0)) {
289                     chi <- seuil - 1
290                 } else if (any(obs < 10)) {
291                     chi <- sum((abs(obs - E) - 0.5)^2 / E)
292                 } else {
293                     chi <- sum(((obs - E)^2)/E)
294                 }
295                 if (is.na(chi)) {
296                     chi <- 0
297                 }
298                 if (chi > seuil) {
299                     if (obs[2,1] < E[2,1]) {
300                         listcol[[clnb]]<-append(listcol[[clnb]],cn[m])
301                         ncclnb<-ncclnb+1
302                     } else if (obs[2,2] < E[2,2]) {
303                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[m])
304                                                 ncclnbp<-ncclnbp+1
305                     }
306                 }
307             }
308             ######################################################
309                         print('resultats elim item')
310                         pp(clnb+1,length(listcol[[clnb+1]]))
311                         pp(clnb,length(listcol[[clnb]]))
312                         pp('ncclnb',ncclnb)
313                         pp('ncclnbp',ncclnbp)
314                         listrownamedtable<-rownames(dtable)
315                         listrownamedtable<-as.integer(listrownamedtable)
316                         newcol<-vector(length=nrow(dataori))
317                         #remplissage de la nouvelle colonne avec les nouvelles classes
318                         print('remplissage')
319 #                       num<-0
320             newcol[listrownamedtable] <- cl[,1]
321                         #recuperation de la classe precedante pour les cases vides
322                         print('recuperation classes precedentes')
323             if (i!=1) {
324                 newcol[which(newcol==0)] <- dout[,ncol(dout)][which(newcol==0)]
325             }
326             if(!is.null(rowelim)) {
327                 newcol[rowelim] <- 0
328             }
329                         tailleclasse<-as.matrix(summary(as.factor(as.character(newcol))))
330                         print('tailleclasse')
331                         print(tailleclasse)
332                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
333                         plusgrand<-which.max(tailleclasse)
334                         #???????????????????????????????????
335                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
336                         if (length(plusgrand)>1) {
337                                 plusgrand<-plusgrand[1]
338                         }
339                         #????????????????????????????????????
340                         
341                         #constuction du prochain tableau a analyser
342                         print('construction tableau suivant')
343             dout<-cbind(dout,newcol)
344                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
345                         dtable<-dataori[which(newcol==classe),]
346                         row.names(dtable)<-rownames(dataori)[which(newcol==classe)]
347             colnames(dtable) <- 1:ncol(dtable)
348                         mere<-classe
349                         listcolelim<-listcol[[as.integer(classe)]]
350                         mother<-listmere[[as.integer(classe)]]
351                         while (mother!=1) {
352                                 listcolelim<-append(listcolelim,listcol[[mother]])
353                                 mother<-listmere[[mother]]
354                         }
355                         
356                         listcolelim<-sort(unique(listcolelim))
357                         pp('avant',ncol(dtable))
358                         if (!is.logical(listcolelim)){
359                                 print('elimination colonne')
360                                 #dtable<-dtable[,-listcolelim]
361                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
362                         }
363                         pp('apres',ncol(dtable))
364                         #elimination des colonnes ne contenant que des 0
365                         print('vire colonne inf 3 dans boucle')
366                         sdt<-colSums(dtable)
367                         if (min(sdt)<=3)
368                                 dtable<-dtable[,-which(sdt<=3)]
369         
370                         #elimination des lignes ne contenant que des 0
371                         print('vire ligne vide dans boucle')
372                         if (ncol(dtable)==1) {
373                                 sdt<-dtable[,1]
374                         } else {
375                                 sdt<-rowSums(dtable)
376                         }
377                         if (min(sdt)==0) {
378                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
379                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
380                                 print(rowelim)
381                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
382                                 dtable<-dtable[-which(sdt==0),]
383                         }
384 #               }
385         }
386 #       sink()
387         res <- list(n1 = dout, list_mere = listmere, list_fille = list_fille)
388         res
389 }