974e90107aa8ee1404e760cedb5835a9738cee47
[iramuteq] / Rscripts / CHD.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008-2011 Pierre Ratinaud
3 #Lisense: GNU/GPL
4
5 pp<-function(txt,val) {
6         d<-paste(txt,' : ')
7         print(paste(d,val))
8 }
9 MyChiSq<-function(x,sc,n){
10         sr<-rowSums(x)
11         E <- outer(sr, sc, "*")/n
12         STAT<-sum(((x - E)^2)/E)
13         STAT
14 }
15
16 MySpeedChi <- function(x,sc) {
17     sr <-rowSums(x)
18     E <- outer(sr, sc, "*")
19         STAT<-sum((x - E)^2/E)
20         STAT
21 }
22
23 find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) {
24     ln <- which(dtable==1, arr.ind=TRUE)
25     lo <- list()
26     lo[1:nrow(dtable)] <- 0
27     for (k in 1:nrow(ln)) {lo[[ln[k,1]]]<-append(lo[[ln[k,1]]],ln[k,2])}
28     for (k in 1:nrow(dtable)) {lo[[k]] <- lo[[k]][-1]}
29     lo<-lo[-c(1,length(lo))]
30     for (l in lo) {
31         compte <- compte + 1 
32         chitable[1,l]<-chitable[1,l]+1
33         chitable[2,l]<-chitable[2,l]-1
34         chi<-MyChiSq(chitable,sc,TT)
35         if (chi>maxinter) {
36             maxinter<-chi
37             rmax<-compte
38         }   
39     }
40     res <- list(maxinter=maxinter, rmax=rmax)
41     res
42 }  
43
44
45
46
47
48 CHD<-function(data.in, x=9, mode.patate = FALSE, svd.method, libsvdc.path=NULL){
49 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
50         dataori <- data.in
51     row.names(dataori) <- rownames(data.in)
52         dtable <- data.in
53         colnames(dtable) <- 1:ncol(dtable)
54     dout <- NULL
55         rowelim<-NULL
56         pp('ncol entree : ',ncol(dtable))
57         pp('nrow entree',nrow(dtable))
58         listcol <- list()
59         listmere <- list()
60         list_fille <- list()
61         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
62         sdt<-colSums(dtable)
63         if (min(sdt)==0)
64                 dtable<-dtable[,-which(sdt==0)]
65     print('vire lignes vides en entree')
66     sdt<-rowSums(dtable)
67         if (min(sdt)==0) {
68         rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
69         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
70         print(rowelim)
71         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
72                 dtable<-dtable[-which(sdt==0),]
73         }
74         mere<-1
75         for (i in 1:x) {
76                 clnb<-(i*2)
77                 listmere[[clnb]]<-mere
78                 listmere[[clnb+1]]<-mere
79                 list_fille[[mere]] <- c(clnb,clnb+1)
80                 listcol[[clnb]]<-vector()
81                 listcol[[clnb+1]]<-vector()
82                 #extraction du premier facteur de l'afc
83                 print('afc')
84                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
85                 afc<-boostana(dtable, nd=1, svd.method = svd.method, libsvdc.path=libsvdc.path)
86                 pp('SV',afc$singular.values)
87                 pp('V.P.', afc$eigen.values)
88                 coordrow <- as.matrix(afc$row.scores[,1])
89                 coordrowori<-coordrow
90                 row.names(coordrow)<-rownames(dtable)
91         coordrow <- cbind(coordrow,1:nrow(dtable))
92                 print('deb recherche meilleur partition')
93         ordert <- as.matrix(coordrow[order(coordrow[,1]),])
94         ordert <- cbind(ordert, 1:nrow(ordert))
95         ordert <- ordert[order(ordert[,2]),]
96
97                 listinter<-vector()
98                 listlim<-vector()
99         dtable <- dtable[order(ordert[,3]),]
100         sc <- colSums(dtable)
101         TT <- sum(sc)
102         sc1 <- dtable[1,]
103         sc2 <- colSums(dtable) - sc1 
104         chitable <- rbind(sc1, sc2)
105         compte <- 1
106         maxinter <- 0
107         rmax <- NULL
108
109         inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT)
110         print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
111                 pp('max inter phase 1', inert$maxinter/TT)#max(listinter))
112                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
113         ordert <- ordert[order(ordert[,3]),]
114                 listclasse<-ifelse(coordrowori<=ordert[(inert$rmax),1],clnb,clnb+1)
115             dtable <- dtable[order(ordert[,2]),]
116                 cl<-listclasse
117                 pp('TT',TT)
118                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
119
120                 N1<-length(listclasse[listclasse==clnb])
121                 N2<-length(listclasse[listclasse==clnb+1])
122                 pp('N1',N1)
123                 pp('N2',N2)
124 ###################################################################
125 #                  reclassement des individus                     #
126 ###################################################################
127         if (!mode.patate) {
128                 malcl<-1000000000000
129                 it<-0
130                 listsub<-list()
131                 #in boucle
132             ln <- which(dtable==1, arr.ind=TRUE)
133             lnz <- list()
134             lnz[1:nrow(dtable)] <- 0
135     
136             for (k in 1:nrow(ln)) {lnz[[ln[k,1]]]<-append(lnz[[ln[k,1]]],ln[k,2])}
137             for (k in 1:nrow(dtable)) {lnz[[k]] <- lnz[[k]][-1]}
138                 TT<-sum(dtable)
139     
140                 while (malcl!=0 & N1>=5 & N2>=5) {
141                         it<-it+1
142                         listsub[[it]]<-vector()
143                 txt <- paste('nombre iteration', it)
144                         #pp('nombre iteration',it)
145                         vdelta<-vector()
146                         #dtable[,'cl']<-cl
147                         t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
148                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
149                         ncolt<-ncol(t1)
150                         #pp('ncolt',ncolt)
151     
152                 if (N1 != 1) {
153                     sc1<-colSums(t1)
154                 } else {
155                     sc1 <- t1
156                 }
157                 if (N2 != 1) {
158                             sc2<-colSums(t2)
159                 } else {
160                     sc2 <- t2
161                 }
162                         
163                 sc<-sc1+sc2
164                         chtableori<-rbind(sc1,sc2)
165                         chtable<-chtableori
166                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
167                         txt <- paste(txt, ' - interori : ',interori)
168                 #pp('interori',interori)
169     
170                         N1<-nrow(t1)
171                         N2<-nrow(t2)
172     
173                         #pp('N1',N1)
174                         #pp('N2',N2)
175                         txt <- paste(txt, 'N1:', N1,'-N2:',N2)
176                 print(txt)
177                 compte <- 0
178                         for (l in lnz){
179                         chi.in<-chtable
180                         compte <- compte + 1
181                                         if(cl[compte]==clnb){
182                                                 chtable[1,l]<-chtable[1,l]-1
183                                                 chtable[2,l]<-chtable[2,l]+1
184                                         }else{
185                                                 chtable[1,l]<-chtable[1,l]+1
186                                                 chtable[2,l]<-chtable[2,l]-1
187                                         }
188                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
189                                         ws<-interori-interswitch
190     
191                                         if (ws<0){
192                                                 interori<-interswitch
193                                                 if(cl[compte]==clnb){
194                                                         #sc1<-chtable[1,]
195                                                         #sc2<-chtable[2,]
196                                                         cl[compte]<-clnb+1
197                                                         listsub[[it]]<-append(listsub[[it]],compte)
198                                                 } else {
199                                                         #sc1<-chtable[1,]
200                                                         #sc2<-chtable[2,]
201                                                         cl[compte]<-clnb
202                                                         listsub[[it]]<-append(listsub[[it]],compte)
203                                                 }
204                                                 vdelta<-append(vdelta,compte)
205                         } else {
206                             chtable<-chi.in
207                                         }
208                    }
209     #                   for (val in vdelta) {
210     #                           if (cl[val]==clnb) {
211     #                                   cl[val]<-clnb+1
212     #                                   listsub[[it]]<-append(listsub[[it]],val)
213     #                                   }else {
214     #                                   cl[val]<-clnb
215     #                                   listsub[[it]]<-append(listsub[[it]],val)
216     #                           }
217     #                   }
218                         print('###################################')
219                         print('longueur < 0')
220                         malcl<-length(vdelta)
221                         if ((it>1)&&(!is.logical(listsub[[it]]))&&(!is.logical(listsub[[it-1]]))){
222                                 if (listsub[[it]]==listsub[[(it-1)]]){
223                                         malcl<-0
224                                 }
225                         }
226                         print(malcl)
227                         print('###################################')
228                 }
229         }
230                 #dtable<-cbind(dtable,'cl'=as.vector(cl))
231         #dtable[,'cl'] <-as.vector(cl)
232 #######################################################################
233 #                 Fin reclassement des individus                      #
234 #######################################################################
235 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
236                         #t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
237                         #t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
238                     t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
239                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
240             if (class(t1)=='numeric') {
241                 sc1 <- as.vector(t1)
242                 nrowt1 <- 1
243             } else {
244                 sc1 <- colSums(t1)
245                 nrowt1 <- nrow(t1)
246             }
247             if (class(t2)=='numeric') {
248                 sc2 <- as.vector(t2)
249                 nrowt2 <- 1
250             } else {
251                 sc2 <- colSums(t2)
252                 nrowt2 <- nrow(t2)
253             }
254             chtable<-rbind(sc1,sc2)
255                         inter<-chisq.test(chtable)$statistic/TT
256                         pp('last inter',inter)
257                         print('=====================')
258                         #calcul de la specificite des colonnes
259                         mint<-min(nrowt1,nrowt2)
260                         maxt<-max(nrowt1,nrowt2)
261                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
262                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
263 #                       print('ATTENTION SEUIL 3,84')
264 #                       seuil<-3.84
265                         pp('seuil',seuil)
266                         sominf<-0
267                         nv<-0
268                         nz<-0
269                         ncclnb<-0
270                         ncclnbp<-0
271             NN1<-0
272             NN2<-0
273             maxchip<-0
274             nbzeroun<-0
275             res1<-0
276             res2<-0
277             nbseuil<-0
278             nbexe<-0
279             nbcontrib<-0
280                         cn<-colnames(dtable)
281             #another try#########################################
282             one <- cbind(sc1,sc2)
283             cols <- c(length(which(cl==clnb)), length(which(cl==clnb+1))) 
284             print(cols)
285             colss <- matrix(rep(cols,ncol(dtable)), ncol=2, byrow=TRUE)
286             zero <- colss - one
287             rows <- cbind(rowSums(zero), rowSums(one))
288             n <- sum(cols)
289             for (m in 1:nrow(rows)) {
290                 obs <- t(matrix(c(zero[m,],one[m,]),2,2))
291                 E <- outer(rows[m,],cols,'*')/n
292                 if ((min(obs[2,])==0) & (min(obs[1,])!=0)) {
293                     chi <- seuil + 1
294                 } else if ((min(obs[1,])==0) & (min(obs[2,])!=0)) {
295                     chi <- seuil - 1
296                 } else if (any(obs < 10)) {
297                     chi <- sum((abs(obs - E) - 0.5)^2 / E)
298                 } else {
299                     chi <- sum(((obs - E)^2)/E)
300                 }
301                 if (is.na(chi)) {
302                     chi <- 0
303                 }
304                 if (chi > seuil) {
305                     if (obs[2,1] < E[2,1]) {
306                         listcol[[clnb]]<-append(listcol[[clnb]],cn[m])
307                         ncclnb<-ncclnb+1
308                     } else if (obs[2,2] < E[2,2]) {
309                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[m])
310                                                 ncclnbp<-ncclnbp+1
311                     }
312                 }
313             }
314             ######################################################
315                         print('resultats elim item')
316                         pp(clnb+1,length(listcol[[clnb+1]]))
317                         pp(clnb,length(listcol[[clnb]]))
318                         pp('ncclnb',ncclnb)
319                         pp('ncclnbp',ncclnbp)
320                         listrownamedtable<-rownames(dtable)
321                         listrownamedtable<-as.integer(listrownamedtable)
322                         newcol<-vector(length=nrow(dataori))
323                         #remplissage de la nouvelle colonne avec les nouvelles classes
324                         print('remplissage')
325 #                       num<-0
326             newcol[listrownamedtable] <- cl[,1]
327                         #recuperation de la classe precedante pour les cases vides
328                         print('recuperation classes precedentes')
329             if (i!=1) {
330                 newcol[which(newcol==0)] <- dout[,ncol(dout)][which(newcol==0)]
331             }
332             if(!is.null(rowelim)) {
333                 newcol[rowelim] <- 0
334             }
335                         tailleclasse<-as.matrix(summary(as.factor(as.character(newcol))))
336                         print('tailleclasse')
337                         print(tailleclasse)
338                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
339                         plusgrand<-which.max(tailleclasse)
340                         #???????????????????????????????????
341                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
342                         if (length(plusgrand)>1) {
343                                 plusgrand<-plusgrand[1]
344                         }
345                         #????????????????????????????????????
346                         
347                         #constuction du prochain tableau a analyser
348                         print('construction tableau suivant')
349             dout<-cbind(dout,newcol)
350                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
351                         dtable<-dataori[which(newcol==classe),]
352                         row.names(dtable)<-rownames(dataori)[which(newcol==classe)]
353             colnames(dtable) <- 1:ncol(dtable)
354                         mere<-classe
355                         listcolelim<-listcol[[as.integer(classe)]]
356                         mother<-listmere[[as.integer(classe)]]
357                         while (mother!=1) {
358                                 listcolelim<-append(listcolelim,listcol[[mother]])
359                                 mother<-listmere[[mother]]
360                         }
361                         
362                         listcolelim<-sort(unique(listcolelim))
363                         pp('avant',ncol(dtable))
364                         if (!is.logical(listcolelim)){
365                                 print('elimination colonne')
366                                 #dtable<-dtable[,-listcolelim]
367                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
368                         }
369                         pp('apres',ncol(dtable))
370                         #elimination des colonnes ne contenant que des 0
371                         print('vire colonne inf 3 dans boucle')
372                         sdt<-colSums(dtable)
373                         if (min(sdt)<=3)
374                                 dtable<-dtable[,-which(sdt<=3)]
375         
376                         #elimination des lignes ne contenant que des 0
377                         print('vire ligne vide dans boucle')
378                         if (ncol(dtable)==1) {
379                                 sdt<-dtable[,1]
380                         } else {
381                                 sdt<-rowSums(dtable)
382                         }
383                         if (min(sdt)==0) {
384                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
385                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
386                                 print(rowelim)
387                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
388                                 dtable<-dtable[-which(sdt==0),]
389                         }
390 #               }
391         }
392 #       sink()
393         res <- list(n1 = dout, list_mere = listmere, list_fille = list_fille)
394         res
395 }