...
[iramuteq] / Rscripts / CHDPOND.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008 Pierre Ratinaud
3 #Lisense: GNU/GPL
4
5 #library(ca)
6 #library(MASS)
7 #source('/home/pierre/workspace/iramuteq/Rscripts/afc.R')
8 #data<-read.table('output/corpus_bin.csv',header=TRUE,sep='\t')
9 #source('/home/pierre/workspace/iramuteq/Rscripts/anacor.R')
10 #library(ade4)
11 pp<-function(txt,val) {
12         d<-paste(txt,' : ')
13         print(paste(d,val))
14 }
15 MyChiSq<-function(x,sc,n){
16         sr<-rowSums(x)
17         E <- outer(sr, sc, "*")/n
18         STAT<-sum((abs(x - E))^2/E)
19         STAT
20 }
21
22 CHD<-function(data,x=9){
23 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
24         dataori=as.matrix(data)
25         row.names(dataori)=rownames(data)
26         dtable=as.matrix(data)
27         row.names(dtable)=rownames(data)
28         rowelim<-NULL
29         pp('ncol entree : ',ncol(dtable))
30         pp('nrow entree',nrow(dtable))
31         listcol <- list()
32         listmere <- list()
33         list_fille <- list()
34         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
35         sdt<-colSums(dtable)
36         if (min(sdt)==0)
37                 dtable<-dtable[,-which(sdt==0)]
38         mere<-1
39         for (i in 1:x) {
40                 clnb<-(i*2)
41                 listmere[[clnb]]<-mere
42                 listmere[[clnb+1]]<-mere
43                 list_fille[[mere]] <- c(clnb,clnb+1)
44                 listcol[[clnb]]<-vector()
45                 listcol[[clnb+1]]<-vector()
46                 #extraction du premier facteur de l'afc
47                 print('afc')
48                 #afc<-ca(dtable,nd=1)
49                 #afc<-corresp(dtable,nd=1)
50                 #afc<-fca(dtable)
51                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
52                 afc<-boostana(dtable,nd=1)
53                 #afc<-dudi.coa(dtable,scannf=FALSE,nf=1)
54                 pp('SV',afc$singular.values)
55                 pp('V.P.', afc$eigen.values)
56                 #pp('V.P.',afc$eig[1])
57                 #pp('V.P.',afc$factexpl[1])
58                 #print(afc$chisq)
59                 afcchi<-chisq.test(dtable)
60                 Tinert<-afcchi$statistic/sum(dtable)
61                 pp('inertie totale',Tinert)
62                 #coordonnees des colonnes sur le premier facteur
63                 #coordrow=afc$rowcoord
64                 coordrow=as.matrix(afc$row.scores)
65                 #coordrow<-as.matrix(afc$rproj[,1])
66                 #coordrow<-as.matrix(afc$li)
67                 coordrowori<-coordrow
68                 #row.names(coordrow)<-afc$rownames
69                 row.names(coordrow)<-rownames(dtable)
70                 #classement en fonction de la position sur le premier facteur
71                 print('deb recherche meilleur partition')
72                 ordertable<-cbind(dtable,coordrow)
73                 coordrow<-as.matrix(coordrow[order(coordrow[,1]),])
74                 ordertable<-ordertable[order(ordertable[,ncol(ordertable)]),][,-ncol(ordertable)]
75                 listinter<-vector()
76                 listlim<-vector()
77                 TT=sum(dtable)
78                 sc<-colSums(dtable)
79                 sc1<-ordertable[1,]
80                 sc2<-colSums(dtable)-ordertable[1,]
81                 chitable<-rbind(sc1,sc2)
82                 #listinter<-vector()
83                 maxinter<-0
84                 rmax<-NULL
85 #################################################################
86                 for (l in 2:(nrow(dtable)-2)){
87                         chitable[1,]<-chitable[1,]+ordertable[l,]
88                         chitable[2,]<-chitable[2,]-ordertable[l,]
89 #                       sc1<-sc1+ordertable[l,]
90 #                       sc2<-sc2-ordertable[l,]
91 #                       chitable<-rbind(sc1,sc2)
92                         chi<-MyChiSq(chitable,sc,TT)
93                         if (chi>maxinter) {
94                                 maxinter<-chi
95                                 rmax<-l
96                         }
97                 #       listinter<-append(listinter,MyChiSq(chitable,sc,TT))#,chisq.test(chitable)$statistic/TT)
98                 }
99                 #plot(listinter)
100                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
101                 pp('max inter phase 1', maxinter/TT)#max(listinter))
102                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
103                 #maxinter<-which.max(listinter)+1
104
105                 listclasse<-ifelse(coordrowori<=coordrow[(rmax),1],clnb,clnb+1)
106                 
107                 cl<-listclasse
108
109                 pp('TT',TT)
110                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
111
112                 N1<-length(listclasse[listclasse==clnb])
113                 N2<-length(listclasse[listclasse==clnb+1])
114                 pp('N1',N1)
115                 pp('N2',N2)
116 ###################################################################
117 #                  reclassement des individus                     #
118 ###################################################################
119                 malcl<-1000000000000
120                 it<-0
121                 listsub<-list()
122                 #in boucle
123
124                 while (malcl!=0 & N1>=5 & N2>=5) {
125                         it<-it+1
126                         listsub[[it]]<-vector()
127                         pp('nombre iteration',it)
128                         vdelta<-vector()
129                         #dtable[,'cl']<-cl
130                         t1<-dtable[cl==clnb,]#[,-ncol(dtable)]
131                         t2<-dtable[cl==clnb+1,]#[,-ncol(dtable)]
132                         ncolt<-ncol(t1)
133                         pp('ncolt',ncolt)
134                         sc1<-colSums(t1)
135                         sc2<-colSums(t2)
136                         sc<-sc1+sc2
137                         TT<-sum(dtable)
138                         chtableori<-rbind(sc1,sc2)
139                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
140                         chtable<-chtableori
141                         pp('interori',interori)
142                         N1<-nrow(t1)
143                         N2<-nrow(t2)
144                         pp('N1',N1)
145                         pp('N2',N2)
146                         
147                         for (l in 1:nrow(dtable)){
148                                         if(cl[l]==clnb){
149                                                 chtable[1,]<-sc1-dtable[l,]
150                                                 chtable[2,]<-sc2+dtable[l,]
151                                         }else{
152                                                 chtable[1,]<-sc1+dtable[l,]
153                                                 chtable[2,]<-sc2-dtable[l,]
154                                         }
155                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
156                                         ws<-interori-interswitch
157
158                                         if (ws<0){
159                                                 interori<-interswitch
160                                                 if(cl[l]==clnb){
161                                                         sc1<-chtable[1,]
162                                                         sc2<-chtable[2,]
163                                                         cl[l]<-clnb+1
164                                                         listsub[[it]]<-append(listsub[[it]],l)
165                                                 }else{
166                                                         sc1<-chtable[1,]
167                                                         sc2<-chtable[2,]
168                                                         cl[l]<-clnb
169                                                         listsub[[it]]<-append(listsub[[it]],l)
170                                                 }
171                                                 vdelta<-append(vdelta,l)
172                                         }
173                                 }
174 #                       for (val in vdelta) {
175 #                               if (cl[val]==clnb) {
176 #                                       cl[val]<-clnb+1
177 #                                       listsub[[it]]<-append(listsub[[it]],val)
178 #                                       }else {
179 #                                       cl[val]<-clnb
180 #                                       listsub[[it]]<-append(listsub[[it]],val)
181 #                               }
182 #                       }
183                         print('###################################')
184                         print('longueur < 0')
185                         malcl<-length(vdelta)
186                         if ((it>1)&&(!is.logical(listsub[[it]]))){
187                                 if (listsub[[it]]==listsub[[(it-1)]]){
188                                         malcl<-0
189                                 }
190                         }
191                         print(malcl)
192                         print('###################################')
193                 }
194                 dtable<-cbind(dtable,'cl'=as.vector(cl))
195 #######################################################################
196 #                 Fin reclassement des individus                      #
197 #######################################################################
198 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
199                         t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
200                         t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
201                 
202                         chtable<-rbind(colSums(t1),colSums(t2))
203                         inter<-chisq.test(chtable)$statistic/TT
204                         pp('last inter',inter)
205                         print('=====================')
206                         
207                         #calcul de la specificite des colonnes
208                         mint<-min(nrow(t1),nrow(t2))
209                         maxt<-max(nrow(t1),nrow(t2))
210                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
211                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
212 #                       print('ATTENTION SEUIL 3,84')
213 #                       seuil<-3.84
214                         pp('seuil',seuil)
215                         sominf<-0
216                         nv<-0
217                         nz<-0
218                         ncclnb<-0
219                         ncclnbp<-0
220             NN1<-0
221             NN2<-0
222             maxchip<-0
223             nbzeroun<-0
224             res1<-0
225             res2<-0
226             nbseuil<-0
227             nbexe<-0
228             nbcontrib<-0
229                         cn<-colnames(dtable)[1:(ncol(dtable)-1)]
230 #               for (k in 1:(ncol(dtable)-1)) {
231 #                               #print(k)
232 #                               tab<-table(dtable[,k],dtable[,ncol(dtable)])
233 #                               #print(cn[k])
234 #                               #print (tab)
235 #                               #chidemoi<-(sum(t)*(((t[1,1]*t[2,2])-(t[1,2]*t[2,1]))^2))/((rowSums(t)[1])*(rowSums(t)[2])*(colSums(t)[1])*(colSums(t)[2]))
236 #                               if (rownames(tab)[1]==1 & nrow(tab)==1) {
237 #                                       tab<-rbind(c(0,0),tab[1])
238 #                                       print('tableau vide dessus')
239 #                               }
240 #                               if (min(tab)<=10){
241 #                                       chi<-chisq.test(tab,correct=TRUE)
242 #                               }else{
243 #                                       chi<-chisq.test(tab,correct=FALSE)
244 #                               }
245 #                               if ((min(tab[2,])==0) & (min(tab[1,])!=0)){
246 #                                       chi$statistic<-seuil+1
247 #                               #    print('min tab 2 == 0')    
248 #                               }
249 ##                if(((tab[2,1]>tab[1,1]) & (tab[2,2]>tab[1,2]))){
250 ##                                      nz<-nz+1
251 ##                                      chi$statistic<-seuil-1
252 ##                                      sominf<-sominf-1
253 ##                              }
254 #                if ((min(tab[1,])==0) & (min(tab[2,])!=0)){
255 #                                       chi$statistic<-seuil-1
256 #                #    print('min tab 1 == 0')
257 #                               }
258 #                               if (is.na(chi$statistic)) {
259 #                                       chi$statistic<-0
260 #                                       print('NA dans chi$statistic')
261 #                               }
262 ##                              print('#####################')
263 #                               if (chi$statistic>seuil){
264 #                               i#      print('sup seuil')
265 #                                       if (tab[2,1]<chi$expected[2,1]){
266 #                                               listcol[[clnb]]<-append(listcol[[clnb]],cn[k])
267 #                                               ncclnb<-ncclnb+1
268 ##                                              print('@@@@@@@@specifique de la classe@@@@@@')
269 ##                                              print(clnb)
270 #                                       } else if (tab[2,2]<chi$expected[2,2]){
271 #                                               listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[k])
272 #                                               ncclnbp<-ncclnbp+1
273 ##                                              print('@@@@@@@@specifique de la classe@@@@@@')
274 ##                                              print(clnb+1)
275 #                                       }
276 #                               }
277 ##                              if (chi$statistic<seuil){
278 ##                                      print('inf seuil')
279 ##                                      print(tab)
280 ##                                      print(chi$expected)
281 ##                                      print(chi$statistic)
282 ##                                      sominf<-sominf+1
283 ##                              }
284 #                               
285 ##                              print('tableau')
286 ##                              print(tab)
287 ##                              chi<-chisq.test(tab,correct=FALSE)
288 ##                              print('sans correction')
289 ##                              print(chi$statistic)
290 ##                              print(chi$expected)
291 ##                              print(chi$residuals)
292 ##                chit<-((tab-chi$expected)^2)/chi$expected
293 ##                print(mean(chi$residuals))
294 ##                print(sd(as.vector(chi$residuals)))
295 #                #print(chi$residuals/sd(as.vector(chi$residuals)))
296 #                #print(abs(chi$residuals/sd(as.vector(chi$residuals))))
297 #                #print(qnorm(abs(chi$residuals/sd(as.vector(chi$residuals)))))
298 ##                difftab<-tab-chi$expected
299 # #               print('contribution')
300 #                #print((difftab-mean(difftab))/sd(as.vector(difftab)))
301 #                #print(abs((chit-mean(chit))/sd(as.vector(chit))))
302 ##                tabin1<-matrix(1,2,2)
303 ##                tabin1<-tabin1-(colSums(tab)/sum(tab))
304 ##                tabin2<-matrix(1,2,2)
305 ##                tabin2<-(tabin2-(rowSums(tab)/sum(tab)))
306 ##                tabinv<-tabin1*tabin2
307 ##                CS<-colSums(tab)
308 ##                RS<-rowSums(tab)
309 ##                GT<-sum(tab)
310 ##                print(tabinv)
311 ##                contrib<-(tab-chi$expected)/sqrt(chi$expected * ((1 - RS/GT) %*% t(1 - CS/GT)))
312 ##                print(chit)
313 ##                print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT')
314 ##                print(contrib)
315 ##                if (((which.max(chit)==2) || (which.max(chit)==4)) & chi$statistic>=seuil) {
316 ##                    if (which.max(chit)==2) NN1<-NN1+1
317 ##                    if (which.max(chit)==4) NN2<-NN2+1
318 ##                }
319 ##                if (max(abs(contrib[2,])>=1.96)) {
320 ##                    nbcontrib<-nbcontrib+1
321 ##                }
322 ##                if (max(chit[2,]>=seuil)) maxchip<-maxchip+1
323 ##                if (chi$statistic >=seuil) nbseuil<-nbseuil+1
324 ##                if ((min(tab[2,])==0) & (chi$statistic<seuil)) nbzeroun<-nbzeroun+1
325 ##                if ((((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4))) & chi$statistic>=seuil) {
326 ##                    if ((which.max(chi$residual)==2) || (which.min(chi$residual)==4)) res1<-res1+1
327 ##                    if ((which.max(chi$residual)==4) || (which.min(chi$residual)==2)) res2<-res2+1
328 ##                } else if (!((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4)) & chi$statistic>=seuil) {
329 ##                    print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
330 ##                    print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
331 ##                }
332 ##                if (length(unique(as.vector(chi$residual)))==2) {
333 ##                    nbexe<-nbexe+1
334 ##                }
335 #
336 ##                              print('avec correction')
337 ##                if (min(tab)<5) chi<-chisq.test(tab,correct=TRUE)
338 ##                              print(chi$statistic)
339 ##                #if (chi$statistic >=seuil) nbseuil<-nbseuil+1
340 ##                              print(chi$expected)
341 ##                              print(chi$residuals)
342 ##                chit<-((abs(tab-chi$expected)-0.5)^2)/chi$expected
343 ##                print(chit)
344 ##                              print('#####################')
345 #                       }
346 ##            pp('NN1',NN1)
347 ##            pp('NN2',NN2)
348 ##            pp('maxchip',maxchip)
349 ##            pp('nbzeroun',nbzeroun)
350 ##            pp('res1',res1)
351 ##            pp('res2',res2)
352 ##            pp('nbseuil',nbseuil)
353 ##            pp('nbexe',nbexe)
354 ##            pp('nbcontrib', nbcontrib)
355                         print('resultats elim item')
356                         pp(clnb+1,length(listcol[[clnb+1]]))
357                         pp(clnb,length(listcol[[clnb]]))
358 #                       pp('inf seuil',sominf)
359 #                       pp('nv',nv)
360                         pp('ncclnb',ncclnb)
361                         pp('ncclnbp',ncclnbp)
362 #                       pp('nz',nz)
363                         #sink()
364                         #lignes concernees
365                         listrownamedtable<-rownames(dtable)
366                         listrownamedtable<-as.integer(listrownamedtable)
367                         newcol<-vector(length=nrow(dataori))
368                         #remplissage de la nouvelle colonne avec les nouvelles classes
369                         print('remplissage')
370                         num<-0
371                         for (ligne in listrownamedtable) {
372                                 num<-num+1
373                                 newcol[ligne]<-as.integer(as.vector(dtable[,'cl'][num])[1])
374                         }
375                         #recuperation de la classe precedante pour les cases vides
376                         print('recuperation classes precedentes')
377                         matori<-as.matrix(dataori)
378                         if (i!=1) {
379                                 #    options(warn=-1)
380                                 for (ligne in 1:length(newcol)) {
381                                         #       print(newcol[ligne])
382                                         if (newcol[ligne]==0) { # ce test renvoie un warning
383                                                 if (ligne %in% rowelim){
384                                                         newcol[ligne]<-0
385                                                 } else {
386                                                         newcol[ligne]<-matori[ligne,length(matori[1,])]
387                                                 }
388                                                 
389                                         }
390                                 }
391                                 #   options(warn=0)
392                         }
393                         dataori<-cbind(dataori,newcol)
394                         
395                         tailleclasse<-as.matrix(summary(as.factor(as.character(dataori[,ncol(dataori)]))))
396                         #tailleclasse<-as.integer(tailleclasse)
397                         print('tailleclasse')
398                         print(tailleclasse)
399                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
400                         plusgrand<-which.max(tailleclasse)
401                         #???????????????????????????????????
402                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
403                         if (length(plusgrand)>1) {
404                                 plusgrand<-plusgrand[1]
405                         }
406                         #????????????????????????????????????
407                         
408                         #constuction du prochain tableau a analyser
409                         print('construction tableau suivant')
410                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
411                         dtable<-dataori[dataori[,ncol(dataori)]==classe,]
412                         row.names(dtable)<-rownames(dataori[dataori[,ncol(dataori)]==classe,])
413                         dtable<-dtable[,1:(ncol(dtable)-i)]
414                         mere<-classe
415                         listcolelim<-listcol[[as.integer(classe)]]
416                         mother<-listmere[[as.integer(classe)]]
417                         while (mother!=1) {
418                                 listcolelim<-append(listcolelim,listcol[[mother]])
419                                 mother<-listmere[[mother]]
420                         }
421                         
422                         listcolelim<-sort(unique(listcolelim))
423                         pp('avant',ncol(dtable))
424                         if (!is.logical(listcolelim)){
425                                 print('elimination colonne')
426                                 #dtable<-dtable[,-listcolelim]
427                                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
428                         }
429                         pp('apres',ncol(dtable))
430                         #elimination des colonnes ne contenant que des 0
431                         print('vire colonne inf 3 dans boucle')
432                         sdt<-colSums(dtable)
433                         if (min(sdt)<=3)
434                                 dtable<-dtable[,-which(sdt<=3)]
435         
436                         #elimination des lignes ne contenant que des 0
437                         print('vire ligne vide dans boucle')
438                         if (ncol(dtable)==1) {
439                                 sdt<-dtable[,1]
440                         } else {
441                                 sdt<-rowSums(dtable)
442                         }
443                         if (min(sdt)==0) {
444                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
445                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
446                                 print(rowelim)
447                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
448                                 dtable<-dtable[-which(sdt==0),]
449                         }
450 #               }
451         }
452 #       sink()
453         res <- list(n1 = dataori[,(ncol(dataori)-x+1):ncol(dataori)], list_mere = listmere, list_fille = list_fille)
454         res
455 }