...
[iramuteq] / Rscripts / CHDKmeans.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008 Pierre Ratinaud
3 #Lisense: GNU/GPL
4
5
6
7 #library(ca)
8 #library(MASS)
9 #source('/home/pierre/workspace/iramuteq/Rscripts/afc.R')
10 #data<-read.table('output/corpus_bin.csv',header=TRUE,sep='\t')
11 #source('/home/pierre/workspace/iramuteq/Rscripts/anacor.R')
12 #library(ade4)
13 pp<-function(txt,val) {
14         d<-paste(txt,' : ')
15         print(paste(d,val))
16 }
17 MyChiSq<-function(x,sc,n){
18         sr<-rowSums(x)
19         E <- outer(sr, sc, "*")/n
20         STAT<-sum((abs(x - E))^2/E)
21         STAT
22 }
23
24 CHD<-function(data,x=9){
25 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
26         dataori=as.matrix(data)
27         row.names(dataori)=rownames(data)
28         dtable=as.matrix(data)
29         row.names(dtable)=rownames(data)
30         rowelim<-NULL
31         pp('ncol entree : ',ncol(dtable))
32         pp('nrow entree',nrow(dtable))
33         listcol <- list()
34         listmere <- list()
35         list_fille <- list()
36         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
37         sdt<-colSums(dtable)
38         if (min(sdt)==0)
39                 dtable<-dtable[,-which(sdt==0)]
40         mere<-1
41         for (i in 1:x) {
42                 clnb<-(i*2)
43                 listmere[[clnb]]<-mere
44                 listmere[[clnb+1]]<-mere
45                 list_fille[[mere]] <- c(clnb,clnb+1)
46                 listcol[[clnb]]<-vector()
47                 listcol[[clnb+1]]<-vector()
48                 #extraction du premier facteur de l'afc
49                 print('afc')
50                 #afc<-ca(dtable,nd=1)
51                 #afc<-corresp(dtable,nd=1)
52                 #afc<-fca(dtable)
53                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
54 #               afc<-boostana(dtable,nd=1)
55 #               #afc<-dudi.coa(dtable,scannf=FALSE,nf=1)
56 #               pp('SV',afc$singular.values)
57 #               pp('V.P.', afc$eigen.values)
58 #               #pp('V.P.',afc$eig[1])
59 #               #pp('V.P.',afc$factexpl[1])
60 #               #print(afc$chisq)
61 #               afcchi<-chisq.test(dtable)
62 #               Tinert<-afcchi$statistic/sum(dtable)
63 #               pp('inertie totale',Tinert)
64 #               #coordonnees des colonnes sur le premier facteur
65 #               #coordrow=afc$rowcoord
66 #               coordrow=as.matrix(afc$row.scores)
67 #               #coordrow<-as.matrix(afc$rproj[,1])
68 #               #coordrow<-as.matrix(afc$li)
69 #               coordrowori<-coordrow
70 #               #row.names(coordrow)<-afc$rownames
71 #               row.names(coordrow)<-rownames(dtable)
72 #               #classement en fonction de la position sur le premier facteur
73 #               print('deb recherche meilleur partition')
74 #               ordertable<-cbind(dtable,coordrow)
75 #               coordrow<-as.matrix(coordrow[order(coordrow[,1]),])
76 #               ordertable<-ordertable[order(ordertable[,ncol(ordertable)]),][,-ncol(ordertable)]
77 #               listinter<-vector()
78 #               listlim<-vector()
79                 TT=sum(dtable)
80 #               sc<-colSums(dtable)
81 #               sc1<-ordertable[1,]
82 #               sc2<-colSums(dtable)-ordertable[1,]
83 #               chitable<-rbind(sc1,sc2)
84 #               #listinter<-vector()
85 #               maxinter<-0
86 #               rmax<-NULL
87 ##################################################################
88 #               for (l in 2:(nrow(dtable)-2)){
89 #                       chitable[1,]<-chitable[1,]+ordertable[l,]
90 #                       chitable[2,]<-chitable[2,]-ordertable[l,]
91 ##                      sc1<-sc1+ordertable[l,]
92 ##                      sc2<-sc2-ordertable[l,]
93 ##                      chitable<-rbind(sc1,sc2)
94 #                       chi<-MyChiSq(chitable,sc,TT)
95 #                       if (chi>maxinter) {
96 #                               maxinter<-chi
97 #                               rmax<-l
98 #                       }
99 #               #       listinter<-append(listinter,MyChiSq(chitable,sc,TT))#,chisq.test(chitable)$statistic/TT)
100 #               }
101 #               #plot(listinter)
102 #               print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
103 #               pp('max inter phase 1', maxinter/TT)#max(listinter))
104 #               print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
105 #               #maxinter<-which.max(listinter)+1
106 #
107 #               listclasse<-ifelse(coordrowori<=coordrow[(rmax),1],clnb,clnb+1)
108 #               
109 #               cl<-listclasse
110 #
111 #               pp('TT',TT)
112 #               #dtable<-cbind(dtable,'cl'= as.vector(cl))
113 #
114 #               N1<-length(listclasse[listclasse==clnb])
115 #               N2<-length(listclasse[listclasse==clnb+1])
116 #               pp('N1',N1)
117 #               pp('N2',N2)
118 ####################################################################
119 ##                  reclassement des individus                     #
120 ####################################################################
121 #               malcl<-1000000000000
122 #               it<-0
123 #               listsub<-list()
124 #               #in boucle
125 #
126 #               while (malcl!=0 & N1>=5 & N2>=5) {
127 #                       it<-it+1
128 #                       listsub[[it]]<-vector()
129 #                       pp('nombre iteration',it)
130 #                       vdelta<-vector()
131 #                       #dtable[,'cl']<-cl
132 #                       t1<-dtable[cl==clnb,]#[,-ncol(dtable)]
133 #                       t2<-dtable[cl==clnb+1,]#[,-ncol(dtable)]
134 #                       ncolt<-ncol(t1)
135 #                       pp('ncolt',ncolt)
136 #                       sc1<-colSums(t1)
137 #                       sc2<-colSums(t2)
138 #                       sc<-sc1+sc2
139 #                       TT<-sum(dtable)
140 #                       chtableori<-rbind(sc1,sc2)
141 #                       interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
142 #                       chtable<-chtableori
143 #                       pp('interori',interori)
144 #                       N1<-nrow(t1)
145 #                       N2<-nrow(t2)
146 #                       pp('N1',N1)
147 #                       pp('N2',N2)
148 #                       
149 #                       for (l in 1:nrow(dtable)){
150 #                                       if(cl[l]==clnb){
151 #                                               chtable[1,]<-sc1-dtable[l,]
152 #                                               chtable[2,]<-sc2+dtable[l,]
153 #                                       }else{
154 #                                               chtable[1,]<-sc1+dtable[l,]
155 #                                               chtable[2,]<-sc2-dtable[l,]
156 #                                       }
157 #                                       interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
158 #                                       ws<-interori-interswitch
159 #
160 #                                       if (ws<0){
161 #                                               interori<-interswitch
162 #                                               if(cl[l]==clnb){
163 #                                                       sc1<-chtable[1,]
164 #                                                       sc2<-chtable[2,]
165 #                                                       cl[l]<-clnb+1
166 #                                                       listsub[[it]]<-append(listsub[[it]],l)
167 #                                               }else{
168 #                                                       sc1<-chtable[1,]
169 #                                                       sc2<-chtable[2,]
170 #                                                       cl[l]<-clnb
171 #                                                       listsub[[it]]<-append(listsub[[it]],l)
172 #                                               }
173 #                                               vdelta<-append(vdelta,l)
174 #                                       }
175 #                               }
176 ##                      for (val in vdelta) {
177 ##                              if (cl[val]==clnb) {
178 ##                                      cl[val]<-clnb+1
179 ##                                      listsub[[it]]<-append(listsub[[it]],val)
180 ##                                      }else {
181 ##                                      cl[val]<-clnb
182 ##                                      listsub[[it]]<-append(listsub[[it]],val)
183 ##                              }
184 ##                      }
185 #                       print('###################################')
186 #                       print('longueur < 0')
187 #                       malcl<-length(vdelta)
188 #                       if ((it>1)&&(!is.logical(listsub[[it]]))){
189 #                               if (listsub[[it]]==listsub[[(it-1)]]){
190 #                                       malcl<-0
191 #                               }
192 #                       }
193 #                       print(malcl)
194 #                       print('###################################')
195 #               }
196 ##########################################################################
197 # kmeans
198 #        clori<-kmeans(dtable,2)$cluster
199 #        cl<-ifelse(clori==1,clnb,clnb+1)
200 #pam
201         library(cluster)
202         clori<-pam(dist(dtable,method='binary'),2,diss=TRUE)$cluster
203         cl<-ifelse(clori==1,clnb,clnb+1)
204         
205             dtable<-cbind(dtable,'cl'=as.vector(cl))
206
207 #######################################################################
208 #                 Fin reclassement des individus                      #
209 #######################################################################
210 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
211                         t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
212                         t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
213                 
214                         chtable<-rbind(colSums(t1),colSums(t2))
215                         inter<-chisq.test(chtable)$statistic/TT
216                         pp('last inter',inter)
217                         print('=====================')
218                         
219                         #calcul de la specificite des colonnes
220                         mint<-min(nrow(t1),nrow(t2))
221                         maxt<-max(nrow(t1),nrow(t2))
222                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
223                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
224 #                       print('ATTENTION SEUIL 3,84')
225 #                       seuil<-3.84
226                         pp('seuil',seuil)
227                         sominf<-0
228                         nv<-0
229                         nz<-0
230                         ncclnb<-0
231                         ncclnbp<-0
232             NN1<-0
233             NN2<-0
234             maxchip<-0
235             nbzeroun<-0
236             res1<-0
237             res2<-0
238             nbseuil<-0
239             nbexe<-0
240             nbcontrib<-0
241                         cn<-colnames(dtable)[1:(ncol(dtable)-1)]
242                         for (k in 1:(ncol(dtable)-1)) {
243                                 #print(k)
244                                 tab<-table(dtable[,k],dtable[,ncol(dtable)])
245                                 #print(cn[k])
246                                 #print (tab)
247                                 #chidemoi<-(sum(t)*(((t[1,1]*t[2,2])-(t[1,2]*t[2,1]))^2))/((rowSums(t)[1])*(rowSums(t)[2])*(colSums(t)[1])*(colSums(t)[2]))
248                                 if (rownames(tab)[1]==1 & nrow(tab)==1) {
249                                         tab<-rbind(c(0,0),tab[1])
250                                         print('tableau vide dessus')
251                                 }
252                                 if (min(tab)<=10){
253                                         chi<-chisq.test(tab,correct=TRUE)
254                                 }else{
255                                         chi<-chisq.test(tab,correct=FALSE)
256                                 }
257                                 if ((min(tab[2,])==1) & (min(tab[1,])!=0)){
258                                         chi$statistic<-seuil+1
259                                 #    print('min tab 2 == 0')    
260                                 }
261 #                if(((tab[2,1]>tab[1,1]) & (tab[2,2]>tab[1,2]))){
262 #                                       nz<-nz+1
263 #                                       chi$statistic<-seuil-1
264 #                                       sominf<-sominf-1
265 #                               }
266                 if ((min(tab[1,])==0) & (min(tab[2,])!=0)){
267                                         chi$statistic<-seuil-1
268                 #    print('min tab 1 == 0')
269                                 }
270                                 if (is.na(chi$statistic)) {
271                                         chi$statistic<-0
272                                         print('NA dans chi$statistic')
273                                 }
274 #                               print('#####################')
275                                 if (chi$statistic>seuil){
276                                 i#      print('sup seuil')
277                                         if (tab[2,1]<chi$expected[2,1]){
278                                                 listcol[[clnb]]<-append(listcol[[clnb]],cn[k])
279                                                 ncclnb<-ncclnb+1
280 #                                               print('@@@@@@@@specifique de la classe@@@@@@')
281 #                                               print(clnb)
282                                         } else if (tab[2,2]<chi$expected[2,2]){
283                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[k])
284                                                 ncclnbp<-ncclnbp+1
285 #                                               print('@@@@@@@@specifique de la classe@@@@@@')
286 #                                               print(clnb+1)
287                                         }
288                                 }
289 #                               if (chi$statistic<seuil){
290 #                                       print('inf seuil')
291 #                                       print(tab)
292 #                                       print(chi$expected)
293 #                                       print(chi$statistic)
294 #                                       sominf<-sominf+1
295 #                               }
296                                 
297 #                               print('tableau')
298 #                               print(tab)
299 #                               chi<-chisq.test(tab,correct=FALSE)
300 #                               print('sans correction')
301 #                               print(chi$statistic)
302 #                               print(chi$expected)
303 #                               print(chi$residuals)
304 #                chit<-((tab-chi$expected)^2)/chi$expected
305 #                print(mean(chi$residuals))
306 #                print(sd(as.vector(chi$residuals)))
307                 #print(chi$residuals/sd(as.vector(chi$residuals)))
308                 #print(abs(chi$residuals/sd(as.vector(chi$residuals))))
309                 #print(qnorm(abs(chi$residuals/sd(as.vector(chi$residuals)))))
310 #                difftab<-tab-chi$expected
311  #               print('contribution')
312                 #print((difftab-mean(difftab))/sd(as.vector(difftab)))
313                 #print(abs((chit-mean(chit))/sd(as.vector(chit))))
314 #                tabin1<-matrix(1,2,2)
315 #                tabin1<-tabin1-(colSums(tab)/sum(tab))
316 #                tabin2<-matrix(1,2,2)
317 #                tabin2<-(tabin2-(rowSums(tab)/sum(tab)))
318 #                tabinv<-tabin1*tabin2
319 #                CS<-colSums(tab)
320 #                RS<-rowSums(tab)
321 #                GT<-sum(tab)
322 #                print(tabinv)
323 #                contrib<-(tab-chi$expected)/sqrt(chi$expected * ((1 - RS/GT) %*% t(1 - CS/GT)))
324 #                print(chit)
325 #                print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT')
326 #                print(contrib)
327 #                if (((which.max(chit)==2) || (which.max(chit)==4)) & chi$statistic>=seuil) {
328 #                    if (which.max(chit)==2) NN1<-NN1+1
329 #                    if (which.max(chit)==4) NN2<-NN2+1
330 #                }
331 #                if (max(abs(contrib[2,])>=1.96)) {
332 #                    nbcontrib<-nbcontrib+1
333 #                }
334 #                if (max(chit[2,]>=seuil)) maxchip<-maxchip+1
335 #                if (chi$statistic >=seuil) nbseuil<-nbseuil+1
336 #                if ((min(tab[2,])==0) & (chi$statistic<seuil)) nbzeroun<-nbzeroun+1
337 #                if ((((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4))) & chi$statistic>=seuil) {
338 #                    if ((which.max(chi$residual)==2) || (which.min(chi$residual)==4)) res1<-res1+1
339 #                    if ((which.max(chi$residual)==4) || (which.min(chi$residual)==2)) res2<-res2+1
340 #                } else if (!((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4)) & chi$statistic>=seuil) {
341 #                    print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
342 #                    print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
343 #                }
344 #                if (length(unique(as.vector(chi$residual)))==2) {
345 #                    nbexe<-nbexe+1
346 #                }
347
348 #                               print('avec correction')
349 #                if (min(tab)<5) chi<-chisq.test(tab,correct=TRUE)
350 #                               print(chi$statistic)
351 #                #if (chi$statistic >=seuil) nbseuil<-nbseuil+1
352 #                               print(chi$expected)
353 #                               print(chi$residuals)
354 #                chit<-((abs(tab-chi$expected)-0.5)^2)/chi$expected
355 #                print(chit)
356 #                               print('#####################')
357                         }
358 #            pp('NN1',NN1)
359 #            pp('NN2',NN2)
360 #            pp('maxchip',maxchip)
361 #            pp('nbzeroun',nbzeroun)
362 #            pp('res1',res1)
363 #            pp('res2',res2)
364 #            pp('nbseuil',nbseuil)
365 #            pp('nbexe',nbexe)
366 #            pp('nbcontrib', nbcontrib)
367                         print('resultats elim item')
368                         pp(clnb+1,length(listcol[[clnb+1]]))
369                         pp(clnb,length(listcol[[clnb]]))
370 #                       pp('inf seuil',sominf)
371 #                       pp('nv',nv)
372                         pp('ncclnb',ncclnb)
373                         pp('ncclnbp',ncclnbp)
374 #                       pp('nz',nz)
375                         #sink()
376                         #lignes concernees
377                         listrownamedtable<-rownames(dtable)
378                         listrownamedtable<-as.integer(listrownamedtable)
379                         newcol<-vector(length=nrow(dataori))
380                         #remplissage de la nouvelle colonne avec les nouvelles classes
381                         print('remplissage')
382                         num<-0
383                         for (ligne in listrownamedtable) {
384                                 num<-num+1
385                                 newcol[ligne]<-as.integer(as.vector(dtable[,'cl'][num])[1])
386                         }
387                         #recuperation de la classe precedante pour les cases vides
388                         print('recuperation classes precedentes')
389                         matori<-as.matrix(dataori)
390                         if (i!=1) {
391                                 #    options(warn=-1)
392                                 for (ligne in 1:length(newcol)) {
393                                         #       print(newcol[ligne])
394                                         if (newcol[ligne]==0) { # ce test renvoie un warning
395                                                 if (ligne %in% rowelim){
396                                                         newcol[ligne]<-0
397                                                 } else {
398                                                         newcol[ligne]<-matori[ligne,length(matori[1,])]
399                                                 }
400                                                 
401                                         }
402                                 }
403                                 #   options(warn=0)
404                         }
405                         dataori<-cbind(dataori,newcol)
406                         
407                         tailleclasse<-as.matrix(summary(as.factor(as.character(dataori[,ncol(dataori)]))))
408                         #tailleclasse<-as.integer(tailleclasse)
409                         print('tailleclasse')
410                         print(tailleclasse)
411                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
412                         plusgrand<-which.max(tailleclasse)
413                         #???????????????????????????????????
414                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
415                         if (length(plusgrand)>1) {
416                                 plusgrand<-plusgrand[1]
417                         }
418                         #????????????????????????????????????
419                         
420                         #constuction du prochain tableau a analyser
421                         print('construction tableau suivant')
422                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
423                         dtable<-dataori[dataori[,ncol(dataori)]==classe,]
424                         row.names(dtable)<-rownames(dataori[dataori[,ncol(dataori)]==classe,])
425                         dtable<-dtable[,1:(ncol(dtable)-i)]
426                         mere<-classe
427                         listcolelim<-listcol[[as.integer(classe)]]
428                         mother<-listmere[[as.integer(classe)]]
429                         while (mother!=1) {
430                                 listcolelim<-append(listcolelim,listcol[[mother]])
431                                 mother<-listmere[[mother]]
432                         }
433                         
434                         listcolelim<-sort(unique(listcolelim))
435                         pp('avant',ncol(dtable))
436                         if (!is.logical(listcolelim)){
437                                 print('elimination colonne')
438                                 #dtable<-dtable[,-listcolelim]
439                                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
440                         }
441                         pp('apres',ncol(dtable))
442                         #elimination des colonnes ne contenant que des 0
443                         print('vire colonne inf 3 dans boucle')
444                         sdt<-colSums(dtable)
445                         if (min(sdt)<=3)
446                                 dtable<-dtable[,-which(sdt<=3)]
447         
448                         #elimination des lignes ne contenant que des 0
449                         print('vire ligne vide dans boucle')
450                         if (ncol(dtable)==1) {
451                                 sdt<-dtable[,1]
452                         } else {
453                                 sdt<-rowSums(dtable)
454                         }
455                         if (min(sdt)==0) {
456                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
457                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
458                                 print(rowelim)
459                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
460                                 dtable<-dtable[-which(sdt==0),]
461                         }
462 #               }
463         }
464 #       sink()
465         res <- list(n1 = dataori[,(ncol(dataori)-x+1):ncol(dataori)], list_mere = listmere, list_fille = list_fille)
466         res
467 }
468
469 #dataout<-CHD(data,9)
470
471 #igraph pour graph de relation