multisplit
[iramuteq] / Rscripts / CHD.R
1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008-2020 Pierre Ratinaud
3 #License: GNU/GPL
4
5 pp<-function(txt,val) {
6         d<-paste(txt,' : ')
7         print(paste(d,val))
8 }
9 MyChiSq<-function(x,sc,n){
10         sr<-rowSums(x)
11         E <- outer(sr, sc, "*")/n
12         STAT<-sum(((x - E)^2)/E)
13         STAT
14 }
15
16 MySpeedChi <- function(x,sc) {
17     sr <-rowSums(x)
18     E <- outer(sr, sc, "*")
19         STAT<-sum((x - E)^2/E)
20         STAT
21 }
22
23 find.max <- function(dtable, chitable, compte, rmax, maxinter, sc, TT) {
24     ln <- which(dtable==1, arr.ind=TRUE)
25     lo <- list()
26     lo[1:nrow(dtable)] <- 0
27     for (k in 1:nrow(ln)) {lo[[ln[k,1]]]<-append(lo[[ln[k,1]]],ln[k,2])}
28     for (k in 1:nrow(dtable)) {lo[[k]] <- lo[[k]][-1]}
29         ## lo<-lo[-c(1,length(lo))]
30         ## for (l in lo) {
31         ##     compte <- compte + 1 
32         ##     chitable[1,l]<-chitable[1,l]+1
33         ##     chitable[2,l]<-chitable[2,l]-1
34         ##     chi<-MyChiSq(chitable,sc,TT)
35                 ## if (chi>maxinter) {
36                 ##     maxinter<-chi
37                 ##     rmax<-compte
38                 ## }   
39     #}
40         lo<-lo[-c(1)]
41         for (l in lo) {
42                 chi<-MyChiSq(chitable,sc,TT)
43                 if (chi>maxinter) {
44                         maxinter<-chi
45                         rmax<-compte
46                 }
47                 compte <- compte + 1
48                 chitable[1,l]<-chitable[1,l]+1
49                 chitable[2,l]<-chitable[2,l]-1
50         }       
51     res <- list(maxinter=maxinter, rmax=rmax)
52     res
53 }  
54
55
56
57
58
59 CHD<-function(data.in, x=9, mode.patate = FALSE, svd.method, libsvdc.path=NULL){
60 #       sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
61         dataori <- data.in
62     row.names(dataori) <- rownames(data.in)
63         dtable <- data.in
64         colnames(dtable) <- 1:ncol(dtable)
65     dout <- NULL
66         rowelim<-NULL
67         pp('ncol entree : ',ncol(dtable))
68         pp('nrow entree',nrow(dtable))
69         listcol <- list()
70         listmere <- list()
71         list_fille <- list()
72         print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
73         sdt<-colSums(dtable)
74         if (min(sdt)==0)
75                 dtable<-dtable[,-which(sdt==0)]
76     print('vire lignes vides en entree')
77     sdt<-rowSums(dtable)
78         if (min(sdt)==0) {
79         rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
80         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
81         print(rowelim)
82         print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
83                 dtable<-dtable[-which(sdt==0),]
84         }
85         mere<-1
86         for (i in 1:x) {
87                 clnb<-(i*2)
88                 listmere[[clnb]]<-mere
89                 listmere[[clnb+1]]<-mere
90                 list_fille[[mere]] <- c(clnb,clnb+1)
91                 listcol[[clnb]]<-vector()
92                 listcol[[clnb+1]]<-vector()
93                 #extraction du premier facteur de l'afc
94                 print('afc')
95                 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
96                 afc<-boostana(dtable, nd=1, svd.method = svd.method, libsvdc.path=libsvdc.path)
97                 pp('SV',afc$singular.values)
98                 pp('V.P.', afc$eigen.values)
99                 coordrow <- as.matrix(afc$row.scores[,1])
100                 coordrowori<-coordrow
101                 row.names(coordrow)<-rownames(dtable)
102         coordrow <- cbind(coordrow,1:nrow(dtable))
103                 print('deb recherche meilleur partition')
104         ordert <- as.matrix(coordrow[order(coordrow[,1]),])
105         ordert <- cbind(ordert, 1:nrow(ordert))
106         ordert <- ordert[order(ordert[,2]),]
107
108                 listinter<-vector()
109                 listlim<-vector()
110         dtable <- dtable[order(ordert[,3]),]
111         sc <- colSums(dtable)
112         TT <- sum(sc)
113         sc1 <- dtable[1,]
114         sc2 <- colSums(dtable) - sc1 
115         chitable <- rbind(sc1, sc2)
116         compte <- 1
117         maxinter <- 0
118         rmax <- NULL
119
120         inert <- find.max(dtable, chitable, compte, rmax, maxinter, sc, TT)
121         print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
122                 pp('max inter phase 1', inert$maxinter/TT)#max(listinter))
123                 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
124         ordert <- ordert[order(ordert[,3]),]
125                 listclasse<-ifelse(coordrowori<=ordert[(inert$rmax),1],clnb,clnb+1)
126             dtable <- dtable[order(ordert[,2]),]
127                 cl<-listclasse
128                 pp('TT',TT)
129                 #dtable<-cbind(dtable,'cl'= as.vector(cl))
130
131                 N1<-length(listclasse[listclasse==clnb])
132                 N2<-length(listclasse[listclasse==clnb+1])
133                 pp('N1',N1)
134                 pp('N2',N2)
135 ###################################################################
136 #                  reclassement des individus                     #
137 ###################################################################
138         if (!mode.patate) {
139                 malcl<-1000000000000
140                 it<-0
141                 listsub<-list()
142                 #in boucle
143             ln <- which(dtable==1, arr.ind=TRUE)
144             lnz <- list()
145             lnz[1:nrow(dtable)] <- 0
146     
147             for (k in 1:nrow(ln)) {lnz[[ln[k,1]]]<-append(lnz[[ln[k,1]]],ln[k,2])}
148             for (k in 1:nrow(dtable)) {lnz[[k]] <- lnz[[k]][-1]}
149                 TT<-sum(dtable)
150     
151                 while (malcl!=0 & N1>=5 & N2>=5) {
152                         it<-it+1
153                         listsub[[it]]<-vector()
154                 txt <- paste('nombre iteration', it)
155                         #pp('nombre iteration',it)
156                         vdelta<-vector()
157                         #dtable[,'cl']<-cl
158                         t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
159                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
160                         ncolt<-ncol(t1)
161                         #pp('ncolt',ncolt)
162     
163                 if (N1 != 1) {
164                     sc1<-colSums(t1)
165                 } else {
166                     sc1 <- t1
167                 }
168                 if (N2 != 1) {
169                             sc2<-colSums(t2)
170                 } else {
171                     sc2 <- t2
172                 }
173                         
174                 sc<-sc1+sc2
175                         chtableori<-rbind(sc1,sc2)
176                         chtable<-chtableori
177                         interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
178                         txt <- paste(txt, ' - interori : ',interori)
179                 #pp('interori',interori)
180     
181                         N1<-nrow(t1)
182                         N2<-nrow(t2)
183     
184                         #pp('N1',N1)
185                         #pp('N2',N2)
186                         txt <- paste(txt, 'N1:', N1,'-N2:',N2)
187                 print(txt)
188                 compte <- 0
189                         for (l in lnz){
190                         chi.in<-chtable
191                         compte <- compte + 1
192                                         if(cl[compte]==clnb){
193                                                 chtable[1,l]<-chtable[1,l]-1
194                                                 chtable[2,l]<-chtable[2,l]+1
195                                         }else{
196                                                 chtable[1,l]<-chtable[1,l]+1
197                                                 chtable[2,l]<-chtable[2,l]-1
198                                         }
199                                         interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
200                                         ws<-interori-interswitch
201     
202                                         if (ws<0){
203                                                 interori<-interswitch
204                                                 if(cl[compte]==clnb){
205                                                         #sc1<-chtable[1,]
206                                                         #sc2<-chtable[2,]
207                                                         cl[compte]<-clnb+1
208                                                         listsub[[it]]<-append(listsub[[it]],compte)
209                                                 } else {
210                                                         #sc1<-chtable[1,]
211                                                         #sc2<-chtable[2,]
212                                                         cl[compte]<-clnb
213                                                         listsub[[it]]<-append(listsub[[it]],compte)
214                                                 }
215                                                 vdelta<-append(vdelta,compte)
216                         } else {
217                             chtable<-chi.in
218                                         }
219                    }
220     #                   for (val in vdelta) {
221     #                           if (cl[val]==clnb) {
222     #                                   cl[val]<-clnb+1
223     #                                   listsub[[it]]<-append(listsub[[it]],val)
224     #                                   }else {
225     #                                   cl[val]<-clnb
226     #                                   listsub[[it]]<-append(listsub[[it]],val)
227     #                           }
228     #                   }
229                         print('###################################')
230                         print('longueur < 0')
231                         malcl<-length(vdelta)
232
233                         if ((it>1)&&(!is.logical(listsub[[it]]))&&(!is.logical(listsub[[it-1]]))){
234                                 if (all(listsub[[it]]==listsub[[(it-1)]])){
235                                         malcl<-0
236                                 }
237                         }
238                         print(malcl)
239                         print('###################################')
240                 }
241         }
242                 #dtable<-cbind(dtable,'cl'=as.vector(cl))
243         #dtable[,'cl'] <-as.vector(cl)
244 #######################################################################
245 #                 Fin reclassement des individus                      #
246 #######################################################################
247 #               if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
248                         #t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
249                         #t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
250                     t1<-dtable[which(cl[,1]==clnb),]#[,-ncol(dtable)]
251                         t2<-dtable[which(cl[,1]==clnb+1),]#[,-ncol(dtable)]
252             if (inherits(t1, "numeric")) {
253                 sc1 <- as.vector(t1)
254                 nrowt1 <- 1
255             } else {
256                 sc1 <- colSums(t1)
257                 nrowt1 <- nrow(t1)
258             }
259             if  (inherits(t2, "numeric")) {
260                 sc2 <- as.vector(t2)
261                 nrowt2 <- 1
262             } else {
263                 sc2 <- colSums(t2)
264                 nrowt2 <- nrow(t2)
265             }
266             chtable<-rbind(sc1,sc2)
267                         inter<-chisq.test(chtable)$statistic/TT
268                         pp('last inter',inter)
269                         print('=====================')
270                         #calcul de la specificite des colonnes
271                         mint<-min(nrowt1,nrowt2)
272                         maxt<-max(nrowt1,nrowt2)
273                         seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
274                         #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
275 #                       print('ATTENTION SEUIL 3,84')
276 #                       seuil<-3.84
277                         pp('seuil',seuil)
278                         sominf<-0
279                         nv<-0
280                         nz<-0
281                         ncclnb<-0
282                         ncclnbp<-0
283             NN1<-0
284             NN2<-0
285             maxchip<-0
286             nbzeroun<-0
287             res1<-0
288             res2<-0
289             nbseuil<-0
290             nbexe<-0
291             nbcontrib<-0
292                         cn<-colnames(dtable)
293             #another try#########################################
294             one <- cbind(sc1,sc2)
295             cols <- c(length(which(cl==clnb)), length(which(cl==clnb+1))) 
296             print(cols)
297             colss <- matrix(rep(cols,ncol(dtable)), ncol=2, byrow=TRUE)
298             zero <- colss - one
299             rows <- cbind(rowSums(zero), rowSums(one))
300             n <- sum(cols)
301             for (m in 1:nrow(rows)) {
302                 obs <- t(matrix(c(zero[m,],one[m,]),2,2))
303                 E <- outer(rows[m,],cols,'*')/n
304                 if ((min(obs[2,])==0) & (min(obs[1,])!=0)) {
305                     chi <- seuil + 1
306                 } else if ((min(obs[1,])==0) & (min(obs[2,])!=0)) {
307                     chi <- seuil - 1
308                 } else if (any(obs < 10)) {
309                     chi <- sum((abs(obs - E) - 0.5)^2 / E)
310                 } else {
311                     chi <- sum(((obs - E)^2)/E)
312                 }
313                 if (is.na(chi)) {
314                     chi <- 0
315                 }
316                 if (chi > seuil) {
317                     if (obs[2,1] < E[2,1]) {
318                         listcol[[clnb]]<-append(listcol[[clnb]],cn[m])
319                         ncclnb<-ncclnb+1
320                     } else if (obs[2,2] < E[2,2]) {
321                                                 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[m])
322                                                 ncclnbp<-ncclnbp+1
323                     }
324                 }
325             }
326             ######################################################
327                         print('resultats elim item')
328                         pp(clnb+1,length(listcol[[clnb+1]]))
329                         pp(clnb,length(listcol[[clnb]]))
330                         pp('ncclnb',ncclnb)
331                         pp('ncclnbp',ncclnbp)
332                         listrownamedtable<-rownames(dtable)
333                         listrownamedtable<-as.integer(listrownamedtable)
334                         newcol<-vector(length=nrow(dataori))
335                         #remplissage de la nouvelle colonne avec les nouvelles classes
336                         print('remplissage')
337 #                       num<-0
338             newcol[listrownamedtable] <- cl[,1]
339                         #recuperation de la classe precedante pour les cases vides
340                         print('recuperation classes precedentes')
341             if (i!=1) {
342                 newcol[which(newcol==0)] <- dout[,ncol(dout)][which(newcol==0)]
343             }
344             if(!is.null(rowelim)) {
345                 newcol[rowelim] <- 0
346             }
347                         tailleclasse<-as.matrix(summary(as.factor(as.character(newcol))))
348                         print('tailleclasse')
349                         print(tailleclasse)
350                         tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
351                         plusgrand<-which.max(tailleclasse)
352                         #???????????????????????????????????
353                         #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
354                         if (length(plusgrand)>1) {
355                                 plusgrand<-plusgrand[1]
356                         }
357                         #????????????????????????????????????
358                         
359                         #constuction du prochain tableau a analyser
360                         print('construction tableau suivant')
361             dout<-cbind(dout,newcol)
362                         classe<-as.integer(rownames(tailleclasse)[plusgrand])
363                         dtable<-dataori[which(newcol==classe),]
364                         row.names(dtable)<-rownames(dataori)[which(newcol==classe)]
365             colnames(dtable) <- 1:ncol(dtable)
366                         mere<-classe
367                         listcolelim<-listcol[[as.integer(classe)]]
368                         mother<-listmere[[as.integer(classe)]]
369                         while (mother!=1) {
370                                 listcolelim<-append(listcolelim,listcol[[mother]])
371                                 mother<-listmere[[mother]]
372                         }
373                         
374                         listcolelim<-sort(unique(listcolelim))
375                         pp('avant',ncol(dtable))
376                         if (!is.logical(listcolelim)){
377                                 print('elimination colonne')
378                                 #dtable<-dtable[,-listcolelim]
379                 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
380                         }
381                         pp('apres',ncol(dtable))
382                         #elimination des colonnes ne contenant que des 0
383                         print('vire colonne inf 3 dans boucle')
384                         sdt<-colSums(dtable)
385                         if (min(sdt)<=3)
386                                 dtable<-dtable[,-which(sdt<=3)]
387         
388                         #elimination des lignes ne contenant que des 0
389                         print('vire ligne vide dans boucle')
390                         if (ncol(dtable)==1) {
391                                 sdt<-dtable[,1]
392                         } else {
393                                 sdt<-rowSums(dtable)
394                         }
395                         if (min(sdt)==0) {
396                                 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
397                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
398                                 print(rowelim)
399                                 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
400                                 dtable<-dtable[-which(sdt==0),]
401                         }
402 #               }
403         }
404 #       sink()
405         res <- list(n1 = dout, list_mere = listmere, list_fille = list_fille)
406         res
407 }