1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008 Pierre Ratinaud
9 #source('/home/pierre/workspace/iramuteq/Rscripts/afc.R')
10 #data<-read.table('output/corpus_bin.csv',header=TRUE,sep='\t')
11 #source('/home/pierre/workspace/iramuteq/Rscripts/anacor.R')
13 pp<-function(txt,val) {
17 MyChiSq<-function(x,sc,n){
19 E <- outer(sr, sc, "*")/n
20 STAT<-sum((abs(x - E))^2/E)
24 CHD<-function(data,x=9){
25 # sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
26 dataori=as.matrix(data)
27 row.names(dataori)=rownames(data)
28 dtable=as.matrix(data)
29 row.names(dtable)=rownames(data)
31 pp('ncol entree : ',ncol(dtable))
32 pp('nrow entree',nrow(dtable))
36 print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
39 dtable<-dtable[,-which(sdt==0)]
43 listmere[[clnb]]<-mere
44 listmere[[clnb+1]]<-mere
45 list_fille[[mere]] <- c(clnb,clnb+1)
46 listcol[[clnb]]<-vector()
47 listcol[[clnb+1]]<-vector()
48 #extraction du premier facteur de l'afc
51 #afc<-corresp(dtable,nd=1)
53 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
54 # afc<-boostana(dtable,nd=1)
55 # #afc<-dudi.coa(dtable,scannf=FALSE,nf=1)
56 # pp('SV',afc$singular.values)
57 # pp('V.P.', afc$eigen.values)
58 # #pp('V.P.',afc$eig[1])
59 # #pp('V.P.',afc$factexpl[1])
61 # afcchi<-chisq.test(dtable)
62 # Tinert<-afcchi$statistic/sum(dtable)
63 # pp('inertie totale',Tinert)
64 # #coordonnees des colonnes sur le premier facteur
65 # #coordrow=afc$rowcoord
66 # coordrow=as.matrix(afc$row.scores)
67 # #coordrow<-as.matrix(afc$rproj[,1])
68 # #coordrow<-as.matrix(afc$li)
69 # coordrowori<-coordrow
70 # #row.names(coordrow)<-afc$rownames
71 # row.names(coordrow)<-rownames(dtable)
72 # #classement en fonction de la position sur le premier facteur
73 # print('deb recherche meilleur partition')
74 # ordertable<-cbind(dtable,coordrow)
75 # coordrow<-as.matrix(coordrow[order(coordrow[,1]),])
76 # ordertable<-ordertable[order(ordertable[,ncol(ordertable)]),][,-ncol(ordertable)]
82 # sc2<-colSums(dtable)-ordertable[1,]
83 # chitable<-rbind(sc1,sc2)
84 # #listinter<-vector()
87 ##################################################################
88 # for (l in 2:(nrow(dtable)-2)){
89 # chitable[1,]<-chitable[1,]+ordertable[l,]
90 # chitable[2,]<-chitable[2,]-ordertable[l,]
91 ## sc1<-sc1+ordertable[l,]
92 ## sc2<-sc2-ordertable[l,]
93 ## chitable<-rbind(sc1,sc2)
94 # chi<-MyChiSq(chitable,sc,TT)
99 # # listinter<-append(listinter,MyChiSq(chitable,sc,TT))#,chisq.test(chitable)$statistic/TT)
102 # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
103 # pp('max inter phase 1', maxinter/TT)#max(listinter))
104 # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
105 # #maxinter<-which.max(listinter)+1
107 # listclasse<-ifelse(coordrowori<=coordrow[(rmax),1],clnb,clnb+1)
112 # #dtable<-cbind(dtable,'cl'= as.vector(cl))
114 # N1<-length(listclasse[listclasse==clnb])
115 # N2<-length(listclasse[listclasse==clnb+1])
118 ####################################################################
119 ## reclassement des individus #
120 ####################################################################
121 # malcl<-1000000000000
126 # while (malcl!=0 & N1>=5 & N2>=5) {
128 # listsub[[it]]<-vector()
129 # pp('nombre iteration',it)
132 # t1<-dtable[cl==clnb,]#[,-ncol(dtable)]
133 # t2<-dtable[cl==clnb+1,]#[,-ncol(dtable)]
140 # chtableori<-rbind(sc1,sc2)
141 # interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
142 # chtable<-chtableori
143 # pp('interori',interori)
149 # for (l in 1:nrow(dtable)){
151 # chtable[1,]<-sc1-dtable[l,]
152 # chtable[2,]<-sc2+dtable[l,]
154 # chtable[1,]<-sc1+dtable[l,]
155 # chtable[2,]<-sc2-dtable[l,]
157 # interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
158 # ws<-interori-interswitch
161 # interori<-interswitch
166 # listsub[[it]]<-append(listsub[[it]],l)
171 # listsub[[it]]<-append(listsub[[it]],l)
173 # vdelta<-append(vdelta,l)
176 ## for (val in vdelta) {
177 ## if (cl[val]==clnb) {
179 ## listsub[[it]]<-append(listsub[[it]],val)
182 ## listsub[[it]]<-append(listsub[[it]],val)
185 # print('###################################')
186 # print('longueur < 0')
187 # malcl<-length(vdelta)
188 # if ((it>1)&&(!is.logical(listsub[[it]]))){
189 # if (listsub[[it]]==listsub[[(it-1)]]){
194 # print('###################################')
196 ##########################################################################
198 # clori<-kmeans(dtable,2)$cluster
199 # cl<-ifelse(clori==1,clnb,clnb+1)
202 clori<-pam(dist(dtable,method='binary'),2,diss=TRUE)$cluster
203 cl<-ifelse(clori==1,clnb,clnb+1)
205 dtable<-cbind(dtable,'cl'=as.vector(cl))
207 #######################################################################
208 # Fin reclassement des individus #
209 #######################################################################
210 # if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
211 t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
212 t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
214 chtable<-rbind(colSums(t1),colSums(t2))
215 inter<-chisq.test(chtable)$statistic/TT
216 pp('last inter',inter)
217 print('=====================')
219 #calcul de la specificite des colonnes
220 mint<-min(nrow(t1),nrow(t2))
221 maxt<-max(nrow(t1),nrow(t2))
222 seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
223 #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
224 # print('ATTENTION SEUIL 3,84')
241 cn<-colnames(dtable)[1:(ncol(dtable)-1)]
242 for (k in 1:(ncol(dtable)-1)) {
244 tab<-table(dtable[,k],dtable[,ncol(dtable)])
247 #chidemoi<-(sum(t)*(((t[1,1]*t[2,2])-(t[1,2]*t[2,1]))^2))/((rowSums(t)[1])*(rowSums(t)[2])*(colSums(t)[1])*(colSums(t)[2]))
248 if (rownames(tab)[1]==1 & nrow(tab)==1) {
249 tab<-rbind(c(0,0),tab[1])
250 print('tableau vide dessus')
253 chi<-chisq.test(tab,correct=TRUE)
255 chi<-chisq.test(tab,correct=FALSE)
257 if ((min(tab[2,])==1) & (min(tab[1,])!=0)){
258 chi$statistic<-seuil+1
259 # print('min tab 2 == 0')
261 # if(((tab[2,1]>tab[1,1]) & (tab[2,2]>tab[1,2]))){
263 # chi$statistic<-seuil-1
266 if ((min(tab[1,])==0) & (min(tab[2,])!=0)){
267 chi$statistic<-seuil-1
268 # print('min tab 1 == 0')
270 if (is.na(chi$statistic)) {
272 print('NA dans chi$statistic')
274 # print('#####################')
275 if (chi$statistic>seuil){
276 i# print('sup seuil')
277 if (tab[2,1]<chi$expected[2,1]){
278 listcol[[clnb]]<-append(listcol[[clnb]],cn[k])
280 # print('@@@@@@@@specifique de la classe@@@@@@')
282 } else if (tab[2,2]<chi$expected[2,2]){
283 listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[k])
285 # print('@@@@@@@@specifique de la classe@@@@@@')
289 # if (chi$statistic<seuil){
292 # print(chi$expected)
293 # print(chi$statistic)
299 # chi<-chisq.test(tab,correct=FALSE)
300 # print('sans correction')
301 # print(chi$statistic)
302 # print(chi$expected)
303 # print(chi$residuals)
304 # chit<-((tab-chi$expected)^2)/chi$expected
305 # print(mean(chi$residuals))
306 # print(sd(as.vector(chi$residuals)))
307 #print(chi$residuals/sd(as.vector(chi$residuals)))
308 #print(abs(chi$residuals/sd(as.vector(chi$residuals))))
309 #print(qnorm(abs(chi$residuals/sd(as.vector(chi$residuals)))))
310 # difftab<-tab-chi$expected
311 # print('contribution')
312 #print((difftab-mean(difftab))/sd(as.vector(difftab)))
313 #print(abs((chit-mean(chit))/sd(as.vector(chit))))
314 # tabin1<-matrix(1,2,2)
315 # tabin1<-tabin1-(colSums(tab)/sum(tab))
316 # tabin2<-matrix(1,2,2)
317 # tabin2<-(tabin2-(rowSums(tab)/sum(tab)))
318 # tabinv<-tabin1*tabin2
323 # contrib<-(tab-chi$expected)/sqrt(chi$expected * ((1 - RS/GT) %*% t(1 - CS/GT)))
325 # print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT')
327 # if (((which.max(chit)==2) || (which.max(chit)==4)) & chi$statistic>=seuil) {
328 # if (which.max(chit)==2) NN1<-NN1+1
329 # if (which.max(chit)==4) NN2<-NN2+1
331 # if (max(abs(contrib[2,])>=1.96)) {
332 # nbcontrib<-nbcontrib+1
334 # if (max(chit[2,]>=seuil)) maxchip<-maxchip+1
335 # if (chi$statistic >=seuil) nbseuil<-nbseuil+1
336 # if ((min(tab[2,])==0) & (chi$statistic<seuil)) nbzeroun<-nbzeroun+1
337 # if ((((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4))) & chi$statistic>=seuil) {
338 # if ((which.max(chi$residual)==2) || (which.min(chi$residual)==4)) res1<-res1+1
339 # if ((which.max(chi$residual)==4) || (which.min(chi$residual)==2)) res2<-res2+1
340 # } else if (!((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4)) & chi$statistic>=seuil) {
341 # print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
342 # print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
344 # if (length(unique(as.vector(chi$residual)))==2) {
348 # print('avec correction')
349 # if (min(tab)<5) chi<-chisq.test(tab,correct=TRUE)
350 # print(chi$statistic)
351 # #if (chi$statistic >=seuil) nbseuil<-nbseuil+1
352 # print(chi$expected)
353 # print(chi$residuals)
354 # chit<-((abs(tab-chi$expected)-0.5)^2)/chi$expected
356 # print('#####################')
360 # pp('maxchip',maxchip)
361 # pp('nbzeroun',nbzeroun)
364 # pp('nbseuil',nbseuil)
366 # pp('nbcontrib', nbcontrib)
367 print('resultats elim item')
368 pp(clnb+1,length(listcol[[clnb+1]]))
369 pp(clnb,length(listcol[[clnb]]))
370 # pp('inf seuil',sominf)
373 pp('ncclnbp',ncclnbp)
377 listrownamedtable<-rownames(dtable)
378 listrownamedtable<-as.integer(listrownamedtable)
379 newcol<-vector(length=nrow(dataori))
380 #remplissage de la nouvelle colonne avec les nouvelles classes
383 for (ligne in listrownamedtable) {
385 newcol[ligne]<-as.integer(as.vector(dtable[,'cl'][num])[1])
387 #recuperation de la classe precedante pour les cases vides
388 print('recuperation classes precedentes')
389 matori<-as.matrix(dataori)
392 for (ligne in 1:length(newcol)) {
393 # print(newcol[ligne])
394 if (newcol[ligne]==0) { # ce test renvoie un warning
395 if (ligne %in% rowelim){
398 newcol[ligne]<-matori[ligne,length(matori[1,])]
405 dataori<-cbind(dataori,newcol)
407 tailleclasse<-as.matrix(summary(as.factor(as.character(dataori[,ncol(dataori)]))))
408 #tailleclasse<-as.integer(tailleclasse)
409 print('tailleclasse')
411 tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
412 plusgrand<-which.max(tailleclasse)
413 #???????????????????????????????????
414 #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
415 if (length(plusgrand)>1) {
416 plusgrand<-plusgrand[1]
418 #????????????????????????????????????
420 #constuction du prochain tableau a analyser
421 print('construction tableau suivant')
422 classe<-as.integer(rownames(tailleclasse)[plusgrand])
423 dtable<-dataori[dataori[,ncol(dataori)]==classe,]
424 row.names(dtable)<-rownames(dataori[dataori[,ncol(dataori)]==classe,])
425 dtable<-dtable[,1:(ncol(dtable)-i)]
427 listcolelim<-listcol[[as.integer(classe)]]
428 mother<-listmere[[as.integer(classe)]]
430 listcolelim<-append(listcolelim,listcol[[mother]])
431 mother<-listmere[[mother]]
434 listcolelim<-sort(unique(listcolelim))
435 pp('avant',ncol(dtable))
436 if (!is.logical(listcolelim)){
437 print('elimination colonne')
438 #dtable<-dtable[,-listcolelim]
439 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
441 pp('apres',ncol(dtable))
442 #elimination des colonnes ne contenant que des 0
443 print('vire colonne inf 3 dans boucle')
446 dtable<-dtable[,-which(sdt<=3)]
448 #elimination des lignes ne contenant que des 0
449 print('vire ligne vide dans boucle')
450 if (ncol(dtable)==1) {
456 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
457 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
459 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
460 dtable<-dtable[-which(sdt==0),]
465 res <- list(n1 = dataori[,(ncol(dataori)-x+1):ncol(dataori)], list_mere = listmere, list_fille = list_fille)
469 #dataout<-CHD(data,9)
471 #igraph pour graph de relation