1 #Author: Pierre Ratinaud
2 #Copyright (c) 2008 Pierre Ratinaud
7 #source('/home/pierre/workspace/iramuteq/Rscripts/afc.R')
8 #data<-read.table('output/corpus_bin.csv',header=TRUE,sep='\t')
9 #source('/home/pierre/workspace/iramuteq/Rscripts/anacor.R')
11 pp<-function(txt,val) {
15 MyChiSq<-function(x,sc,n){
17 E <- outer(sr, sc, "*")/n
18 STAT<-sum((abs(x - E))^2/E)
22 CHD<-function(data,x=9){
23 # sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
24 dataori=as.matrix(data)
25 row.names(dataori)=rownames(data)
26 dtable=as.matrix(data)
27 row.names(dtable)=rownames(data)
29 pp('ncol entree : ',ncol(dtable))
30 pp('nrow entree',nrow(dtable))
34 print('vire colonnes vides en entree')#FIXME : il ne doit pas y avoir de colonnes vides en entree !!
37 dtable<-dtable[,-which(sdt==0)]
41 listmere[[clnb]]<-mere
42 listmere[[clnb+1]]<-mere
43 list_fille[[mere]] <- c(clnb,clnb+1)
44 listcol[[clnb]]<-vector()
45 listcol[[clnb+1]]<-vector()
46 #extraction du premier facteur de l'afc
49 #afc<-corresp(dtable,nd=1)
51 pp('taille dtable dans boucle (col/row)',c(ncol(dtable),nrow(dtable)))
52 afc<-boostana(dtable,nd=1)
53 #afc<-dudi.coa(dtable,scannf=FALSE,nf=1)
54 pp('SV',afc$singular.values)
55 pp('V.P.', afc$eigen.values)
56 #pp('V.P.',afc$eig[1])
57 #pp('V.P.',afc$factexpl[1])
59 afcchi<-chisq.test(dtable)
60 Tinert<-afcchi$statistic/sum(dtable)
61 pp('inertie totale',Tinert)
62 #coordonnees des colonnes sur le premier facteur
63 #coordrow=afc$rowcoord
64 coordrow=as.matrix(afc$row.scores)
65 #coordrow<-as.matrix(afc$rproj[,1])
66 #coordrow<-as.matrix(afc$li)
68 #row.names(coordrow)<-afc$rownames
69 row.names(coordrow)<-rownames(dtable)
70 #classement en fonction de la position sur le premier facteur
71 print('deb recherche meilleur partition')
72 ordertable<-cbind(dtable,coordrow)
73 coordrow<-as.matrix(coordrow[order(coordrow[,1]),])
74 ordertable<-ordertable[order(ordertable[,ncol(ordertable)]),][,-ncol(ordertable)]
80 sc2<-colSums(dtable)-ordertable[1,]
81 chitable<-rbind(sc1,sc2)
85 #################################################################
86 for (l in 2:(nrow(dtable)-2)){
87 chitable[1,]<-chitable[1,]+ordertable[l,]
88 chitable[2,]<-chitable[2,]-ordertable[l,]
89 # sc1<-sc1+ordertable[l,]
90 # sc2<-sc2-ordertable[l,]
91 # chitable<-rbind(sc1,sc2)
92 chi<-MyChiSq(chitable,sc,TT)
97 # listinter<-append(listinter,MyChiSq(chitable,sc,TT))#,chisq.test(chitable)$statistic/TT)
100 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
101 pp('max inter phase 1', maxinter/TT)#max(listinter))
102 print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
103 #maxinter<-which.max(listinter)+1
105 listclasse<-ifelse(coordrowori<=coordrow[(rmax),1],clnb,clnb+1)
110 #dtable<-cbind(dtable,'cl'= as.vector(cl))
112 N1<-length(listclasse[listclasse==clnb])
113 N2<-length(listclasse[listclasse==clnb+1])
116 ###################################################################
117 # reclassement des individus #
118 ###################################################################
124 while (malcl!=0 & N1>=5 & N2>=5) {
126 listsub[[it]]<-vector()
127 pp('nombre iteration',it)
130 t1<-dtable[cl==clnb,]#[,-ncol(dtable)]
131 t2<-dtable[cl==clnb+1,]#[,-ncol(dtable)]
138 chtableori<-rbind(sc1,sc2)
139 interori<-MyChiSq(chtableori,sc,TT)/TT#chisq.test(chtableori)$statistic#/TT
141 pp('interori',interori)
147 for (l in 1:nrow(dtable)){
149 chtable[1,]<-sc1-dtable[l,]
150 chtable[2,]<-sc2+dtable[l,]
152 chtable[1,]<-sc1+dtable[l,]
153 chtable[2,]<-sc2-dtable[l,]
155 interswitch<-MyChiSq(chtable,sc,TT)/TT#chisq.test(chtable)$statistic/TT
156 ws<-interori-interswitch
159 interori<-interswitch
164 listsub[[it]]<-append(listsub[[it]],l)
169 listsub[[it]]<-append(listsub[[it]],l)
171 vdelta<-append(vdelta,l)
174 # for (val in vdelta) {
175 # if (cl[val]==clnb) {
177 # listsub[[it]]<-append(listsub[[it]],val)
180 # listsub[[it]]<-append(listsub[[it]],val)
183 print('###################################')
184 print('longueur < 0')
185 malcl<-length(vdelta)
186 if ((it>1)&&(!is.logical(listsub[[it]]))){
187 if (listsub[[it]]==listsub[[(it-1)]]){
192 print('###################################')
194 dtable<-cbind(dtable,'cl'=as.vector(cl))
195 #######################################################################
196 # Fin reclassement des individus #
197 #######################################################################
198 # if (!(length(cl[cl==clnb])==1 || length(cl[cl==clnb+1])==1)) {
199 t1<-dtable[dtable[,'cl']==clnb,][,-ncol(dtable)]
200 t2<-dtable[dtable[,'cl']==clnb+1,][,-ncol(dtable)]
202 chtable<-rbind(colSums(t1),colSums(t2))
203 inter<-chisq.test(chtable)$statistic/TT
204 pp('last inter',inter)
205 print('=====================')
207 #calcul de la specificite des colonnes
208 mint<-min(nrow(t1),nrow(t2))
209 maxt<-max(nrow(t1),nrow(t2))
210 seuil<-round((1.9*(maxt/mint))+1.9,digit=6)
211 #sink('/home/pierre/workspace/iramuteq/dev/findchi2.txt')
212 # print('ATTENTION SEUIL 3,84')
229 cn<-colnames(dtable)[1:(ncol(dtable)-1)]
230 # for (k in 1:(ncol(dtable)-1)) {
232 # tab<-table(dtable[,k],dtable[,ncol(dtable)])
235 # #chidemoi<-(sum(t)*(((t[1,1]*t[2,2])-(t[1,2]*t[2,1]))^2))/((rowSums(t)[1])*(rowSums(t)[2])*(colSums(t)[1])*(colSums(t)[2]))
236 # if (rownames(tab)[1]==1 & nrow(tab)==1) {
237 # tab<-rbind(c(0,0),tab[1])
238 # print('tableau vide dessus')
241 # chi<-chisq.test(tab,correct=TRUE)
243 # chi<-chisq.test(tab,correct=FALSE)
245 # if ((min(tab[2,])==0) & (min(tab[1,])!=0)){
246 # chi$statistic<-seuil+1
247 # # print('min tab 2 == 0')
249 ## if(((tab[2,1]>tab[1,1]) & (tab[2,2]>tab[1,2]))){
251 ## chi$statistic<-seuil-1
254 # if ((min(tab[1,])==0) & (min(tab[2,])!=0)){
255 # chi$statistic<-seuil-1
256 # # print('min tab 1 == 0')
258 # if (is.na(chi$statistic)) {
260 # print('NA dans chi$statistic')
262 ## print('#####################')
263 # if (chi$statistic>seuil){
264 # i# print('sup seuil')
265 # if (tab[2,1]<chi$expected[2,1]){
266 # listcol[[clnb]]<-append(listcol[[clnb]],cn[k])
268 ## print('@@@@@@@@specifique de la classe@@@@@@')
270 # } else if (tab[2,2]<chi$expected[2,2]){
271 # listcol[[clnb+1]]<-append(listcol[[clnb+1]],cn[k])
273 ## print('@@@@@@@@specifique de la classe@@@@@@')
277 ## if (chi$statistic<seuil){
278 ## print('inf seuil')
280 ## print(chi$expected)
281 ## print(chi$statistic)
287 ## chi<-chisq.test(tab,correct=FALSE)
288 ## print('sans correction')
289 ## print(chi$statistic)
290 ## print(chi$expected)
291 ## print(chi$residuals)
292 ## chit<-((tab-chi$expected)^2)/chi$expected
293 ## print(mean(chi$residuals))
294 ## print(sd(as.vector(chi$residuals)))
295 # #print(chi$residuals/sd(as.vector(chi$residuals)))
296 # #print(abs(chi$residuals/sd(as.vector(chi$residuals))))
297 # #print(qnorm(abs(chi$residuals/sd(as.vector(chi$residuals)))))
298 ## difftab<-tab-chi$expected
299 # # print('contribution')
300 # #print((difftab-mean(difftab))/sd(as.vector(difftab)))
301 # #print(abs((chit-mean(chit))/sd(as.vector(chit))))
302 ## tabin1<-matrix(1,2,2)
303 ## tabin1<-tabin1-(colSums(tab)/sum(tab))
304 ## tabin2<-matrix(1,2,2)
305 ## tabin2<-(tabin2-(rowSums(tab)/sum(tab)))
306 ## tabinv<-tabin1*tabin2
311 ## contrib<-(tab-chi$expected)/sqrt(chi$expected * ((1 - RS/GT) %*% t(1 - CS/GT)))
313 ## print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT')
315 ## if (((which.max(chit)==2) || (which.max(chit)==4)) & chi$statistic>=seuil) {
316 ## if (which.max(chit)==2) NN1<-NN1+1
317 ## if (which.max(chit)==4) NN2<-NN2+1
319 ## if (max(abs(contrib[2,])>=1.96)) {
320 ## nbcontrib<-nbcontrib+1
322 ## if (max(chit[2,]>=seuil)) maxchip<-maxchip+1
323 ## if (chi$statistic >=seuil) nbseuil<-nbseuil+1
324 ## if ((min(tab[2,])==0) & (chi$statistic<seuil)) nbzeroun<-nbzeroun+1
325 ## if ((((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4))) & chi$statistic>=seuil) {
326 ## if ((which.max(chi$residual)==2) || (which.min(chi$residual)==4)) res1<-res1+1
327 ## if ((which.max(chi$residual)==4) || (which.min(chi$residual)==2)) res2<-res2+1
328 ## } else if (!((which.max(chi$residual)==2) || (which.max(chi$residual)==4) || (which.min(chi$residual)==2) || (which.min(chi$residual)==4)) & chi$statistic>=seuil) {
329 ## print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
330 ## print('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
332 ## if (length(unique(as.vector(chi$residual)))==2) {
336 ## print('avec correction')
337 ## if (min(tab)<5) chi<-chisq.test(tab,correct=TRUE)
338 ## print(chi$statistic)
339 ## #if (chi$statistic >=seuil) nbseuil<-nbseuil+1
340 ## print(chi$expected)
341 ## print(chi$residuals)
342 ## chit<-((abs(tab-chi$expected)-0.5)^2)/chi$expected
344 ## print('#####################')
348 ## pp('maxchip',maxchip)
349 ## pp('nbzeroun',nbzeroun)
352 ## pp('nbseuil',nbseuil)
354 ## pp('nbcontrib', nbcontrib)
355 print('resultats elim item')
356 pp(clnb+1,length(listcol[[clnb+1]]))
357 pp(clnb,length(listcol[[clnb]]))
358 # pp('inf seuil',sominf)
361 pp('ncclnbp',ncclnbp)
365 listrownamedtable<-rownames(dtable)
366 listrownamedtable<-as.integer(listrownamedtable)
367 newcol<-vector(length=nrow(dataori))
368 #remplissage de la nouvelle colonne avec les nouvelles classes
371 for (ligne in listrownamedtable) {
373 newcol[ligne]<-as.integer(as.vector(dtable[,'cl'][num])[1])
375 #recuperation de la classe precedante pour les cases vides
376 print('recuperation classes precedentes')
377 matori<-as.matrix(dataori)
380 for (ligne in 1:length(newcol)) {
381 # print(newcol[ligne])
382 if (newcol[ligne]==0) { # ce test renvoie un warning
383 if (ligne %in% rowelim){
386 newcol[ligne]<-matori[ligne,length(matori[1,])]
393 dataori<-cbind(dataori,newcol)
395 tailleclasse<-as.matrix(summary(as.factor(as.character(dataori[,ncol(dataori)]))))
396 #tailleclasse<-as.integer(tailleclasse)
397 print('tailleclasse')
399 tailleclasse<-as.matrix(tailleclasse[!(rownames(tailleclasse)==0),])
400 plusgrand<-which.max(tailleclasse)
401 #???????????????????????????????????
402 #Si 2 classes ont des effectifs egaux, on prend la premiere de la liste...
403 if (length(plusgrand)>1) {
404 plusgrand<-plusgrand[1]
406 #????????????????????????????????????
408 #constuction du prochain tableau a analyser
409 print('construction tableau suivant')
410 classe<-as.integer(rownames(tailleclasse)[plusgrand])
411 dtable<-dataori[dataori[,ncol(dataori)]==classe,]
412 row.names(dtable)<-rownames(dataori[dataori[,ncol(dataori)]==classe,])
413 dtable<-dtable[,1:(ncol(dtable)-i)]
415 listcolelim<-listcol[[as.integer(classe)]]
416 mother<-listmere[[as.integer(classe)]]
418 listcolelim<-append(listcolelim,listcol[[mother]])
419 mother<-listmere[[mother]]
422 listcolelim<-sort(unique(listcolelim))
423 pp('avant',ncol(dtable))
424 if (!is.logical(listcolelim)){
425 print('elimination colonne')
426 #dtable<-dtable[,-listcolelim]
427 dtable<-dtable[,!(colnames(dtable) %in% listcolelim)]
429 pp('apres',ncol(dtable))
430 #elimination des colonnes ne contenant que des 0
431 print('vire colonne inf 3 dans boucle')
434 dtable<-dtable[,-which(sdt<=3)]
436 #elimination des lignes ne contenant que des 0
437 print('vire ligne vide dans boucle')
438 if (ncol(dtable)==1) {
444 rowelim<-as.integer(rownames(dtable)[which(sdt==0)])
445 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
447 print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
448 dtable<-dtable[-which(sdt==0),]
453 res <- list(n1 = dataori[,(ncol(dataori)-x+1):ncol(dataori)], list_mere = listmere, list_fille = list_fille)