1 #datadm<-read.table('/home/pierre/.hippasos/corpus_agir_CHDS_16/fileACTtemp.csv', header=TRUE,sep=';', quote='\"',row.names = 1, na.strings = 'NA')
3 #dissmat<-daisy(dataact, metric = 'gower', stand = FALSE)
4 #chd<-diana(dissmat,diss=TRUE,)
6 #sortheight<-sort(height,decreasing=TRUE)
7 FindBestCluster<-function (x,Max=15) {
23 ListClasseOk[[j]]<-i+1
30 #BestCLusterNb<-FindBestCluster(sortheight)
31 #classes<-as.data.frame(cutree(as.hclust(chd), k=6))[,1]
32 #datadm<-cbind(datadm,classes)
33 #clusplot(datadm,classes,shade=TRUE,color=TRUE,labels=4)
35 BuildContTable<- function (x) {
37 for (i in 1:(ncol(x)-1)) {
38 coltable<-table(x[,i],x$classes)
39 afctable<-rbind(afctable,coltable)
44 PrintProfile<- function(dataclasse,profileactlist,profileetlist,antiproact,antiproet,clusternb,profileout,antiproout,profilesuplist=NULL,antiprosup=NULL) {
48 cltot<-as.data.frame(dataclasse[dataclasse[,ncol(dataclasse)]!=0,])
49 cltot<-as.data.frame(as.character(cltot[,ncol(cltot)]))
51 classes<-as.data.frame(as.character(dataclasse[,ncol(dataclasse)]))
52 classes.s<-as.data.frame(summary(cltot[,1],maxsum=500))
53 profile<-rbind(profile,c('***','nb classes',clusternb,'***','',''))
54 antipro<-rbind(antipro,c('***','nb classes',clusternb,'***','',''))
55 for(i in 1:clusternb) {
56 profile<-rbind(profile,c('**','classe',i,'**','',''))
57 nbind<-classes.s[which(rownames(classes.s)==i),1]
58 pr<-round((nbind/tot)*100,digits=2)
59 profile<-rbind(profile,c('****',nbind,tot,pr,'****',''))
60 if (length(profileactlist[[1]][[i]])!=0){
61 profile<-rbind(profile,as.matrix(profileactlist[[1]][[i]]))
63 if (!is.null(profilesuplist)) {
64 profile<-rbind(profile,c('*****','*','*','*','*','*'))
65 if (length(profilesuplist[[1]][[i]])!=0) {
66 profile<-rbind(profile,as.matrix(profilesuplist[[1]][[i]]))
69 if (!is.null(profileetlist)) {
70 profile<-rbind(profile,c('*','*','*','*','*','*'))
71 if (length(profileetlist[[1]][[i]])!=0) {
72 profile<-rbind(profile,as.matrix(profileetlist[[1]][[i]]))
75 antipro<-rbind(antipro,c('**','classe',i,'**','',''))
76 antipro<-rbind(antipro,c('****',nbind,tot,pr,'****',''))
77 if (length(antiproact[[1]][[i]])!=0) {
78 antipro<-rbind(antipro,as.matrix(antiproact[[1]][[i]]))
80 if (!is.null(profilesuplist)) {
81 antipro<-rbind(antipro,c('*****','*','*','*','*','*'))
82 if (length(antiprosup[[1]][[i]])!=0) {
83 antipro<-rbind(antipro,as.matrix(antiprosup[[1]][[i]]))
86 if (!is.null(profileetlist)) {
87 antipro<-rbind(antipro,c('*','*','*','*','*','*'))
88 if (length(antiproet[[1]][[i]])!=0) {
89 antipro<-rbind(antipro,as.matrix(antiproet[[1]][[i]]))
93 write.csv2(profile,file=profileout,row.names=FALSE)
94 write.csv2(antipro,file=antiproout,row.names=FALSE)
97 AddCorrelationOk<-function(afc) {
98 rowcoord<-afc$rowcoord
99 colcoord<-afc$colcoord
100 factor <- ncol(rowcoord)
101 hypo<-function(rowcoord,ligne) {
103 for (i in 1:factor) {
104 somme<-somme+(rowcoord[ligne,i])^2
108 cor<-function(d,hypo) {
111 CompCrl<-function(rowcol) {
113 for (i in 1:factor) {
114 for(ligne in 1:nrow(rowcol)) {
115 out[ligne,i]<-cor(rowcol[ligne,i],hypo(rowcol,ligne))
120 afc$rowcrl<-CompCrl(rowcoord)
121 afc$colcrl<-CompCrl(colcoord)
125 AsLexico<- function(x) {
132 mod.names<-rownames(x)
133 #problem exemple aurelia
134 for (classe in 1:ncol(x)) {
136 for (ligne in 1:nrow(x)) {
137 conttable<-matrix(0,2,2)
138 conttable[1,1]<-as.numeric(x[ligne,classe])
139 conttable[1,2]<-sumrow[ligne]-conttable[1,1]
140 conttable[2,1]<-sumcol[classe]-conttable[1,1]
141 conttable[2,2]<-tot-sumrow[ligne]-conttable[2,1]
142 chiresult<-chisq.test(conttable,correct=TRUE)
143 if (is.na(chiresult$p.value)) {
145 chiresult$statistic<-0
147 obsv<-chiresult$expected
148 pval<-as.character(format(chiresult$p.value,scientific=TRUE))
149 spval<-strsplit(pval,'e')
153 if (conttable[1,1]>obsv[1,1]) {
154 tablep[ligne,classe]<-as.numeric(spval[[1]][2])*(-1)
155 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
157 else if (conttable[1,1]<obsv[1,1]){
158 tablep[ligne,classe]<-as.numeric(spval[[1]][2])
159 tablesqr[ligne,classe]<--round(chiresult$statistic,digits=3)
164 eff_relatif<-(x/sumcol)*1000
166 output[[2]]<-tablesqr
167 output[[3]]<-eff_relatif
171 MyChiSq<-function(x){
175 E <- outer(sr, sc, "*")/n
176 STAT<-sum((abs(x - E))^2/E)
177 PVAL <- pchisq(STAT, 1, lower.tail = FALSE)
178 chi<-list(statistic = STAT, expected = E, p.value = PVAL)
182 AsLexico2<- function(mat, chip = FALSE) {
190 for (i in 1:nrow(contcs)) {
194 contrs[,1:ncol(contrs)] <- sumrow
195 conttot <- matrix(tot, nrow = nrow(mat), ncol = ncol(mat))
196 cont12 <- contrs - mat
197 cont21 <- contcs - mat
198 cont22 <- conttot - contrs - cont21
199 mod.names<-rownames(mat)
200 make_chi_lex <- function(x) {
206 chiresult<-MyChiSq(tb)
208 if (is.na(chiresult$p.value)) {
210 chiresult$statistic<-0
212 obsv<-chiresult$expected
213 pval<-as.character(format(chiresult$p.value,scientific=TRUE))
214 spval<-strsplit(pval,'e')
218 if (tb[1,1]>obsv[1,1]) {
219 as.numeric(spval[[1]][2])*(-1)
221 else if (tb[1,1]<obsv[1,1]){
222 as.numeric(spval[[1]][2])
227 make_chi_p <- function(x) {
233 chiresult<-MyChiSq(tb)
235 if (is.na(chiresult$p.value)) {
237 chiresult$statistic<-0
239 obsv<-chiresult$expected
240 if (tb[1,1]>obsv[1,1]) {
243 else if (tb[1,1]<obsv[1,1]){
249 make_chi <- function(x) {
255 chiresult<-MyChiSq(tb)
257 if (is.na(chiresult$p.value)) {
259 chiresult$statistic<-0
261 obsv<-chiresult$expected
262 if (tb[1,1]>obsv[1,1]) {
265 else if (tb[1,1]<obsv[1,1]){
272 res <- matrix(sapply(1:length(mat), make_chi_lex), ncol = ncol(mat))
273 rownames(res)<-mod.names
274 colnames(res) <- colnames(mat)
275 eff_relatif<-round(t(apply(mat,1,function(x) {(x/t(as.matrix(sumcol))*1000)})),2)
276 rownames(eff_relatif)<-mod.names
277 colnames(eff_relatif) <- colnames(mat)
279 reschip <- matrix(sapply(1:length(mat), make_chi_p), ncol = ncol(mat))
280 rownames(reschip)<- mod.names
281 colnames(reschip) <- colnames(mat)
282 reschi <- matrix(sapply(1:length(mat), make_chi), ncol = ncol(mat))
283 rownames(reschip)<- mod.names
284 colnames(reschip) <- colnames(mat)
288 out[[3]]<-eff_relatif
298 #http://txm.sourceforge.net/doc/R/textometrieR-package.html
300 specificites.probabilities <- function (lexicaltable, types = NULL, parts = NULL)
302 rowMargin <- rowSums(lexicaltable)
303 colMargin <- colSums(lexicaltable)
304 F <- sum(lexicaltable)
305 if (!is.null(types)) {
306 if (is.character(types)) {
307 if (is.null(rownames(lexicaltable)))
308 stop("The lexical table has no row names and the \"types\" argument is a character vector.")
309 if (!all(types %in% rownames(lexicaltable)))
310 stop(paste("Some requested types are not known in the lexical table: ",
311 paste(types[!(types %in% rownames(lexicaltable))],
316 stop("The row index must be greater than 0.")
317 if (max(types) > nrow(lexicaltable))
318 stop("Row index must be smaller than the number of rows.")
320 lexicaltable <- lexicaltable[types, , drop = FALSE]
321 rowMargin <- rowMargin[types]
323 if (!is.null(parts)) {
324 if (is.character(parts)) {
325 if (is.null(colnames(lexicaltable)))
326 stop("The lexical table has no col names and the \"parts\" argument is a character vector.")
327 if (!all(parts %in% colnames(lexicaltable)))
328 stop(paste("Some requested parts are not known in the lexical table: ",
329 paste(parts[!(parts %in% colnames(lexicaltable))],
333 if (max(parts) > ncol(lexicaltable))
334 stop("Column index must be smaller than the number of cols.")
336 stop("The col index must be greater than 0.")
338 lexicaltable <- lexicaltable[, parts, drop = FALSE]
339 colMargin <- colMargin[parts]
341 if (nrow(lexicaltable) == 0 | ncol(lexicaltable) == 0) {
342 stop("The lexical table must contains at least one row and one column.")
344 specif <- matrix(0, nrow = nrow(lexicaltable), ncol = ncol(lexicaltable))
345 for (i in 1:ncol(lexicaltable)) {
346 whiteDrawn <- lexicaltable[, i]
349 drawn <- colMargin[i]
350 independance <- (white * drawn)/F
351 specif_negative <- whiteDrawn < independance
352 specif_positive <- whiteDrawn >= independance
353 specif[specif_negative, i] <- phyper(whiteDrawn[specif_negative],
354 white[specif_negative], black[specif_negative], drawn)
355 specif[specif_positive, i] <- phyper(whiteDrawn[specif_positive] -
356 1, white[specif_positive], black[specif_positive],
359 dimnames(specif) <- dimnames(lexicaltable)
364 #http://txm.sourceforge.net/doc/R/textometrieR-package.html
366 specificites <- function (lexicaltable, types = NULL, parts = NULL)
368 spe <- specificites.probabilities(lexicaltable, types, parts)
369 spelog <- matrix(0, nrow = nrow(spe), ncol = ncol(spe))
370 spelog[spe < 0.5] <- log10(spe[spe < 0.5])
371 spelog[spe > 0.5] <- abs(log10(1 - spe[spe > 0.5]))
372 spelog[spe == 0.5] <- 0
373 spelog[is.infinite(spe)] <- 0
374 spelog <- round(spelog, digits = 4)
375 rownames(spelog) <- rownames(spe)
376 colnames(spelog) <- colnames(spe)
380 make.spec.hypergeo <- function(mat) {
381 #library(textometrieR)
382 spec <- specificites(mat)
384 eff_relatif<-round(t(apply(mat,1,function(x) {(x/t(as.matrix(sumcol))*1000)})),2)
387 out[[3]]<-eff_relatif
391 BuildProf01<-function(x,classes) {
393 #classes : classes de chaque lignes de x
394 dm<-cbind(x,cl=classes)
395 clnb=length(summary(as.data.frame(as.character(classes)),max=100))
396 mat<-matrix(0,ncol(x),clnb)
397 rownames(mat)<-colnames(x)
399 dtmp<-dm[which(dm$cl==i),]
400 for (j in 1:(ncol(dtmp)-1)) {
401 mat[j,i]<-sum(dtmp[,j])
407 BuildProf<- function(x,dataclasse,clusternb,lim=2) {
409 #r.names<-rownames(x)
411 #rownames(x)<-r.names
413 #nuce<-nrow(dataclasse)
414 sumcol<-paste(NULL,1:nrow(x))
415 colclasse<-dataclasse[,ncol(dataclasse)]
416 nuce <- length(which(colclasse != 0))
417 # for (i in 1:nrow(x)) {
418 # sumcol[i]<-sum(x[i,])
420 # afctablesum<-cbind(x,sumcol)
421 afctablesum <- cbind(x, rowSums(x))
422 #dataclasse<-as.data.frame(dataclasse[dataclasse[,ncol(dataclasse)]!=0,])
423 dataclasse<-as.matrix(dataclasse[dataclasse[,ncol(dataclasse)]!=0,])
429 mod.names<-rownames(x)
433 for (classe in 1:clusternb) {
434 lnbligne[classe]<-length(colclasse[colclasse==classe])
435 prof[[classe]]<-data.frame()
436 aprof[[classe]]<-data.frame()
439 for (ligne in 1:nrow(x)) {
440 for (classe in 1:clusternb) {
441 nbligneclasse<-lnbligne[classe]
442 conttable<-data.frame()
443 conttable[1,1]<-as.numeric(x[ligne,classe])
444 conttable[1,2]<-as.numeric(as.vector(x[ligne,ncol(x)]))-as.numeric(x[ligne,classe])
445 conttable[2,1]<-nbligneclasse-as.numeric(x[ligne,classe])
446 conttable[2,2]<-nrow(dataclasse)-as.numeric(as.vector(x[ligne,ncol(x)]))-conttable[2,1]
447 chiresult<-chisq.test(conttable,correct=FALSE)
448 if (is.na(chiresult$p.value)) {
450 chiresult$statistic<-0
453 obsv<-chiresult$expected
454 tablep[ligne,classe]<-round(chiresult$p.value,digits=3)
455 #tablep[ligne,classe]<-format(chiresult$p.value, scientific=TRUE)
456 if (chiresult$statistic>=lim) {
457 if (conttable[1,1]>obsv[1,1]) {
458 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
459 prof[[classe]][nrow(prof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
460 prof[[classe]][nrow(prof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
461 prof[[classe]][nrow(prof[[classe]]),3]<-round((as.numeric(as.vector(x[ligne,classe]))/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
462 prof[[classe]][nrow(prof[[classe]]),4]<-round(chiresult$statistic,digits=2)
463 prof[[classe]][nrow(prof[[classe]]),5]<-mod.names[ligne]
464 prof[[classe]][nrow(prof[[classe]]),6]<-chiresult$p.value
466 else if (conttable[1,1]<obsv[1,1]){
467 tablesqr[ligne,classe]<--round(chiresult$statistic,digits=3)
468 aprof[[classe]][nrow(aprof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
469 aprof[[classe]][nrow(aprof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
470 aprof[[classe]][nrow(aprof[[classe]]),3]<-round((as.numeric(x[ligne,classe])/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
471 aprof[[classe]][nrow(aprof[[classe]]),4]<--round(chiresult$statistic,digits=2)
472 aprof[[classe]][nrow(aprof[[classe]]),5]<-mod.names[ligne]
473 aprof[[classe]][nrow(aprof[[classe]]),6]<-chiresult$p.value
475 #pour gerer le cas avec une seule v et par exemple
476 else if (conttable[1,1]==obsv[1,1]) {
477 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
478 prof[[classe]][nrow(prof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
479 prof[[classe]][nrow(prof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
480 prof[[classe]][nrow(prof[[classe]]),3]<-round((as.numeric(as.vector(x[ligne,classe]))/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
481 prof[[classe]][nrow(prof[[classe]]),4]<-round(chiresult$statistic,digits=2)
482 prof[[classe]][nrow(prof[[classe]]),5]<-mod.names[ligne]
483 prof[[classe]][nrow(prof[[classe]]),6]<-chiresult$p.value
484 aprof[[classe]][nrow(aprof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
485 aprof[[classe]][nrow(aprof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
486 aprof[[classe]][nrow(aprof[[classe]]),3]<-round((as.numeric(x[ligne,classe])/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
487 aprof[[classe]][nrow(aprof[[classe]]),4]<--round(chiresult$statistic,digits=2)
488 aprof[[classe]][nrow(aprof[[classe]]),5]<-mod.names[ligne]
489 aprof[[classe]][nrow(aprof[[classe]]),6]<-chiresult$p.value
492 if (conttable[1,1]>obsv[1,1]) {
493 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
494 } else if (conttable[1,1]<obsv[1,1]){
495 tablesqr[ligne,classe]<--round(chiresult$statistic,digits=3)
497 #pour gerer le cas avec une seule v et par exemple
498 else if (conttable[1,1]==obsv[1,1]) {
499 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
505 for (classe in 1:clusternb) {
506 if (length(prof[[classe]])!=0) {
507 prof[[classe]]<-prof[[classe]][order(prof[[classe]][,4],decreasing=TRUE),]
509 if (length(aprof[[classe]])!=0) {
510 aprof[[classe]]<-aprof[[classe]][order(aprof[[classe]][,4]),]
515 output[[2]]<-tablesqr
516 output[[3]]<-afctablesum
523 build.pond.prof <- function(mat, lim = 2) {
525 mod.names<-rownames(mat)
533 clusternb <- ncol(mat)
534 x <- cbind(mat, rowSums(mat))
535 for (classe in 1:clusternb) {
536 prof[[classe]]<-data.frame()
537 aprof[[classe]]<-data.frame()
540 for (ligne in 1:nrow(mat)) {
541 for(classe in 1:ncol(mat)) {
543 tb[1,1] <- mat[ligne,classe]
544 tb[1,2] <- srow[ligne] - tb[1,1]
545 tb[2,1] <- scol[classe] - tb[1,1]
546 tb[2,2] <- tot - srow[ligne] - tb[2,1]
547 chiresult <- MyChiSq(tb)
548 if (is.na(chiresult$p.value)) {
550 chiresult$statistic<-0
552 tablep[ligne,classe]<-round(chiresult$p.value,digits=3)
554 obsv <- chiresult$expected
555 if (chiresult$statistic>=lim) {
556 if (conttable[1,1]>obsv[1,1]) {
557 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
558 prof[[classe]][nrow(prof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
559 prof[[classe]][nrow(prof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
560 prof[[classe]][nrow(prof[[classe]]),3]<-round((as.numeric(as.vector(x[ligne,classe]))/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
561 prof[[classe]][nrow(prof[[classe]]),4]<-round(chiresult$statistic,digits=2)
562 prof[[classe]][nrow(prof[[classe]]),5]<-mod.names[ligne]
563 prof[[classe]][nrow(prof[[classe]]),6]<-chiresult$p.value
565 else if (conttable[1,1]<obsv[1,1]){
566 tablesqr[ligne,classe]<--round(chiresult$statistic,digits=3)
567 aprof[[classe]][nrow(aprof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
568 aprof[[classe]][nrow(aprof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
569 aprof[[classe]][nrow(aprof[[classe]]),3]<-round((as.numeric(x[ligne,classe])/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
570 aprof[[classe]][nrow(aprof[[classe]]),4]<--round(chiresult$statistic,digits=2)
571 aprof[[classe]][nrow(aprof[[classe]]),5]<-mod.names[ligne]
572 aprof[[classe]][nrow(aprof[[classe]]),6]<-chiresult$p.value
574 #pour gerer le cas avec une seule v et par exemple
575 else if (conttable[1,1]==obsv[1,1]) {
576 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
577 prof[[classe]][nrow(prof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
578 prof[[classe]][nrow(prof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
579 prof[[classe]][nrow(prof[[classe]]),3]<-round((as.numeric(as.vector(x[ligne,classe]))/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
580 prof[[classe]][nrow(prof[[classe]]),4]<-round(chiresult$statistic,digits=2)
581 prof[[classe]][nrow(prof[[classe]]),5]<-mod.names[ligne]
582 prof[[classe]][nrow(prof[[classe]]),6]<-chiresult$p.value
583 aprof[[classe]][nrow(aprof[[classe]])+1,1]<-as.numeric(x[ligne,classe])
584 aprof[[classe]][nrow(aprof[[classe]]),2]<-as.numeric(as.vector(x[ligne,ncol(x)]))
585 aprof[[classe]][nrow(aprof[[classe]]),3]<-round((as.numeric(x[ligne,classe])/as.numeric(as.vector(x[ligne,ncol(x)])))*100,digits=2)
586 aprof[[classe]][nrow(aprof[[classe]]),4]<--round(chiresult$statistic,digits=2)
587 aprof[[classe]][nrow(aprof[[classe]]),5]<-mod.names[ligne]
588 aprof[[classe]][nrow(aprof[[classe]]),6]<-chiresult$p.value
591 if (conttable[1,1]>obsv[1,1]) {
592 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
593 } else if (conttable[1,1]<obsv[1,1]){
594 tablesqr[ligne,classe]<--round(chiresult$statistic,digits=3)
596 #pour gerer le cas avec une seule v et par exemple
597 else if (conttable[1,1]==obsv[1,1]) {
598 tablesqr[ligne,classe]<-round(chiresult$statistic,digits=3)
604 for (classe in 1:clusternb) {
605 if (length(prof[[classe]])!=0) {
606 prof[[classe]]<-prof[[classe]][order(prof[[classe]][,4],decreasing=TRUE),]
608 if (length(aprof[[classe]])!=0) {
609 aprof[[classe]]<-aprof[[classe]][order(aprof[[classe]][,4]),]
614 output[[2]]<-tablesqr