### IRIS data(iris) attach(iris) par(mfrow=c(2,2)) for (k in 1: 4) boxplot(iris[,k]~Species,main=names(iris)[k]) for (k in 1:4) { print(names(iris)[k]) print(anova(lm(iris[,k]~Species))) } plot(iris[,1:4],col=c("blue","red","green")[iris$Species]) library(scatterplot3d) scatterplot3d(iris[,1],iris[,2],iris[,3],color=c("blue","red","green")[iris$Species]) library(rgl) plot3d(iris[,1],iris[,2],iris[,3],col=c("blue","red","green")[iris$Species],type="s") plot3d(iris[,1],iris[,2],iris[,4],col=c("blue","red","green")[iris$Species],type="s") plot3d(iris[,4],iris[,2],iris[,3],col=c("blue","red","green")[iris$Species],type="s") summary(manova(as.matrix(iris[,1:4])~Species),test="Wilks") ### loi normale à 2 dimensions library(MASS) s <- matrix(c(1, 0.8, 0.8, 1), 2) x1 <- mvrnorm(50, c(0.5, -0.5), s) x2 <- mvrnorm(50, c(-0.5, 0.5), s) x <- rbind.data.frame(x1, x2) x <- scale(x, scale = FALSE) fac <- factor(rep(1:2, rep(50, 2))) plot(x, col = as.integer(fac)) library(car) ellipse(c(0.5, -0.5), s,1) ellipse(c(-0.5, 0.5), s,1) abline(h=0);abline(v=0) anova(lm(x[,1]~fac)) anova(lm(x[,2]~fac)) summary(manova(as.matrix(x)~fac),test="Wilks") arrows(0, 0, 2, 0, lwd = 2) text(2, 0, "A", pos = 1, cex = 2) arrows(0, 0, sqrt(2), sqrt(2), lwd = 2) text(sqrt(2), sqrt(2), "B", pos = 4, cex = 2) arrows(0, 0, 0, 2, lwd = 2) text(0, 2, "C", pos = 2, cex = 2) arrows(0, 0, -sqrt(2), sqrt(2), lwd = 2) text(-sqrt(2), sqrt(2), "D", pos = 2, cex = 2) par(mfrow = c(2, 2)) f1 <- function(a, b, cha) { z <- a * x[, 1] + b * x[, 2] z0 <- seq(-4, 4, le = 50) z1 <- z[fac == 1] z2 <- z[fac == 2] hist(z, proba = TRUE, col = grey(0.9), xlim = c(-4, 4), main = cha) lines(z0, dnorm(z0, mean(z), sd(z)), lwd = 2) lines(z0, 0.5 * dnorm(z0, mean(z1), sd(z1)), col = "red", lwd = 2) lines(z0, 0.5 * dnorm(z0, mean(z2), sd(z2)), col = "blue", lwd = 2) } f1(1, 0, "A") f1(1/sqrt(2), 1/sqrt(2), "B") f1(0, 1, "C") f1(-1/sqrt(2), 1/sqrt(2), "D") f2 <- function(a,b) { z=a*x[,1]+b*x[,2] a1 = var(z) * 99/100 a2=var(predict(lm(as.numeric(z)~fac)))*99/100 a3=a2/a1 round(c(a1,a2,a3),2) } f2(1,0) f2(1/1.414,1/1.414) f2(0,1) f2(-1/1.414,1/1.414) ### Exemple cours Tomassone M = matrix(c(0,2,4,6,8,5,7,9,11,13,3,1,5,9,7,2,0,4,8,6,1,1,1,1,1,2,2,2,2,2), byrow=FALSE,ncol=3) colnames(M)=c("x1","x2","pop") M = as.data.frame(M);M M[,3]=as.factor(M[,3]) attach(M) tapply(M[,1],pop,mean) tapply(M[,2],pop,mean) g=cbind(tapply(M[,1],pop,mean),tapply(M[,2],pop,mean));g plot(M[,1:2],col=as.numeric(pop)) points(g,col=1:2,pch=3) Mc=M;Mc[,1:2]=scale(M[,1:2],scale=FALSE);Mc gc=cbind(tapply(Mc[,1],pop,mean),tapply(Mc[,2],pop,mean));gc plot(Mc[,1:2],col=as.numeric(pop));abline(h=0);abline(v=0) points(gc,col=1:2,pch=3) T1=as.matrix(scale(M[1:5,1:2],scale=FALSE));T1 W1=t(T1)%*%T1;W1 T2=as.matrix(scale(M[6:10,1:2],scale=FALSE));T2 W2=t(T2)%*%T2;W2 W=W1+W2;W T=t(as.matrix(scale(M[,1:2],scale=FALSE)))%*%as.matrix(scale(M[,1:2],scale=FALSE));T B=T-W;B gc=as.matrix(scale(g,scale=FALSE));gc 5*t(gc)%*%gc W_1=solve(W);W_1 X=W_1%*%B;X propre=eigen(X) propre$value propre$vectors fondis=propre$vectors%*%diag(diag(t(propre$vectors)%*%W%*%propre$vectors/8)^(-0.5)) fondis t(fondis)%*%W%*%fondis library(MASS) lda(pop~.,M) library(ade4) xydis=as.matrix(Mc[,1:2])%*%fondis xydis plot(xydis,col=as.numeric(pop),pch=5) abline(h=0);abline(v=0) points(gc%*%fondis,col=1:2,pch=3) ### Exemple TD manuel M = matrix(c(0,2,2,4,5,7,7,9,3,5,7,9,0,2,4,6,1,1,1,1,2,2,2,2), byrow=FALSE,ncol=3) colnames(M)=c("x1","x2","pop") M = as.data.frame(M) M[,3]=as.factor(M[,3]);M attach(M) tapply(M[,1],pop,mean) tapply(M[,2],pop,mean) g=cbind(tapply(M[,1],pop,mean),tapply(M[,2],pop,mean)) plot(M[,1:2],col=as.numeric(pop)) points(g,col=1:2,pch=3) Mc=M;Mc[,1:2]=scale(M[,1:2],scale=FALSE);Mc gc=cbind(tapply(Mc[,1],pop,mean),tapply(Mc[,2],pop,mean));gc plot(Mc[,1:2],col=as.numeric(pop));abline(h=0);abline(v=0) points(gc,col=1:2,pch=3) T1=as.matrix(scale(M[1:4,1:2],scale=FALSE));T1 W1=t(T1)%*%T1;W1 T2=as.matrix(scale(M[5:8,1:2],scale=FALSE));T2 W2=t(T2)%*%T2;W2 W=W1+W2;W T=t(as.matrix(scale(M[,1:2],scale=FALSE)))%*%as.matrix(scale(M[,1:2],scale=FALSE));T B=T-W;B gc=as.matrix(scale(g,scale=FALSE));gc 4*t(gc)%*%gc W_1=solve(W);W_1 X=W_1%*%B;X propre=eigen(X) propre$value propre$vectors[,1] fondis=propre$vectors%*%diag(diag(t(propre$vectors)%*%W%*%propre$vectors/6)^(-0.5)) fondis[,1] t(fondis)%*%W%*%fondis library(MASS) lda(pop~.,M) library(ade4) xydis=as.matrix(Mc[,1:2])%*%fondis[,1] xydis plot(xydis,col=as.numeric(pop),pch=5) abline(h=0);abline(v=0) points(gc%*%fondis,col=1:2,pch=3) #### exemple zebu library(ade4) data(chazeb);chazeb tab=chazeb$tab cla=chazeb$cla par(mfrow=c(3,2)) for (k in 1:6) boxplot(tab[,k]~cla,main=names(tab)[k]) tapply(tab[,1],cla,mean) lda(cla~.,tab)$means for (k in 1:6) { print(names(tab)[k]) print(anova(lm(tab[,k]~cla))) } summary(manova(as.matrix(tab)~cla),test="Wilks") afd1=lda(cla~.,tab) round(afd1$scaling,2) round(afd1$svd^2,2) anova(lm(as.matrix(tab)%*%afd1$scaling~cla)) library(ade4) afd2=discrimin(dudi.pca(tab,scan=FALSE),cla,scan=FALSE) anova(lm(as.matrix(afd2$li)~cla)) afd2$eig;1-afd2$eig;afd2$eig/(1-afd2$eig) plot(as.matrix(afd2$li),as.matrix(as.matrix(tab)%*%afd1$scaling)) par(mfrow=c(1,2)) boxplot(as.matrix(tab)%*%afd1$scaling~cla) title("LD1 : lda") boxplot(as.matrix(afd2$li)~cla) title("LD1 : discrimin") wilks = function(afd,n,p,g,k) { eig=afd$eig if (k>=1) eig =afd$eig[-(1:k)] Wilks=prod(1-eig) khi2=-(n-0.5*(p+g)-1)*log(Wilks) ddl= (p-k)*(g-k-1) prob=1-pchisq(khi2,ddl) print(round(c(Wilks,khi2,ddl,prob),2))} ## la fonction bartlett calcule a signification des axes discriminants, on peut ## éliminer les k premiers axes pour étudier la significatio des derniers axes. wilks(afd2,23,6,2,0) predict(afd1,tab)$class table(cla,predict(afd1,tab)$class) table(afd2$ afd2$fa afd2$va cor(tab[,1],afd2$li) afd2$cp cor(dudi.pca(tab,scan=FALSE)$li[,1],afd2$li) plot(afd2) #### exemple pommier pommier=read.table("pommier.txt",header=T);pommier tab=pommier[,1:6] cla=as.factor(pommier[,7]) par(mfrow=c(3,2)) for (k in 1:6) boxplot(tab[,k]~cla,main=names(tab)[k]) tapply(tab[,1],cla,mean) round(lda(cla~.,tab)$means,2) for (k in 1:6) { print(names(tab)[k]) print(anova(lm(tab[,k]~cla))) } summary(manova(as.matrix(tab)~cla),test="Wilks") afd1=lda(cla~.,tab) round(afd1$scaling,2) round(afd1$svd^2,2) anova(lm(as.matrix(tab)%*%afd1$scaling~cla)) library(ade4) afd2=discrimin(dudi.pca(tab,scan=FALSE),cla,,nf=3) afd2$li summary(manova(as.matrix(afd2$li)~cla),test="Wilks") afd2$eig;1-afd2$eig;afd2$eig/(1-afd2$eig);prod(1-afd2$eig) plot(as.matrix(afd2$li),as.matrix(as.matrix(tab)%*%afd1$scaling)) par(mfrow=c(3,2)) boxplot(as.matrix(tab)%*%afd1$scaling[,1]~cla) title("LD1 : lda") boxplot(as.matrix(afd2$li[,1])~cla) title("LD1 : discrimin") boxplot(as.matrix(tab)%*%afd1$scaling[,2]~cla) title("LD2 : lda") boxplot(as.matrix(afd2$li[,2])~cla) title("LD2 : discrimin") boxplot(as.matrix(tab)%*%afd1$scaling[,3]~cla) title("LD3 : lda") boxplot(as.matrix(afd2$li[,3])~cla) title("LD3 : discrimin") wilks = function(afd,n,p,g,k) { eig=afd$eig if (k>=1) eig =afd$eig[-(1:k)] Wilks=prod(1-eig) khi2=-(n-0.5*(p+g)-1)*log(Wilks) ddl= (p-k)*(g-k-1) prob=1-pchisq(khi2,ddl) print(round(c(Wilks,khi2,ddl,prob),2))} ## la fonction bartlett calcule a signification des axes discriminants, on peut ## éliminer les k premiers axes pour étudier la significatio des derniers axes. wilks(afd2,32,6,4,0) wilks(afd2,32,6,4,1) wilks(afd2,32,6,4,2) predict(afd1,tab)$class table(cla,predict(afd1,tab)$class) round(afd2$fa,2) round(afd2$va,2) cor(tab[,1],afd2$li) round(afd2$cp,2) cor(dudi.pca(tab,scan=FALSE)$li[,1],afd2$li) plot(afd2) library(rgl) plot3d(as.data.frame(as.matrix(tab)%*%afd1$scaling),type="s",col=as.numeric(cla)) plot(as.data.frame(as.matrix(tab)%*%afd1$scaling), col=as.numeric(cla),pch=as.numeric(cla)) #### CLASSEMENT p1 <- 0.2 p2 <- 0.3 p3 <- 0.5 n <- 100 s1 <- c(1,2) s2 <- c(3,1) s3 <- c(1.5,2) s <- rbind(s1,s2,s3) m1 <- c(1,2) m2 <- c(6,6) m3<- c(6,-2) m <- rbind(m1,m2,m3) c <- sample(c(1,2,3),size=n,prob=c(p1,p2,p3),replace=TRUE) x <- cbind(rnorm(n,m[c,1],s[c,1]),rnorm(n,m[c,2],s[c,2])) couleur <- rep("red",n) couleur[c==2] <- "blue" couleur[c==3] <- "green" plot(x,col=couleur,pch=c) len <- 50 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- p1*dnorm(grille[,1],m[1,1],s[1,1])*dnorm(grille[,2],m[1,2],s[1,2]) Z <- cbind(Z,p2*dnorm(grille[,1],m[2,1],s[2,1])*dnorm(grille[,2],m[2,2],s[2,2])) Z <- cbind(Z,p3*dnorm(grille[,1],m[3,1],s[3,1])*dnorm(grille[,2],m[3,2],s[3,2])) zp <- Z[,3] - pmax(Z[,2], Z[,1]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE) zp <- Z[,1] - pmax(Z[,2], Z[,3]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE) library(MASS) T <- as.factor(couleur) x.lda <- lda(x,T) len <- 50 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- predict(x.lda,grille) zp <- as.numeric(Z$class) zp <- Z$post[,3] - pmax(Z$post[,2], Z$post[,1]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="magenta") zp <- Z$post[,1] - pmax(Z$post[,2], Z$post[,3]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="magenta") front.lda <- function(sigma="diff") { p1 <- 0.2 p2 <- 0.3 p3 <- 0.5 n <- 100 s1 <- c(1,2) s2 <- c(3,1) s3 <- c(1.5,2) if (sigma=="diff") s <- rbind(s1,s2,s3) else s <- rbind(s1,s1,s1) m1 <- c(1,2) m2 <- c(6,6) m3<- c(6,-2) m <- rbind(m1,m2,m3) c <- sample(c(1,2,3),size=n,prob=c(p1,p2,p3),replace=TRUE) x <- cbind(rnorm(n,m[c,1],s[c,1]),rnorm(n,m[c,2],s[c,2])) couleur <- rep("red",n) couleur[c==2] <- "blue" couleur[c==3] <- "green" plot(x,col=couleur,,pch=c) len <- 100 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- p1*dnorm(grille[,1],m[1,1],s[1,1])*dnorm(grille[,2],m[1,2],s[1,2]) Z <- cbind(Z,p2*dnorm(grille[,1],m[2,1],s[2,1])*dnorm(grille[,2],m[2,2],s[2,2])) Z <- cbind(Z,p3*dnorm(grille[,1],m[3,1],s[3,1])*dnorm(grille[,2],m[3,2],s[3,2])) zp <- Z[,3] - pmax(Z[,2], Z[,1]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE) zp <- Z[,1] - pmax(Z[,2], Z[,3]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE) T <- as.factor(couleur) x.lda <- lda(x,T) len <- 100 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- predict(x.lda,grille) zp <- as.numeric(Z$class) zp <- Z$post[,3] - pmax(Z$post[,2], Z$post[,1]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="magenta",lty=3) zp <- Z$post[,1] - pmax(Z$post[,2], Z$post[,3]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="magenta",lty=3) T <- as.factor(couleur) x.qda <- qda(x,T) len <- 100 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- predict(x.qda,grille) zp <- as.numeric(Z$class) zp <- Z$post[,3] - pmax(Z$post[,2], Z$post[,1]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="blue",lty=2) zp <- Z$post[,1] - pmax(Z$post[,2], Z$post[,3]) contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col="blue",lty=2) } library(MASS) data(crabs);?crabs # Separation selon la couleur x <- log(cbind(crabs$FL,crabs$RW)) n <- dim(x)[1] couleur <- rep("black",n) couleur[crabs$sp == "O"] <- "magenta" plot(x,col=couleur) T <- as.factor(couleur) x.lda <- lda(x,T) # Visualisation des frontieres de decision # Choix d'une grille len <- 50 xp <- seq(min(x[,1]),max(x[,1]), length=len) yp <- seq(min(x[,2]),max(x[,2]), length=len) grille <- expand.grid(z1=xp,z2=yp) Z <- predict(x.lda,grille) Z <- Z$post zp <- Z[,1] - Z[,2] contour(xp, yp, matrix(zp, len), add=TRUE, levels=0, drawlabels=FALSE,col='green') # Evaluation du taux de points classés dans un ensemble de test test <- x col <- couleur M <- dim(test)[1] C <- rep(1,M) C[col == "magenta"] <- 2 Z <- predict(x.lda,test) erreur.lda <- sum(Z$class != col)/M;erreur.lda # echantillon test choix <- sample(c(1,2),size=n,prob=c(0.5,0.5),replace=TRUE) appr <- x[choix==1,] test <- x[choix==2,] appr.col <- couleur[choix==1] test.col <- couleur[choix==2] M <- dim(test)[1] C <- rep(1,M) C[test.col == "magenta"] <- 2 T <- as.factor(appr.col) appr.lda <- lda(appr,T) Z <- predict(appr.lda,test) erreur.lda <- sum(Z$class != test.col)/M appr.qda <- qda(appr,T) Z <- predict(appr.qda,test) erreur.qda <- sum(Z$class != test.col)/M list(erreur.lda,erreur.qda) # Validation croisee x <- log(cbind(crabs$FL,crabs$RW)) n <- dim(x)[1] couleur <- rep("black",n) couleur[crabs$sp == "O"] <- "magenta" erreur.lda <- 0 erreur.qda <- 0 for (i in 1:n) { appr <- x[-i,] test <- x[i,] appr.col <- couleur[-i] test.col <- couleur[i] C <- test.col T <- as.factor(appr.col) appr.lda <- lda(appr,T) Z <- predict(appr.lda,test) erreur.lda <- erreur.lda + sum(Z$class!=test.col) appr.qda <- qda(appr,T) Z <- predict(appr.qda,test) erreur.qda <- erreur.qda + sum(Z$class!=test.col) } erreur.lda <- erreur.lda/n erreur.qda <- erreur.qda/n list(erreur.lda,erreur.qda)