library(rmf) data(studenti) # Tabelle 2 x 2 table(df$sex) table(df$estero) table(df$sex,df$estero) Table(df$sex,df$estero) tbl <- table(df$sex,df$estero) prop.table(tbl) round(prop.table(tbl,margin=1),3) round(prop.table(tbl,margin=1)*100,1) round(prop.table(tbl,margin=2),3) tc(df$sex,df$estero) chisq.test(tbl)$expected cbind(tbl,round(chisq.test(tbl)$expected,1)) # tabelle 2 x c table(df$sex,df$pulizie) tbl <- table(df$sex,df$pulizie) round(prop.table(tbl,margin=2),3) tc(df$sex,df$pulizie) round(chisq.test(tbl)$expected,2) options(warn=-1) rbind(tbl,round(chisq.test(tbl)$expected,1)) # Tabelle r x c tc(df$etacat,df$musica_rock) (tbl <- table(df$etacat,df$musica_rock)) options(warn=-1) round(chisq.test(tbl)$expected,2) # Tabelle di contingenza quadrate data(occupationalStatus) str(occupationalStatus) occupationalStatus diag(occupationalStatus) sum(diag(occupationalStatus)) sum(diag(occupationalStatus))/sum(occupationalStatus) # La statistica chi quadrato (tbl <- table(df$sex,df$pulizie)) options(warn=-1); att <- chisq.test(tbl)$expected round(att,2) round((tbl-att),2) Table(tbl=att) sum(tbl-att) round((tbl-att)^2,2) sum((tbl-att)^2) round((tbl-att)^2/att,2) sum((tbl-att)^2/att) options(warn=-1) chisq.test(tbl)$stat # L'odds ratio (p <- seq(0,1,by=0.1)) round(p/(1-p),3) # Figura 4.1 p <- seq(0,0.9,by=0.01) plot(p,p/(1-p),xlab="Probabilità",ylab="Odds",type="l", cex.lab=1.25,cex.axis=1.25) abline(v=0.5,h=1,lty=3,lwd=2) Table(df$sex,df$estero) Table(df$sex,df$spesa) tc(df$sex,df$spesa) # Il coefficiente k di Cohen data(kappa) str(kappa) (tbl <- table(kappa$Rev1,kappa$Rev2)) (oag <- sum(diag(tbl))/sum(tbl)) (att <- chisq.test(tbl)$exp) (cag <- sum(diag(att))/sum(att)) (oag-cag)/(1-cag) # Il coefficiente gamma x <- df$cucinare y <- df$pulizie ok <- complete.cases(x,y) x <- x[ok]; y <- y[ok] Table(x,y) outer(c(1:15),c(1:10),"*") outer(c("a","e","i","o","u"), c("A","E","I","O","U"),paste) Rx <- outer(x,x,"-") table(Rx) Rx <- outer(x,x,function(u,v) sign(u-v)) Ry <- outer(y,y,function(u,v) sign(u-v)) Table(Rx,Ry) S <- Rx*Ry table(S) sum(S)/sum(abs(S)) # Tabelle di contingenza multidimensionali table(df$sex,df$etac) (tbl <- table(df$sex,df$etac,df$anno)) str(tbl) (tbl <- ftable(df$anno,df$sex,df$etac)) str(tbl) pivot(data.frame(df$anno,df$sex,df$etac)) data(Titanic) str(Titanic) data(berkeley) str(berkeley) tc(berkeley$Admit,berkeley$Sex) tc(berkeley$Sex,berkeley$Dep) tbl <- Table(berkeley$Sex,berkeley$Dep) round(tbl[1,]/tbl[2,],3) tc(berkeley$Admit,berkeley$Dep) tbl <- table(berkeley$Admit,berkeley$Dep) round(sort(tbl[2,]/tbl[1,]),3) ftable(berkeley$Dep,berkeley$Sex,berkeley$Admit) tbl <- pivot(data.frame(berkeley$Dep,berkeley$Sex, berkeley$Admit)) tbl[seq(2,24,2),] # Metodi grafici per distribuzioni doppie (tbl <- table(df$sex,df$cucinare)) round(prop.table(tbl,margin=1),3) round(prop.table(tbl,margin=2),3) # Figura 4.2 tbl <- table(df$sex,df$cucinare) barplot(tbl,ylim=c(0,100), ylab="Frequenze assolute",main="(A)") # altezza delle colonne sempre 100 barplot(prop.table(tbl,margin=2),ylim=c(0,1.05), ylab="Frequenze relative",main="(B)") # colonne affiancate barplot(prop.table(tbl,margin=2), ylim=c(0,0.8),beside=TRUE, ylab="Frequenze relative",main="(C)") # profili di riga affiancati barplot(prop.table(tbl,margin=1), ylim=c(0,0.5),beside=TRUE, ylab="Frequenze relative",main="(D)") # Figura 4.3 tbl <- table(df$sex,df$cucinare) prRig <- prop.table(tbl,margin=1) colori <- c("#D4D4D4","#EAEAEA") midpts <- barplot(tbl,ylim=c(0,100),ylab="Frequenze assolute",col=colori) ascissa <- rep(midpts,each=2) ordinata <- apply(tbl,2,cumsum) - tbl/2 valori <- round(prRig,2) colori <- rep("black",times=2) text(ascissa,ordinata,valori,col=colori,cex=1.1) # Figura 4.4 data(berkeley) tbl <- table(berkeley$Sex) mosaicplot(tbl,main="(A)") tbl <- table(berkeley$Sex,berkeley$Admit) mosaicplot(tbl,main="(B)") tbl <- table(berkeley$Dep) mosaicplot(tbl,main="(C)") tbl <- table(berkeley$Dep,berkeley$Admit) mosaicplot(tbl,main="(D)") # Figura 4.5 tbl <- table(berkeley$Dep,berkeley$Sex,berkeley$Admit) mosaicplot(tbl,main="") # Il rapporto di correlazione x <- df$musica_rock y <- df$eta ok <- complete.cases(x,y) x <- x[ok]; y <- y[ok] mean(y) (devt <- sum((y - mean(y))^2)) tapply(y,x,mean) (tmp <- tapply(y,x,function(z) sum((z-mean(z))^2))) (deve <- sum(tmp)) meta <- tapply(y,x,mean) z <- numeric(length(y)) k <- which(x==1); z[k] <- meta[1] k <- which(x==2); z[k] <- meta[2] k <- which(x==3); z[k] <- meta[3] k <- which(x==4); z[k] <- meta[4] k <- which(x==5); z[k] <- meta[5] table(round(z,2)) (devf <- sum((z - mean(z))^2)) devt devf+deve mean(y) mean(z) sqrt(devf/devt) eta <- function(x,y) { ok <- complete.cases(x,y) x <- x[ok]; y <- y[ok] devt <- sum((y - mean(y))^2) tmp <- tapply(y,x,function(z) sum((z-mean(z))^2)) deve <- sum(tmp) devf <- devt - deve return(sqrt(devf/devt)) } eta(df$musica_rock,df$eta) x <- cutpoints(df$altezza,c(145,163,168,172,176,194), destra=TRUE) eta(x,df$peso) eta(x,df$peso)^2 x <- cutpoints(df$altezza,c(145,163,168,172,176,194),destra=TRUE) (ym <- tapply(df$peso,x,mean,na.rm=TRUE)) # Figura 4.6 x <- cutpoints(df$altezza,c(145,163,168,172,176,194),destra=TRUE) ym <- tapply(df$peso,x,mean,na.rm=TRUE) barplot(ym,ylab="Peso medio (kg)",main="(A)",cex.names=0.8) yd <- tapply(df$peso,x,sd,na.rm=TRUE) yl <- as.character(levels(x)) errorBars(ym,yd,yl,ylim=c(0,90),las=2,cex.names=0.8, ylab="Peso medio (kg)",main="(B)") # Correlazione sum(is.na(df$altezza)) sum(is.na(df$peso)) ok <- complete.cases(df$altezza,df$peso) tmp <- df[ok,c(15,16)] dim(df) dim(tmp) str(tmp) # Figura 4.7 plot(tmp$altezza,tmp$peso,xlab="Altezza (cm)",ylab="Peso (kg)",main="(A)") plot(tmp$altezza,tmp$peso,xlab="Altezza (cm)",ylab="Peso (kg)",main="(B)") abline(v=mean(tmp$altezza),h=mean(tmp$peso)) mean(tmp) x <- tmp$altezza > mean(tmp$altezza) y <- tmp$peso > mean(tmp$peso) table(x,y) tmp$x <- tmp$altezza - mean(tmp$altezza) tmp$y <- tmp$peso - mean(tmp$peso) head(tmp) tail(tmp) tmp$xy <- tmp$x * tmp$y sum(tmp$xy > 0) sum(tmp$xy < 0) sum(tmp$xy) mean(tmp$xy) cov(tmp$altezza,tmp$peso) sum(tmp$xy)/(nrow(tmp)-1) var(tmp$altezza) mean(tmp$x^2) sum(tmp$x^2)/(nrow(tmp)-1) var(tmp$peso) mean(tmp$y^2) sum(tmp$y^2)/(nrow(tmp)-1) sd(tmp$altezza) sd(tmp$peso) sd(tmp$altezza)*sd(tmp$peso) cov(tmp$altezza,tmp$peso)/ (sd(tmp$altezza)*sd(tmp$peso)) cor(tmp$altezza,tmp$peso) cor(df$altezza,df$peso) cor(df$altezza,df$peso,use="complete.obs") # Figura 4.8 library(MASS) n <- 1000 rv <- c(0.99,0.9,0.5,0.2) rv <- c(rv,0,rev(-rv)) set.seed(123456) par(mfrow=c(3,3)) for (r in rv) { sigma <- matrix(c(1,r,r,1),nrow=2) rand <- mvrnorm(n=n,mu=c(0,0),Sigma=sigma) ccl <- cor(rand[,1],rand[,2]) plot(rand,main=round(ccl,3)) } par(mfrow=c(1,1)) # Figura 4.9 x <- c(-4:8); y <- 2*x**2-8*x+10 cbind(x,y) cor(x,y) plot(x,y,main=paste("r =",round(cor(x,y),3),"\n")) x <- x[-c(1,2)]; y <- y[-c(1,2)] cor(x,y) set.seed(123456) x <- rnorm(100000) y <- rnorm(100000) cor(x,y) x <- rnorm(100); y <- rnorm(100) cor(x,y) x <- c(x,15); y <- c(y,20) cor(x,y) plot(x,y,main=paste("r =",round(cor(x,y),3),"\n")) cor(df$scarpe,df$capelli,use="complete.obs") tapply(df$capelli,df$sex,mean,na.rm=TRUE) tapply(df$scarpe,df$sex,mean,na.rm=TRUE) mas <- which(df$sex == "M") tmp <- df[mas,] cor(tmp$scarpe,tmp$capelli,use="complete.obs") fem <- which(df$sex == "F") tmp <- df[fem,] cor(tmp$scarpe,tmp$capelli,use="complete.obs") x <- c(0:5); y <- c(1,2,4,8,16,32) cor(x,y) set.seed(123456) x <- runif(10) round(x,3) rank(x) round(sort(x),3) df$peso[1:10] rank(df$peso[1:10]) sort(df$peso[1:10]) x <- c(0:5); y <- c(1,2,4,8,16,32) cor(rank(x),rank(y)) cor(x,y,method="spearman") x <- c(0:5); y <- 1/c(1,2,4,8,16,32) cor(x,y,method="spearman") ok <- complete.cases(df$altezza,df$peso) tmp <- df[ok,c(15,16)] cor(rank(tmp$altezza),rank(tmp$peso)) cor(tmp$altezza,tmp$peso,method="spearman") # Regressione ok <- complete.cases(df$altezza,df$peso) tmp <- df[ok,c(15,16)] (b1 <- cov(tmp$altezza,tmp$peso)/var(tmp$altezza)) (b0 <- mean(tmp$peso) - b1*mean(tmp$altezza)) # Figura 4.10 plot(tmp$altezza,tmp$peso, xlab="Altezza (cm)",ylab="Peso (kg)") abline(b0,b1,lwd=2) (x <- seq(150,180,by=5)) y <- b0 + b1*x round(y,2) tmp$yhat <- b0 + b1*tmp$altezza cor(tmp$yhat,tmp$peso) cor(tmp$altezza,tmp$peso) lm(peso ~ altezza, data=tmp) fit <- lm(peso ~ altezza, data=tmp) tmp$res <- tmp$peso - tmp$yhat head(tmp) table(sign(tmp$res)) sum(tmp$res^2) yyy <- -90 + 0.9*tmp$altezza sum((tmp$peso - yyy)^2) (devRes <- sum(tmp$res^2)) (devTot <- sum((tmp$peso - mean(tmp$peso))^2)) devTot - devRes (devSpi <- sum((tmp$yhat - mean(tmp$yhat))^2)) mean(tmp$peso) mean(tmp$yhat) sum(tmp$res) devSpi/devTot cor(tmp$altezza,tmp$peso)^2 reglin(df$altezza,df$peso) x <- tmp$altezza - mean(tmp$altezza) lm(peso ~ x, data=tmp) mean(tmp$peso) mean(x) b0 + b1*mean(tmp$altezza) mean(tmp$peso) # Figura 4.11 ok <- complete.cases(df$sex,df$altezza,df$peso) tmp <- df[ok,c(3,15,16)] (fit.f <- lm(peso ~ altezza, data=tmp, subset=(sex=="F"))) (fit.m <- lm(peso ~ altezza, data=tmp, subset=(sex=="M"))) plot(tmp$altezza,tmp$peso,type="n", xlab="Altezza (cm)",ylab="Peso (kg)") points(tmp$altezza[tmp$sex=="F"], tmp$peso[tmp$sex=="F"],pch=16) points(tmp$altezza[tmp$sex=="M"], tmp$peso[tmp$sex=="M"],pch=0) abline(fit.f,lty=2,lwd=2) abline(fit.m,lty=3,lwd=2) legend(145,110,legend=c("F","M"),pch=c(16,0), lty=c(2,3)) x <- seq(150,190,by=10) b.f <- coef(fit.f) b.m <- coef(fit.m) cbind(x,b.f[1]+b.f[2]*x,b.m[1]+b.m[2]*x) # data(galton) data(galton) str(galton) # Figura 4.12 plot(galton$mph,galton$cah, xlab="Mid-parent height (Inches)", ylab="Child adult height (Inches)") plot(jitter(galton$mph),jitter(galton$cah), xlab="Mid-parent height (Inches)", ylab="Child adult height (Inches)") table(galton$cah,galton$mph) (fit <- lm(cah ~ mph, data=galton)) abline(fit,lwd=2) abline(0,1,lty=2,lwd=2) round(tapply(galton$cah,galton$mph,mean),1) cor(galton$cah,galton$mph) cor(galton$cah,galton$mph)^2 (devt <- sum((galton$cah - mean(galton$cah))^2)) tmp <- tapply(galton$cah,galton$mph, function(z) sum((z-mean(z))^2)) (deve <- sum(tmp)) (devf <- devt - deve) devf/devt sqrt(devf/devt) (fit <- lm(mph ~ cah, data=galton)) round(tapply(galton$mph,galton$cah,mean),1) apply(galton,2,sd) sd(galton$cah)/sd(galton$mph) set.seed(123456) x <- rbinom(200,100,0.5) mean(x) y <- sort(x) (y <- y[1:20]) mean(y) x <- rbinom(20,100,0.5) mean(x)