4.1.随机数模拟

library(ggplot2)
library(gridExtra)
## 模拟生成正态分布数据
nor1<- rnorm(300,mean = 0,sd = 1)
## Poisson Distribution泊松分布
pois <- rpois(300,lambda = 2)
## Uniform Distribution 均匀分布
unif1 <- runif(300,min = 0, max = 1)
## F分布
f1 <- rf(300,10,10)

## 数据可视化
p1<- ggplot()+theme_bw(base_family = "STKaiti")+
  geom_density(aes(nor1),bw = 0.4,fill="red",alpha=0.4)+
  labs(x="",title = "正态分布")+
  theme(plot.title = element_text(hjust = 0.5))


p2 <- ggplot()+theme_bw(base_family = "STKaiti")+
  geom_density(aes(pois),bw = 0.8,fill = "red",alpha = 0.4)+
  labs(x="",title = "泊松分布")+
  theme(plot.title = element_text(hjust = 0.5))

p3<- ggplot()+theme_bw(base_family = "STKaiti")+
  geom_histogram(aes(unif1),bins = 15,fill = "red",alpha = 0.4)+
  labs(x="",title = "均匀分布")+
  theme(plot.title = element_text(hjust = 0.5))


p4<- ggplot()+theme_bw(base_family = "STKaiti")+
  geom_density(aes(f1),bw = 0.8,fill = "red",alpha = 0.4)+
  labs(x="",title = "F分布")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1,p2,p3,p4,nrow=2)

## 根据随机数,估计数据分布的参数
library(MASS)
## Maximum-likelihood fitting of univariate distributions
fitdistr(nor1,densfun = "normal")
##       mean           sd     
##   -0.02931103    1.03759018 
##  ( 0.05990530) ( 0.04235944)
fitdistr(pois,densfun = "Poisson")
##     lambda  
##   1.9733333 
##  (0.0811035)

多元正态分布

library(MASS)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 生成Multivariate Normal Distribution多变量分布样本
set.seed(123)
sigma2 = matrix(c(10,3,3,4),2,2)
norm2d <- mvrnorm(n=800,mu=c(0,4),Sigma = sigma2)
norm2df <- as.data.frame(norm2d)
colnames(norm2df) <- c("x","y")
## 可视化2维正态分布数据
ggplot(norm2df,aes(x=x,y=y))+
  theme_bw(base_family = "STKaiti")+
  geom_point(colour="red")+
  geom_density2d()+
  labs(title = "二元正态分布")+
  theme(plot.title = element_text(hjust = 0.5))

## 可视化2元正态分布的3维密度曲面图
## Two-Dimensional Kernel Density Estimation 二维核密度估计
kde <- kde2d(norm2df$x,norm2df$y,n = 50)
plot_ly(x = kde$x, y = kde$y, z = kde$z,type = "surface")
## 计算数据的方差矩阵
cov(norm2df)
##          x        y
## x 9.717163 2.786767
## y 2.786767 3.808918
## 计算均值
colMeans(norm2d)
## [1] -0.01450823  3.93987072
apply(norm2d,2,mean)
## [1] -0.01450823  3.93987072
## 估计二元正态分布的参数
library(tmvtnorm)
## Loading required package: mvtnorm
## Loading required package: Matrix
## Loading required package: stats4
## Loading required package: gmm
## Loading required package: sandwich
## 截断多元正态分布参数估计
mlefit1 <- mle.tmvnorm(norm2d,lower = c(-Inf,-Inf), upper = c(+Inf,+Inf))
summary(mlefit1)
## Maximum likelihood estimation
## 
## Call:
## mle(minuslogl = function (mu_1 = 0, mu_2 = 0, sigma_1.1 = 1, 
##     sigma_1.2 = 0, sigma_2.2 = 1) 
## {
##     nf <- names(formals())
##     theta <- sapply(nf, function(x) {
##         eval(parse(text = x))
##     })
##     mean <- theta[1:n]
##     if (cholesky) {
##         L <- inv_vech(theta[-(1:n)])
##         L[lower.tri(L, diag = FALSE)] <- 0
##         sigma <- t(L) %*% L
##     }
##     else {
##         sigma <- inv_vech(theta[-(1:n)])
##     }
##     if (det(sigma) <= 0 || any(diag(sigma) < 0)) {
##         return(.Machine$integer.max)
##     }
##     f <- -(sum(dmvnorm(X, mean, sigma, log = TRUE)) - nrow(X) * 
##         log(pmvnorm(lower = lower, upper = upper, mean = mean, 
##             sigma = sigma)))
##     if (is.infinite(f) || is.na(f)) {
##         return(.Machine$integer.max)
##     }
##     f
## }, start = as.list(c(mu_1 = 0, mu_2 = 0, sigma_1.1 = 1, sigma_1.2 = 0, 
## sigma_2.2 = 1)), method = "BFGS", fixed = list())
## 
## Coefficients:
##              Estimate Std. Error
## mu_1      -0.01450775 0.11014235
## mu_2       3.93986816 0.06895799
## sigma_1.1  9.70506982 0.48525615
## sigma_1.2  2.78330080 0.23629107
## sigma_2.2  3.80416316 0.19020841
## 
## -2 log L: 7239.191
mlefit1@coef
##        mu_1        mu_2   sigma_1.1   sigma_1.2   sigma_2.2 
## -0.01450775  3.93986816  9.70506982  2.78330080  3.80416316

4.2.假设检验

数据分布检验

数据相关性检验

t检验(均值检验等)

方差齐性检验

library(energy)
## 检验数据是否正态分布
## 模拟生成正态分布数据
set.seed(12)
nordata<- rnorm(500,mean = 2,sd = 5)
ggplot()+theme_bw(base_family = "STKaiti")+
  geom_histogram(aes(nordata),stat = "density",bins = 40)+
  labs(x="",title = "正态分布")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

## 1:QQ图经验法
#qqnorm(): 产生qq分布图
#qqline(): 添加一个参考线
par(pty="s")
qqnorm(nordata, pch = 1, frame = FALSE)
qqline(nordata, col = "steelblue", lwd = 2)

## 使用car包中的qqPlot函数
library(car)
## Loading required package: carData
par(pty="s")
qqPlot(nordata,distribution="norm")

## [1] 201 292
## 2 ks检验
ks.test(x = nordata,"pnorm")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  nordata
## D = 0.46952, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x = nordata,"pnorm",mean = 2,sd=5)
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  nordata
## D = 0.031436, p-value = 0.7063
## alternative hypothesis: two-sided
## ks检验还可以检验两组数据是否具有相同的分布
## Poisson Distribution泊松分布
pois <- rpois(100,lambda = 2)
ks.test(x = nordata,y = pois)
## Warning in ks.test(x = nordata, y = pois): p-value will be approximate in the
## presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  nordata and pois
## D = 0.334, p-value = 1.684e-08
## alternative hypothesis: two-sided
## 2元正态分布数据的数据正态性检验
set.seed(1234)
sigma2 = matrix(c(1,0,0,1),2,2)
norm2d <- mvrnorm(n=100,mu=c(0,0),Sigma = sigma2)


## energy 包里的mvnorm.etest()基于E统计量做多元正态性检验
library(energy)
## H0:是多元正态分布
mvnorm.etest(norm2d,R = 199)
## 
##  Energy test of multivariate normality: estimated parameters
## 
## data:  x, sample size 100, dimension 2, replicates 199
## E-statistic = 1.022, p-value = 0.05528
library(MVN)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## sROC 0.1-2 loaded
##  For multivariate normality, both p-values of skewness and kurtosis statistics should be greater than 0.05.
par(pty="s")
result <- mvn(norm2d, mvnTest = "mardia",multivariatePlot = "qq")

## 多个变量正态性检验
result$multivariateNormality
##              Test         Statistic           p value Result
## 1 Mardia Skewness  7.25791970026999 0.122870089870682    YES
## 2 Mardia Kurtosis 0.431195685323416 0.666326090926829    YES
## 3             MVN              <NA>              <NA>    YES
## 每个变量的一维正态性检验
## Shapiro–Wilk test tests the null hypothesis that a sample x1, ..., xn 
## came from a normally distributed population.
result$univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk  Column1     0.9886    0.5507    YES   
## 2 Shapiro-Wilk  Column2     0.9659    0.0108    NO
##   "mardia" 根据偏度和峰度的显著性来确定时否维多元正态分布,
## 如果是多元正态分布,则偏度核峰度的p值都应大于0.05


## 检验鸢尾花数据的4个特征是否为4元正态分布数据
Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
Iris <- scale(Iris[,2:5],center = T,scale = T)
par(pty="s")
irismvn <- mvn(Iris, mvnTest = "mardia",multivariatePlot = "qq")

irismvn$multivariateNormality
##              Test          Statistic              p value Result
## 1 Mardia Skewness    66.497952439495 6.72212639729695e-07     NO
## 2 Mardia Kurtosis -0.196550178829866    0.844179561490265    YES
## 3             MVN               <NA>                 <NA>     NO
irismvn$univariateNormality
##           Test      Variable Statistic   p value Normality
## 1 Shapiro-Wilk SepalLengthCm    0.9761  0.0102      NO    
## 2 Shapiro-Wilk SepalWidthCm     0.9838  0.0752      YES   
## 3 Shapiro-Wilk PetalLengthCm    0.8764  <0.001      NO    
## 4 Shapiro-Wilk PetalWidthCm     0.9026  <0.001      NO
## 结果显示,4个变量不是4元正态分布

t检验和方差齐性检验

## t检验,数据的均值检验
## 单样本t检验,检验样本的均值是否为指定值
t1<- rnorm(100,mean = 0,sd = 4)
t.test(t1,mu = 0)
## 
##  One Sample t-test
## 
## data:  t1
## t = -0.038834, df = 99, p-value = 0.9691
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.7710970  0.7414932
## sample estimates:
##   mean of x 
## -0.01480188
## 双样本t检验,检验样本的均值是否相等
t2<- rnorm(100,mean = 4,sd = 4)
t.test(t1,t2,mu = 0)
## 
##  Welch Two Sample t-test
## 
## data:  t1 and t2
## t = -6.2346, df = 193.31, p-value = 2.786e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.814834 -2.500610
## sample estimates:
##   mean of x   mean of y 
## -0.01480188  3.64291986
## 可以通过mu = num来检验两样本的均值差
t.test(t2,t1,mu = 4)
## 
##  Welch Two Sample t-test
## 
## data:  t2 and t1
## t = -0.58342, df = 193.31, p-value = 0.5603
## alternative hypothesis: true difference in means is not equal to 4
## 95 percent confidence interval:
##  2.500610 4.814834
## sample estimates:
##   mean of x   mean of y 
##  3.64291986 -0.01480188
## 方差齐性检验
## F对于两个总体;数据服从正态分布。
## F Test to Compare Two Variances
var.test(t1,t2)
## 
##  F test to compare two variances
## 
## data:  t1 and t2
## F = 0.73037, num df = 99, denom df = 99, p-value = 0.1197
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4914251 1.0855043
## sample estimates:
## ratio of variances 
##          0.7303725
## Bartlett检验条件:对于多个总体;数据服从正态分布。
t3 <- rnorm(100,mean = 4,sd = 8)
bartlett.test(list(t1,t2,t3))
## 
##  Bartlett test of homogeneity of variances
## 
## data:  list(t1, t2, t3)
## Bartlett's K-squared = 81.3, df = 2, p-value < 2.2e-16
## Levene检验这一方法更为稳健,且不依赖总体分布,是方差齐性检验的首选方法。
## 它既可用于对两个总体方差进行齐性检验,也可用于对多个总体方差进行齐性检验
## 在car包中可以使用
library(car)
library(ggplot2)
testdata <- data.frame(x = c(t1,t2,t3),
                   group1 = c(rep(c("A","B","C"),c(100,100,100))))
leveneTest(x~group1,data = testdata)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   2  30.831 6.814e-13 ***
##       297                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(testdata,aes(x = group1,y = x))+theme_bw()+
  geom_violin(aes(fill = group1),alpha = 0.2)+
  geom_jitter(aes(colour = group1))+
  theme(legend.position = "none")+
  labs(y = "values")

数据相关性检验

Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
Iris <- Iris[,2:5]
## 检验鸢尾花数据的两个特征
cor.test(Iris$SepalLengthCm,Iris$SepalWidthCm)
## 
##  Pearson's product-moment correlation
## 
## data:  Iris$SepalLengthCm and Iris$SepalWidthCm
## t = -1.3386, df = 148, p-value = 0.1828
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.26498618  0.05180021
## sample estimates:
##        cor 
## -0.1093692
## 如何快速检验4个变量两两之间的相关性是否显著
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
result <- corr.test(Iris,method="pearson")
## 相关系数
result$r
##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm     1.0000000   -0.1093692     0.8717542    0.8179536
## SepalWidthCm     -0.1093692    1.0000000    -0.4205161   -0.3565441
## PetalLengthCm     0.8717542   -0.4205161     1.0000000    0.9627571
## PetalWidthCm      0.8179536   -0.3565441     0.9627571    1.0000000
## 相关性检验的P值
result$p
##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm  0.000000e+00 1.827652e-01  5.192270e-47 9.259397e-37
## SepalWidthCm   1.827652e-01 0.000000e+00  2.528810e-07 1.504778e-05
## PetalLengthCm  1.038454e-47 8.429366e-08  0.000000e+00 3.465997e-85
## PetalWidthCm   2.314849e-37 7.523891e-06  5.776661e-86 0.000000e+00
## P值小于0.05,说明可以拒绝原假设,说明相关性不显著。


## Hmisc包的rcorr()也能完成相关性检验
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:base':
## 
##     format.pval, units
sper <- rcorr(as.matrix(Iris),type = "pearson")
## 相关系数
sper$r
##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm     1.0000000   -0.1093692     0.8717542    0.8179536
## SepalWidthCm     -0.1093692    1.0000000    -0.4205161   -0.3565441
## PetalLengthCm     0.8717542   -0.4205161     1.0000000    0.9627571
## PetalWidthCm      0.8179536   -0.3565441     0.9627571    1.0000000
## 相关性检验的P值
sper$P
##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm            NA 1.827652e-01  0.000000e+00 0.000000e+00
## SepalWidthCm      0.1827652           NA  8.429366e-08 7.523891e-06
## PetalLengthCm     0.0000000 8.429366e-08            NA 0.000000e+00
## PetalWidthCm      0.0000000 7.523891e-06  0.000000e+00           NA
## 为了方便观察相关性检验的结果,编写如下函数

comcor <- function(data,minre1 = 0.8,minre2 = -0.8,maxp = 0.05,type = "pearson"){
  ## 该函数用来计算相关系数,并且的高符合要求的组合
  ## data : 数据矩阵,每列为一个特征
  ## minre = 0.8 最小的相关系数
  ## maxp = 0.05 最大的相关系数显著性值
  library(Hmisc)
  n <- ncol(data)
  ## 计算相关系数
  sper <- rcorr(as.matrix(data),type = type)
  ## 生成相应的变量组合
  hang <- matrix(rownames(sper$r), nrow = n, ncol = n,byrow = FALSE)
  lie <- matrix(colnames(sper$r),nrow = n,ncol = n,byrow = TRUE)
  zuhe <- matrix(paste(hang,lie,sep = "~"),nrow = n)
  ## 对相应的数据取下三角形的数值
  lowrel <- sper$r[lower.tri(sper$r)]
  lowp <- sper$P[lower.tri(sper$P)]
  lowzuhe <- zuhe[lower.tri(zuhe)]
  result<-data.frame(zuhe=lowzuhe,r = lowrel,p = lowp)
  ## 找到相关系数中 r>=0.8,p<=0.05) 的组合
  index <- which((lowrel >=minre1 | lowrel <=minre2) & lowzuhe !=1 & lowp <= maxp)
  return(result[index,])
}
## 找到相关性显著的变量
comcor(Iris,minre1 = 0.8,minre2 = -0.8,maxp = 0.05,type = "pearson")
##                          zuhe         r p
## 2 PetalLengthCm~SepalLengthCm 0.8717542 0
## 3  PetalWidthCm~SepalLengthCm 0.8179536 0
## 6  PetalWidthCm~PetalLengthCm 0.9627571 0

4.3.方差分析

单因素方差分析

双因素方差分析

## 单因素方差分析
Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
colnames(Iris)
## [1] "Id"            "SepalLengthCm" "SepalWidthCm"  "PetalLengthCm"
## [5] "PetalWidthCm"  "Species"
## 比较鸢尾花数据集在SepalWidthCm上的均值差异
boxplot(SepalWidthCm~Species,Iris)

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## 可视化不同类数据的均值
par(family="STKaiti")
plotmeans(SepalWidthCm~Species,Iris,col = "red",
          main = "")

## 进行单因素方差分析
# bartlett.test(SepalWidthCm~Species,Iris)
irisaov <- aov(SepalWidthCm~Species,Iris)
summary(irisaov)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Species       2  10.98   5.489   47.36 <2e-16 ***
## Residuals   147  17.04   0.116                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 检验结果说明了各组均值不等,可以进一步进行两两比较

## TukeyHSD()函数提供了对各组均值差异的成对检验
tky <- TukeyHSD(irisaov)
tky = as.data.frame(tky$Species)
tky$pair = rownames(tky)


# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1), 
                           label=c("p<0.01","p<0.05","Non-Sig")))) +
  theme_bw(base_family = "STKaiti",base_size = 16)+
  geom_hline(yintercept=0, lty="11", colour="grey30",size = 1) +
  geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2,size = 1) +
  geom_point(aes(pair, diff),size = 2) +
  labs(colour="")+
  theme(axis.text.x = element_text(size = 14))

## 双因素方差分析,不考虑交互作用

##  examines the effects of Vitamin C on tooth growth in Guinea Pigs (ToothGrowth)
## 每种动物通过两种递送方法之一,橙汁或抗坏血酸(一种维生素C形式,编码为VC)
## 接受三种剂量水平的维生素C(0.5,1和2mg /天)中的一种。
data(ToothGrowth)
## 将浓度数据转化为因子变量
ToothGrowth$dose <- factor(ToothGrowth$dose,levels = c(0.5, 1, 2),
                           labels = c("D0.5", "D1", "D2"))
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: Factor w/ 3 levels "D0.5","D1","D2": 1 1 1 1 1 1 1 1 1 1 ...
## 可视化数据
# 绘制多个分组的小提琴图
# Add error bars: mean_se
# (other values include: mean_sd, mean_ci, median_iqr, ....)
library("ggpubr")
## Loading required package: magrittr
ggviolin(ToothGrowth, x = "dose", y = "len", color = "supp",
         add = "dotplot",palette = c("red", "blue"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

## 双因素方差分析,不考虑交互作用
aov1 <- aov(len~dose+supp,data = ToothGrowth)
summary(aov1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## dose         2 2426.4  1213.2   82.81  < 2e-16 ***
## supp         1  205.4   205.4   14.02 0.000429 ***
## Residuals   56  820.4    14.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 可视化方差分析交互图
ggline(ToothGrowth, x = "dose", y = "len", color = "supp",
       shape = "supp",add = "mean",palette = c("red", "blue"))

## 双因素方差分析,考虑交互作用
aov1 <- aov(len~dose*supp,data = ToothGrowth)
summary(aov1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## dose         2 2426.4  1213.2  92.000  < 2e-16 ***
## supp         1  205.4   205.4  15.572 0.000231 ***
## dose:supp    2  108.3    54.2   4.107 0.021860 *  
## Residuals   54  712.1    13.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.4 列联表分析

testdata<- read.csv("data/chap4/原料.csv")
rownames(testdata) <- testdata$X
testdata$X <- NULL
testdata
##     A  B  C
## 甲 52 64 24
## 已 60 59 52
## 丙 70 65 74
(result <- chisq.test(testdata))
## 
##  Pearson's Chi-squared test
## 
## data:  testdata
## X-squared = 15.375, df = 4, p-value = 0.003983
## 计算出期望值
result$expected
##        A        B        C
## 甲 49.00 50.61538 40.38462
## 已 59.85 61.82308 49.32692
## 丙 73.15 75.56154 60.28846
## 使用马赛克图进行可视化
par(family = "STKaiti")
mosaicplot(testdata,main = "",color = TRUE)

library(vcd)
## Loading required package: grid
assocstats(as.matrix(testdata))
##                     X^2 df  P(> X^2)
## Likelihood Ratio 16.101  4 0.0028871
## Pearson          15.375  4 0.0039834
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.169 
## Cramer's V        : 0.122

高维列联表分析

library(vcd)
library(MASS)

## 1973年按入学和性别划分的伯克利大学研究生院申请人数最多的6个学院的汇总数据。
data("UCBAdmissions")

UCBAdmissions
## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
## 
## , , Dept = C
## 
##           Gender
## Admit      Male Female
##   Admitted  120    202
##   Rejected  205    391
## 
## , , Dept = D
## 
##           Gender
## Admit      Male Female
##   Admitted  138    131
##   Rejected  279    244
## 
## , , Dept = E
## 
##           Gender
## Admit      Male Female
##   Admitted   53     94
##   Rejected  138    299
## 
## , , Dept = F
## 
##           Gender
## Admit      Male Female
##   Admitted   22     24
##   Rejected  351    317
## A,B,C,D,E,F为6个学院
## 性别为男女
## Admit  为入学和被拒绝

## 使用马赛克图可视化数据
mosaic(~Admit+Dept+Gender,data=UCBAdmissions,shade = TRUE, legend = TRUE)

## 相互独立:判断Admit,Dept,Gender是否成对独立。
loglm(~Admit+Dept+Gender, data=UCBAdmissions)
## Call:
## loglm(formula = ~Admit + Dept + Gender, data = UCBAdmissions)
## 
## Statistics:
##                       X^2 df P(> X^2)
## Likelihood Ratio 2097.671 16        0
## Pearson          2000.328 16        0
## P值小于0说明,三者之间不独立


## 在给定Dept的情况下判断是否入学和性别是否独立
## 条件独立:Admit与Gender无关,给定Dept.
loglm(~Admit+Dept+Gender+Admit*Dept+Gender*Dept, data=UCBAdmissions)
## Call:
## loglm(formula = ~Admit + Dept + Gender + Admit * Dept + Gender * 
##     Dept, data = UCBAdmissions)
## 
## Statistics:
##                       X^2 df    P(> X^2)
## Likelihood Ratio 21.73551  6 0.001351993
## Pearson          19.93841  6 0.002840164
## P值小于0说明,两者之间不独立

mosaic(~Admit+Gender,data=UCBAdmissions,shade = TRUE, legend = TRUE)

## 从马赛克图上可以看出入学的男性数量更高,而被拒绝的那女数量相当