library(ggplot2)
library(gridExtra)
## 模拟生成正态分布数据
nor1<- rnorm(300,mean = 0,sd = 1)
## Poisson Distribution泊松分布
pois <- rpois(300,lambda = 2)
## Uniform Distribution 均匀分布
unif1 <- runif(300,min = 0, max = 1)
## F分布
f1 <- rf(300,10,10)
## 数据可视化
p1<- ggplot()+theme_bw(base_family = "STKaiti")+
geom_density(aes(nor1),bw = 0.4,fill="red",alpha=0.4)+
labs(x="",title = "正态分布")+
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot()+theme_bw(base_family = "STKaiti")+
geom_density(aes(pois),bw = 0.8,fill = "red",alpha = 0.4)+
labs(x="",title = "泊松分布")+
theme(plot.title = element_text(hjust = 0.5))
p3<- ggplot()+theme_bw(base_family = "STKaiti")+
geom_histogram(aes(unif1),bins = 15,fill = "red",alpha = 0.4)+
labs(x="",title = "均匀分布")+
theme(plot.title = element_text(hjust = 0.5))
p4<- ggplot()+theme_bw(base_family = "STKaiti")+
geom_density(aes(f1),bw = 0.8,fill = "red",alpha = 0.4)+
labs(x="",title = "F分布")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,p3,p4,nrow=2)
## 根据随机数,估计数据分布的参数
library(MASS)
## Maximum-likelihood fitting of univariate distributions
fitdistr(nor1,densfun = "normal")
## mean sd
## -0.02931103 1.03759018
## ( 0.05990530) ( 0.04235944)
fitdistr(pois,densfun = "Poisson")
## lambda
## 1.9733333
## (0.0811035)
library(MASS)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## 生成Multivariate Normal Distribution多变量分布样本
set.seed(123)
sigma2 = matrix(c(10,3,3,4),2,2)
norm2d <- mvrnorm(n=800,mu=c(0,4),Sigma = sigma2)
norm2df <- as.data.frame(norm2d)
colnames(norm2df) <- c("x","y")
## 可视化2维正态分布数据
ggplot(norm2df,aes(x=x,y=y))+
theme_bw(base_family = "STKaiti")+
geom_point(colour="red")+
geom_density2d()+
labs(title = "二元正态分布")+
theme(plot.title = element_text(hjust = 0.5))
## 可视化2元正态分布的3维密度曲面图
## Two-Dimensional Kernel Density Estimation 二维核密度估计
kde <- kde2d(norm2df$x,norm2df$y,n = 50)
plot_ly(x = kde$x, y = kde$y, z = kde$z,type = "surface")
## 计算数据的方差矩阵
cov(norm2df)
## x y
## x 9.717163 2.786767
## y 2.786767 3.808918
## 计算均值
colMeans(norm2d)
## [1] -0.01450823 3.93987072
apply(norm2d,2,mean)
## [1] -0.01450823 3.93987072
## 估计二元正态分布的参数
library(tmvtnorm)
## Loading required package: mvtnorm
## Loading required package: Matrix
## Loading required package: stats4
## Loading required package: gmm
## Loading required package: sandwich
## 截断多元正态分布参数估计
mlefit1 <- mle.tmvnorm(norm2d,lower = c(-Inf,-Inf), upper = c(+Inf,+Inf))
summary(mlefit1)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = function (mu_1 = 0, mu_2 = 0, sigma_1.1 = 1,
## sigma_1.2 = 0, sigma_2.2 = 1)
## {
## nf <- names(formals())
## theta <- sapply(nf, function(x) {
## eval(parse(text = x))
## })
## mean <- theta[1:n]
## if (cholesky) {
## L <- inv_vech(theta[-(1:n)])
## L[lower.tri(L, diag = FALSE)] <- 0
## sigma <- t(L) %*% L
## }
## else {
## sigma <- inv_vech(theta[-(1:n)])
## }
## if (det(sigma) <= 0 || any(diag(sigma) < 0)) {
## return(.Machine$integer.max)
## }
## f <- -(sum(dmvnorm(X, mean, sigma, log = TRUE)) - nrow(X) *
## log(pmvnorm(lower = lower, upper = upper, mean = mean,
## sigma = sigma)))
## if (is.infinite(f) || is.na(f)) {
## return(.Machine$integer.max)
## }
## f
## }, start = as.list(c(mu_1 = 0, mu_2 = 0, sigma_1.1 = 1, sigma_1.2 = 0,
## sigma_2.2 = 1)), method = "BFGS", fixed = list())
##
## Coefficients:
## Estimate Std. Error
## mu_1 -0.01450775 0.11014235
## mu_2 3.93986816 0.06895799
## sigma_1.1 9.70506982 0.48525615
## sigma_1.2 2.78330080 0.23629107
## sigma_2.2 3.80416316 0.19020841
##
## -2 log L: 7239.191
mlefit1@coef
## mu_1 mu_2 sigma_1.1 sigma_1.2 sigma_2.2
## -0.01450775 3.93986816 9.70506982 2.78330080 3.80416316
数据分布检验
数据相关性检验
t检验(均值检验等)
方差齐性检验
library(energy)
## 检验数据是否正态分布
## 模拟生成正态分布数据
set.seed(12)
nordata<- rnorm(500,mean = 2,sd = 5)
ggplot()+theme_bw(base_family = "STKaiti")+
geom_histogram(aes(nordata),stat = "density",bins = 40)+
labs(x="",title = "正态分布")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## 1:QQ图经验法
#qqnorm(): 产生qq分布图
#qqline(): 添加一个参考线
par(pty="s")
qqnorm(nordata, pch = 1, frame = FALSE)
qqline(nordata, col = "steelblue", lwd = 2)
## 使用car包中的qqPlot函数
library(car)
## Loading required package: carData
par(pty="s")
qqPlot(nordata,distribution="norm")
## [1] 201 292
## 2 ks检验
ks.test(x = nordata,"pnorm")
##
## One-sample Kolmogorov-Smirnov test
##
## data: nordata
## D = 0.46952, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x = nordata,"pnorm",mean = 2,sd=5)
##
## One-sample Kolmogorov-Smirnov test
##
## data: nordata
## D = 0.031436, p-value = 0.7063
## alternative hypothesis: two-sided
## ks检验还可以检验两组数据是否具有相同的分布
## Poisson Distribution泊松分布
pois <- rpois(100,lambda = 2)
ks.test(x = nordata,y = pois)
## Warning in ks.test(x = nordata, y = pois): p-value will be approximate in the
## presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: nordata and pois
## D = 0.334, p-value = 1.684e-08
## alternative hypothesis: two-sided
## 2元正态分布数据的数据正态性检验
set.seed(1234)
sigma2 = matrix(c(1,0,0,1),2,2)
norm2d <- mvrnorm(n=100,mu=c(0,0),Sigma = sigma2)
## energy 包里的mvnorm.etest()基于E统计量做多元正态性检验
library(energy)
## H0:是多元正态分布
mvnorm.etest(norm2d,R = 199)
##
## Energy test of multivariate normality: estimated parameters
##
## data: x, sample size 100, dimension 2, replicates 199
## E-statistic = 1.022, p-value = 0.05528
library(MVN)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
## For multivariate normality, both p-values of skewness and kurtosis statistics should be greater than 0.05.
par(pty="s")
result <- mvn(norm2d, mvnTest = "mardia",multivariatePlot = "qq")
## 多个变量正态性检验
result$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 7.25791970026999 0.122870089870682 YES
## 2 Mardia Kurtosis 0.431195685323416 0.666326090926829 YES
## 3 MVN <NA> <NA> YES
## 每个变量的一维正态性检验
## Shapiro–Wilk test tests the null hypothesis that a sample x1, ..., xn
## came from a normally distributed population.
result$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Column1 0.9886 0.5507 YES
## 2 Shapiro-Wilk Column2 0.9659 0.0108 NO
## "mardia" 根据偏度和峰度的显著性来确定时否维多元正态分布,
## 如果是多元正态分布,则偏度核峰度的p值都应大于0.05
## 检验鸢尾花数据的4个特征是否为4元正态分布数据
Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
Iris <- scale(Iris[,2:5],center = T,scale = T)
par(pty="s")
irismvn <- mvn(Iris, mvnTest = "mardia",multivariatePlot = "qq")
irismvn$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 66.497952439495 6.72212639729695e-07 NO
## 2 Mardia Kurtosis -0.196550178829866 0.844179561490265 YES
## 3 MVN <NA> <NA> NO
irismvn$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk SepalLengthCm 0.9761 0.0102 NO
## 2 Shapiro-Wilk SepalWidthCm 0.9838 0.0752 YES
## 3 Shapiro-Wilk PetalLengthCm 0.8764 <0.001 NO
## 4 Shapiro-Wilk PetalWidthCm 0.9026 <0.001 NO
## 结果显示,4个变量不是4元正态分布
## t检验,数据的均值检验
## 单样本t检验,检验样本的均值是否为指定值
t1<- rnorm(100,mean = 0,sd = 4)
t.test(t1,mu = 0)
##
## One Sample t-test
##
## data: t1
## t = -0.038834, df = 99, p-value = 0.9691
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.7710970 0.7414932
## sample estimates:
## mean of x
## -0.01480188
## 双样本t检验,检验样本的均值是否相等
t2<- rnorm(100,mean = 4,sd = 4)
t.test(t1,t2,mu = 0)
##
## Welch Two Sample t-test
##
## data: t1 and t2
## t = -6.2346, df = 193.31, p-value = 2.786e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.814834 -2.500610
## sample estimates:
## mean of x mean of y
## -0.01480188 3.64291986
## 可以通过mu = num来检验两样本的均值差
t.test(t2,t1,mu = 4)
##
## Welch Two Sample t-test
##
## data: t2 and t1
## t = -0.58342, df = 193.31, p-value = 0.5603
## alternative hypothesis: true difference in means is not equal to 4
## 95 percent confidence interval:
## 2.500610 4.814834
## sample estimates:
## mean of x mean of y
## 3.64291986 -0.01480188
## 方差齐性检验
## F对于两个总体;数据服从正态分布。
## F Test to Compare Two Variances
var.test(t1,t2)
##
## F test to compare two variances
##
## data: t1 and t2
## F = 0.73037, num df = 99, denom df = 99, p-value = 0.1197
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4914251 1.0855043
## sample estimates:
## ratio of variances
## 0.7303725
## Bartlett检验条件:对于多个总体;数据服从正态分布。
t3 <- rnorm(100,mean = 4,sd = 8)
bartlett.test(list(t1,t2,t3))
##
## Bartlett test of homogeneity of variances
##
## data: list(t1, t2, t3)
## Bartlett's K-squared = 81.3, df = 2, p-value < 2.2e-16
## Levene检验这一方法更为稳健,且不依赖总体分布,是方差齐性检验的首选方法。
## 它既可用于对两个总体方差进行齐性检验,也可用于对多个总体方差进行齐性检验
## 在car包中可以使用
library(car)
library(ggplot2)
testdata <- data.frame(x = c(t1,t2,t3),
group1 = c(rep(c("A","B","C"),c(100,100,100))))
leveneTest(x~group1,data = testdata)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 30.831 6.814e-13 ***
## 297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(testdata,aes(x = group1,y = x))+theme_bw()+
geom_violin(aes(fill = group1),alpha = 0.2)+
geom_jitter(aes(colour = group1))+
theme(legend.position = "none")+
labs(y = "values")
Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
Iris <- Iris[,2:5]
## 检验鸢尾花数据的两个特征
cor.test(Iris$SepalLengthCm,Iris$SepalWidthCm)
##
## Pearson's product-moment correlation
##
## data: Iris$SepalLengthCm and Iris$SepalWidthCm
## t = -1.3386, df = 148, p-value = 0.1828
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.26498618 0.05180021
## sample estimates:
## cor
## -0.1093692
## 如何快速检验4个变量两两之间的相关性是否显著
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
result <- corr.test(Iris,method="pearson")
## 相关系数
result$r
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm 1.0000000 -0.1093692 0.8717542 0.8179536
## SepalWidthCm -0.1093692 1.0000000 -0.4205161 -0.3565441
## PetalLengthCm 0.8717542 -0.4205161 1.0000000 0.9627571
## PetalWidthCm 0.8179536 -0.3565441 0.9627571 1.0000000
## 相关性检验的P值
result$p
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm 0.000000e+00 1.827652e-01 5.192270e-47 9.259397e-37
## SepalWidthCm 1.827652e-01 0.000000e+00 2.528810e-07 1.504778e-05
## PetalLengthCm 1.038454e-47 8.429366e-08 0.000000e+00 3.465997e-85
## PetalWidthCm 2.314849e-37 7.523891e-06 5.776661e-86 0.000000e+00
## P值小于0.05,说明可以拒绝原假设,说明相关性不显著。
## Hmisc包的rcorr()也能完成相关性检验
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:plotly':
##
## subplot
## The following objects are masked from 'package:base':
##
## format.pval, units
sper <- rcorr(as.matrix(Iris),type = "pearson")
## 相关系数
sper$r
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm 1.0000000 -0.1093692 0.8717542 0.8179536
## SepalWidthCm -0.1093692 1.0000000 -0.4205161 -0.3565441
## PetalLengthCm 0.8717542 -0.4205161 1.0000000 0.9627571
## PetalWidthCm 0.8179536 -0.3565441 0.9627571 1.0000000
## 相关性检验的P值
sper$P
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm NA 1.827652e-01 0.000000e+00 0.000000e+00
## SepalWidthCm 0.1827652 NA 8.429366e-08 7.523891e-06
## PetalLengthCm 0.0000000 8.429366e-08 NA 0.000000e+00
## PetalWidthCm 0.0000000 7.523891e-06 0.000000e+00 NA
## 为了方便观察相关性检验的结果,编写如下函数
comcor <- function(data,minre1 = 0.8,minre2 = -0.8,maxp = 0.05,type = "pearson"){
## 该函数用来计算相关系数,并且的高符合要求的组合
## data : 数据矩阵,每列为一个特征
## minre = 0.8 最小的相关系数
## maxp = 0.05 最大的相关系数显著性值
library(Hmisc)
n <- ncol(data)
## 计算相关系数
sper <- rcorr(as.matrix(data),type = type)
## 生成相应的变量组合
hang <- matrix(rownames(sper$r), nrow = n, ncol = n,byrow = FALSE)
lie <- matrix(colnames(sper$r),nrow = n,ncol = n,byrow = TRUE)
zuhe <- matrix(paste(hang,lie,sep = "~"),nrow = n)
## 对相应的数据取下三角形的数值
lowrel <- sper$r[lower.tri(sper$r)]
lowp <- sper$P[lower.tri(sper$P)]
lowzuhe <- zuhe[lower.tri(zuhe)]
result<-data.frame(zuhe=lowzuhe,r = lowrel,p = lowp)
## 找到相关系数中 r>=0.8,p<=0.05) 的组合
index <- which((lowrel >=minre1 | lowrel <=minre2) & lowzuhe !=1 & lowp <= maxp)
return(result[index,])
}
## 找到相关性显著的变量
comcor(Iris,minre1 = 0.8,minre2 = -0.8,maxp = 0.05,type = "pearson")
## zuhe r p
## 2 PetalLengthCm~SepalLengthCm 0.8717542 0
## 3 PetalWidthCm~SepalLengthCm 0.8179536 0
## 6 PetalWidthCm~PetalLengthCm 0.9627571 0
单因素方差分析
双因素方差分析
## 单因素方差分析
Iris <- read.csv("data/chap4/Iris.csv",header = TRUE)
colnames(Iris)
## [1] "Id" "SepalLengthCm" "SepalWidthCm" "PetalLengthCm"
## [5] "PetalWidthCm" "Species"
## 比较鸢尾花数据集在SepalWidthCm上的均值差异
boxplot(SepalWidthCm~Species,Iris)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## 可视化不同类数据的均值
par(family="STKaiti")
plotmeans(SepalWidthCm~Species,Iris,col = "red",
main = "")
## 进行单因素方差分析
# bartlett.test(SepalWidthCm~Species,Iris)
irisaov <- aov(SepalWidthCm~Species,Iris)
summary(irisaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 10.98 5.489 47.36 <2e-16 ***
## Residuals 147 17.04 0.116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 检验结果说明了各组均值不等,可以进一步进行两两比较
## TukeyHSD()函数提供了对各组均值差异的成对检验
tky <- TukeyHSD(irisaov)
tky = as.data.frame(tky$Species)
tky$pair = rownames(tky)
# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1),
label=c("p<0.01","p<0.05","Non-Sig")))) +
theme_bw(base_family = "STKaiti",base_size = 16)+
geom_hline(yintercept=0, lty="11", colour="grey30",size = 1) +
geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2,size = 1) +
geom_point(aes(pair, diff),size = 2) +
labs(colour="")+
theme(axis.text.x = element_text(size = 14))
## 双因素方差分析,不考虑交互作用
## examines the effects of Vitamin C on tooth growth in Guinea Pigs (ToothGrowth)
## 每种动物通过两种递送方法之一,橙汁或抗坏血酸(一种维生素C形式,编码为VC)
## 接受三种剂量水平的维生素C(0.5,1和2mg /天)中的一种。
data(ToothGrowth)
## 将浓度数据转化为因子变量
ToothGrowth$dose <- factor(ToothGrowth$dose,levels = c(0.5, 1, 2),
labels = c("D0.5", "D1", "D2"))
str(ToothGrowth)
## 'data.frame': 60 obs. of 3 variables:
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: Factor w/ 3 levels "D0.5","D1","D2": 1 1 1 1 1 1 1 1 1 1 ...
## 可视化数据
# 绘制多个分组的小提琴图
# Add error bars: mean_se
# (other values include: mean_sd, mean_ci, median_iqr, ....)
library("ggpubr")
## Loading required package: magrittr
ggviolin(ToothGrowth, x = "dose", y = "len", color = "supp",
add = "dotplot",palette = c("red", "blue"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## 双因素方差分析,不考虑交互作用
aov1 <- aov(len~dose+supp,data = ToothGrowth)
summary(aov1)
## Df Sum Sq Mean Sq F value Pr(>F)
## dose 2 2426.4 1213.2 82.81 < 2e-16 ***
## supp 1 205.4 205.4 14.02 0.000429 ***
## Residuals 56 820.4 14.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 可视化方差分析交互图
ggline(ToothGrowth, x = "dose", y = "len", color = "supp",
shape = "supp",add = "mean",palette = c("red", "blue"))
## 双因素方差分析,考虑交互作用
aov1 <- aov(len~dose*supp,data = ToothGrowth)
summary(aov1)
## Df Sum Sq Mean Sq F value Pr(>F)
## dose 2 2426.4 1213.2 92.000 < 2e-16 ***
## supp 1 205.4 205.4 15.572 0.000231 ***
## dose:supp 2 108.3 54.2 4.107 0.021860 *
## Residuals 54 712.1 13.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testdata<- read.csv("data/chap4/原料.csv")
rownames(testdata) <- testdata$X
testdata$X <- NULL
testdata
## A B C
## 甲 52 64 24
## 已 60 59 52
## 丙 70 65 74
(result <- chisq.test(testdata))
##
## Pearson's Chi-squared test
##
## data: testdata
## X-squared = 15.375, df = 4, p-value = 0.003983
## 计算出期望值
result$expected
## A B C
## 甲 49.00 50.61538 40.38462
## 已 59.85 61.82308 49.32692
## 丙 73.15 75.56154 60.28846
## 使用马赛克图进行可视化
par(family = "STKaiti")
mosaicplot(testdata,main = "",color = TRUE)
library(vcd)
## Loading required package: grid
assocstats(as.matrix(testdata))
## X^2 df P(> X^2)
## Likelihood Ratio 16.101 4 0.0028871
## Pearson 15.375 4 0.0039834
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.169
## Cramer's V : 0.122
library(vcd)
library(MASS)
## 1973年按入学和性别划分的伯克利大学研究生院申请人数最多的6个学院的汇总数据。
data("UCBAdmissions")
UCBAdmissions
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
##
## , , Dept = C
##
## Gender
## Admit Male Female
## Admitted 120 202
## Rejected 205 391
##
## , , Dept = D
##
## Gender
## Admit Male Female
## Admitted 138 131
## Rejected 279 244
##
## , , Dept = E
##
## Gender
## Admit Male Female
## Admitted 53 94
## Rejected 138 299
##
## , , Dept = F
##
## Gender
## Admit Male Female
## Admitted 22 24
## Rejected 351 317
## A,B,C,D,E,F为6个学院
## 性别为男女
## Admit 为入学和被拒绝
## 使用马赛克图可视化数据
mosaic(~Admit+Dept+Gender,data=UCBAdmissions,shade = TRUE, legend = TRUE)
## 相互独立:判断Admit,Dept,Gender是否成对独立。
loglm(~Admit+Dept+Gender, data=UCBAdmissions)
## Call:
## loglm(formula = ~Admit + Dept + Gender, data = UCBAdmissions)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 2097.671 16 0
## Pearson 2000.328 16 0
## P值小于0说明,三者之间不独立
## 在给定Dept的情况下判断是否入学和性别是否独立
## 条件独立:Admit与Gender无关,给定Dept.
loglm(~Admit+Dept+Gender+Admit*Dept+Gender*Dept, data=UCBAdmissions)
## Call:
## loglm(formula = ~Admit + Dept + Gender + Admit * Dept + Gender *
## Dept, data = UCBAdmissions)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 21.73551 6 0.001351993
## Pearson 19.93841 6 0.002840164
## P值小于0说明,两者之间不独立
mosaic(~Admit+Gender,data=UCBAdmissions,shade = TRUE, legend = TRUE)
## 从马赛克图上可以看出入学的男性数量更高,而被拒绝的那女数量相当