统一设置ggplot2的绘图风格

library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))

11.1:支持向量机分类

Breast Cancer Wisconsin (Diagnostic) Data Set

Predict whether the cancer is benign or malignant

https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

library(e1071)
library(Metrics)
library(readr)
library(gmodels)
## 读取数据
Cancer <- read_csv("data/chap11/Breast Cancer Wisconsin.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   diagnosis = col_character()
## )
## See spec(...) for full column specifications.
Cancer$diagnosis <- as.factor(Cancer$diagnosis)
head(Cancer)
## # A tibble: 6 x 32
##       id diagnosis radius_mean texture_mean perimeter_mean area_mean
##    <dbl> <fct>           <dbl>        <dbl>          <dbl>     <dbl>
## 1 8.42e5 M                18.0         10.4          123.      1001 
## 2 8.43e5 M                20.6         17.8          133.      1326 
## 3 8.43e7 M                19.7         21.2          130       1203 
## 4 8.43e7 M                11.4         20.4           77.6      386.
## 5 8.44e7 M                20.3         14.3          135.      1297 
## 6 8.44e5 M                12.4         15.7           82.6      477.
## # … with 26 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## #   concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## #   fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## #   perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>
summary(Cancer)
##        id            diagnosis  radius_mean      texture_mean  
##  Min.   :     8670   B:357     Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869218   M:212     1st Qu.:11.700   1st Qu.:16.17  
##  Median :   906024             Median :13.370   Median :18.84  
##  Mean   : 30371831             Mean   :14.127   Mean   :19.29  
##  3rd Qu.:  8813129             3rd Qu.:15.780   3rd Qu.:21.80  
##  Max.   :911320502             Max.   :28.110   Max.   :39.28  
##  perimeter_mean     area_mean      smoothness_mean   compactness_mean 
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
##  1st Qu.: 75.17   1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492  
##  Median : 86.24   Median : 551.1   Median :0.09587   Median :0.09263  
##  Mean   : 91.97   Mean   : 654.9   Mean   :0.09636   Mean   :0.10434  
##  3rd Qu.:104.10   3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##  concavity_mean    concave points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000   Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02956   1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.06154   Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.08880   Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.13070   3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.42680   Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se     concave points_se 
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638  
##  Median :0.006380   Median :0.020450   Median :0.02589   Median :0.010930  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189   Mean   :0.011796  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600   Max.   :0.052790  
##   symmetry_se       fractal_dimension_se  radius_worst   texture_worst  
##  Min.   :0.007882   Min.   :0.0008948    Min.   : 7.93   Min.   :12.02  
##  1st Qu.:0.015160   1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08  
##  Median :0.018730   Median :0.0031870    Median :14.97   Median :25.41  
##  Mean   :0.020542   Mean   :0.0037949    Mean   :16.27   Mean   :25.68  
##  3rd Qu.:0.023480   3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72  
##  Max.   :0.078950   Max.   :0.0298400    Max.   :36.04   Max.   :49.54  
##  perimeter_worst    area_worst     smoothness_worst  compactness_worst
##  Min.   : 50.41   Min.   : 185.2   Min.   :0.07117   Min.   :0.02729  
##  1st Qu.: 84.11   1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720  
##  Median : 97.66   Median : 686.5   Median :0.13130   Median :0.21190  
##  Mean   :107.26   Mean   : 880.6   Mean   :0.13237   Mean   :0.25427  
##  3rd Qu.:125.40   3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910  
##  Max.   :251.20   Max.   :4254.0   Max.   :0.22260   Max.   :1.05800  
##  concavity_worst  concave points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.0000   Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.1145   1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2267   Median :0.09993      Median :0.2822   Median :0.08004        
##  Mean   :0.2722   Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3829   3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :1.2520   Max.   :0.29100      Max.   :0.6638   Max.   :0.20750
table(Cancer$diagnosis)
## 
##   B   M 
## 357 212

主成分分析对数据进行降维

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(pheatmap)
## 可视化碎石图,选择合适的主成分数
parpca <- fa.parallel(Cancer[,3:32],fa = "pc")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor extraction method.

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  5
## 主成分分析,提取前2个主成分
can_cor <- cor(Cancer[,3:32],Cancer[,3:32])
dim(can_cor)
## [1] 30 30
canpca <- principal(can_cor,nfactors = 5,rotate = "cluster")
## 可视化主成分和每个变量的相关性
pheatmap(canpca$loadings,cluster_rows = F,cluster_cols = F)

## 使用pca模型获取数据集的5个主成分
can_score<- predict.psych(canpca,Cancer[,3:32])
dim(can_score)
## [1] 569   5
head(can_score)
##              RC1        RC5        RC2        RC3          RC4
## [1,]  2.14398826  2.6686188  1.2409385  0.1051650 -1.424708623
## [2,]  1.26818755 -0.4886897 -0.3731027 -1.0945307 -0.047042941
## [3,]  1.63678624  0.9139543  0.5164043 -0.4680318  0.164123014
## [4,] -0.07931035  5.1876659  2.0412375  0.9794555 -0.009439497
## [5,]  1.53865585 -0.1956127  0.4809143  0.1801097 -1.403098202
## [6,] -0.15095910  2.3711514  0.5010229 -0.4343232 -0.401721316
## 可视化前两个维度的数据分布
plotdata <- data.frame(PC1 = can_score[,1],PC2 = can_score[,2],
                       diagnosis = Cancer$diagnosis)

ggplot(plotdata,aes(x=PC1,y= PC2,colour = diagnosis,shape = diagnosis))+
  geom_point()+theme(legend.position = "top")

支持向量机分类

## 数据70%用于训练,30%用于测试模型效果
## 为了方便可视化,只使用前两个主成分
set.seed(123)
index <- sample(nrow(can_score),round(0.7*nrow(can_score)))
cancerdata <- data.frame(can_score[,1:2])
cancerdata$diagnosis <- as.factor(Cancer$diagnosis)
colnames(cancerdata) <- c("PC1","PC2","y")
train_data <- cancerdata[index,]
test_data <- cancerdata[-index,]


## 线性核的支持向量机分类模型
can_svm <- svm(y~.,data = train_data,kernel ="linear")
summary(can_svm)
## 
## Call:
## svm(formula = y ~ ., data = train_data, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  63
## 
##  ( 31 32 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  B M
## 在训练数据集上,可视化支持向量和分类面
plot(can_svm,train_data)

## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(can_svm,test_data))
sprintf("base SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "base SVM model acc: 0.959064"
table(test_data$y,test_pre)
##    test_pre
##      B  M
##   B 97  1
##   M  6 67
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  171 
## 
##  
##              | test_pre 
##  test_data$y |         B |         M | Row Total | 
## -------------|-----------|-----------|-----------|
##            B |        97 |         1 |        98 | 
## -------------|-----------|-----------|-----------|
##            M |         6 |        67 |        73 | 
## -------------|-----------|-----------|-----------|
## Column Total |       103 |        68 |       171 | 
## -------------|-----------|-----------|-----------|
## 
## 

参数搜索

## 模型的精度能够提高吗?
## 调整参数提升模型的精度
set.seed(1)
svm_opt <- tune.svm(y~.,data = train_data,kernel ="linear",cost = seq(1,20,1))

## 可视化参数搜索的结果
plot(svm_opt,main = "SVM best cost parameter")

svm_opt$best.parameters
##    cost
## 14   14
svm_opt$best.performance
## [1] 0.05794872
svmbest <- svm_opt$best.model
summary(svmbest)
## 
## Call:
## best.svm(x = y ~ ., data = train_data, cost = seq(1, 20, 1), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  14 
## 
## Number of Support Vectors:  54
## 
##  ( 27 27 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  B M
## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(svmbest,test_data))
sprintf("best SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "best SVM model acc: 0.964912"
table(test_data$y,test_pre)
##    test_pre
##      B  M
##   B 97  1
##   M  5 68
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  171 
## 
##  
##              | test_pre 
##  test_data$y |         B |         M | Row Total | 
## -------------|-----------|-----------|-----------|
##            B |        97 |         1 |        98 | 
## -------------|-----------|-----------|-----------|
##            M |         5 |        68 |        73 | 
## -------------|-----------|-----------|-----------|
## Column Total |       102 |        69 |       171 | 
## -------------|-----------|-----------|-----------|
## 
## 
## 针对线性核,虽然不同的cost参数下支持向量的数量不一样,但是在测试集上的预测精度是一样的

更改支持向量机模型的分类kernal,查看模型的效果

## 进一步的提高模型精度,搜索更多的参数
set.seed(1)
Optvm <- tune.svm(y~.,data = train_data,kernel ="radial",
                    cost = seq(1,20,1),gamma = seq(0.1,1,0.1))
plot(Optvm)

Optvm$best.parameters
##    gamma cost
## 15   0.5    2
Optvm$best.performance
## [1] 0.05794872
svmbest2 <- Optvm$best.model
summary(svmbest2)
## 
## Call:
## best.svm(x = y ~ ., data = train_data, gamma = seq(0.1, 1, 0.1), 
##     cost = seq(1, 20, 1), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  2 
## 
## Number of Support Vectors:  71
## 
##  ( 35 36 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  B M
## 可视化新的支持向量机分类面
plot(svmbest2,data = train_data)

## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(svmbest2,test_data))
sprintf("best SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "best SVM model acc: 0.947368"
table(test_data$y,test_pre)
##    test_pre
##      B  M
##   B 97  1
##   M  8 65
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  171 
## 
##  
##              | test_pre 
##  test_data$y |         B |         M | Row Total | 
## -------------|-----------|-----------|-----------|
##            B |        97 |         1 |        98 | 
## -------------|-----------|-----------|-----------|
##            M |         8 |        65 |        73 | 
## -------------|-----------|-----------|-----------|
## Column Total |       105 |        66 |       171 | 
## -------------|-----------|-----------|-----------|
## 
## 
## 使用新的核函数得到了新的数据切分曲面,模型的分类效果比使用线性核效果好

11.2: SVM识别异常值

识别二维数据中的异常值

异常值识别垃圾邮件

## 二维数据中的异常值
library(ggplot2)
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(e1071)
library(Rlof)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# ## 生成随机数据
# set.seed(1234)
# samp1 <- mvrnorm(n = 200,mu=c(5,5),
#                  Sigma = matrix(c(1,0,0,1),nrow = 2,ncol = 2))
# samp1 <- samp1[samp1[,1] < 7,]
# samp2 <- mvrnorm(n = 3,mu=c(0,0),
#                  Sigma = matrix(c(5,0,0,5),nrow = 2,ncol = 2))
# samp3 <- mvrnorm(n = 2,mu=c(6,12),
#                  Sigma = matrix(c(1,0,0,2),nrow = 2,ncol = 2))
# samp4 <- mvrnorm(n = 2,mu=c(12,3),
#                  Sigma = matrix(c(3,0,0,4),nrow = 2,ncol = 2))
# samp5 <- mvrnorm(n = 1,mu=c(12,6),
#                  Sigma = matrix(c(1,0,0,1.5),nrow = 2,ncol = 2))
# samp6 <- mvrnorm(n = 2,mu=c(11,10),
#                  Sigma = matrix(c(2,0,0,2),nrow = 2,ncol = 2))
# samp7 <- mvrnorm(n = 3,mu=c(-1,10),
#                  Sigma = matrix(c(6,0,0,6),nrow = 2,ncol = 2))
# samp <- rbind(samp1,samp2,samp3,samp4,samp5,samp6,samp7)
# par(pty = c("s"))
# plot(samp[,1],samp[,2])
# ## 随机分布样本
# scatter2dim <- data.frame(x = samp[,1],y = samp[,2])
# index <- sample(nrow(scatter2dim),nrow(scatter2dim))
# scatter2dim <- scatter2dim[index,]
# plot(scatter2dim$x,scatter2dim$y)
# write.csv(scatter2dim,"data/chap10/outlierdata.csv",row.names = F)

## 读取异常值数据
outlierdata <- read.csv("data/chap11/outlierdata.csv")
## 可视化数据的分布
qplot(outlierdata$x,outlierdata$y,xlab = "X",ylab = "Y",main = "散点图")

## Lof方法识别异常值
lof_score <- lofactor(outlierdata,k=6)
outlierdata$lof_score <- lof_score
outlierdata$scorecut <- cut_width(lof_score,2)
## 可视化LOF得分
ggplot(outlierdata,aes(x=x,y=y,colour=lof_score))+
  geom_point(aes(shape = scorecut))+
  ggtitle("LOF得分")

## 可以指定距离,多核并行计算lof得分
lof_score2 <- Rlof::lof(outlierdata[,c("x","y")],k=6,cores = 2,
                        method = "maximum")
outlierdata$lof_score2 <- lof_score2
outlierdata$scorecut2 <- cut_width(lof_score2,2)
## 可视化LOF得分
ggplot(outlierdata,aes(x=x,y=y,colour=lof_score2))+
  geom_point(aes(shape = scorecut2))+
  ggtitle("LOF得分")

## 使用支持向量机来识别异常值SVM
out_svm <- svm(x = outlierdata[,c("x","y")],y = NULL,
               type = "one-classification",nu = 0.08)
out_svm
## 
## Call:
## svm.default(x = outlierdata[, c("x", "y")], y = NULL, type = "one-classification", 
##     nu = 0.08)
## 
## 
## Parameters:
##    SVM-Type:  one-classification 
##  SVM-Kernel:  radial 
##       gamma:  0.5 
##          nu:  0.08 
## 
## Number of Support Vectors:  22
## 预测是否为异常值
svmpre <- predict(out_svm,outlierdata[,c("x","y")])
outlierdata$isoutlier <- !svmpre
## 可视化SVM的识别结果
ggplot(outlierdata,aes(x=x,y=y))+
  geom_point(aes(shape = isoutlier))+
  ggtitle("SVM识别异常值")

使用SVM异常值处理方法识别垃圾邮件

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(gmodels)
load("data/chap8/spam_cp.RData")
load("data/chap8/spam.RData")
## 构建TF-IDF矩阵
controls = list(weighting = function(x) weightTfIdf(x,normalize = FALSE))
spam_dtm <- DocumentTermMatrix(spam_cp,control = controls)
## 通常情况下,文档-词项的tf-idf矩阵非常的稀疏,可以删除一些不重要的词来缓解矩阵的稀疏性,同时提高计算效率
spam_dtm <- removeSparseTerms(spam_dtm,0.999)
spam_dtm
## <<DocumentTermMatrix (documents: 5572, terms: 1252)>>
## Non-/sparse entries: 36634/6939510
## Sparsity           : 99%
## Maximal term length: 19
## Weighting          : term frequency - inverse document frequency (tf-idf)
## 数据随机切分为75%训练集和25%测试集
set.seed(123)
index <- sample(nrow(spam),nrow(spam)*0.75)
spam_dtm2mat <- as.matrix(spam_dtm)
train_x <- spam_dtm2mat[index,]
train_y <- spam$label[index]
test_x <- spam_dtm2mat[-index,]
test_y <- spam$label[-index]

# train_x <- apply(train_x, 2, function(x) ifelse(x>0,1,0))
# test_x <- apply(test_x, 2, function(x) ifelse(x>0,1,0))
## 使用支持向量机来识别异常值SVM
out_svm <- svm(x = train_x,y = NULL,
               type = "one-classification",nu = 0.1)
## 预测是否为异常值
svmpre <- predict(out_svm,test_x)
table(svmpre)
## svmpre
## FALSE  TRUE 
##   282  1111
table(test_y)
## test_y
##  ham spam 
## 1206  187
svmpre2 <- ifelse(svmpre == TRUE, "ham","spam")

CrossTable(test_y,svmpre2,prop.r = F,prop.t = F,prop.chisq = F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1393 
## 
##  
##              | svmpre2 
##       test_y |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1005 |       201 |      1206 | 
##              |     0.905 |     0.713 |           | 
## -------------|-----------|-----------|-----------|
##         spam |       106 |        81 |       187 | 
##              |     0.095 |     0.287 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1111 |       282 |      1393 | 
##              |     0.798 |     0.202 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
table(test_y,svmpre2)
##       svmpre2
## test_y  ham spam
##   ham  1005  201
##   spam  106   81
sprintf("异常值检测识别精度为%.4f",accuracy(test_y,svmpre2))
## [1] "异常值检测识别精度为0.7796"

11.3:支持向量回归

读取数据,数据特征处理,数据切分

library(e1071)
library(caret)
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(Metrics)
library(readr)
library(gmodels)
library(outliers)
## 
## Attaching package: 'outliers'
## The following object is masked from 'package:psych':
## 
##     outlier
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
kc_house <- read_csv("data/chap11/kc_house_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   id = col_character(),
##   date = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
kc_house$date <- year(kc_house$date)
kc_house <- kc_house[,2:21]
## 剔除价格异常的数据
outlierval <- boxplot.stats(kc_house$price)$out
kc_house <- kc_house[!(kc_house$price %in% outlierval),]
dim(kc_house)
## [1] 20467    20
## 剔除 bedrooms 异常的数据(==0,>6)
index <- c(which(kc_house$bedrooms == 0),which(kc_house$bedrooms > 6))
kc_house <- kc_house[-index,]
dim(kc_house)
## [1] 20408    20
## 剔除 bathrooms 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$bathrooms)$out
kc_house <- kc_house[!(kc_house$bathrooms %in% outlierval),]
dim(kc_house)
## [1] 20316    20
## 剔除 sqft_lot 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$sqft_lot)$out
kc_house <- kc_house[!(kc_house$sqft_lot %in% outlierval),]
dim(kc_house)
## [1] 18112    20
## 剔除特征 waterfront and view
kc_house$waterfront <- NULL
kc_house$view <- NULL
## 剔除 condition 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$condition)$out
kc_house <- kc_house[!(kc_house$condition %in% outlierval),]
dim(kc_house)
## [1] 18091    18
## 剔除 sqft_basement 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$sqft_basement)$out
kc_house <- kc_house[!(kc_house$sqft_basement %in% outlierval),]
dim(kc_house)
## [1] 17687    18
## 房子建立的时间长短,并剔除取值小于0的数据
kc_house$yr_built <- kc_house$date - kc_house$yr_built
index <- which(kc_house$yr_built < 0)
kc_house <- kc_house[-index,]
dim(kc_house)
## [1] 17675    18
## 房子是否装修过,装修过为1,没装修过为0
index <- which(kc_house$yr_renovated != 0)
kc_house$yr_renovated[index] <- 1
dim(kc_house)
## [1] 17675    18
kc_house$zipcode <- as.factor(kc_house$zipcode)
## 剔除不需要的特征
kc_house <- kc_house[,2:14]
## 可视化房子的售价分布
ggplot(kc_house)+geom_histogram(aes(price),fill = "lightblue",bins = 60)

## 可视化特征之间的相关系数
kc_cor <- cor(kc_house[1:11],kc_house[1:11])
corrplot.mixed(kc_cor,tl.col="black",tl.pos = "lt",
               tl.cex = 0.8,number.cex = 0.7)

head(kc_house)
## # A tibble: 6 x 13
##    price bedrooms bathrooms sqft_living sqft_lot floors condition grade
##    <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>     <dbl> <dbl>
## 1 221900        3      1           1180     5650      1         3     7
## 2 538000        3      2.25        2570     7242      2         3     7
## 3 180000        2      1            770    10000      1         3     6
## 4 604000        4      3           1960     5000      1         5     7
## 5 510000        3      2           1680     8080      1         3     8
## 6 257500        3      2.25        1715     6819      2         3     7
## # … with 5 more variables: sqft_above <dbl>, sqft_basement <dbl>,
## #   yr_built <dbl>, yr_renovated <dbl>, zipcode <fct>
summary(kc_house)
##      price            bedrooms       bathrooms      sqft_living  
##  Min.   :  80000   Min.   :1.000   Min.   :0.500   Min.   : 370  
##  1st Qu.: 305000   1st Qu.:3.000   1st Qu.:1.500   1st Qu.:1360  
##  Median : 422500   Median :3.000   Median :2.000   Median :1780  
##  Mean   : 461056   Mean   :3.276   Mean   :2.004   Mean   :1871  
##  3rd Qu.: 579000   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.:2290  
##  Max.   :1127000   Max.   :6.000   Max.   :4.000   Max.   :5050  
##                                                                  
##     sqft_lot         floors        condition         grade       
##  Min.   :  520   Min.   :1.000   Min.   :2.000   Min.   : 3.000  
##  1st Qu.: 4800   1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 7.000  
##  Median : 7110   Median :1.000   Median :3.000   Median : 7.000  
##  Mean   : 7139   Mean   :1.482   Mean   :3.405   Mean   : 7.451  
##  3rd Qu.: 9122   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.: 8.000  
##  Max.   :18224   Max.   :3.500   Max.   :5.000   Max.   :12.000  
##                                                                  
##    sqft_above   sqft_basement       yr_built       yr_renovated    
##  Min.   : 370   Min.   :   0.0   Min.   :  0.00   Min.   :0.00000  
##  1st Qu.:1140   1st Qu.:   0.0   1st Qu.: 16.00   1st Qu.:0.00000  
##  Median :1460   Median :   0.0   Median : 42.00   Median :0.00000  
##  Mean   :1637   Mean   : 233.5   Mean   : 44.09   Mean   :0.03525  
##  3rd Qu.:1990   3rd Qu.: 470.0   3rd Qu.: 64.00   3rd Qu.:0.00000  
##  Max.   :5050   Max.   :1250.0   Max.   :115.00   Max.   :1.00000  
##                                                                    
##     zipcode     
##  98103  :  579  
##  98115  :  544  
##  98117  :  530  
##  98052  :  498  
##  98034  :  496  
##  98038  :  494  
##  (Other):14534
dim(kc_house)
## [1] 17675    13
anyNA(kc_house)
## [1] FALSE
write.csv(kc_house,"data/chap11/kc_house_clean.csv",row.names = FALSE)
## 切分数据
set.seed(123)
index <- sample(nrow(kc_house),round(0.7*nrow(kc_house)))
train_data <- kc_house[index,]
test_data <- kc_house[-index,]
dim(train_data)
## [1] 12372    13
dim(test_data)
## [1] 5303   13

支持向量机回归模型

## 支持向量机回归模型
system.time(
  svmreg <- svm(price~.,data = train_data,kernel = "radial")
)
##    user  system elapsed 
##  11.085   0.081  11.174
summary(svmreg)
## 
## Call:
## svm(formula = price ~ ., data = train_data, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01234568 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  8917
## 在训练集上的误差
train_pre <- predict(svmreg,train_data)
train_mae <- mae(train_data$price,train_pre)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 56347.919309"
test_pre <- predict(svmreg,test_data)
test_mae <- mae(test_data$price,test_pre)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 58359.507538"

使用参数搜索,寻找更好的模型

## 该部分程序运行时间较长,此处先将其注释掉 
#############################################
# set.seed(234)
# tunecontrols <- tune.control(sampling = "bootstrap",boot.size = 0.8)
# system.time(

#   svmregopt <- tune(svm,price~.,data = train_data,kernel = "radial",
#                     ranges = list(gamma = seq(0.01,0.1,0.02),
#                                   cost= seq(11,20,2)),
#                     tunecontrol = tunecontrols)
# )
## 找到的模型参数
# plot(svmregopt,main = "Performance of svm regression")
# svmregopt$best.parameters
# svmregopt$best.performance
#############################################

## 得到的最优参数为cost=19,gamma=0.01

 
 
## 使用找到的最好参数进行SVM回归
system.time(
  svmregbest <- svm(price~.,data = train_data,kernel = "radial",
                    cost=19,gamma=0.01)
)
##    user  system elapsed 
##  17.398   0.102  17.512
## 在训练集上的误差
train_pre <- predict(svmregbest,train_data)
train_mae <- mae(train_data$price,train_pre)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 48616.775325"
test_pre <- predict(svmregbest,test_data)
test_mae <- mae(test_data$price,test_pre)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 52979.473453"
## 模型和原始的支持向量回归相比,效果提升了5000多

11.4:MLP神经网络分类

单隐藏层神经网络,分类肺癌数据

library(neuralnet)
library(NeuralNetTools)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:neuralnet':
## 
##     compute
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(psych)
library(Metrics)
library(ggplot2)
## 读取数据
Cancer <- read_csv("data/chap11/Breast Cancer Wisconsin.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   diagnosis = col_character()
## )
## See spec(...) for full column specifications.
Cancer$diagnosis <- as.integer(as.factor(Cancer$diagnosis))-1
## 主成分分析,提取前5个主成分
can_cor <- cor(Cancer[,3:32],Cancer[,3:32])
canpca <- principal(can_cor,nfactors = 5,rotate = "cluster")
can_score<- predict.psych(canpca,Cancer[,3:32])
## 数据70%用于训练,30%用于测试模型效果
set.seed(123)
index <- sample(nrow(can_score),round(0.7*nrow(can_score)))
train_data <- as.data.frame(can_score[index,])
train_data$y <- Cancer$diagnosis[index]
test_data <- as.data.frame(can_score[-index,])
test_data$y <- Cancer$diagnosis[-index]
head(train_data)
##          RC1        RC5         RC2        RC3         RC4 y
## 1  0.1011215 -0.8996009 -0.86740003  0.7000380  1.68584164 1
## 2 -0.3603427 -1.3415099 -0.53912539 -0.6659501  1.29527571 0
## 3 -0.7493287 -2.0504789 -1.11433658 -0.6384352  0.76219966 0
## 4 -1.2799680  0.6470246 -0.34020304 -0.1953687 -1.26199913 0
## 5  0.2618498  0.6918794  1.22911675 -0.2793941  0.51394461 1
## 6  0.4541706  1.8289565  0.05629867 -0.6531238  0.03885923 1
colnames(train_data)
## [1] "RC1" "RC5" "RC2" "RC3" "RC4" "y"
## 
## 单隐藏层 MLP model
sigmlp <- neuralnet(y~RC1+RC5+RC2+RC3+RC4,data = train_data,
                    hidden = c(10),linear.output = FALSE,
                    act.fct = "logistic",algorithm = "rprop+")

par(cex = 0.6)
plotnet(sigmlp,pos_col = "red", neg_col = "grey")

summary(sigmlp)
##                     Length Class      Mode    
## call                   7   -none-     call    
## response             398   -none-     numeric 
## covariate           1990   -none-     numeric 
## model.list             2   -none-     list    
## err.fct                1   -none-     function
## act.fct                1   -none-     function
## linear.output          1   -none-     logical 
## data                   6   data.frame list    
## exclude                0   -none-     NULL    
## net.result             1   -none-     list    
## weights                1   -none-     list    
## generalized.weights    1   -none-     list    
## startweights           1   -none-     list    
## result.matrix         74   -none-     numeric
## 测试集上的精度
test_pre <- neuralnet::compute(sigmlp,test_data[,1:5])
test_pre <- as.vector(ifelse(test_pre$net.result >0.5,1,0))
accuracy(test_data$y,test_pre)
## [1] 0.9824561
## 训练单隐藏层神经网络,分析隐藏层的不同在测试集上的精度
set.seed(123)
size <- seq(2,20,1)
netacc <- size
for (ii in 1:length(size)) {
  signet <- neuralnet(y~RC1+RC5+RC2+RC3+RC4,data = train_data,
                    hidden = c(size[ii]),linear.output = FALSE,
                    act.fct = "logistic",algorithm = "rprop+")
  test_pre <- neuralnet::compute(signet,test_data[,1:5])
  test_pre <- as.vector(ifelse(test_pre$net.result >0.5,1,0)) 
  netacc[ii] <- accuracy(test_data$y,test_pre)
}

data.frame(size = size,netacc = netacc) %>%
  ggplot(aes(x=size,y = netacc))+
  geom_line()+geom_point(colour="red")

多隐藏层神经网络,fashion-mnist数据分类

使用h2o.deeplearning:feed-forward multilayer artificial neural network

https://www.kaggle.com/zalando-research/fashionmnist

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(caret)
library(ggcorrplot)
library(RColorBrewer)
## 启动初始化一个h2o实例; 定义为2核同时计算;
h2o.init(nthreads=4,max_mem_size='8G')
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/dv/qd8nc5bx6xx3pxq_xx_j47240000gn/T//RtmpKr0MNP/h2o_sunlanxin_started_from_r.out
##     /var/folders/dv/qd8nc5bx6xx3pxq_xx_j47240000gn/T//RtmpKr0MNP/h2o_sunlanxin_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 576 milliseconds 
##     H2O cluster timezone:       Asia/Shanghai 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.26.0.2 
##     H2O cluster version age:    4 months and 28 days !!! 
##     H2O cluster name:           H2O_started_from_R_sunlanxin_hzc570 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.11 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (4 months and 28 days)!
## Please download and install the latest version from http://h2o.ai/download/
## 因为数据量很大,为了节省计算时间,这里只使用部分数据集,完成训练和测试的工作
fashion <- read_csv("data/chap11/fashion-mnist_train.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
fashion_test <- read_csv("data/chap11/fashion-mnist_test.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
fashion$label <- as.factor(fashion$label)
fashion[2:785] <- fashion[2:785] / 255.0
fashion_test[2:785] <- fashion_test[2:785] / 255.0
dim(fashion)
## [1] 60000   785
dim(fashion_test)
## [1] 10000   785
table(fashion$label)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 6000 6000 6000 6000 6000 6000 6000 6000 6000 6000
table(fashion_test$label)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000
## 可视化部分数据
## 可视化部分样本的图像
set.seed(123)
index <- sample(nrow(fashion),50)
par(mfrow = c(5,10),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(index)){
  im <- as.numeric(fashion[index[ii],2:785])
  im <- matrix(im,nrow=28,ncol = 28,byrow = F)
  im <- apply(apply(im, 1, rev),1,rev)
  image(im,col = gray(seq(0, 1, length = 256)),xaxt= "n", yaxt= "n")
}

## 数据切分为训练集和验证集,80%训练集,20%验证集
fashion_h2o <- as.h2o(fashion)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
splits <- h2o.splitFrame(data = fashion_h2o, ratios = 0.8,seed = 1234)
train_data <- splits[[1]]
val_data <- splits[[2]]

dim(train_data)
## [1] 48009   785
dim(val_data)
## [1] 11991   785
## MLP分类模型
mlpclf <- h2o.deeplearning(x = 2:785, y = 1, train_data, activation = "Tanh",
                           hidden=c(100,100,100,100),epochs = 5,
                           loss = "CrossEntropy",score_each_iteration = TRUE,
                           mini_batch_size = 500,validation_frame = val_data,
                           stopping_rounds=0,seed = 123)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
summary(mlpclf)
## Model Details:
## ==============
## 
## H2OMultinomialModel: deeplearning
## Model Key:  DeepLearning_model_R_1577262350391_1 
## Status of Neuron Layers: predicting label, 10-class classification, multinomial distribution, CrossEntropy loss, 109,810 weights/biases, 1.4 MB, 245,594 training samples, mini-batch size 1
##   layer units    type dropout       l1       l2 mean_rate rate_rms momentum
## 1     1   784   Input  0.00 %       NA       NA        NA       NA       NA
## 2     2   100    Tanh  0.00 % 0.000000 0.000000  0.018767 0.049689 0.000000
## 3     3   100    Tanh  0.00 % 0.000000 0.000000  0.005727 0.001688 0.000000
## 4     4   100    Tanh  0.00 % 0.000000 0.000000  0.006707 0.002303 0.000000
## 5     5   100    Tanh  0.00 % 0.000000 0.000000  0.005533 0.004053 0.000000
## 6     6    10 Softmax      NA 0.000000 0.000000  0.002659 0.001812 0.000000
##   mean_weight weight_rms mean_bias bias_rms
## 1          NA         NA        NA       NA
## 2    0.001021   0.080721  0.002030 0.077328
## 3   -0.001409   0.128151 -0.003879 0.129459
## 4   -0.000937   0.117577 -0.004774 0.177014
## 5   -0.001043   0.117010  0.021648 0.165734
## 6   -0.040423   0.479030 -0.265369 0.126030
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 9971 samples **
## 
## Training Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.0761862
## RMSE: (Extract with `h2o.rmse`) 0.2760185
## Logloss: (Extract with `h2o.logloss`) 0.2599514
## Mean Per-Class Error: 0.09293306
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           0   1   2    3    4   5   6    7   8   9  Error          Rate
## 0       913   1   4   24    5   1  50    0   0   0 0.0852 =    85 / 998
## 1         4 993   0   17    1   0   0    0   0   0 0.0217 =  22 / 1,015
## 2        10   0 749    7  119   1  67    1   3   0 0.2173 =   208 / 957
## 3        26   2   4  921   19   0  20    0   1   0 0.0725 =    72 / 993
## 4         3   1  53   30  929   0  21    0   1   0 0.1050 = 109 / 1,038
## 5         0   0   0    0    0 924   0   42   3   4 0.0504 =    49 / 973
## 6       150   0  37   20   80   1 729    0   5   0 0.2867 = 293 / 1,022
## 7         0   0   0    0    0   4   0 1040   0  11 0.0142 =  15 / 1,055
## 8         5   0   3    3    3   2   7    3 947   0 0.0267 =    26 / 973
## 9         1   0   0    0    0   4   0   42   0 900 0.0496 =    47 / 947
## Totals 1112 997 850 1022 1156 937 894 1128 960 915 0.0929 = 926 / 9,971
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  0.907131
## 2   2  0.978939
## 3   3  0.993782
## 4   4  0.997192
## 5   5  0.998997
## 6   6  0.999498
## 7   7  1.000000
## 8   8  1.000000
## 9   9  1.000000
## 10 10  1.000000
## 
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
## 
## Validation Set Metrics: 
## =====================
## 
## Extract validation frame with `h2o.getFrame("RTMP_sid_af88_5")`
## MSE: (Extract with `h2o.mse`) 0.09976002
## RMSE: (Extract with `h2o.rmse`) 0.3158481
## Logloss: (Extract with `h2o.logloss`) 0.3595032
## Mean Per-Class Error: 0.1201635
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           0    1    2    3    4    5    6    7    8    9  Error
## 0      1020    0   14   27    3    1   86    0    3    0 0.1161
## 1         4 1165    4   27    2    0    7    0    1    0 0.0372
## 2        33    0  929   10  147    0   95    0    4    0 0.2373
## 3        39    5    7 1113   37    1   16    0    1    0 0.0870
## 4         7    1   71   39 1012    1   58    0    6    0 0.1531
## 5         1    0    0    0    1 1133    0   67    2   16 0.0713
## 6       196    2   77   28  135    2  758    0   11    0 0.3730
## 7         0    0    0    0    0   10    0 1146    0   15 0.0213
## 8        10    0    8   11    7    6   12    8 1152    0 0.0511
## 9         0    0    0    0    0    7    0   57    0 1117 0.0542
## Totals 1310 1173 1110 1255 1344 1161 1032 1278 1180 1148 0.1206
##                    Rate
## 0      =    134 / 1,154
## 1      =     45 / 1,210
## 2      =    289 / 1,218
## 3      =    106 / 1,219
## 4      =    183 / 1,195
## 5      =     87 / 1,220
## 6      =    451 / 1,209
## 7      =     25 / 1,171
## 8      =     62 / 1,214
## 9      =     64 / 1,181
## Totals = 1,446 / 11,991
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  0.879410
## 2   2  0.966391
## 3   3  0.987657
## 4   4  0.993245
## 5   5  0.996331
## 6   6  0.998332
## 7   7  0.999083
## 8   8  0.999500
## 9   9  0.999917
## 10 10  1.000000
## 
## 
## 
## 
## Scoring History: 
##             timestamp   duration training_speed  epochs iterations      samples
## 1 2019-12-25 16:26:43  0.000 sec             NA 0.00000          0     0.000000
## 2 2019-12-25 16:26:54 12.073 sec   1246 obs/sec 0.25824          1 12398.000000
## 3 2019-12-25 16:27:05 22.503 sec   1250 obs/sec 0.51290          2 24624.000000
## 4 2019-12-25 16:27:15 32.844 sec   1257 obs/sec 0.76786          3 36864.000000
## 5 2019-12-25 16:27:25 43.263 sec   1262 obs/sec 1.02654          4 49283.000000
##   training_rmse training_logloss training_r2 training_classification_error
## 1            NA               NA          NA                            NA
## 2       0.38479          0.51683     0.98185                       0.18283
## 3       0.37234          0.50024     0.98301                       0.16809
## 4       0.34642          0.41711     0.98529                       0.14703
## 5       0.34626          0.43092     0.98530                       0.14783
##   validation_rmse validation_logloss validation_r2
## 1              NA                 NA            NA
## 2         0.38976            0.53994       0.98141
## 3         0.37900            0.52415       0.98242
## 4         0.35785            0.45120       0.98433
## 5         0.36240            0.47978       0.98393
##   validation_classification_error
## 1                              NA
## 2                         0.18297
## 3                         0.16963
## 4                         0.15537
## 5                         0.16004
## 
## ---
##              timestamp          duration training_speed  epochs iterations
## 17 2019-12-25 16:29:29  2 min 46.949 sec   1273 obs/sec 4.09321         16
## 18 2019-12-25 16:29:39  2 min 57.197 sec   1274 obs/sec 4.34971         17
## 19 2019-12-25 16:29:50  3 min  7.455 sec   1275 obs/sec 4.60509         18
## 20 2019-12-25 16:30:00  3 min 17.598 sec   1275 obs/sec 4.85842         19
## 21 2019-12-25 16:30:10  3 min 28.234 sec   1273 obs/sec 5.11558         20
## 22 2019-12-25 16:30:11  3 min 28.960 sec   1273 obs/sec 5.11558         20
##          samples training_rmse training_logloss training_r2
## 17 196511.000000       0.29082          0.29054     0.98963
## 18 208825.000000       0.27920          0.26486     0.99045
## 19 221086.000000       0.27384          0.25659     0.99081
## 20 233248.000000       0.27602          0.25995     0.99066
## 21 245594.000000       0.28806          0.27811     0.98983
## 22 245594.000000       0.27602          0.25995     0.99066
##    training_classification_error validation_rmse validation_logloss
## 17                       0.10200         0.32603            0.38912
## 18                       0.09618         0.31754            0.36290
## 19                       0.09116         0.32042            0.37049
## 20                       0.09287         0.31585            0.35950
## 21                       0.10260         0.32884            0.38735
## 22                       0.09287         0.31585            0.35950
##    validation_r2 validation_classification_error
## 17       0.98699                         0.12993
## 18       0.98766                         0.12167
## 19       0.98743                         0.12676
## 20       0.98779                         0.12059
## 21       0.98677                         0.13160
## 22       0.98779                         0.12059
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##   variable relative_importance scaled_importance percentage
## 1 pixel771            1.000000          1.000000   0.002565
## 2  pixel15            0.945291          0.945291   0.002425
## 3  pixel47            0.926730          0.926730   0.002377
## 4  pixel14            0.820528          0.820528   0.002105
## 5  pixel39            0.819039          0.819039   0.002101
## 
## ---
##     variable relative_importance scaled_importance percentage
## 779 pixel493            0.362333          0.362333   0.000929
## 780 pixel702            0.360451          0.360451   0.000924
## 781  pixel34            0.352331          0.352331   0.000904
## 782 pixel701            0.338989          0.338989   0.000869
## 783  pixel86            0.332298          0.332298   0.000852
## 784 pixel464            0.327093          0.327093   0.000839
h2o.mse(mlpclf)
## [1] 0.0761862
## 可视化模型误差的收敛过程
plot(mlpclf)

## 查看5张图像经过MLP模型后的特征的变化情况
index <- seq(200,nrow(fashion),6000)[1:5]
par(mfrow = c(5,5),mai=c(0.1,0.1,0.1,0.1))
for(ii in seq_along(index)){
  imone <- fashion_h2o[index[ii],2:785]
  imlay1 <- h2o.deepfeatures(mlpclf,imone,layer = 1)
  imlay2 <- h2o.deepfeatures(mlpclf,imone,layer = 2)
  imlay3 <- h2o.deepfeatures(mlpclf,imone,layer = 3)
  imlay4 <- h2o.deepfeatures(mlpclf,imone,layer = 4)
  imone <- matrix(imone,nrow = 28,ncol = 28)
  imone <- apply(apply(imone, 1, rev),1,rev) 
  image(imone,axes = FALSE,main = "image")
  image(matrix(imlay1,nrow = 10,ncol = 10),axes = FALSE,main = "layer1")
  image(matrix(imlay2,nrow = 10,ncol = 10),axes = FALSE,main = "layer2")
  image(matrix(imlay3,nrow = 10,ncol = 10),axes = FALSE,main = "layer3")
  image(matrix(imlay4,nrow = 10,ncol = 10),axes = FALSE,main = "layer4")
}
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |======================================================================| 100%

## 计算模型在测试集上的预测值和性能
test_data <- as.h2o(fashion_test)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
mlp_pre <- as.data.frame(h2o.predict(mlpclf, newdata = test_data))
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
head(mlp_pre)
##   predict           p0           p1           p2           p3           p4
## 1       0 8.825573e-01 1.203678e-06 1.778921e-03 1.775553e-03 2.529055e-04
## 2       1 1.449403e-05 9.998722e-01 1.012056e-06 9.725374e-05 2.594125e-06
## 3       2 4.532433e-02 1.355734e-04 8.328083e-01 4.993226e-04 1.903358e-03
## 4       0 5.127520e-01 1.256277e-05 1.503413e-01 1.447441e-02 1.189241e-03
## 5       4 2.124521e-02 3.261013e-04 1.334515e-01 1.978805e-01 3.295533e-01
## 6       6 8.384536e-02 5.816908e-05 1.659194e-02 8.954610e-03 2.609597e-03
##             p5           p6           p7           p8           p9
## 1 6.663631e-06 1.133134e-01 3.670872e-05 2.442098e-05 2.529408e-04
## 2 5.980163e-07 1.143874e-05 2.479186e-07 5.611547e-08 1.547157e-07
## 3 1.324675e-04 1.188200e-01 2.710366e-05 1.739044e-04 1.756738e-04
## 4 2.937390e-05 3.202861e-01 4.385079e-04 1.792104e-04 2.972488e-04
## 5 2.358185e-04 3.158153e-01 1.299353e-04 8.065782e-04 5.557758e-04
## 6 1.850168e-04 8.867538e-01 1.081465e-04 8.766160e-04 1.678942e-05
acc <- accuracy(fashion_test$label,mlp_pre$predict)
sprintf("MLP model acc: %f",acc)
## [1] "MLP model acc: 0.880400"
## 在测试集上的混淆矩阵热力图
confum <- caret::confusionMatrix(as.factor(fashion_test$label),mlp_pre$predict)
confumat <- as.data.frame(confum$table)
confumat[,1:2] <- apply(confumat[,1:2],2,as.integer)

ggplot(confumat,aes(x=Reference,y = Prediction))+
  geom_tile(aes(fill = Freq))+
  geom_text(aes(label = Freq))+
  scale_x_continuous(breaks = c(0:9))+
  scale_y_continuous(breaks = unique(confumat$Prediction),
                     trans = "reverse")+
  scale_fill_gradient2(low="darkblue", high="lightgreen", 
                       guide="colorbar")+
  ggtitle("MLP分类器在测试集结果")

11.5:MLP神经网络回归

library(RSNNS)
## Loading required package: Rcpp
## 
## Attaching package: 'RSNNS'
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, train
kc_house <- read_csv("data/chap11/kc_house_clean.csv")
## Parsed with column specification:
## cols(
##   price = col_double(),
##   bedrooms = col_double(),
##   bathrooms = col_double(),
##   sqft_living = col_double(),
##   sqft_lot = col_double(),
##   floors = col_double(),
##   condition = col_double(),
##   grade = col_double(),
##   sqft_above = col_double(),
##   sqft_basement = col_double(),
##   yr_built = col_double(),
##   yr_renovated = col_double(),
##   zipcode = col_double()
## )
## 将zipcode进行label编码
zipcode <- decodeClassLabels(kc_house$zipcode)
kc_house <- cbind(kc_house[1:12],zipcode)
## 数据max-min归一化到0-1之间
kc_house[,2:11] <- normalizeData(kc_house[,2:11],"0_1")
## 房价归一化到0-1之间,并获取归一化参数
price <- normalizeData(kc_house$price,type = "0_1")
NormParameters <- getNormParameters(price)
head(kc_house)
##    price bedrooms bathrooms sqft_living  sqft_lot floors condition     grade
## 1 221900      0.4 0.1428571  0.17307692 0.2897650    0.0 0.3333333 0.4444444
## 2 538000      0.4 0.5000000  0.47008547 0.3796882    0.4 0.3333333 0.4444444
## 3 180000      0.2 0.1428571  0.08547009 0.5354722    0.0 0.3333333 0.3333333
## 4 604000      0.6 0.7142857  0.33974359 0.2530502    0.0 1.0000000 0.4444444
## 5 510000      0.4 0.4285714  0.27991453 0.4270221    0.0 0.3333333 0.5555556
## 6 257500      0.4 0.5000000  0.28739316 0.3557953    0.4 0.3333333 0.4444444
##   sqft_above sqft_basement  yr_built yr_renovated 98001 98002 98003 98004 98005
## 1 0.17307692         0.000 0.5130435            0     0     0     0     0     0
## 2 0.38461538         0.320 0.5478261            1     0     0     0     0     0
## 3 0.08547009         0.000 0.7130435            0     0     0     0     0     0
## 4 0.14529915         0.728 0.4260870            0     0     0     0     0     0
## 5 0.27991453         0.000 0.2434783            0     0     0     0     0     0
## 6 0.28739316         0.000 0.1652174            0     0     0     1     0     0
##   98006 98007 98008 98010 98011 98014 98019 98022 98023 98024 98027 98028 98029
## 1     0     0     0     0     0     0     0     0     0     0     0     0     0
## 2     0     0     0     0     0     0     0     0     0     0     0     0     0
## 3     0     0     0     0     0     0     0     0     0     0     0     1     0
## 4     0     0     0     0     0     0     0     0     0     0     0     0     0
## 5     0     0     0     0     0     0     0     0     0     0     0     0     0
## 6     0     0     0     0     0     0     0     0     0     0     0     0     0
##   98030 98031 98032 98033 98034 98038 98039 98040 98042 98045 98052 98053 98055
## 1     0     0     0     0     0     0     0     0     0     0     0     0     0
## 2     0     0     0     0     0     0     0     0     0     0     0     0     0
## 3     0     0     0     0     0     0     0     0     0     0     0     0     0
## 4     0     0     0     0     0     0     0     0     0     0     0     0     0
## 5     0     0     0     0     0     0     0     0     0     0     0     0     0
## 6     0     0     0     0     0     0     0     0     0     0     0     0     0
##   98056 98058 98059 98065 98070 98072 98074 98075 98077 98092 98102 98103 98105
## 1     0     0     0     0     0     0     0     0     0     0     0     0     0
## 2     0     0     0     0     0     0     0     0     0     0     0     0     0
## 3     0     0     0     0     0     0     0     0     0     0     0     0     0
## 4     0     0     0     0     0     0     0     0     0     0     0     0     0
## 5     0     0     0     0     0     0     1     0     0     0     0     0     0
## 6     0     0     0     0     0     0     0     0     0     0     0     0     0
##   98106 98107 98108 98109 98112 98115 98116 98117 98118 98119 98122 98125 98126
## 1     0     0     0     0     0     0     0     0     0     0     0     0     0
## 2     0     0     0     0     0     0     0     0     0     0     0     1     0
## 3     0     0     0     0     0     0     0     0     0     0     0     0     0
## 4     0     0     0     0     0     0     0     0     0     0     0     0     0
## 5     0     0     0     0     0     0     0     0     0     0     0     0     0
## 6     0     0     0     0     0     0     0     0     0     0     0     0     0
##   98133 98136 98144 98146 98148 98155 98166 98168 98177 98178 98188 98198 98199
## 1     0     0     0     0     0     0     0     0     0     1     0     0     0
## 2     0     0     0     0     0     0     0     0     0     0     0     0     0
## 3     0     0     0     0     0     0     0     0     0     0     0     0     0
## 4     0     1     0     0     0     0     0     0     0     0     0     0     0
## 5     0     0     0     0     0     0     0     0     0     0     0     0     0
## 6     0     0     0     0     0     0     0     0     0     0     0     0     0
dim(kc_house)
## [1] 17675    82
hist(price)

## 数据切分
set.seed(123)
datasplist <- splitForTrainingAndTest(kc_house[,2:82],price,
                                      ratio = 0.3)

## MLP回归模型
system.time(
mlpreg <- mlp(datasplist$inputsTrain,datasplist$targetsTrain,maxit = 200,
              size = c(100,50,100,50),learnFunc = "Rprop",
              inputsTest=datasplist$inputsTest,
              targetsTest = datasplist$targetsTest,
              metric = "RSME")
)
##    user  system elapsed 
## 419.957   2.279 448.088
## 可视化模型的效果
plotIterativeError(mlpreg,main = "MLP Iterative Erro")

plotRegressionError(datasplist$targetsTrain,mlpreg$fitted.values,
                    main="MLP train fit")

plotRegressionError(datasplist$targetsTest,mlpreg$fittedTestValues,
                    main="MLP test fit")

## 在训练集上的误差
mlppre_train <- denormalizeData(mlpreg$fitted.values,NormParameters)
mlp_train <- denormalizeData(datasplist$targetsTrain,NormParameters)
train_mae <- mae(mlp_train,mlppre_train)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 47971.117964"
mlppre_test <- denormalizeData(mlpreg$fittedTestValues,NormParameters)
mlp_test <- denormalizeData(datasplist$targetsTest,NormParameters)
test_mae <- mae(mlp_test,mlppre_test)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 55811.877215"