## 读取数据
iris <- read.csv("data/chap3/Iris.csv")
## 可视化散点图
par(family = "STKaiti",pch = 17)
plot(iris$SepalLengthCm,iris$SepalWidthCm,
type = "p",col = "red",main = "散点图",
xlab = "SepalLengthCm",ylab = "SepalWidthCm")
generateRPointShapes<-function(){
oldPar<-par()
par(font=2, mar=c(0.5,0,0,0))
y=rev(c(rep(0.5,9),rep(1,9), rep(1.5,9)))
x=c(rep(1:9,3))
x = x[1:26]
y = y[1:26]
plot(x, y, pch = 0:25, cex=1.5, ylim=c(0,3), xlim=c(1,9.5),
axes=FALSE, xlab="", ylab="", bg="blue")
text(x, y, labels=0:25, pos=3)
par(mar=oldPar$mar,font=oldPar$font )
}
generateRPointShapes()
cl <- colors()
length(cl); cl[1:20]
## [1] 657
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
## [9] "aquamarine1" "aquamarine2" "aquamarine3" "aquamarine4"
## [13] "azure" "azure1" "azure2" "azure3"
## [17] "azure4" "beige" "bisque" "bisque1"
## 可视化多个图像窗口
par(family = "STKaiti",mfrow=c(2,2))
layout(matrix(c(1,2,3,3),2,2,byrow = TRUE))
hist(iris$SepalLengthCm,breaks = 20,col = "lightblue",main = "直方图",xlab = "SepalLengthCm")
smoothScatter(iris$PetalLengthCm,iris$PetalWidthCm, nbin = 64,main = "散点图",xlab = "PetalLengthCm",ylab = "PetalWidthCm")
## 添加第3个图像
boxplot(SepalLengthCm~Species,data = iris,main = "箱线图",ylab = "SepalLengthCm")
ggplot2系列的包可以使用+来绘制图像
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gridExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## 可视化简单的图像
## 散点图
p1 <- ggplot(iris,aes(x = PetalLengthCm,y = PetalWidthCm))+
theme_bw(base_family = "STKaiti",base_size = 9)+
geom_point(aes(colour = Species))+
labs(title = "散点图")
p1
## 小提琴图
p2 <- ggplot(iris,aes(x = Species,y = SepalLengthCm))+
theme_gray(base_family = "STKaiti",base_size = 9)+
geom_violin(aes(fill = Species),show.legend = F)+
labs(title = "小提琴图")+
theme(plot.title = element_text(hjust = 0.5))
p2
p3 <- ggplot(iris,aes(SepalWidthCm))+
theme_minimal(base_family = "STKaiti",base_size = 9)+
geom_density(aes(colour = Species,fill = Species),alpha = 0.5)+
labs(title = "密度曲线")+
theme(plot.title = element_text(hjust = 0.5),
legend.position = c(0.8,0.8))
p3
p4 <- ggplot(iris,aes(x = SepalLengthCm,y = SepalWidthCm))+
theme_classic(base_family = "STKaiti",base_size = 9)+
geom_point(shape = 17)+
geom_density_2d(linemitre = 5)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("二维密度曲线")
p4
## 将4副图放进一个图像中
grid.arrange(p1,p2,p3,p4,nrow = 2)
ggscatmat(data = iris[,2:6],columns = 1:4,color = "Species",alpha = 0.8)+
theme_bw(base_family = "STKaiti",base_size = 10)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("矩阵散点图")
ggparcoord(data = iris[,2:6],columns = 1:4,
groupColumn = "Species",scale = "center")+
theme_bw(base_family = "STKaiti",base_size = 10)+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom")+
ggtitle("平行坐标图")+labs(x = "")
## 平滑的平行坐标图
ggparcoord(data = iris[,2:6],columns = 1:4,
groupColumn = "Species",scale = "globalminmax",
splineFactor = 50,order = c(4,1,2,3))+
theme_bw(base_family = "STKaiti",base_size = 10)+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom")+
ggtitle("平滑的平行坐标图")+labs(x = "")
热力图,直方图等
## 读取数据,数据融合
library(readr)
athlete_events <- read_csv("data/chap3/athlete_events.csv")
## Parsed with column specification:
## cols(
## ID = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## Height = col_double(),
## Weight = col_double(),
## Team = col_character(),
## NOC = col_character(),
## Games = col_character(),
## Year = col_double(),
## Season = col_character(),
## City = col_character(),
## Sport = col_character(),
## Event = col_character(),
## Medal = col_character()
## )
noc_regions <- read_csv("data/chap3/noc_regions.csv")
## Parsed with column specification:
## cols(
## NOC = col_character(),
## region = col_character(),
## notes = col_character()
## )
## 数据连接
athletedata <- inner_join(athlete_events,noc_regions[,1:2],by=c("NOC"="NOC"))
## 查看数据
summary(athletedata)
## ID Name Sex Age
## Min. : 1 Length:270767 Length:270767 Min. :10.00
## 1st Qu.: 34630 Class :character Class :character 1st Qu.:21.00
## Median : 68187 Mode :character Mode :character Median :24.00
## Mean : 68229 Mean :25.56
## 3rd Qu.:102066 3rd Qu.:28.00
## Max. :135571 Max. :97.00
## NA's :9462
## Height Weight Team NOC
## Min. :127.0 Min. : 25.00 Length:270767 Length:270767
## 1st Qu.:168.0 1st Qu.: 60.00 Class :character Class :character
## Median :175.0 Median : 70.00 Mode :character Mode :character
## Mean :175.3 Mean : 70.71
## 3rd Qu.:183.0 3rd Qu.: 79.00
## Max. :226.0 Max. :214.00
## NA's :60083 NA's :62785
## Games Year Season City
## Length:270767 Min. :1896 Length:270767 Length:270767
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1988 Mode :character Mode :character
## Mean :1978
## 3rd Qu.:2002
## Max. :2016
##
## Sport Event Medal region
## Length:270767 Length:270767 Length:270767 Length:270767
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
head(athletedata)
## # A tibble: 6 x 16
## ID Name Sex Age Height Weight Team NOC Games Year Season City
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1 A Di… M 24 180 80 China CHN 1992… 1992 Summer Barc…
## 2 2 A La… M 23 170 60 China CHN 2012… 2012 Summer Lond…
## 3 3 Gunn… M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
## 4 4 Edga… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
## 5 5 Chri… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 6 5 Chri… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## # … with 4 more variables: Sport <chr>, Event <chr>, Medal <chr>, region <chr>
str(athletedata)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 270767 obs. of 16 variables:
## $ ID : num 1 2 3 4 5 5 5 5 5 5 ...
## $ Name : chr "A Dijiang" "A Lamusi" "Gunnar Nielsen Aaby" "Edgar Lindenau Aabye" ...
## $ Sex : chr "M" "M" "M" "M" ...
## $ Age : num 24 23 24 34 21 21 25 25 27 27 ...
## $ Height: num 180 170 NA NA 185 185 185 185 185 185 ...
## $ Weight: num 80 60 NA NA 82 82 82 82 82 82 ...
## $ Team : chr "China" "China" "Denmark" "Denmark/Sweden" ...
## $ NOC : chr "CHN" "CHN" "DEN" "DEN" ...
## $ Games : chr "1992 Summer" "2012 Summer" "1920 Summer" "1900 Summer" ...
## $ Year : num 1992 2012 1920 1900 1988 ...
## $ Season: chr "Summer" "Summer" "Summer" "Summer" ...
## $ City : chr "Barcelona" "London" "Antwerpen" "Paris" ...
## $ Sport : chr "Basketball" "Judo" "Football" "Tug-Of-War" ...
## $ Event : chr "Basketball Men's Basketball" "Judo Men's Extra-Lightweight" "Football Men's Football" "Tug-Of-War Men's Tug-Of-War" ...
## $ Medal : chr NA NA NA "Gold" ...
## $ region: chr "China" "China" "Denmark" "Denmark" ...
## 查看每个国家参与奥运会运动员人数
plotdata <- athletedata%>%group_by(region)%>%
summarise(number=n())%>%
arrange(desc(number))
## 可视化前40个人数多的国家的参与人数
ggplot(plotdata[1:30,],aes(x=reorder(region,number),y=number))+
theme_bw(base_family = "STKaiti")+
geom_bar(aes(fill=number),stat = "identity",show.legend = F)+
coord_flip()+
scale_fill_gradient(low = "#56B1F7", high = "#132B43")+
labs(x="地区",y="运动员人数",title="不同地区奥运会运动员人数")+
theme(axis.text.x = element_text(vjust = 0.5),
plot.title = element_text(hjust = 0.5))
library(RColorBrewer)
## 人数最多的30个地区,不同年份运动员人数变化
region30 <- athletedata%>%group_by(region)%>%
summarise(number=n())%>%
arrange(desc(number))
region30 <- region30$region[1:30]
## 不同性别下的,可视化人数最多的15个地区,不同年份运动员人数变化
plotdata <- athletedata[athletedata$region %in%region30[1:15],]%>%
group_by(region,Year,Sex)%>%
summarise(number=n())
ggplot(data=plotdata, aes(x=Year,y=region)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill = number),colour = "white")+
scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
scale_x_continuous(breaks=unique( plotdata$Year)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
facet_wrap(~Sex,nrow = 2)
library(ggChernoff)
## 查看不同季节举办的的奥运会运动员人数变化
region6 <- c("USA","Germany","France" ,"UK","Russia","China")
index <- ((athletedata$region %in% region6)&(!is.na(athletedata$Medal))&(athletedata$Season=="Summer"))
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%group_by(Year,region)%>%
summarise(Medalnum=n())
ggplot(plotdata2,aes(x=Year,y=Medalnum))+
theme_bw(base_family = "STKaiti")+
geom_line()+
geom_chernoff(fill = 'goldenrod1')+
facet_wrap(~region,ncol = 2)+
labs(x="举办时间",y="奖牌数")
可视化动画
library(gganimate)
library(gifski)
## 可视化每个地区每年奖牌的获取情况
index <- (athletedata$region %in% region30[1:20]&(!is.na(athletedata$Medal)))
plotdata <- athletedata[index,]
plotdata2 <- plotdata%>%group_by(Year,region,Medal)%>%
summarise(Medalnum = n())
head(plotdata2)
## # A tibble: 6 x 4
## # Groups: Year, region [3]
## Year region Medal Medalnum
## <dbl> <chr> <chr> <int>
## 1 1896 Australia Bronze 1
## 2 1896 Australia Gold 2
## 3 1896 Austria Bronze 2
## 4 1896 Austria Gold 2
## 5 1896 Austria Silver 1
## 6 1896 France Bronze 2
plotdata2$Year <- as.integer(plotdata2$Year)
ggplot(plotdata2,aes(x=region,y=Medalnum,fill=Medal))+
theme_bw()+
geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
scale_fill_brewer(palette="RdYlGn")+
transition_time(Year) +
labs(title = 'Year: {frame_time}')
##截取其中两个图片
p2 <- ggplot(plotdata2[plotdata2$Year == 2000,],aes(x=region,y=Medalnum,fill=Medal))+
theme_bw()+
geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
scale_fill_brewer(palette="RdYlGn")+
labs(title = 'Year: 2000')
p2
p3 <- ggplot(plotdata2[plotdata2$Year == 1996,],aes(x=region,y=Medalnum,fill=Medal))+
theme_bw()+
geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90,vjust = 0.5))+
scale_fill_brewer(palette="RdYlGn")+
labs(title = 'Year: 1996')
p3
library(treemap)
## 使用treemap 可视化数据
plotdata <- athletedata%>%
group_by(region,Sex)%>%
summarise(number=n())
## 计算奖牌数量
plotdata2 <- athletedata[!is.na(athletedata$Medal),]%>%
group_by(region,Sex)%>%
summarise(Medalnum=n())
## 合并数据
plotdata3 <- inner_join(plotdata2,plotdata,by=c("region", "Sex"))
treemap(plotdata3,index = c("Sex","region"),vSize = "number",
vColor = "Medalnum",type="value",palette="RdYlGn",
title = "不同性别下每个国家的运动员人数",fontfamily.title = "STKaiti",
title.legend = "奖牌数量",fontfamily.legend="STKaiti")
可视化美国机场之间的联系
library(maps)
library(geosphere)
## 读取飞机航线的数据
usaairline <- read.csv("data/chap3/usaairline.csv")
airportusa <- read.csv("data/chap3/airportusa.csv")
head(airportusa)
## AirportID Name City Country IATA ICAO
## 1 3411 Barter Island LRRS Airport Barter Island United States BTI PABA
## 2 3413 Cape Lisburne LRRS Airport Cape Lisburne United States LUR PALU
## 3 3414 Point Lay LRRS Airport Point Lay United States PIZ PPIZ
## 4 3415 Hilo International Airport Hilo United States ITO PHTO
## 5 3416 Orlando Executive Airport Orlando United States ORL KORL
## 6 3417 Bettles Airport Bettles United States BTT PABT
## Latitude Longitude Altitude Timezone DST Tzbatabase Type
## 1 70.1340 -143.5820 2 -9 A America/Anchorage airport
## 2 68.8751 -166.1100 16 -9 A America/Anchorage airport
## 3 69.7329 -163.0050 22 -9 A America/Anchorage airport
## 4 19.7214 -155.0480 38 -10 N Pacific/Honolulu airport
## 5 28.5455 -81.3329 113 -5 A America/New_York airport
## 6 66.9139 -151.5290 647 -9 A America/Anchorage airport
## Source
## 1 OurAirports
## 2 OurAirports
## 3 OurAirports
## 4 OurAirports
## 5 OurAirports
## 6 OurAirports
head(usaairline)
## destination.apirport source.airport Latitude.x Longitude.x Latitude.y
## 1 ABE MYR 33.6797 -78.9283 40.6521
## 2 ABE CLT 35.2140 -80.9431 40.6521
## 3 ABE DTW 42.2124 -83.3534 40.6521
## 4 ABE PIE 27.9102 -82.6874 40.6521
## 5 ABE SFB 28.7776 -81.2375 40.6521
## 6 ABE PHL 39.8719 -75.2411 40.6521
## Longitude.y
## 1 -75.4408
## 2 -75.4408
## 3 -75.4408
## 4 -75.4408
## 5 -75.4408
## 6 -75.4408
map("state",col="palegreen", fill=TRUE, bg="lightblue", lwd=0.1)
# 添加起点的位置
points(x=airportusa$Longitude, y=airportusa$Latitude, pch=19, cex=0.4,col="tomato")
col.1 <- adjustcolor("orange", alpha=0.4)
## 添加边
for(i in 1:nrow(usaairline)) {
node1 <- usaairline[i,c("Latitude.x","Longitude.x")]
node2 <- usaairline[i,c("Latitude.y","Longitude.y")]
arc <- gcIntermediate( c(node1$Longitude.x, node1$Latitude.x),
c(node2$Longitude.y, node2$Latitude.y),
n=1000, addStartEnd=TRUE )
lines(arc, col=col.1, lwd=0.2)
}
igraph包可视化社交网络图
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## 读取顶点和边的数据
vertexdata <- read.csv("data/chap3/vertex.csv")
edgedata <- read.csv("data/chap3/edge.csv")
## Country:国家,airportnumber:机场数量,vtype:节点的类型
head(vertexdata)
## Country airportnumber vtype
## 1 Algeria 43 2
## 2 Australia 296 3
## 3 Austria 19 1
## 4 Bahamas 33 2
## 5 Belgium 24 2
## 6 Brazil 234 3
## Country.x,Country.y :连线的两个点,connectnumber:连接的数量, etype:边的类型
head(edgedata)
## Country.y Country.x connectnumber etype
## 1 Algeria France 67 1
## 2 Australia New Zealand 64 1
## 3 Austria Germany 67 1
## 4 Bahamas United States 72 1
## 5 Belgium Spain 60 1
## 6 Brazil United States 57 1
## 定义网络图
g <- graph_from_data_frame(edgedata,vertices = vertexdata,directed = TRUE)
## 添加边的宽度
E(g)$width <- log10(E(g)$connectnumber)
# 生成节点和边的颜色
colrs <- c("gray50", "tomato", "gold")
V(g)$color <- colrs[V(g)$vtype]
E(g)$color <- colrs[E(g)$etype]
# plot 4个图,2 rows, 2 columns,每个图使用不同的图像样式
par(mfrow=c(2,2), mar=c(0,0,0,0),cex = 1)
plot(g, layout = layout_in_circle(g),
edge.arrow.size=0.4,
vertex.size = 10*log10(V(g)$airportnumber),
vertex.label.cex = 0.6)
plot(g, layout = layout_with_fr(g),
edge.arrow.size=0.4,
vertex.size = 10*log10(V(g)$airportnumber),
vertex.label.cex = 0.6)
plot(g, layout = layout_on_sphere(g),
edge.arrow.size=0.4,
vertex.size = 10*log10(V(g)$airportnumber),
vertex.label.cex = 0.6)
plot(g, layout = layout_randomly(g),
edge.arrow.size=0.4,
vertex.size = 10*log10(V(g)$airportnumber),
vertex.label.cex = 0.6)
## 可视化集合之间的关系——韦恩图
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
## VennDiagram包最多可以绘制5个集合的韦恩图,这里绘制4个数组的韦恩图
vcol <- c("red","blue","green","DeepPink")
T<-venn.diagram(list(First =c(1:30),
Second=seq(1,50,by = 2),
Third =seq(2,50,by = 2),
Four = c(20,70)),
filename = NULL,lwd = 0.5,
fill = vcol,alpha = 0.5,margin = 0.1)
grid.draw(T)
使用UpSetR包可视化多个集合的交集情况
library(UpSetR)
## 数据准备
one <- 1:100
two <- seq(1,200,by = 2)
three <- seq(10,300,by = 5)
four <- seq(2,400,by = 4)
five <- seq(10,500,by = 10)
six <- seq(3,400,by = 3)
## 1: 将6个集合的并集计算出来,
all <- unique(c(one,two,three,four,five,six))
## 建立一个数据表格
plotdata <- data.frame(matrix(nrow = length(all),ncol = 7))
colnames(plotdata) <-c("element","one","two","three","four",
"five","six")
## 2:数据表第一列是6个集合的并集的所有元素
plotdata[,1] <- all
## 3:其它列中的对应行,如果包含那一行的元素,则取值为1,否则取值为0
for (i in 1:length(all)) {
plotdata[i,2] <- ifelse(all[i] %in% one,1,0)
plotdata[i,3] <- ifelse(all[i] %in% two,1,0)
plotdata[i,4] <- ifelse(all[i] %in% three,1,0)
plotdata[i,5] <- ifelse(all[i] %in% four,1,0)
plotdata[i,6] <- ifelse(all[i] %in% five,1,0)
plotdata[i,7] <- ifelse(all[i] %in% six,1,0)
}
## 查看数据
head(plotdata)
## element one two three four five six
## 1 1 1 1 0 0 0 0
## 2 2 1 0 0 1 0 0
## 3 3 1 1 0 0 0 1
## 4 4 1 0 0 0 0 0
## 5 5 1 1 0 0 0 0
## 6 6 1 0 0 1 0 1
## 使用该数据表进行可视化
## 可视化6个集合的交并情况
upset(plotdata,
sets = c("one","two","three","four","five","six"),
nintersects = 40, ## 默认显示前40个交集
order.by = "freq", ## 根据频数排序
## 设置主条形图
matrix.color = "black", ## 数据矩阵的颜色
main.bar.color = "red",# 主要条形图的颜色
##设置集合条形图
sets.bar.color = "tomato",
## 设置矩阵点图
point.size = 2.5,line.size = 0.5,
## 矩阵点图和条形图的比例
mb.ratio = c(0.65, 0.35))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(plot3D)
## 使用plot3D包绘制3D图像
x <- y <- seq(0,10,by = 0.5)
## 生成网格数据并计算Z
xy <- mesh(x,y)
z <- sin(xy$x) + cos(xy$y) + sin(xy$x) * cos(xy$y)
par(mfrow = c(1,2))
hist3D(x,y,z,phi = 45, theta = 45,space = 0.1,colkey = F,bty = "g")
surf3D(xy$x,xy$y,z,colkey = F,border = "black",bty = "b2")
## 使用plotly包绘制3D图像
plot_ly(x = xy$x, y = xy$y, z = z,showscale = FALSE)%>%
add_surface()