今天给大家分享如何使用R语言从百度百科深层抓取成龙的朋友圈,以及如何绘制成龙的社交图.
加载R包
library(rvest)
library(stringr)
一级关系网抓取
url='https://baike.baidu.com/item/%E6%88%90%E9%BE%99/71648?fr=aladdin'
web=read_html(url)
name=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_attr('title')
relation=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_text()
relation=(str_split(relation,name)%>%unlist)[(str_split(relation,name)%>%unlist
%>%str_length()) >0]
href=web%>%html_nodes('#slider_relations>ul>li> a')%>%
html_attr('href')
re=data.frame(sorce='成龙',name=name,
relation=relation,href=href,stringsAsFactors =F)
构建爬虫及清理函数
为减少代码冗余,构建两个函数.
fun = function(x){
web=read_html(x)
name=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_attr('title')
relation=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_text()
relation=(str_split(relation,name)%>%unlist)[(str_split(relation,name)%>%unlist
%>%str_length()) >0]
href=web%>%html_nodes('#slider_relations>ul>li> a')%>%html_attr('href')
re=data.frame(name=name,relation=relation,href=href,stringsAsFactors =F)
return(re)
}
fun1=function(x){
re=x
res=NULL
for(i in 1:length(re$href)){
if(
nrow(fun(re$href[i]))>0){
res=rbind(res,data.frame(sorce=re$name[i],fun(re$href[i]),stringsAsFactors =F))
}
}
re1=res[(res$name %in% unique(re$sorce))==0,]
return(re1)
}
数据读取及保存
#二级关系网
re1=fun1(re)
#三级关系网
re2=fun1(re1)
#四级关系网
re3=fun1(re2)
#五级关系网
re4=fun1(re3)
##数据保存
saveRDS(list(re=re,re1=re1,re2=re2,re3=re3,re4=re4),'D:\\chenglong.rds')
数据可视化
#加载数据
ldt=readRDS('D:\\chenglong.rds')
str(ldt)
str(ldt$re)
#一级关系网络图
library(GGally)
library(network)
library
(ggnetwork)
library(geomnet)
##点
nodes=data.frame(label=unique(c(ldt$re$sorce,ldt$re$name)),
grade=c(1,rep(2,9)),stringsAsFactors =F)
##边
links=ldt$re[,1:2]
mm.net network(links, directed = FALSE)
rownames(nodes) nodes$label
mm.net%v%'grade'= ifelse(network.
vertex.names(mm.net)=="周杰伦" ,1,2)
set.edge.attribute(mm.net,'labb',ldt$re$relation)
set.seed(10052016)
ggplot(data = ggnetwork(mm.net, layout = "kamadakawai"),
aes(x, y, xend = xend, yend = yend)) +
geom_edges(color = "grey50",linetype=3) +
geom_nodes(aes(colour = as.factor(grade)),
size = 5) +
geom_nodetext_repel(aes(colour = as.factor(grade),label = vertex.names),
size = 4, vjust = -0.6) +
geom_edgetext(aes(label = labb),color = "white", fill = "grey25")+
scale_colour_manual(values = c("pink", "gold")) +
xlim(c(-0.05, 1.05)) +
theme_blank() +
theme(legend
.position = "n")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
panel.background = element_rect(fill = "grey25"),
panel.grid = element_blank())
ldt$re$grade=2
ldt$re1$grade=3
ldt$re2$grade=4
ldt$re3$grade=5
ldt$re4$grade=6
dt4=rbind(ldt$re[,-4],ldt$re1[,-4],ldt$re2[,-4],ldt$re3[,-4],ldt$re4[,-
4])
#点
nodes4=data.frame(label=c('成龙',dt4[!duplicated(dt4$name),]$name) ,
grade=c(1,dt4[!duplicated(dt4$name),]$grade),stringsAsFactors =F)
nodes4=nodes4[-27,]
#边
links4=dt4[,1:2]
mm.net4 network(links4, directed = FALSE)
rownames(nodes4) nodes4$label
mm.net4%v
%'grade'= nodes4[network.vertex.names(mm.net4) ,'grade']
set.edge.attribute(mm.net4,'labb',dt4$relation)
set.seed(10052016)
p=ggplot(data = ggnetwork(mm.net4, layout = "kamadakawai"),
aes(x, y, xend = xend, yend = yend)) +
geom_edges(color = "grey80",linetype=3) +
geom_nodes(aes(colour =
as.factor(grade)), size = 1) +
geom_nodetext_repel(aes(colour = as.factor(grade),label = vertex.names),
size = 3, vjust = -0.6) +
#geom_edgetext(aes(label = labb),color = "white", fill = "grey25")+
scale_colour_brewer(palette = "Oranges") +
xlim(c(-0.05, 1.05)) +
theme_blank() +
theme(legend.position = "n")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
panel.background = element_rect(fill = "grey25"),
panel.grid = element_blank())
p
windows(height = 15,width = 20)
p+scale_colour_brewer(palette = "Oranges")
如需转载请联系EasyCharts团队!
微信后台回复“转载”即可!
【书籍推荐】
《Excel 数据之美--科学图表与商业图表的绘制》
【必备插件】
EasyCharts --
Excel图表插件
【网易云课堂】
Excel 商业图表修炼秘笈之基础篇
【手册获取】
国内首款-数据可视化参考手册:专业绘图必备