前言 :距离第一次听说生信已经十几年了,现在是邋遢大叔重新开始学代码,精力确实已不像从前,各位入坑还是要乘早。后来约莫在5年前,课题组当时有个RNA-Seq数据,lab meeting时听瑞典小哥在汇报DEGs筛选,当时感觉好是神奇。其实陆陆续续也 有过学习的念头,但在对自己的各种纵容下,想法又逐渐隐没。直到2月前,机缘巧合参加了生信技能树培训,才进一步强化了自己学习生信技术的信念。
几天前,曾老师在群里给我布置了一份学徒作业,比较不同流程(limma/voom,edgeR,DESeq2 )差异分析的区别,拟使用的数据集是TCGA-BRCA的counts值矩阵。作为非肿瘤口的生信新人,秉着无知者无畏的态度试了一试。以下是具体过程。
代码主要来源于小洁老师(不是我吹,听了小洁老师的课,傻子也能学会R代码)。
R包安装# R包太多,这里略了。 for (pkg in cran_packages){ if (! require (pkg,character.only=T ) ) { install.packages(pkg,ask = F ,update = F ) require (pkg,character.only=T ) } }# first prepare BioManager on CRAN if (!require ("BiocManager" )) install.packages("BiocManager" ,update = F ,ask = F )# use BiocManager to install for (pkg in Biocductor_packages){ if (! require (pkg,character.only=T ) ) { BiocManager::install(pkg,ask = F ,update = F ) require (pkg,character.only=T ) } }#前面的报错都先不要管。主要看这里 for (pkg in c(Biocductor_packages,cran_packages)){ require (pkg,character.only=T ) }#哪个报错,就回去安装哪个。如果你没有安装xx包,却提示你xx包不存在,这也正常,是因为复杂的依赖关系,缺啥补啥。 if (!require (tinyarray))devtools::install_local("tinyarray-master.zip" ,upgrade = F )
数据下载1.从网页选择数据,下载manifest文件
数据存放网站:https://portal.gdc.cancer.gov/
在Repository勾选自己需要的case和file类型。
2.使用gdc-client工具下载
因使用的是Rstudio-Server Rstudio_3.6.3_CentOS7,gdc-client的安装有点波折,解决方法参考https://my.oschina.net/shenweiyan/blog/4538119
#step1 options(stringsAsFactors = F )library (stringr) cancer_type="TCGA-BRCA" if (!dir.exists("clinical" ))dir.create("clinical" )if (!dir.exists("expdata" ))dir.create("expdata" ) dir()
#此代码块是在Terminal中使用 conda create -n gdc python=3.7source activate gdc git clone https://github.com/NCI-GDC/gdc-clientcd gdc-client pip install -r requirements.txt python setup.py install 2>&1 | tee -a install.log###安装完毕后开始下载数据 gdc-client download -m gdc_manifest.2020-12-17_clinic.txt -d clinical gdc-client download -m gdc_manifest.2020-12-17_exp.txt -d expdata
#下载结束后 length(dir("./clinical/" )) length(dir("./expdata/" ))
表达矩阵获得 1.整理临床信息library (XML) result "./clinical/00049989-fa21-48fb-8dda-710c0dd5932e/nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml") #单个运行 rootnode rootsize print(rootnode[1 ])#print(rootnode[2]) xmldataframe 2]) head(t(xmlToDataFrame(rootnode[2 ]))) xmls = dir("clinical/" ,pattern = "*.xml$" ,recursive = T ) #可层级 td = function (x){ result "clinical/",x)) rootnode xmldataframe 2]) return (t(xmldataframe)) } cl = lapply(xmls,td) cl_df cl_df[1 :3 ,1 :3 ] clinical = data.frame(cl_df) clinical[1 :4 ,1 :4 ]
2.整理表达矩阵批量读取所有的counts.gz文件。
count_files = dir("expdata/" ,pattern = "*.htseq.counts.gz$" ,recursive = T ) ex = function (x){ result "expdata/",x),row.names = 1 ,sep = "\t" ) return (result) }# head(ex("03aee74e-4e37-4a58-a720-c90e807d2f40/be5bc6a0-9720-47ac-953e-fa8d0c32cd82.htseq.counts.gz")) exp = lapply(count_files,ex) exp dim(exp) exp[1 :4 ,1 :4 ]
下载cart-json文件已将文件名与样本ID一一对应。
meta "metadata.cart.2020-12-17.json") colnames(meta) ids ids[[1 ]] class(ids[[1 ]][,1 ]) ID = sapply(ids,function (x){x[,2 ]}) file2id = data.frame(file_name = meta$file_name, ID = ID) head(file2id$file_name) head(count_files) count_files2 = stringr::str_split(count_files,"/" ,simplify = T )[,2 ] count_files2[1 ] %in % file2id$file_name file2id = file2id[match(count_files2,file2id$file_name),] colnames(exp) = file2id$ID exp[1 :4 ,1 :4 ]
3.过滤表达矩阵dim(exp) exp = exp[apply(exp, 1 , function (x) sum(x > 1 ) > 9 ), ] dim(exp) exp[1 :4 ,1 :4 ]# 过滤在至少在75%的样本中都有表达的基因 keep_exp 0 ) >= floor(0.75 *ncol(exp)) table(keep_exp) exp exp[1 :4 ,1 :4 ] dim(exp)
4.添加分组信息根据样本ID的第14-15位,给样本分组(tumor和normal)
table(str_sub(colnames(exp),14 ,15 )) group_list = ifelse(as.numeric(str_sub(colnames(exp),14 ,15 )) 10,'tumor' ,'normal' ) group_list = factor(group_list,levels = c("normal" ,"tumor" )) table(group_list) save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata" ))
DEGs分析及对比#三大R包差异分析 if (!require (stringr))install.packages('stringr' )if (!require (ggplotify))install.packages("ggplotify" )if (!require (patchwork))install.packages("patchwork" )if (!require (cowplot))install.packages("cowplot" )if (!require (DESeq2))BiocManager::install('DESeq2' )if (!require (edgeR))BiocManager::install('edgeR' )if (!require (limma))BiocManager::install('limma' ) rm(list = ls()) load("TCGA-BRCAgdc.Rdata" ) table(group_list)#deseq2---- library (DESeq2) colData condition=group_list)if (!file.exists(paste0(cancer_type,"dd.Rdata" ))){ dds countData = exp, colData = colData, design = ~ condition) dds save(dds,file = paste0(cancer_type,"dd.Rdata" )) } load(paste0(cancer_type,"dd.Rdata" ))# 两两比较 res "condition",rev(levels(group_list)))) resOrdered # 按照P值排序 DEG head(DEG)#添加change列标记基因上调下调 logFC_cutoff 2*sd(abs(log2FoldChange)) )#logFC_cutoff k1 = (DEG$pvalue 0.05 )&(DEG$log2FoldChange k2 = (DEG$pvalue 0.05)&(DEG$log2FoldChange > logFC_cutoff) DEG$change = ifelse(k1,"DOWN" ,ifelse(k2,"UP" ,"NOT" )) table(DEG$change) head(DEG) DESeq2_DEG #edgeR---- library (edgeR) dge dge$samples$lib.size dge design 0+group_list) rownames(design)colnames(design) dge dge dge fit fit2 1,1 )) DEG=topTags(fit2, n=nrow(exp)) DEG=as.data.frame(DEG) logFC_cutoff 2*sd(abs(logFC)) )#logFC_cutoff k1 = (DEG$PValue 0.05 )&(DEG$logFC k2 = (DEG$PValue 0.05)&(DEG$logFC > logFC_cutoff) DEG$change = ifelse(k1,"DOWN" ,ifelse(k2,"UP" ,"NOT" )) head(DEG) table(DEG$change) edgeR_DEG ###limma---- library (limma) design 0+group_list) colnames(design)=levels(group_list) rownames(design)=colnames(exp) dge dge v "quantile") fit constrasts = paste(rev(levels(group_list)),collapse = "-" ) cont.matrix fit2=contrasts.fit(fit,cont.matrix) fit2=eBayes(fit2) DEG = topTable(fit2, coef=constrasts, n=Inf ) DEG = na.omit(DEG) logFC_cutoff 2*sd(abs(logFC)) )#logFC_cutoff k1 = (DEG$P.Value 0.05 )&(DEG$logFC k2 = (DEG$P.Value 0.05)&(DEG$logFC > logFC_cutoff) DEG$change = ifelse(k1,"DOWN" ,ifelse(k2,"UP" ,"NOT" )) table(DEG$change) head(DEG) limma_voom_DEG tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)), edgeR = as.integer(table(edgeR_DEG$change)), limma_voom = as.integer(table(limma_voom_DEG$change)), row.names = c("down" ,"not" ,"up" ) );tj save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,group_list,tj,file = paste0(cancer_type,"DEG.Rdata" ))#可视化 rm(list = ls()) load("TCGA-BRCAgdc.Rdata" ) load("TCGA-BRCADEG.Rdata" )if (!require (tinyarray))devtools::install_local("tinyarray-master.zip" ,upgrade = F )library (ggplot2)library (tinyarray) exp[1 :4 ,1 :4 ] dat = log2(exp+1 ) pca.plot = draw_pca(dat,group_list);pca.plot save(pca.plot,file = paste0(cancer_type,"pcaplot.Rdata" )) cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT" ] cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT" ] cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT" ] h1 = draw_heatmap(dat[cg1,],group_list,scale_before = T ) h2 = draw_heatmap(dat[cg2,],group_list,scale_before = T ) h3 = draw_heatmap(dat[cg3,],group_list,scale_before = T ) m2d = function (x){ mean(abs(x))+2 *sd(abs(x)) } v1 = draw_volcano(DESeq2_DEG,pkg = 1 ,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange)) v2 = draw_volcano(edgeR_DEG,pkg = 2 ,logFC_cutoff = m2d(edgeR_DEG$logFC)) v3 = draw_volcano(limma_voom_DEG,pkg = 3 ,logFC_cutoff = m2d(limma_voom_DEG$logFC))library (patchwork) (h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect' ) &theme(legend.position = "none" ) ggsave(paste0(cancer_type,"heat_vo.png" ),width = 15 ,height = 10 )#三大R包差异基因对比之相关性 #for test correlation plot between the logFC for the top 500 DE genes head(edgeR_DEG) head(limma_voom_DEG) head(DESeq2_DEG) td1=edgeR_DEG[,1 :2 ] head(td1) colnames(td1)=c("logFC_edgeR" ,"logCPM_edgeR" ) td1$ID=rownames(td1) td2=limma_voom_DEG[,1 :2 ] head(td2) colnames(td2)=c("logFC_limma" ,"logCPM_limma" ) td2$ID=rownames(td2) td3=DESeq2_DEG[,1 :2 ] head(td3) colnames(td3)=c("baseMean" ,"log2FC_DESeq2" ) td3$ID=rownames(td3) td_edgeR_limma=inner_join(td1,td2,by="ID" ) td_edgeR_limma=td_edgeR_limma[,c(3 ,1 ,4 )] head(td_edgeR_limma) td_edgeR_DESeq2=inner_join(td1,td3,by="ID" ) td_edgeR_DESeq2=td_edgeR_DESeq2[,c(3 ,1 ,5 )] head(td_edgeR_DESeq2) td_DESeq2_limma=inner_join(td3,td2,by="ID" ) td_DESeq2_limma=td_DESeq2_limma[,c(3 ,2 ,4 )] head(td_DESeq2_limma) #选出500个logFC绝对值最大的 # for edgeR vs limma a1=head(td_edgeR_limma[order(td_edgeR_limma$logFC_edgeR),],250 ) a2=tail(td_edgeR_limma[order(td_edgeR_limma$logFC_edgeR),],250 ) a=rbind(a1,a2) head(a) ggplot(a,aes(y=logFC_limma,x=logFC_edgeR))+geom_point() p1=ggplot(data=a, aes(y=logFC_limma, x=logFC_edgeR))+geom_point(color="red" )+stat_smooth(method="lm" ,se=FALSE )+stat_cor(data=a, method = "pearson" ) png("cor_edgeR_limma.png" ) p1 dev.off()# for edgeR vs DESeq2 a1=head(td_edgeR_DESeq2[order(td_edgeR_DESeq2$logFC_edgeR),],250 ) a2=tail(td_edgeR_DESeq2[order(td_edgeR_DESeq2$logFC_edgeR),],250 ) a=rbind(a1,a2) head(a) ggplot(a,aes(y=log2FC_DESeq2,x=logFC_edgeR))+geom_point() p2=ggplot(data=a, aes(y=log2FC_DESeq2, x=logFC_edgeR))+geom_point(color="red" )+stat_smooth(method="lm" ,se=FALSE )+stat_cor(data=a, method = "pearson" ) png("cor_edgeR_DESeq2.png" ) p2 dev.off()# for limma vs DESeq2 a1=head(td_DESeq2_limma[order(td_DESeq2_limma$logFC_limma),],250 ) a2=tail(td_DESeq2_limma[order(td_DESeq2_limma$logFC_limma),],250 ) a=rbind(a1,a2) head(a) ggplot(a,aes(y=log2FC_DESeq2,x=logFC_limma))+geom_point() p3=ggplot(data=a, aes(y=log2FC_DESeq2, x=logFC_limma))+geom_point(color="red" )+stat_smooth(method="lm" ,se=FALSE )+stat_cor(data=a, method = "pearson" ) png("cor_limma_DESeq2.png" ) p3 dev.off()library (patchwork) p1+p2+p3+ plot_layout(guides = "collect" ) png("cor_limma_DESeq2_edgeR.png" ) p1+p2+p3+ plot_layout(guides = "collect" ) dev.off()#三大R包差异基因对比之韦恩图 rm(list = ls()) load("TCGA-BRCAgdc.Rdata" ) load("TCGA-BRCADEG.Rdata" ) load("TCGA-BRCApcaplot.Rdata" ) UP=function (df){ rownames(df)[df$change=="UP" ] } DOWN=function (df){ rownames(df)[df$change=="DOWN" ] } up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG)) down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG)) dat = log2(exp+1 ) hp = draw_heatmap(dat[c(up,down),],group_list,scale_before = T )#上调、下调基因分别画维恩图 up_genes = list(Deseq2 = UP(DESeq2_DEG), edgeR = UP(edgeR_DEG), limma = UP(limma_voom_DEG)) down_genes = list(Deseq2 = DOWN(DESeq2_DEG), edgeR = DOWN(edgeR_DEG), limma = DOWN(limma_voom_DEG)) up.plot "UPgene") down.plot "DOWNgene")#维恩图拼图,终于搞定 library (patchwork)#up.plot + down.plot # 拼图 pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect" ) ggsave(paste0(cancer_type,"heat_ve_pca.png" ),width = 15 ,height = 10 )#以下又为test #for upset图 参考https://www.jianshu.com/p/b702f644cad3 #install.packages("UpSetR") library (UpSetR)library (dplyr)library (tidyr) up_DESeq2=UP(DESeq2_DEG) up_edgeR=UP(edgeR_DEG) up_limma=UP(limma_voom_DEG) down_DESeq2=DOWN(DESeq2_DEG) down_edgeR=DOWN(edgeR_DEG) down_limma=DOWN(limma_voom_DEG) input 'DESeq2_up' = length(up_DESeq2), 'edgeR_up' = length(up_edgeR), 'limma_up' = length(up_limma), 'DESeq2_down' =length(down_DESeq2), 'edgeR_down' = length(down_edgeR), 'limma_down' =length(down_limma),'DESeq2_up&edgeR_up' =length(intersect(UP(DESeq2_DEG),UP(edgeR_DEG))),'DESeq2_up&limma_up' = length(intersect(UP(DESeq2_DEG),UP(limma_voom_DEG))),'edgeR_up&limma_up' =length(intersect(UP(limma_voom_DEG),UP(edgeR_DEG))),'DESeq2_up&edgeR_up&limma_up' =length(intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))),'DESeq2_down&edgeR_down' =length(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG))),'DESeq2_down&limma_down' = length(intersect(DOWN(DESeq2_DEG),DOWN(limma_voom_DEG))),'edgeR_down&limma_down' =length(intersect(DOWN(limma_voom_DEG),DOWN(edgeR_DEG))),'DESeq2_down&edgeR_down&limma_down' =length(intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))) ) data p1 6, sets = c('DESeq2_up' , 'edgeR_up' , 'limma_up' , 'DESeq2_down' , 'edgeR_down' , 'limma_down' ), keep.order = TRUE , # number.angles = 30, point.size = 5 , line.size = 1.3 , mainbar.y.label = "DEGs Intersection" , sets.x.label = "" , mb.ratio = c(0.60 , 0.40 ), text.scale = c(2 , 3 , 4 , 1.5 ,1.5 , 2 )) png("upset.png" ) p1 dev.off()
第一个是3大R包的火山图和如图:
TCGA-BRCAheat_vo 然后是3大R包的各自的上下调基因的韦恩图:
TCGA-BRCAheat_ve_pca 跟韦恩图一个意思的upset图
upset 最后是3个R包各自计算的logFC的相关性:
cor_limma_DESeq2_edgeR