文章复现学习 | ROS（9）药敏预测、免疫治疗

生信菜鸟团 · 公众号 · 生物 · 2024-11-11 17:57

正文

学习笔记总结于『生信技能树』马拉松课程

本文学习复现《Oxidative Stress Response Biomarkers of Ovarian Cancer Based on Single-Cell and Bulk RNA Sequencing》（基于单细胞和Bulk转录组的卵巢癌氧化应激反应生物标志物）一文中的图，其中Oxidative Stress Response缩写为ROS。文章包含了芯片、转录组、单细胞等技术，适合用来复现及学习

九、药敏预测、免疫治疗

0.前情提要

关于药敏预测

预测哪些药物会对疾病表现好的治疗效果，通过IC50来看。IC50，半数抑制浓度，指在体外试验中在特定暴露时间后，药物抑制50%（细胞/靶点/特定蛋白等）所需的药物浓度

关于免疫治疗

TCGA数据没有提供免疫治疗结果，所以需要外部的数据来验证高/低风险组之间的免疫治疗结果是否有差别。根据免疫治疗的有效性，分为完全缓解（CR）、部分缓解（PR）、疾病稳定（SD）和疾病进展（PD）

CR和PR是好的结果，SD和PD是坏的结果，分别放在一起。不过图101（右）应该按照CR、PR、SD、PD的顺序来放

1.药物敏感性预测

文章使用了一个2014年的R包来计算IC50值，我们使用同一个作者但比较新的R包

1.1载入数据并完成预测

补充一点背景知识： oncoPredict 是根据基因表达量来预测药物敏感性的R包。也就是说它可以根据你的样本基因表达量来告诉你每个药物的IC50值，这个值越低就说明药物越管用

提到药物预测，还有一个 pRRophetic 包，建议不用看了，因为 oncoPredict 是它的plus版本；还有一个cellMiner包，之前写过，可以翻翻看

#### 1.载入数据
# 代码参考自：https://mp.weixin.qq.com/s/QRaTd-fIsqq6sPsLmOPvIw，一些背景知识也可以补充下.
# 数据来源于：https://osf.io/c6tfx/  在Training Data文件夹下存放着R包作者准备好的数据，用作药物预测的训练集

# install.packages("oncoPredict")
# BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
rm(list = ls())
library(oncoPredict)
library(data.table)
library(gtools)
library(reshape2)
library(ggpubr)
dir='./DataFiles/DataFiles/Training Data/'
dir(dir)

# 可以看到其中包括了Cancer Therapeutics Response Portal (CTRP)和Genomics of Drug Sensitivity in Cancer (GDSC)，相当于两个训练集
# 两个数据库的数据，都是提供了基因表达矩阵和药物IC50表格

# rds格式的数据，需要用readRDS()函数读取
exp = readRDS(file=file.path(dir,'GDSC2_Expr (RMA Normalized and Log Transformed).rds'))
exp[1:4,1:4]
dim(exp) # 17419个基因   805个样本
range(exp) # 2.094251 13.929729 经过了log，不过不用逆log

drug = readRDS(file = file.path(dir,"GDSC2_Res.rds"))
drug #下载到的数据是被log转换过的，用这句代码逆转回去
drug[1:4,1:4]# 行是样本，列是药物，数字是药物在这个样本中的IC50
dim(drug)
identical(rownames(drug),colnames(exp))
# drug是药物IC50值，exp是对应细胞系基因的表达矩阵。可以看到二者的样本名称是对应的。


#### 2.拿自己的数据来完成预测
load("../6.model/TCGA-OV_sur_model.Rdata")
test = exprSet#我们自己的表达矩阵

# 运行时间很长，大概2h，所以if(F)注释掉
if(F){
  calcPhenotype(trainingExprData = exp,
                trainingPtype = drug,
                testExprData = test,
                batchCorrect = 'standardize',  #   "eb" for array,standardize  for rnaseq
                powerTransformPhenotype = TRUE,
                removeLowVaryingGenes = 0.2,
                minNumSamples = 10, 
                printOutput = TRUE, 
                removeLowVaringGenesFrom = 'rawData' )
}

# R包Vignette里关于batchCorrect参数的说明
# batchCorrect options: "eb" for ComBat, "qn" for quantiles normalization, "standardize", or "none"
# "eb" is good to use when you use microarray training data to build models on microarray testing data.
# "standardize is good to use when you use microarray training data to build models on RNA-seq testing data (this is what Paul used in the 2017 IDWAS paper that used GDSC microarray to impute in TCGA RNA-Seq data, see methods section of that paper for rationale)

# R包Vignette里关于removeLowVaringGenesFrom参数的说明
#Determine method to remove low varying genes.
#Options are 'homogenizeData' and 'rawData'
#homogenizeData is likely better if there is ComBat batch correction, raw data was used in the 2017 IDWAS paper that used GDSC microarray to impute in TCGA RNA-Seq data.

# 也就是说，芯片数据就用上面代码里的参数，转录组数据的话，就将batchCorrect改为standardize
# removeLowVaringGenesFrom，作者说的也模糊，就随便了


#### 3.看看结果
# 这是运行之后的结果，被存在固定文件夹calcPhenotype_Output下。文件名也是固定的DrugPredictions.csv。因此一个工作目录只能计算一个数据，不要混着用了
testPtype './calcPhenotype_Output/DrugPredictions.csv', row.names = 1,check.names = F)
testPtype[1:4, 1:4]
dim(testPtype)
identical(colnames(testPtype),colnames(drug))
# IC50越小，药效越好
# 198种药物IC50的预测结果就在这个表格里了

1.2结合高低风险画图

load("../6.model/rsurv.Rdata")
identical(rownames(testPtype),rownames(rsurv))
a = apply(testPtype, 2, function(x){
  #x = testPtype[,1]，每个药物的IC50值
   data.frame(p = wilcox.test(x~rsurv$group)$p.value, # wilcox.test组间非参数检验，和高低风险的计算，计算p值和方差
              i = var(x)
   )
})
a = do.call(rbind,a)
k1 = a$p<0.01;table(k1)
k2 = a$i>25;table(k2)
table(k1&k2)
dg = rownames(a)[k1&k2]
library(tinyarray)
draw_boxplot(t(testPtype[,dg]),rsurv$group)
# 比较希望能看到高低风险组的差别

1.3相关性热图

load("../6.model/lassogene.Rdata")
nn = names(head(sort(apply(testPtype, 2, sum)),30))
testPtype = testPtype[,nn]
nc = cbind(testPtype,t(exprSet[lassoGene,])) %>% as.matrix()


library(Hmisc)
m = rcorr(nc)$r[1:ncol(testPtype),(ncol(nc)-length(lassoGene)+1):ncol(nc)]
p = rcorr(nc)$P[1:ncol(testPtype),(ncol(nc)-length(lassoGene)+1):ncol(nc)]
p[1:4,1:4]
library(dplyr)
tmp = matrix(case_when(as.logical(p<0.01)~"**",
                       as.logical(p<0.05)~"*",
                       T~""),nrow = nrow(p))
library(pheatmap)
pheatmap(t(m),
         display_numbers =t(tmp),
         angle_col =45,
         color = colorRampPalette(c("#2fa1dd", "white", "#f87669"))(100),
border_color = "white",
width = 7, 
height=9.1,
treeheight_col = 0,
treeheight_row = 0)

2.免疫治疗

### 1.输入数据
# 细胞丰度矩阵；risk结果；表达矩阵
rm(list = ls())
load("../6.model/rsurv.Rdata")#临床信息、分组

### 2.免疫数据验证
# IMvigor210CoreBiologies这个包比较过时了，更新时间停留在2018年，很难安装。喜欢挑战可以点进去 http://research-pub.gene.com/IMvigor210CoreBiologies/ 这个网页下载本地安装包，折腾一下，不喜欢就直接用存好的数据
# 这个时间点也是R语言版本从3.4切换到3.5的时候，那次换了bioconductor安装方式，导致这个网页中的安装方式早已不能用了

library(BiocGenerics)
library(Biobase)
f_cds = "cds.Rdata"
if(!file.exists(f_cds)){
  library(IMvigor210CoreBiologies)
  data(cds)
  counts = counts(cds) #表达矩阵
  an = fData(cds) #feature，基因的信息
  pd = pData(cds) #列的信息
  save(counts,an,pd,file = "cds.Rdata")
}
load(f_cds)

# 3.表达矩阵和临床信息表格的一系列整理
counts = counts[,match(rownames(pd),colnames(counts))]
an = an[!duplicated(rownames(an)),]
counts = counts[!duplicated(rownames(counts)),]

g = intersect(an$entrez_id,rownames(counts))
an = an[g,]
counts = counts[g,]

k = (!duplicated(an$symbol))&(!is.na(an$symbol));table(k)

counts = counts[k,]
an = an[k,]
rownames(counts) = an$symbol
meta = pd[,c("os","censOS","Best Confirmed Overall Response")]
colnames(meta)[3] = "Response"
meta$Response[meta$Response=="NE"]=NA

str(meta)
colnames(meta)[1:2] = c("time","event")
exp = log2(edgeR::cpm(counts)+1)
load("../6.model/lassogene.Rdata")
exp = exp[lassoGene,]
identical(rownames(meta),colnames(exp))
head(meta)#行是基因，列是样本

library(survminer)
lassoGene
load("../6.model/lasso_model.Rdata")
library(glmnet)
library(survival)
coef = coef(fit,s = cvfit$lambda.min)
dat = data.frame(gene = rownames(coef),
                 coefficient = as.numeric(coef[,1]))
head(dat)
dat = dat[dat$coefficient!=0,]
identical(dat$gene,colnames(exp))