专栏名称: 生信菜鸟团
生信菜鸟团荣誉归来,让所有想分析生物信息学数据的小伙伴找到归属,你值得拥有!
目录
相关文章推荐
生物探索  ·  Immunity | ... ·  2 天前  
生物学霸  ·  浙江中医药大学公共卫生学院招聘博士后 ·  3 天前  
生物学霸  ·  填补空白,C9 高校,发 Cell ·  3 天前  
生物学霸  ·  Nature:上海交通大学团队代谢学领域再创 ... ·  6 天前  
51好读  ›  专栏  ›  生信菜鸟团

你还缺scRNA-seq的workflow吗(二)?

生信菜鸟团  · 公众号  · 生物  · 2024-09-17 13:14

正文

前言

前面我们演示了生信工程师Roman Hillje分享的scRNA-seq workflow的部分(你还缺scRNA-seq的workflow吗?), 今天继续后面的学习。

Roman Hillje的github主页:

  • https://github.com/romanhaa

workflow:

  • https://romanhaa.github.io/projects/scrnaseq_workflow/

12 Perform analysis for Cerebro

为了进一步研究Cerebro https://github.com/romanhaa/Cerebro 中的数据集,我们现在添加了更多的meta data,计算标记基因(marker genes),标记基因的通路富集,并导出数据集。这个过程在很大程度上遵循了cerebroApp网站 https://romanhaa.github.io/cerebroApp/articles/cerebroApp_workflow_Seurat.html 上描述的工作流程。

12.1 Add meta data

load("labels_assinged_seurat.Rdata")
load("thresholds.RData")
seurat@misc$experiment <- list(
experiment_name = 'bone_marrow_donor_cells',
organism = 'hg',
date_of_analysis = Sys.Date()
)

seurat@misc$parameters <- list(
gene_nomenclature = 'gene_name',
discard_genes_expressed_in_fewer_cells_than = 6,
keep_mitochondrial_genes = TRUE,
variables_to_regress_out = 'nCount',
number_PCs = 15,
cluster_resolution = 0.8
)

seurat@misc$parameters$filtering <- list(
UMI_min = thresholds_nCount[1],
UMI_max = thresholds_nCount[2],
genes_min = thresholds_nFeature[1],
genes_max = thresholds_nFeature[2],
pct_mito_min = thresholds_percent_MT[1],
pct_mito_max = thresholds_percent_MT[2]
)

#github上的,自己科学上网下吧
if(!require("cerebroApp")){
BiocManager::install('romanhaa/cerebroApp')
}
library(cerebroApp)

seurat@misc$technical_info$cerebroApp_version <- utils::packageVersion('cerebroApp')
seurat@misc$technical_info$Seurat <- utils::packageVersion('Seurat')
seurat@misc$technical_info <- list(
'R' = capture.output(devtools::session_info())
)

12.2 Calculate relationship trees

这里,我们为所有三个分组变量:样本、聚类和细胞类型计算关系树——就像我们之前为聚类所做的那样。它们中的每一个都被添加到Seurat对象的@misc槽中,这样它就可以被提取到Cerebro中。

#Samples:

Idents(seurat) <- "sample"
seurat <- BuildClusterTree(
seurat,
dims = 1:15,
reorder = FALSE,
reorder.numeric = FALSE
)
seurat@misc$trees$sample <- seurat@tools$BuildClusterTree
#Clusters:

Idents(seurat) <- "seurat_clusters"
seurat <- BuildClusterTree(
seurat,
dims = 1:15,
reorder = FALSE,
reorder.numeric = FALSE
)
seurat@misc$trees$seurat_clusters <- seurat@tools$BuildClusterTree
#Cell types:

Idents(seurat) <- "cell_type_singler_blueprintencode_main"
seurat <- BuildClusterTree(
seurat,
dims = 1:15,
reorder = FALSE,
reorder.numeric = FALSE
)
seurat@misc$trees$cell_type_singler_blueprintencode_main <- seurat@tools$BuildClusterTree

12.3 cerebroApp steps

在这里,我们计算线粒体和核糖体转录本的百分比,得到最多表达的基因和标记基因,使用富集过的通路进行测试,并进行基因集富集分析GSEA。因为cereBroAPP作者提桶跑路,已经不支持seurat5了,所以没办法只能自己改函数,改过的函数存在cereBroFunction.R脚本里面

#感觉是cerebroApp不能兼容Seurat5,找不到原来位置的counts(因为v5在layers下面了)
#检查源代码发现确实是这样的,没办法想继续用就得自己修改源代码再定义一个函数
#修改过的函数在cereBroFunction_forSeuratV5.R脚本里面,source一下
source("cereBroFunction_forSeuratV5.R")
seurat <- addPercentMtRibo(
seurat,
assay = "RNA",
organism = 'hg',
gene_nomenclature = 'name'
)

seurat <- getMostExpressedGenes(
seurat,
assay = 'RNA',
groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main')
)

#Marker Gene结果保存在object@misc[["marker_genes"]][["cerebro_seurat"]][[groups[1]]]
seurat <- getMarkerGenes(
seurat,
organism = 'hg',
groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
name = 'cerebro_seurat',
only_pos = TRUE
)

#富集分析结果存在seurat@misc[["enriched_pathways"]]中
seurat <- cerebroApp::getEnrichedPathways(
seurat,
marker_genes_input = 'cerebro_seurat',
adj_p_cutoff = 0.01,
max_terms = 100
)

#GSEA富集分析,做不了,有坑,自己把基因提出来做好了
seurat <- performGeneSetEnrichmentAnalysis(
seurat,
assay = 'RNA',
GMT_file = 'h.all.v2023.2.Hs.symbols.gmt',
groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
thresh_p_val = 0.05,
thresh_q_val = 0.1,
parallel.sz = 1,
verbose = FALSE
)

cereBroFunction.R:

#函数 addPercentMtRibo
addPercentMtRibo <- function(object, assay = "RNA", organism ="hg" ,gene_nomenclature = 'name'){
  
  calculatePercentGenes<-function (object, assay = "RNA", genes ,message) {
    result <- pbapply::pblapply(genes, function(x) {
      genes_here <- intersect(x, rownames(object@assays[[assay]]$counts))
      if (length(genes_here) == 1) {
        object@assays[[assay]]$counts[genes_here, ]/Matrix::colSums(object@assays[[assay]]$counts)
        print(paste0("Only one ", message,"gene was found"))
      }
      else {
        Matrix::colSums(object@assays[[assay]]$counts[genes_here, 
        ])/Matrix::colSums(object@assays[[assay]]$counts)
      }
    })
  }
  
  #配置文件中记录了线粒体和核糖体基因
  genes_mt <- readr::read_tsv(system.file(paste0("extdata/genes_mt_"
                                                 organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                              col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  genes_ribo <- readr::read_tsv(system.file(paste0("extdata/genes_ribo_"
                                                   organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                                col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  
  genes_mt_here <- intersect(genes_mt, rownames(object@assays[[assay]]$counts))
  genes_ribo_here <- intersect(genes_ribo, rownames(object@assays[[assay]]$counts))
  
  values_mt <- calculatePercentGenes(object, assay = assay, message="mitochondrial", list(genes_mt = genes_mt_here))
  values_ribo <- calculatePercentGenes(object, assay = assay, message="ribosomal", list(genes_ribo = genes_ribo_here))
  
  [email protected]$mt_percent <- as.vector(values_mt[[1]])
  [email protected]$ribo_percent <- as.vector(values_ribo[[1]])
  
  return(object)
}

#函数 getMostExpressedGenes
getMostExpressedGenes<-function (object, assay = "RNA", groups = NULL) {
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment  (NA) for `"
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Group `", current_group, "` contains only one subgroup. Will calculate most expressed genes "
                     "across all cells of this group..."))
      transcripts_counts_per_gene <- Matrix::rowSums(object@assays[[assay]]$counts)
      total_transcript_count <- sum(transcripts_counts_per_gene)
      transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
      transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                           decreasing = TRUE)
      table <- tibble::tibble(group = group_levels, gene = names(transcripts_percent_per_gene)[1:100], 
                              pct = transcripts_percent_per_gene[1:100])
      most_expressed_genes <- table %>% dplyr::mutate(group = factor(group, 
                                                                     levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                    group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get most expressed genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      results <- pbapply::pblapply(group_levels, function(x) {
        cells_of_current_group_level <- rownames([email protected])[which([email protected][[current_group]] == 
                                                                           x)]
        transcript_count_matrix <- object@assays[[assay]]$counts[,cells_of_current_group_level]
        if (is.vector(transcript_count_matrix) == TRUE) {
          transcripts_counts_per_gene <- transcript_count_matrix
        }
        else {
          transcripts_counts_per_gene <- Matrix::rowSums(transcript_count_matrix)
        }
        total_transcript_count <- sum(transcripts_counts_per_gene)
        transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
        transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                             decreasing = TRUE)
        table <- tibble::tibble(group = x, gene = names(transcripts_percent_per_gene)[1:100], 
                                pct = transcripts_percent_per_gene[1:100])
      })
      most_expressed_genes <- do.call(rbind, results) %>% 
        dplyr::mutate(group = factor(group, levels = group_levels)) %>% 
        dplyr::rename(`:=`(!!current_group, group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
  }
  return(object)
}

#函数 getMarkerGenes
getMarkerGenes <- function (object, organism = NULL, groups = NULL
          name = "cerebro_seurat", only_pos = TRUE, min_pct = 0.7
          thresh_logFC = 0.25, thresh_p_val = 0.01, test = "wilcox"
          verbose = TRUE...) {
  if (is.null(object@misc$marker_genes)) {
    object@misc$marker_genes <- list()
  }
  if (organism %in% c("hg""mm") == FALSE) {
    message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] No information about genes on cell surface because organism is "
                   "either not specified or not human/mouse."))
  }
  else {
    if (organism == "hg" || organism == "human") {
      temp_attributes <- "hgnc_symbol"
      temp_dataset <- "hsapiens_gene_ensembl"
    }
    else if (organism == "mm" || organism == "mouse") {
      temp_attributes <- "external_gene_name"
      temp_dataset <- "mmusculus_gene_ensembl"
    }
    attempt <- 1
    while (!exists("genes_on_cell_surface") && attempt <= 
           3) {
      try(genes_on_cell_surface <- biomaRt::getBM(attributes = temp_attributes, 
                                                  filters = "go", values = "GO:0009986", mart = biomaRt::useMart("ensembl"
                                                                                                                 dataset = temp_dataset))[, 1])
    }
    if (!exists("genes_on_cell_surface")) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Genes in GO term \"cell surface\" (GO:0009986) could not be "
                     "retrieved, possibly due to the server not being reachable at the "
                     "moment."))
    }
  }
  object@misc$marker_genes[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment (NA) for `"
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      warning(paste0("Only one group level found for group `"
                     current_group, "`. Will skip this group and proceed to next."), 
              call. = FALSE)
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get marker genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      Seurat::Idents(object) <- current_group
      results <- Seurat::FindAllMarkers(object, 
                                        only.pos = only_pos, min.pct = min_pct, logfc.threshold = thresh_logFC, 
                                        return.thresh = thresh_p_val, test.use = test, 
                                        verbose = verbose, ...)
      if (nrow(results) == 0) {
        message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                       "] No marker genes found for any of the level of `"
                       current_group, "`."))
        results <- "no_markers_found"
      }
      else if (nrow(results) > 0) {
        if (exists("genes_on_cell_surface") && "gene" %in
            colnames(results)) {
          results <- results %>% dplyr::mutate(on_cell_surface = .data$gene %in
                                                 genes_on_cell_surface)
        }
      }
      if ("cluster" %in% colnames(results)) {
        results <- results %>% dplyr::rename(`:=`(!!current_group, 
                                                  .data$cluster)) %>% dplyr::select(tidyselect::all_of(current_group), 
                                                                                    tidyselect::any_of("gene"), dplyr::everything())
      }
      object@misc[["marker_genes"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

#函数 performGeneSetEnrichmentAnalysis
performGeneSetEnrichmentAnalysis<-function (object, assay = "RNA", GMT_file, 
                                            groups = NULL, name = "cerebro_GSVA"
                                            thresh_p_val = 0.05, thresh_q_val = 0.1...) {
  
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loading gene sets..."))
  read_GMT_file<-function (file) 
  {
    gmt <- readr::read_delim(file, delim = ";", col_names = c("X1"), 
                             col_types = readr::cols())
    gene_set_genes <- list()
    for (i in seq_len(nrow(gmt))) {
      temp_genes <- strsplit(gmt$X1[i], split = "\t")[[1]] %>% 
        unlist()
      temp_genes <- temp_genes[3:length(temp_genes)]
      gene_set_genes[[i]] <- temp_genes
    }
    gene_set_loaded <- list(genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, 
                                                                                       split = "\t"), "["1) %>% unlist(), geneset.description = lapply(strsplit(gmt$X1, 
                                                                                                                                                                  split = "\t"), "["2) %>% unlist())
    return(gene_set_loaded)
  }
  gene_sets <- read_GMT_file(GMT_file)
  names(gene_sets$genesets) <- gene_sets$geneset.names
  gene_sets_tibble <- tibble::tibble(name = gene_sets$geneset.names, 
                                     description = gene_sets$geneset.description, length = NA
                                     genes = NA)
  for (i in seq_along(gene_sets$genesets)) {
    gene_sets_tibble$length[i] <- gene_sets$genesets[[i]] %>% 
      length()
    gene_sets_tibble$genes[i] <- gene_sets$genesets[[i]] %>% 
      unlist() %>% paste(.data, collapse = ",")
  }
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loaded "
                 length(gene_sets$genesets), " gene sets from GMT file."))
  expressed_genes <- Matrix::rowSums(object@assays[[assay]]$counts)
  expressed_genes <- which(expressed_genes != 0)
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Extracting transcript counts "
                 "from `data` slot of `", assay, "` assay..."))
  matrix_full <- object@assays[[assay]]$counts[expressed_genes, 
  ]
  object@misc$enriched_pathways[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) ""without group assignment (NA) for `"
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                       "] Performing analysis for ""{length(group_levels)} subgroups of group `{current_group}`..."))
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Only one group level found ""for group `"
                     current_group, "`. Will skip this group and proceed "
                     "to next."), call. = FALSE)
      results <- NULL
    }
    else if (length(group_levels) > 1) {
      matrix_mean_by_group <- future.apply::future_sapply(group_levels, 
                                                          USE.NAMES = TRUE, simplify = TRUE, future.globals = FALSE
                                                          function(x) {
                                                            cells_in_this_group <- which([email protected][[current_group]] == 
                                                                                           x)
                                                            if (length(cells_in_this_group) == 1) {
                                                              matrix_full[, cells_in_this_group]
                                                            }
                                                            else {
                                                              Matrix::rowMeans(matrix_full[, cells_in_this_group])
                                                            }
                                                          })
      enrichment_scores <- GSVA::gsva(expr = matrix_mean_by_group, 
                                      gset.idx.list = gene_sets$genesets, ...)
      results <- tibble::tibble(group = character(), name = character(), 
                                enrichment_score = numeric(), p_value = numeric(), 
                                q_value = numeric())
      for (i in seq_len(ncol(enrichment_scores))) {
        temp_results <- tibble::tibble(group = colnames(matrix_mean_by_group)[i], 
                                       name = rownames(enrichment_scores), enrichment_score = enrichment_scores[, 
                                                                                                                i])
        if (nrow(temp_results) < 2) {
          message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                         "] Only 1 gene set ""available, therefore p- and q-values cannot be calculated."))
          temp_results <- temp_results %>% dplyr::mutate(p_value = NA
                                                         q_value = NA)
        }
        else if (nrow(temp_results) >= 2) {
          temp_p_values <- stats::pnorm(-abs(scale(temp_results$enrichment_score)[, 
                                                                                  1]))
          temp_q_values <- suppressWarnings(qvalue::qvalue(temp_p_values, 
                                                           pi0 = 1, ties = min)$lfdr)
          temp_results <- temp_results %>% dplyr::mutate(p_value = temp_p_values, 
                                                         q_value = temp_q_values) %>% dplyr::filter(.data$p_value <= 
                                                                                                      thresh_p_val, .data$q_value <= thresh_q_val)
        }
        results <- dplyr::bind_rows(results, temp_results) %>% 
          dplyr::arrange(.data$q_value)
      }
      results <- dplyr::left_join(results, gene_sets_tibble, 
                                  by = "name") %>% dplyr::select(c("group""name"
                                                                   "description""length""genes""enrichment_score"
                                                                   "p_value""q_value")) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                 .data$group))
      group_levels_with_results <- group_levels[group_levels %in
                                                  results[[current_group]]]
      results[[current_group]] <- factor(results[[current_group]], 
                                         levels = group_levels_with_results)
      message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                         "] {nrow(results)} gene sets ""passed the thresholds across all subgroups of group `{current_group}`."))
      if (nrow(results) == 0) {
        results <- "no_gene_sets_enriched"
      }
      object@misc[["enriched_pathways"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

13 单基因的表达

13.1单基因的小提琴图

  • 首先,我们发现用宽度而不是面积来衡量小提琴是很有用的。
  • 其次,我们建议对小提琴进行修剪,以避免小提琴延伸到负尺度,这是没有意义的,因为没有任何负的表达值。
  • 第三,在数据中添加一些噪声可能是有意义的,以避免突出显示刻度底部的不同值,例如log(1), log(2)等。在使用Seurat对表达量进行可视化时,这是默认情况下完成的(甚至没有选项)。

在这种情况下,差异非常细微,如下图所示(红细胞和单核细胞)。

p1 <- [email protected] %>%
mutate(expression = seurat@assays$SCT@data['CD69',]) %>%
ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +
geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
theme_bw() +
scale_fill_manual(values = custom_colors$discrete) +
scale_x_discrete() +
scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
labs(title = "CD69 expression (without noise)") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = 'none'
)

p2 <- [email protected] %>%
mutate(
expression = seurat@assays$SCT@data['CD69',],
expression = expression + rnorm(nrow(.))/200
) %>%
ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +
geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
theme_bw() +
scale_fill_manual(values = custom_colors$discrete) +
scale_x_discrete() +
scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
labs(title = "CD69 expression (with noise)") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = 'none'
)

p1 + p2 + plot_layout(ncol = 1)

ggsave(
'plots/expression_cd69_violin_plot.png',
p1 + p2 + plot_layout(ncol = 1),
height = 6, width = 9
)

13.2 多基因点图

点图可以展示多个基因的表达,通过一个特定的分组变量。接下来的例子中,我们将使用检测到的细胞类型和聚类的标记基因。

13.2.1 根据细胞类型划分

# cells will be grouped by samples that they have been assigned to
group_ids <- unique([email protected]$cell_type_singler_blueprintencode_main)

# select a set of genes for which we want to show expression
genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>%
group_by(cell_type_singler_blueprintencode_main) %>%
arrange(p_val_adj) %>%
filter(row_number() == 1) %>%
arrange(cell_type_singler_blueprintencode_main) %>%
pull(gene)


#这里若没有DC种类的细胞,则
group_ids = group_ids[group_ids!="DC"]
# for every sample-gene combination, calculate the average expression across
# all cells and then transform the data into a data frame
expression_levels_per_group <- vapply(
group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
cells_in_current_group <- which([email protected]$cell_type_singler_blueprintencode_main == x)
Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
}
) %>%
t() %>%
as.data.frame() %>%
mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
select(cell_type_singler_blueprintencode_main, everything()) %>%
pivot_longer(
cols = c(2:ncol(.)),
names_to = 'gene'
) %>%
dplyr::rename(expression = value) %>%
mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene))

# for every sample-gene combination, calculate the percentage of cells in the
# respective group that has at least 1 transcript (this means we consider it
# as expressing the gene) and then transform the data into a data frame
percentage_of_cells_expressing_gene <- vapply(
group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
cells_in_current_group <- which([email protected]$cell_type_singler_blueprintencode_main == x)
Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
}
) %>%
t() %>%
as.data.frame() %>%
mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
select(cell_type_singler_blueprintencode_main, everything()) %>%
pivot_longer(
cols = c(2:ncol(.)),
names_to = 'gene'
) %>%
dplyr::rename(cell_count = value) %>%
left_join(
.,
[email protected] %>%
group_by(cell_type_singler_blueprintencode_main) %>%
tally(),
by = 'cell_type_singler_blueprintencode_main') %>%
mutate(
id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene),
percent_cells = cell_count / n
)

# merge the two data frames created before and plot the data
p <- left_join(
expression_levels_per_group,
percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
by = 'id_to_merge'
) %>%
mutate(
cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)),
gene = factor(gene, levels = genes_to_show)
) %>%
arrange(gene) %>%
ggplot(aes(gene, cell_type_singler_blueprintencode_main)) +
geom_point(aes(color = expression, size = percent_cells)) +
scale_color_distiller(
palette = 'Reds',
direction = 1,
name = 'Log-normalised\nexpression',
guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
) +
scale_size(name = 'Percent\nof cells', labels = scales::percent) +
labs(y = 'Cell type', color = 'Expression') +
coord_fixed() +
theme_bw() +
theme(
axis.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)

p

ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)

13.2.2 根据聚类结果进行划分

group_ids <- levels([email protected]$seurat_clusters)

genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>%
group_by(seurat_clusters) %>%
arrange(p_val_adj) %>%
filter(row_number() == 1) %>%
arrange(seurat_clusters) %>%
pull(gene)
genes_to_show <- genes_to_show[!duplicated(genes_to_show)]

expression_levels_per_group <- vapply(
group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
cells_in_current_group <- which([email protected]$seurat_clusters == x)
Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
}
) %>%
t() %>%
as.data.frame()
expression_levels_per_group <- expression_levels_per_group[,!duplicated(colnames(expression_levels_per_group))] %>%
mutate(cluster = rownames(.)) %>%
select(cluster, everything()) %>%
pivot_longer(
cols = c(2:ncol(.)),
names_to = 'gene'
) %>%
dplyr::rename(expression = value) %>%
mutate(id_to_merge = paste0(cluster, '_', gene))

percentage_of_cells_expressing_gene <- vapply(
group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
cells_in_current_group <- which([email protected]$seurat_cluster == x)
Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
}
) %>%
t() %>%
as.data.frame()
percentage_of_cells_expressing_gene <- percentage_of_cells_expressing_gene[,!duplicated(colnames(percentage_of_cells_expressing_gene))] %>%
mutate(cluster = rownames(.)) %>%
select(cluster, everything()) %>%
pivot_longer(
cols = c(2:ncol(.)),
names_to = 'gene'
) %>%
dplyr::rename(cell_count = value) %>%
left_join(
.,
[email protected] %>%
group_by(seurat_clusters) %>%
tally() %>%
dplyr::rename(cluster = seurat_clusters),
by = 'cluster') %>%
mutate(
id_to_merge = paste0(cluster, '_', gene),
percent_cells = cell_count / n
)

p <- left_join(
expression_levels_per_group,
percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
by = 'id_to_merge'
) %>%
mutate(
cluster = factor(cluster, levels = rev(group_ids)),
gene = factor(gene, levels = genes_to_show)
) %>%
arrange(gene) %>%
ggplot(aes(gene, cluster)) +
geom_point(aes(color = expression, size = percent_cells)) +
scale_color_distiller(
palette = 'Reds',
direction = 1,
name = 'Log-normalised\nexpression',
guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
) +
scale_size(name = 'Percent\nof cells', labels = scales::percent) +
labs(y = 'Cluster', color = 'Expression') +
coord_fixed() +
theme_bw() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)

p

ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)

14 基因集中的表达量

我们根据细胞类型划分检查GO通路B_CELL_ACTIVATION中基因的表达量

library(msigdbr)

gene_set <- msigdbr(species = 'Homo sapiens', category = 'C5', subcategory = 'BP') %>%
filter(gs_name == 'GOBP_B_CELL_ACTIVATION') %>%
pull(gene_symbol) %>%
unique()

gene_set_present <- gene_set[which(gene_set %in% rownames(seurat@assays$SCT@data))]

head(gene_set_present)
# [1] "ABL1" "ADA" "ADAM17" "ADGRG3" "AHR" "AKAP17A"

14.1 Mean log-expression

temp_labels <- [email protected] %>%
group_by(cell_type_singler_blueprintencode_main) %>%
tally() %>%
dplyr::rename('cell_type' = cell_type_singler_blueprintencode_main)

p <- tibble(
cell_type = [email protected]$cell_type_singler_blueprintencode_main,
mean_expression = Matrix::colMeans(seurat@assays$SCT@data[gene_set_present,])
) %>%
ggplot(aes(x = cell_type, y = mean_expression, fill = cell_type)) +
geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
geom_text(
data = temp_labels,
aes(
x = cell_type,
y = -Inf,
label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
vjust = -1
),
color = 'black', size = 2.8
) +
theme_bw() +
scale_fill_manual(values = custom_colors$discrete) +
scale_x_discrete() +
scale_y_continuous(name = 'Mean log expression', labels = scales::comma, expand = c(0.08, 0)) +
labs(title = "GO term 'B_CELL_ACTIVATION'") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = 'none'
)

p

ggsave('plots/gene_set_expression_b_cell_activation_mean.png', p, height = 4, width = 7)

14.2 Module score

seurat <- AddModuleScore(
seurat,
features = list(gene_set_present),
name = 'GO_B_CELL_ACTIVATION',
assay = 'SCT'
)

temp_labels <- [email protected] %>%
group_by(cell_type_singler_blueprintencode_main) %>%
tally()

p <- [email protected] %>%
ggplot(
aes(
x = cell_type_singler_blueprintencode_main,
y = GO_B_CELL_ACTIVATION1,
fill = cell_type_singler_blueprintencode_main
)
) +
geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
geom_text(
data = temp_labels,
aes(
x = cell_type_singler_blueprintencode_main,
y = -Inf,
label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
vjust = -1
),
color = 'black', size = 2.8
) +
theme_bw() +
scale_fill_manual(values = custom_colors$discrete) +
scale_x_discrete() +
scale_y_continuous(name = 'Module score', labels = scales::comma, expand = c(0.08, 0)) +
labs(title = "GO term 'B_CELL_ACTIVATION'") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = 'none'
)

p

ggsave('plots/gene_set_expression_b_cell_activation_module_score.png', p, height = 4, width = 7)

15 GSEA富集分析

15.1 GSVA

为了说明原因,我们将使用来自MSigDB的50个Hallmark基因集,并将单核细胞与造血干细胞(HSC;控制)进行比较。感谢Alessandro Raveane的代码。

# function to read GMT file
read_GMT_file <- function(file) {
gmt <- readr::read_delim(
file,
delim = ';',
col_names = c('X1'),
col_types = readr::cols()
)

gene_set_genes <- list()
for ( i in seq_len(nrow(gmt)) )
{
temp_genes <- strsplit(gmt$X1[i], split = '\t')[[1]] %>% unlist()
temp_genes <- temp_genes[3:length(temp_genes)]
gene_set_genes[[i]] <- temp_genes
}
gene_set_loaded <- list(
genesets = gene_set_genes,
geneset.names = lapply(strsplit(gmt$X1, split = '\t'), '[', 1) %>% unlist(),
geneset.description = lapply(
strsplit(gmt$X1, split = '\t'), '[', 2
) %>% unlist()
)

return(gene_set_loaded)
}

# load gene sets from GMT file
gene_sets <- read_GMT_file('h.all.v2023.2.Hs.symbols.gmt')

# set gene set names
names(gene_sets$genesets) <- gene_sets$geneset.names

# get indices of cells which are either HSC or monocytes
cells_to_analyze <- [email protected] %>%
mutate(row_number = row_number()) %>%
filter(grepl(cell_type_singler_blueprintencode_main, pattern = 'HSC|Monocytes')) %>%
arrange(cell_type_singler_blueprintencode_main) %>%
pull(row_number)

# get list of genes unique genes across all gene sets
genes_to_analyze <- gene_sets$genesets %>% unlist() %>% unique()
# filter gene list for those which are present in the data set
genes_to_analyze <- genes_to_analyze[which(genes_to_analyze %in% rownames(seurat@assays$SCT@counts))]

# get expression matrix and reduce it to cells and genes of interest
expression_matrix <- seurat@assays$SCT@counts[ genes_to_analyze , cells_to_analyze] %>% as.matrix()

# perform GSVA
gsva <- GSVA::gsva(
expression_matrix,
gset.idx.list = gene_sets$genesets,
parallel.sz = 1
)

# load limma library for statistical testing
library(limma)

#HSC细胞和Monocytes细胞的差异表达分析
# generate design matrix
design_matrix <- tibble(
control = 1,
test = c(
rep(0, [email protected] %>% filter(cell_type_singler_blueprintencode_main == 'HSC') %>% nrow()),
rep(1, [email protected] %>% filter(cell_type_singler_blueprintencode_main == 'Monocytes') %>% nrow())
)
)

head(design_matrix)
# A tibble: 6 x 2
# control test
#
# 1 1 0
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 1 0

# fit linear model, followed by empirical Bayes statistics for differential
# enrichment analysis
fit <- lmFit(gsva, design_matrix)
fit <- eBayes(fit)

# prepare data for plotting
data <- topTable(fit, coef = 'test', number = 50) %>%
mutate(gene_set = rownames(fit$t)) %>%
arrange(t) %>%
mutate(
gene_set = factor(gene_set, levels = gene_set),
just = ifelse(t < 0, 0, 1),
nudge_y = ifelse(t < 0, 1, -1),
color = ifelse(t < -5 | t > 5, 'black', 'grey')
)

DataFrame(data)
# DataFrame with 50 rows and 10 columns
# logFC AveExpr t P.Value adj.P.Val B gene_set just nudge_y color
#
# 1 -0.369443 -0.1569690 -35.1619 1.26066e-186 1.05055e-185 415.980 HALLMARK_TGF_BETA_SIGNALING 0 1 black
# 2 -0.251413 -0.0820012 -27.2261 1.96152e-127 8.91598e-127 279.761 HALLMARK_NOTCH_SIGNALING 0 1 black
# 3 -0.288169 -0.1202293 -26.1464 1.40712e-119 5.41201e-119 261.689 HALLMARK_ESTROGEN_RESPONSE_EARLY 0 1 black
# 4 -0.135034 -0.1511146 -22.3565 8.50194e-93 2.36165e-92 200.094 HALLMARK_INTERFERON_ALPHA_RESPONSE 0 1 black
# 5 -0.203651 -0.1049498 -22.0728 7.44006e-91 1.95791e-90 195.628 HALLMARK_INTERFERON_GAMMA_RESPONSE 0 1 black
# ... ... ... ... ... ... ... ... ... ... ...
# 46 0.200259 0.25341594 35.9250 2.31901e-192 2.31901e-191 429.182 HALLMARK_WNT_BETA_CATENIN_SIGNALING 1 -1 black
# 47 0.293113 -0.08115610 36.1995 2.00784e-194 2.50980e-193 433.929 HALLMARK_MITOTIC_SPINDLE 1 -1 black
# 48 0.338449 0.11583774 36.2560 7.56196e-195 1.26033e-193 434.906 HALLMARK_CHOLESTEROL_HOMEOSTASIS 1 -1 black
# 49 0.251678 0.00262136 37.5117 2.86772e-204 7.16930e-203 456.592 HALLMARK_HYPOXIA 1 -1 black
# 50 0.282907 -0.05021404 48.5971 1.72523e-285 8.62616e-284 643.571 HALLMARK_TNFA_SIGNALING_VIA_NFKB 1 -1 black

# plot t-value
p <- ggplot(data = data, aes(x = gene_set, y = t, fill = t)) +
geom_col() +
geom_hline(yintercept = c(-5,5), linetype = 'dashed', color = 'grey80') +
geom_text(
aes(
x = gene_set,
y = 0,
label = gene_set,
hjust = just,
color = color
),
nudge_y = data$nudge_y, size = 3
) +
scale_x_discrete(name = '', labels = NULL) +
scale_y_continuous(name = 't-value', limits = c(-55,55)) +
scale_fill_distiller(palette = 'Spectral', limits = c(-max(data$t), max(data$t))) +
scale_color_manual(values = c('black' = 'black', 'grey' = 'grey')) +
coord_flip() +
theme_bw() +
theme(
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none'
)

p

ggsave('plots/gsva_hallmark_gene_sets_monocytes_vs_hsc.png', height = 7.5, width = 7.5)

16 保存结果和Session info

最后我们将seurat对象保存,可以用于后续分析

saveRDS(seurat, 'data/seurat.rds')

保存我们使用的R和R包版本信息,以便后续进行可重复性分析。

devtools::session_info()






文末友情宣传

强烈建议你推荐给身边的博士后以及年轻生物学PI,多一点数据认知,让他们的科研上一个台阶: