你还缺scRNA-seq的workflow吗（二）？

生信菜鸟团 · 公众号 · 生物 · 2024-09-17 13:14

正文

前言

前面我们演示了生信工程师Roman Hillje分享的scRNA-seq workflow的部分（你还缺scRNA-seq的workflow吗？）, 今天继续后面的学习。

Roman Hillje的github主页：

https://github.com/romanhaa

workflow：

https://romanhaa.github.io/projects/scrnaseq_workflow/

12 Perform analysis for Cerebro

为了进一步研究Cerebro https://github.com/romanhaa/Cerebro 中的数据集，我们现在添加了更多的meta data，计算标记基因(marker genes)，标记基因的通路富集，并导出数据集。这个过程在很大程度上遵循了cerebroApp网站 https://romanhaa.github.io/cerebroApp/articles/cerebroApp_workflow_Seurat.html 上描述的工作流程。

12.1 Add meta data

load("labels_assinged_seurat.Rdata")
load("thresholds.RData")
seurat@misc$experiment <- list(
  experiment_name = 'bone_marrow_donor_cells',
  organism = 'hg',
  date_of_analysis = Sys.Date()
)

seurat@misc$parameters <- list(
  gene_nomenclature = 'gene_name',
  discard_genes_expressed_in_fewer_cells_than = 6,
  keep_mitochondrial_genes = TRUE,
  variables_to_regress_out = 'nCount',
  number_PCs = 15,
  cluster_resolution = 0.8
)

seurat@misc$parameters$filtering <- list(
  UMI_min = thresholds_nCount[1],
  UMI_max = thresholds_nCount[2],
  genes_min = thresholds_nFeature[1],
  genes_max = thresholds_nFeature[2],
  pct_mito_min = thresholds_percent_MT[1],
  pct_mito_max = thresholds_percent_MT[2]
)

#github上的，自己科学上网下吧
if(!require("cerebroApp")){
  BiocManager::install('romanhaa/cerebroApp')
}
library(cerebroApp)

seurat@misc$technical_info$cerebroApp_version <- utils::packageVersion('cerebroApp')
seurat@misc$technical_info$Seurat <- utils::packageVersion('Seurat')
seurat@misc$technical_info <- list(
  'R' = capture.output(devtools::session_info())
)

12.2 Calculate relationship trees

这里，我们为所有三个分组变量:样本、聚类和细胞类型计算关系树——就像我们之前为聚类所做的那样。它们中的每一个都被添加到Seurat对象的 @misc 槽中，这样它就可以被提取到Cerebro中。

#Samples:

Idents(seurat) <- "sample"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$sample <- seurat@tools$BuildClusterTree
#Clusters:

Idents(seurat) <- "seurat_clusters"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$seurat_clusters <- seurat@tools$BuildClusterTree
#Cell types:

Idents(seurat) <- "cell_type_singler_blueprintencode_main"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$cell_type_singler_blueprintencode_main <- seurat@tools$BuildClusterTree

12.3 cerebroApp steps

在这里，我们计算线粒体和核糖体转录本的百分比，得到最多表达的基因和标记基因，使用富集过的通路进行测试，并进行基因集富集分析GSEA。因为cereBroAPP作者提桶跑路，已经不支持seurat5了，所以没办法只能自己改函数，改过的函数存在 cereBroFunction.R 脚本里面

#感觉是cerebroApp不能兼容Seurat5，找不到原来位置的counts(因为v5在layers下面了)
#检查源代码发现确实是这样的，没办法想继续用就得自己修改源代码再定义一个函数
#修改过的函数在cereBroFunction_forSeuratV5.R脚本里面，source一下
source("cereBroFunction_forSeuratV5.R")
seurat <- addPercentMtRibo(
  seurat,
  assay = "RNA",
  organism = 'hg',
  gene_nomenclature = 'name'
)

seurat <- getMostExpressedGenes(
  seurat,
  assay = 'RNA',
  groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main')
)

#Marker Gene结果保存在object@misc[["marker_genes"]][["cerebro_seurat"]][[groups[1]]]
seurat <- getMarkerGenes(
  seurat,
  organism = 'hg',
  groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
  name = 'cerebro_seurat',
  only_pos = TRUE
)

#富集分析结果存在seurat@misc[["enriched_pathways"]]中
seurat <- cerebroApp::getEnrichedPathways(
  seurat,
  marker_genes_input = 'cerebro_seurat',
  adj_p_cutoff = 0.01,
  max_terms = 100
)

#GSEA富集分析,做不了，有坑，自己把基因提出来做好了
 seurat <- performGeneSetEnrichmentAnalysis(
   seurat,
   assay = 'RNA',
   GMT_file = 'h.all.v2023.2.Hs.symbols.gmt',
   groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
   thresh_p_val = 0.05,
   thresh_q_val = 0.1,
   parallel.sz = 1,
   verbose = FALSE
 )

cereBroFunction.R :

#函数 addPercentMtRibo
addPercentMtRibo <- function(object, assay = "RNA", organism ="hg" ,gene_nomenclature = 'name'




    
){
  
  calculatePercentGenes<-function (object, assay = "RNA", genes ,message) {
    result <- pbapply::pblapply(genes, function(x) {
      genes_here <- intersect(x, rownames(object@assays[[assay]]$counts))
      if (length(genes_here) == 1) {
        object@assays[[assay]]$counts[genes_here, ]/Matrix::colSums(object@assays[[assay]]$counts)
        print(paste0("Only one ", message,"gene was found"))
      }
      else {
        Matrix::colSums(object@assays[[assay]]$counts[genes_here, 
        ])/Matrix::colSums(object@assays[[assay]]$counts)
      }
    })
  }
  
  #配置文件中记录了线粒体和核糖体基因
  genes_mt <- readr::read_tsv(system.file(paste0("extdata/genes_mt_", 
                                                 organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                              col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  genes_ribo <- readr::read_tsv(system.file(paste0("extdata/genes_ribo_", 
                                                   organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                                col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  
  genes_mt_here <- intersect(genes_mt, rownames(object@assays[[assay]]$counts))
  genes_ribo_here <- intersect(genes_ribo, rownames(object@assays[[assay]]$counts))
  
  values_mt <- calculatePercentGenes(object, assay = assay, message="mitochondrial", list(genes_mt = genes_mt_here))
  values_ribo <- calculatePercentGenes(object, assay = assay, message="ribosomal", list(genes_ribo = genes_ribo_here))
  
  [email protected]$mt_percent <- as.vector(values_mt[[1]])
  [email protected]$ribo_percent <- as.vector(values_ribo[[1]])
  
  return(object)
}

#函数 getMostExpressedGenes
getMostExpressedGenes<-function (object, assay = "RNA", groups = NULL) {
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment  (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Group `", current_group, "` contains only one subgroup. Will calculate most expressed genes ", 
                     "across all cells of this group..."))
      transcripts_counts_per_gene <- Matrix::rowSums(object@assays[[assay]]$counts)
      total_transcript_count <- sum(transcripts_counts_per_gene)
      transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
      transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                           decreasing = TRUE)
      table <- tibble::tibble(group = group_levels, gene = names(transcripts_percent_per_gene)[1:100], 
                              pct = transcripts_percent_per_gene[1:100])
      most_expressed_genes <- table %>% dplyr::mutate(group = factor(group, 
                                                                     levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                    group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get most expressed genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      results <- pbapply::pblapply(group_levels, function(x) {
        cells_of_current_group_level <- rownames([email protected])[which([email protected][[current_group]] == 
                                                                           x)]
        transcript_count_matrix <- object@assays[[assay]]$counts[,cells_of_current_group_level]
        if (is.vector(transcript_count_matrix) == TRUE) {
          transcripts_counts_per_gene <- transcript_count_matrix
        }
        else {
          transcripts_counts_per_gene <- Matrix::rowSums(transcript_count_matrix)
        }
        total_transcript_count <- sum(transcripts_counts_per_gene)
        transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
        transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                             decreasing = TRUE)
        table <- tibble::tibble(group = x, gene = names(transcripts_percent_per_gene)[1:100], 
                                pct = transcripts_percent_per_gene[1:100])
      })
      most_expressed_genes <- do.call(rbind, results) %>% 
        dplyr::mutate(group = factor(group, levels = group_levels)) %>% 
        dplyr::rename(`:=`(!!current_group, group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
  }
  return(object)
}

#函数 getMarkerGenes
getMarkerGenes <- function (object, organism = NULL, groups = NULL, 
          name = "cerebro_seurat", only_pos = TRUE, min_pct = 0.7, 
          thresh_logFC = 0.25, thresh_p_val = 0.01, test = "wilcox", 
          verbose = TRUE, ...) {
  if (is.null(object@misc$marker_genes)) {
    object@misc$marker_genes <- list()
  }
  if (organism %in% c("hg", "mm") == FALSE) {
    message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] No information about genes on cell surface because organism is ", 
                   "either not specified or not human/mouse."))
  }
  else {
    if (organism == "hg" || organism == "human") {
      temp_attributes <- "hgnc_symbol"
      temp_dataset <- "hsapiens_gene_ensembl"
    }
    else if (organism == "mm" || organism == "mouse") {
      temp_attributes <- "external_gene_name"
      temp_dataset <- "mmusculus_gene_ensembl"
    }
    attempt <- 1
    while (!exists("genes_on_cell_surface") && attempt <= 
           3) {
      try(genes_on_cell_surface <- biomaRt::getBM(attributes = temp_attributes, 
                                                  filters = "go", values = "GO:0009986", mart = biomaRt::useMart("ensembl", 
                                                                                                                 dataset = temp_dataset))[, 1])
    }
    if (!exists("genes_on_cell_surface")) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Genes in GO term \"cell surface\" (GO:0009986) could not be ", 
                     "retrieved, possibly due to the server not being reachable at the ", 
                     "moment."))
    }
  }
  object@misc$marker_genes[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      warning(paste0("Only one group level found for group `", 
                     current_group, "`. Will skip this group and proceed to next."), 
              call. = FALSE)
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get marker genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      Seurat::Idents(object) <- current_group
      results <- Seurat::FindAllMarkers(object, 
                                        only.pos = only_pos, min.pct = min_pct, logfc.threshold = thresh_logFC, 
                                        return.thresh = thresh_p_val, test.use = test, 
                                        verbose = verbose, ...)
      if (nrow(results) == 0) {
        message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                       "] No marker genes found for any of the level of `", 
                       current_group, "`."))
        results <- "no_markers_found"




    

      }
      else if (nrow(results) > 0) {
        if (exists("genes_on_cell_surface") && "gene" %in% 
            colnames(results)) {
          results <- results %>% dplyr::mutate(on_cell_surface = .data$gene %in% 
                                                 genes_on_cell_surface)
        }
      }
      if ("cluster" %in% colnames(results)) {
        results <- results %>% dplyr::rename(`:=`(!!current_group, 
                                                  .data$cluster)) %>% dplyr::select(tidyselect::all_of(current_group), 
                                                                                    tidyselect::any_of("gene"), dplyr::everything())
      }
      object@misc[["marker_genes"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

#函数 performGeneSetEnrichmentAnalysis
performGeneSetEnrichmentAnalysis<-function (object, assay = "RNA", GMT_file, 
                                            groups = NULL, name = "cerebro_GSVA", 
                                            thresh_p_val = 0.05, thresh_q_val = 0.1, ...) {
  
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loading gene sets..."))
  read_GMT_file<-function (file) 
  {
    gmt <- readr::read_delim(file, delim = ";", col_names = c("X1"), 
                             col_types = readr::cols())
    gene_set_genes <- list()
    for (i in seq_len(nrow(gmt))) {
      temp_genes <- strsplit(gmt$X1[i], split = "\t")[[1]] %>% 
        unlist()
      temp_genes <- temp_genes[3:length(temp_genes)]
      gene_set_genes[[i]] <- temp_genes
    }
    gene_set_loaded <- list(genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, 
                                                                                       split = "\t"), "[", 1) %>% unlist(), geneset.description = lapply(strsplit(gmt$X1, 
                                                                                                                                                                  split = "\t"), "[", 2) %>% unlist())
    return(gene_set_loaded)
  }
  gene_sets <- read_GMT_file(GMT_file)
  names(gene_sets$genesets) <- gene_sets$geneset.names
  gene_sets_tibble <- tibble::tibble(name = gene_sets$geneset.names, 
                                     description = gene_sets$geneset.description, length = NA, 
                                     genes = NA)
  for (i in seq_along(gene_sets$genesets)) {
    gene_sets_tibble$length[i] <- gene_sets$genesets[[i]] %>% 
      length()
    gene_sets_tibble$genes[i] <- gene_sets$genesets[[i]] %>% 
      unlist() %>% paste(.data, collapse = ",")
  }
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loaded ", 
                 length(gene_sets$genesets), " gene sets from GMT file."))
  expressed_genes <- Matrix::rowSums(object@assays[[assay]]$counts)
  expressed_genes <- which(expressed_genes != 0)
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Extracting transcript counts ", 
                 "from `data` slot of `", assay, "` assay..."))
  matrix_full <- object@assays[[assay]]$counts[expressed_genes, 
  ]
  object@misc$enriched_pathways[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor([email protected][[current_group]])) {
      group_levels <- levels([email protected][[current_group]])
    }
    else if (is.character([email protected][[current_group]])) {
      group_levels <- unique([email protected][[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- [email protected][[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) ", "without group assignment (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                       "] Performing analysis for ", "{length(group_levels)} subgroups of group `{current_group}`..."))
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Only one group level found ", "for group `", 
                     current_group, "`. Will skip this group and proceed ", 
                     "to next."), call. = FALSE)
      results <- NULL
    }
    else if (length(group_levels) > 1) {
      matrix_mean_by_group <- future.apply::future_sapply(group_levels, 
                                                          USE.NAMES = TRUE, simplify = TRUE, future.globals = FALSE, 
                                                          function(x) {
                                                            cells_in_this_group <- which([email protected][[current_group]] == 
                                                                                           x)
                                                            if (length(cells_in_this_group) == 1) {
                                                              matrix_full[, cells_in_this_group]
                                                            }
                                                            else {
                                                              Matrix::rowMeans(matrix_full[, cells_in_this_group])
                                                            }
                                                          })
      enrichment_scores <- GSVA::gsva(expr = matrix_mean_by_group, 
                                      gset.idx.list = gene_sets$genesets, ...)
      results <- tibble::tibble(group = character(), name = character(), 
                                enrichment_score = numeric(), p_value = numeric(), 
                                q_value = numeric())
      for (i in seq_len(ncol(enrichment_scores))) {
        temp_results <- tibble::tibble(group = colnames(matrix_mean_by_group)[i], 
                                       name = rownames(enrichment_scores), enrichment_score = enrichment_scores[, 
                                                                                                                i])
        if (nrow(temp_results) < 2) {
          message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                         "] Only 1 gene set ", "available, therefore p- and q-values cannot be calculated."))
          temp_results <- temp_results %>% dplyr::mutate(p_value = NA, 
                                                         q_value = NA)
        }
        else if (nrow(temp_results) >= 2) {
          temp_p_values <- stats::pnorm(-abs(scale(temp_results$enrichment_score)[, 
                                                                                  1]))
          temp_q_values <- suppressWarnings(qvalue::qvalue(temp_p_values, 
                                                           pi0 = 1, ties = min)$lfdr)
          temp_results <- temp_results %>% dplyr::mutate(p_value = temp_p_values, 
                                                         q_value = temp_q_values) %>% dplyr::filter(.data$p_value <= 
                                                                                                      thresh_p_val, .data$q_value <= thresh_q_val)
        }
        results <- dplyr::bind_rows(results, temp_results) %>% 
          dplyr::arrange(.data$q_value)
      }
      results <- dplyr::left_join(results, gene_sets_tibble, 
                                  by = "name") %>% dplyr::select(c("group", "name", 
                                                                   "description", "length", "genes", "enrichment_score", 
                                                                   "p_value", "q_value")) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                 .data$group))
      group_levels_with_results <- group_levels[group_levels %in% 
                                                  results[[current_group]]]
      results[[current_group]] <- factor(results[[current_group]], 
                                         levels = group_levels_with_results)
      message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                         "] {nrow(results)} gene sets ", "passed the thresholds across all subgroups of group `{current_group}`."))
      if (nrow(results) == 0) {
        results <- "no_gene_sets_enriched"
      }
      object@misc[["enriched_pathways"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

13 单基因的表达

13.1单基因的小提琴图

首先，我们发现用宽度而不是面积来衡量小提琴是很有用的。
其次，我们建议对小提琴进行修剪，以避免小提琴延伸到负尺度，这是没有意义的，因为没有任何负的表达值。
第三，在数据中添加一些噪声可能是有意义的，以避免突出显示刻度底部的不同值，例如log(1)， log(2)等。在使用Seurat对表达量进行可视化时，这是默认情况下完成的(甚至没有选项)。

在这种情况下，差异非常细微，如下图所示(红细胞和单核细胞)。

p1 <- [email protected] %>%
  mutate(expression = seurat@assays$SCT@data['CD69',]) %>%
  ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
  labs(title = "CD69 expression (without noise)") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p2 <- [email protected] %>%
  mutate(
    expression = seurat@assays$SCT@data['CD69',],
    expression = expression + rnorm(nrow(.))/200
  ) %>%
  ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
  labs(title = "CD69 expression (with noise)") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p1 + p2 + plot_layout(ncol = 1)

ggsave(
  'plots/expression_cd69_violin_plot.png',
  p1 + p2 + plot_layout(ncol = 1),
  height = 6, width = 9
)

13.2 多基因点图

点图可以展示多个基因的表达，通过一个特定的分组变量。接下来的例子中，我们将使用检测到的细胞类型和聚类的标记基因。

13.2.1 根据细胞类型划分

# cells will be grouped by samples that they have been assigned to
group_ids <- unique([email protected]$cell_type_singler_blueprintencode_main)

# select a set of genes for which we want to show expression
genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(cell_type_singler_blueprintencode_main) %>%
  pull(gene)


#这里若没有DC种类的细胞，则
group_ids = group_ids[group_ids!="DC"]
# for every sample-gene combination, calculate the average expression across
# all cells and then transform the data into a data frame
expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which([email protected]$cell_type_singler_blueprintencode_main == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene))

# for every sample-gene combination, calculate the percentage of cells in the
# respective group that has at least 1 transcript (this means we consider it
# as expressing the gene) and then transform the data into a data frame
percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which([email protected]$cell_type_singler_blueprintencode_main == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    [email protected] %>%
      group_by(cell_type_singler_blueprintencode_main) %>%
      tally(),
    by = 'cell_type_singler_blueprintencode_main') %>%
  mutate(
    id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene),
    percent_cells = cell_count / n
  )

# merge the two data frames created before and plot the data
p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cell_type_singler_blueprintencode_main)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cell type', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p

ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)

13.2.2 根据聚类结果进行划分

group_ids <- levels([email protected]$seurat_clusters)

genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>%
  group_by(seurat_clusters) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(seurat_clusters) %>%
  pull(gene)
genes_to_show <- genes_to_show[!duplicated(genes_to_show)]

expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which([email protected]$seurat_clusters == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame()
expression_levels_per_group <- expression_levels_per_group[,!duplicated(colnames(expression_levels_per_group))] %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cluster, '_', gene))

percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which([email protected]$seurat_cluster == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() 
percentage_of_cells_expressing_gene <- percentage_of_cells_expressing_gene[,!duplicated(colnames(percentage_of_cells_expressing_gene))] %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    [email protected] %>%
      group_by(seurat_clusters) %>%
      tally() %>%
      dplyr::rename(cluster = seurat_clusters),
    by = 'cluster') %>%
  mutate(
    id_to_merge = paste0(cluster, '_', gene),
    percent_cells = cell_count / n
  )

p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cluster = factor(cluster, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cluster)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cluster', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p

ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)

14 基因集中的表达量

我们根据细胞类型划分检查GO通路B_CELL_ACTIVATION中基因的表达量

library(msigdbr)

gene_set <- msigdbr(species = 'Homo sapiens', category = 'C5', subcategory = 'BP') %>%
  filter(gs_name == 'GOBP_B_CELL_ACTIVATION') %>%
  pull(gene_symbol) %>%
  unique()

gene_set_present <- gene_set[which(gene_set %in% rownames(seurat@assays$SCT@data))]