前言
前面我们演示了生信工程师Roman Hillje分享的scRNA-seq workflow的部分(
你还缺scRNA-seq的workflow吗?
), 今天继续后面的学习。
Roman Hillje的github主页:
https://github.com/romanhaa
workflow:
https://romanhaa.github.io/projects/scrnaseq_workflow/
12 Perform analysis for Cerebro
为了进一步研究Cerebro https://github.com/romanhaa/Cerebro 中的数据集,我们现在添加了更多的meta data,计算标记基因(marker genes),标记基因的通路富集,并导出数据集。这个过程在很大程度上遵循了cerebroApp网站 https://romanhaa.github.io/cerebroApp/articles/cerebroApp_workflow_Seurat.html 上描述的工作流程。
12.1 Add meta data
load("labels_assinged_seurat.Rdata") load("thresholds.RData") seurat@misc$experiment <- list( experiment_name = 'bone_marrow_donor_cells', organism = 'hg', date_of_analysis = Sys.Date() ) seurat@misc$parameters <- list( gene_nomenclature = 'gene_name', discard_genes_expressed_in_fewer_cells_than = 6, keep_mitochondrial_genes = TRUE, variables_to_regress_out = 'nCount', number_PCs = 15, cluster_resolution = 0.8 ) seurat@misc$parameters$filtering <- list( UMI_min = thresholds_nCount[1], UMI_max = thresholds_nCount[2], genes_min = thresholds_nFeature[1], genes_max = thresholds_nFeature[2], pct_mito_min = thresholds_percent_MT[1], pct_mito_max = thresholds_percent_MT[2] ) #github上的,自己科学上网下吧 if(!require("cerebroApp")){ BiocManager::install('romanhaa/cerebroApp') } library(cerebroApp) seurat@misc$technical_info$cerebroApp_version <- utils::packageVersion('cerebroApp') seurat@misc$technical_info$Seurat <- utils::packageVersion('Seurat') seurat@misc$technical_info <- list( 'R' = capture.output(devtools::session_info()) )
12.2 Calculate relationship trees
这里,我们为所有三个分组变量:样本、聚类和细胞类型计算关系树——就像我们之前为聚类所做的那样。它们中的每一个都被添加到Seurat对象的
@misc
槽中,这样它就可以被提取到Cerebro中。
#Samples: Idents(seurat) <- "sample" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$sample <- seurat@tools$BuildClusterTree #Clusters: Idents(seurat) <- "seurat_clusters" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$seurat_clusters <- seurat@tools$BuildClusterTree #Cell types: Idents(seurat) <- "cell_type_singler_blueprintencode_main" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$cell_type_singler_blueprintencode_main <- seurat@tools$BuildClusterTree
12.3 cerebroApp steps
在这里,我们计算线粒体和核糖体转录本的百分比,得到最多表达的基因和标记基因,使用富集过的通路进行测试,并进行基因集富集分析GSEA。因为cereBroAPP作者提桶跑路,已经不支持seurat5了,所以没办法只能自己改函数,改过的函数存在
cereBroFunction.R
脚本里面
#感觉是cerebroApp不能兼容Seurat5,找不到原来位置的counts(因为v5在layers下面了) #检查源代码发现确实是这样的,没办法想继续用就得自己修改源代码再定义一个函数 #修改过的函数在cereBroFunction_forSeuratV5.R脚本里面,source一下 source("cereBroFunction_forSeuratV5.R") seurat <- addPercentMtRibo( seurat, assay = "RNA", organism = 'hg', gene_nomenclature = 'name' ) seurat <- getMostExpressedGenes( seurat, assay = 'RNA', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main') ) #Marker Gene结果保存在object@misc[["marker_genes"]][["cerebro_seurat"]][[groups[1]]] seurat <- getMarkerGenes( seurat, organism = 'hg', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'), name = 'cerebro_seurat', only_pos = TRUE ) #富集分析结果存在seurat@misc[["enriched_pathways"]]中 seurat <- cerebroApp::getEnrichedPathways( seurat, marker_genes_input = 'cerebro_seurat', adj_p_cutoff = 0.01, max_terms = 100 ) #GSEA富集分析,做不了,有坑,自己把基因提出来做好了 seurat <- performGeneSetEnrichmentAnalysis( seurat, assay = 'RNA', GMT_file = 'h.all.v2023.2.Hs.symbols.gmt', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'), thresh_p_val = 0.05, thresh_q_val = 0.1, parallel.sz = 1, verbose = FALSE )
cereBroFunction.R
:
#函数 addPercentMtRibo addPercentMtRibo <- function (object, assay = "RNA" , organism ="hg" ,gene_nomenclature = 'name'
){ calculatePercentGenes<-function (object, assay = "RNA" , genes ,message) { result <- pbapply::pblapply(genes, function (x) { genes_here <- intersect(x, rownames(object@assays[[assay]]$counts)) if (length(genes_here) == 1 ) { object@assays[[assay]]$counts[genes_here, ]/Matrix::colSums(object@assays[[assay]]$counts) print(paste0("Only one " , message,"gene was found" )) } else { Matrix::colSums(object@assays[[assay]]$counts[genes_here, ])/Matrix::colSums(object@assays[[assay]]$counts) } }) } #配置文件中记录了线粒体和核糖体基因 genes_mt <- readr::read_tsv(system.file(paste0("extdata/genes_mt_" , organism, "_" , gene_nomenclature, ".tsv.gz" ), package = "cerebroApp" ), col_types = readr::cols(), col_names = FALSE ) %>% dplyr::select(1 ) %>% t() %>% as.vector() genes_ribo <- readr::read_tsv(system.file(paste0("extdata/genes_ribo_" , organism, "_" , gene_nomenclature, ".tsv.gz" ), package = "cerebroApp" ), col_types = readr::cols(), col_names = FALSE ) %>% dplyr::select(1 ) %>% t() %>% as.vector() genes_mt_here <- intersect(genes_mt, rownames(object@assays[[assay]]$counts)) genes_ribo_here <- intersect(genes_ribo, rownames(object@assays[[assay]]$counts)) values_mt <- calculatePercentGenes(object, assay = assay, message="mitochondrial" , list(genes_mt = genes_mt_here)) values_ribo <- calculatePercentGenes(object, assay = assay, message="ribosomal" , list(genes_ribo = genes_ribo_here)) [email protected] $mt_percent <- as.vector(values_mt[[1 ]]) [email protected] $ribo_percent <- as.vector(values_ribo[[1 ]]) return (object) }#函数 getMostExpressedGenes getMostExpressedGenes<-function (object, assay = "RNA" , groups = NULL ) { for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor([email protected] [[current_group]])) { group_levels <- levels([email protected] [[current_group]]) } else if (is.character([email protected] [[current_group]])) { group_levels <- unique([email protected] [[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- [email protected] [[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } if (length(group_levels) == 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Group `" , current_group, "` contains only one subgroup. Will calculate most expressed genes " , "across all cells of this group..." )) transcripts_counts_per_gene <- Matrix::rowSums(object@assays[[assay]]$counts) total_transcript_count <- sum(transcripts_counts_per_gene) transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, decreasing = TRUE ) table <- tibble::tibble(group = group_levels, gene = names(transcripts_percent_per_gene)[1 :100 ], pct = transcripts_percent_per_gene[1 :100 ]) most_expressed_genes <- table %>% dplyr::mutate(group = factor(group, levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, group)) object@misc[["most_expressed_genes" ]][[current_group]] <- most_expressed_genes } else if (length(group_levels) > 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Get most expressed genes for " , length(group_levels), " groups in `" , current_group, "`..." )) results <- pbapply::pblapply(group_levels, function (x) { cells_of_current_group_level <- rownames([email protected] )[which([email protected] [[current_group]] == x)] transcript_count_matrix <- object@assays[[assay]]$counts[,cells_of_current_group_level] if (is.vector(transcript_count_matrix) == TRUE ) { transcripts_counts_per_gene <- transcript_count_matrix } else { transcripts_counts_per_gene <- Matrix::rowSums(transcript_count_matrix) } total_transcript_count <- sum(transcripts_counts_per_gene) transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, decreasing = TRUE ) table <- tibble::tibble(group = x, gene = names(transcripts_percent_per_gene)[1 :100 ], pct = transcripts_percent_per_gene[1 :100 ]) }) most_expressed_genes <- do.call(rbind, results) %>% dplyr::mutate(group = factor(group, levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, group)) object@misc[["most_expressed_genes" ]][[current_group]] <- most_expressed_genes } } return (object) }#函数 getMarkerGenes getMarkerGenes <- function (object, organism = NULL , groups = NULL , name = "cerebro_seurat" , only_pos = TRUE , min_pct = 0.7 , thresh_logFC = 0.25 , thresh_p_val = 0.01 , test = "wilcox" , verbose = TRUE , ... ) { if (is.null(object@misc$marker_genes)) { object@misc$marker_genes <- list() } if (organism %in % c("hg" , "mm" ) == FALSE ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] No information about genes on cell surface because organism is " , "either not specified or not human/mouse." )) } else { if (organism == "hg" || organism == "human" ) { temp_attributes <- "hgnc_symbol" temp_dataset <- "hsapiens_gene_ensembl" } else if (organism == "mm" || organism == "mouse" ) { temp_attributes <- "external_gene_name" temp_dataset <- "mmusculus_gene_ensembl" } attempt <- 1 while (!exists("genes_on_cell_surface" ) && attempt <= 3 ) { try (genes_on_cell_surface <- biomaRt::getBM(attributes = temp_attributes, filters = "go" , values = "GO:0009986" , mart = biomaRt::useMart("ensembl" , dataset = temp_dataset))[, 1 ]) } if (!exists("genes_on_cell_surface" )) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Genes in GO term \"cell surface\" (GO:0009986) could not be " , "retrieved, possibly due to the server not being reachable at the " , "moment." )) } } object@misc$marker_genes[[name]] <- list() for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor([email protected] [[current_group]])) { group_levels <- levels([email protected] [[current_group]]) } else if (is.character([email protected] [[current_group]])) { group_levels <- unique([email protected] [[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- [email protected] [[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } if (length(group_levels) == 1 ) { warning (paste0("Only one group level found for group `" , current_group, "`. Will skip this group and proceed to next." ), call. = FALSE ) } else if (length(group_levels) > 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Get marker genes for " , length(group_levels), " groups in `" , current_group, "`..." )) Seurat::Idents(object) <- current_group results <- Seurat::FindAllMarkers(object, only.pos = only_pos, min.pct = min_pct, logfc.threshold = thresh_logFC, return.thresh = thresh_p_val, test.use = test, verbose = verbose, ... ) if (nrow(results) == 0 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] No marker genes found for any of the level of `" , current_group, "`." )) results <- "no_markers_found"
} else if (nrow(results) > 0 ) { if (exists("genes_on_cell_surface" ) && "gene" %in % colnames(results)) { results <- results %>% dplyr::mutate(on_cell_surface = .data$gene %in % genes_on_cell_surface) } } if ("cluster" %in % colnames(results)) { results <- results %>% dplyr::rename(`:=`(!!current_group, .data$cluster)) %>% dplyr::select(tidyselect::all_of(current_group), tidyselect::any_of("gene" ), dplyr::everything()) } object@misc[["marker_genes" ]][[name]][[current_group]] <- results } } return (object) }#函数 performGeneSetEnrichmentAnalysis performGeneSetEnrichmentAnalysis<-function (object, assay = "RNA" , GMT_file, groups = NULL , name = "cerebro_GSVA" , thresh_p_val = 0.05 , thresh_q_val = 0.1 , ... ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Loading gene sets..." )) read_GMT_file<-function (file) { gmt <- readr::read_delim(file, delim = ";" , col_names = c("X1" ), col_types = readr::cols()) gene_set_genes <- list() for (i in seq_len(nrow(gmt))) { temp_genes <- strsplit(gmt$X1[i], split = "\t" )[[1 ]] %>% unlist() temp_genes <- temp_genes[3 :length(temp_genes)] gene_set_genes[[i]] <- temp_genes } gene_set_loaded <- list(genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, split = "\t" ), "[" , 1 ) %>% unlist(), geneset.description = lapply(strsplit(gmt$X1, split = "\t" ), "[" , 2 ) %>% unlist()) return (gene_set_loaded) } gene_sets <- read_GMT_file(GMT_file) names(gene_sets$genesets) <- gene_sets$geneset.names gene_sets_tibble <- tibble::tibble(name = gene_sets$geneset.names, description = gene_sets$geneset.description, length = NA , genes = NA ) for (i in seq_along(gene_sets$genesets)) { gene_sets_tibble$length[i] <- gene_sets$genesets[[i]] %>% length() gene_sets_tibble$genes[i] <- gene_sets$genesets[[i]] %>% unlist() %>% paste(.data, collapse = "," ) } message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Loaded " , length(gene_sets$genesets), " gene sets from GMT file." )) expressed_genes <- Matrix::rowSums(object@assays[[assay]]$counts) expressed_genes <- which(expressed_genes != 0 ) message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Extracting transcript counts " , "from `data` slot of `" , assay, "` assay..." )) matrix_full <- object@assays[[assay]]$counts[expressed_genes, ] object@misc$enriched_pathways[[name]] <- list() for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor([email protected] [[current_group]])) { group_levels <- levels([email protected] [[current_group]]) } else if (is.character([email protected] [[current_group]])) { group_levels <- unique([email protected] [[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- [email protected] [[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) " , "without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } message(glue::glue("[" , format(Sys.time(), "%H:%M:%S" ), "] Performing analysis for " , "{length(group_levels)} subgroups of group `{current_group}`..." )) if (length(group_levels) == 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Only one group level found " , "for group `" , current_group, "`. Will skip this group and proceed " , "to next." ), call. = FALSE ) results <- NULL } else if (length(group_levels) > 1 ) { matrix_mean_by_group <- future.apply::future_sapply(group_levels, USE.NAMES = TRUE , simplify = TRUE , future.globals = FALSE , function (x) { cells_in_this_group <- which([email protected] [[current_group]] == x) if (length(cells_in_this_group) == 1 ) { matrix_full[, cells_in_this_group] } else { Matrix::rowMeans(matrix_full[, cells_in_this_group]) } }) enrichment_scores <- GSVA::gsva(expr = matrix_mean_by_group, gset.idx.list = gene_sets$genesets, ... ) results <- tibble::tibble(group = character(), name = character(), enrichment_score = numeric(), p_value = numeric(), q_value = numeric()) for (i in seq_len(ncol(enrichment_scores))) { temp_results <- tibble::tibble(group = colnames(matrix_mean_by_group)[i], name = rownames(enrichment_scores), enrichment_score = enrichment_scores[, i]) if (nrow(temp_results) < 2 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Only 1 gene set " , "available, therefore p- and q-values cannot be calculated." )) temp_results <- temp_results %>% dplyr::mutate(p_value = NA , q_value = NA ) } else if (nrow(temp_results) >= 2 ) { temp_p_values <- stats::pnorm(-abs(scale(temp_results$enrichment_score)[, 1 ])) temp_q_values <- suppressWarnings(qvalue::qvalue(temp_p_values, pi0 = 1 , ties = min)$lfdr) temp_results <- temp_results %>% dplyr::mutate(p_value = temp_p_values, q_value = temp_q_values) %>% dplyr::filter(.data$p_value <= thresh_p_val, .data$q_value <= thresh_q_val) } results <- dplyr::bind_rows(results, temp_results) %>% dplyr::arrange(.data$q_value) } results <- dplyr::left_join(results, gene_sets_tibble, by = "name" ) %>% dplyr::select(c("group" , "name" , "description" , "length" , "genes" , "enrichment_score" , "p_value" , "q_value" )) %>% dplyr::rename(`:=`(!!current_group, .data$group)) group_levels_with_results <- group_levels[group_levels %in % results[[current_group]]] results[[current_group]] <- factor(results[[current_group]], levels = group_levels_with_results) message(glue::glue("[" , format(Sys.time(), "%H:%M:%S" ), "] {nrow(results)} gene sets " , "passed the thresholds across all subgroups of group `{current_group}`." )) if (nrow(results) == 0 ) { results <- "no_gene_sets_enriched" } object@misc[["enriched_pathways" ]][[name]][[current_group]] <- results } } return (object) }
13 单基因的表达
13.1单基因的小提琴图
首先,我们发现用宽度而不是面积来衡量小提琴是很有用的。
其次,我们建议对小提琴进行修剪,以避免小提琴延伸到负尺度,这是没有意义的,因为没有任何负的表达值。
第三,在数据中添加一些噪声可能是有意义的,以避免突出显示刻度底部的不同值,例如log(1), log(2)等。在使用Seurat对表达量进行可视化时,这是默认情况下完成的(甚至没有选项)。
在这种情况下,差异非常细微,如下图所示(红细胞和单核细胞)。
p1 <- [email protected] %>% mutate(expression = seurat@assays$SCT@data['CD69',]) %>% ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) + geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) + labs(title = "CD69 expression (without noise)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p2 <- [email protected] %>% mutate( expression = seurat@assays$SCT@data['CD69',], expression = expression + rnorm(nrow(.))/200 ) %>% ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) + geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) + labs(title = "CD69 expression (with noise)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p1 + p2 + plot_layout(ncol = 1) ggsave( 'plots/expression_cd69_violin_plot.png', p1 + p2 + plot_layout(ncol = 1), height = 6, width = 9 )
13.2 多基因点图
点图可以展示多个基因的表达,通过一个特定的分组变量。接下来的例子中,我们将使用检测到的细胞类型和聚类的标记基因。
13.2.1 根据细胞类型划分
# cells will be grouped by samples that they have been assigned to group_ids <- unique([email protected] $cell_type_singler_blueprintencode_main) # select a set of genes for which we want to show expression genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>% group_by(cell_type_singler_blueprintencode_main) %>% arrange(p_val_adj) %>% filter(row_number() == 1) %>% arrange(cell_type_singler_blueprintencode_main) %>% pull(gene) #这里若没有DC种类的细胞,则 group_ids = group_ids[group_ids!="DC"] # for every sample-gene combination, calculate the average expression across # all cells and then transform the data into a data frame expression_levels_per_group <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which([email protected] $cell_type_singler_blueprintencode_main == x) Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group]) } ) %>% t() %>% as.data.frame() %>% mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>% select(cell_type_singler_blueprintencode_main, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(expression = value) %>% mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene)) # for every sample-gene combination, calculate the percentage of cells in the # respective group that has at least 1 transcript (this means we consider it # as expressing the gene) and then transform the data into a data frame percentage_of_cells_expressing_gene <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which([email protected] $cell_type_singler_blueprintencode_main == x) Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0) } ) %>% t() %>% as.data.frame() %>% mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>% select(cell_type_singler_blueprintencode_main, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(cell_count = value) %>% left_join( ., [email protected] %>% group_by(cell_type_singler_blueprintencode_main) %>% tally(), by = 'cell_type_singler_blueprintencode_main') %>% mutate( id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene), percent_cells = cell_count / n ) # merge the two data frames created before and plot the data p <- left_join( expression_levels_per_group, percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells), by = 'id_to_merge' ) %>% mutate( cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)), gene = factor(gene, levels = genes_to_show) ) %>% arrange(gene) %>% ggplot(aes(gene, cell_type_singler_blueprintencode_main)) + geom_point(aes(color = expression, size = percent_cells)) + scale_color_distiller( palette = 'Reds', direction = 1, name = 'Log-normalised\nexpression', guide = guide_colorbar(frame.colour = "black", ticks.colour = "black") ) + scale_size(name = 'Percent\nof cells', labels = scales::percent) + labs(y = 'Cell type', color = 'Expression') + coord_fixed() + theme_bw() + theme( axis.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)
13.2.2 根据聚类结果进行划分
group_ids <- levels([email protected] $seurat_clusters) genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>% group_by(seurat_clusters) %>% arrange(p_val_adj) %>% filter(row_number() == 1) %>% arrange(seurat_clusters) %>% pull(gene) genes_to_show <- genes_to_show[!duplicated(genes_to_show)] expression_levels_per_group <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which([email protected] $seurat_clusters == x) Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group]) } ) %>% t() %>% as.data.frame() expression_levels_per_group <- expression_levels_per_group[,!duplicated(colnames(expression_levels_per_group))] %>% mutate(cluster = rownames(.)) %>% select(cluster, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(expression = value) %>% mutate(id_to_merge = paste0(cluster, '_', gene)) percentage_of_cells_expressing_gene <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which([email protected] $seurat_cluster == x) Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0) } ) %>% t() %>% as.data.frame() percentage_of_cells_expressing_gene <- percentage_of_cells_expressing_gene[,!duplicated(colnames(percentage_of_cells_expressing_gene))] %>% mutate(cluster = rownames(.)) %>% select(cluster, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(cell_count = value) %>% left_join( ., [email protected] %>% group_by(seurat_clusters) %>% tally() %>% dplyr::rename(cluster = seurat_clusters), by = 'cluster') %>% mutate( id_to_merge = paste0(cluster, '_', gene), percent_cells = cell_count / n ) p <- left_join( expression_levels_per_group, percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells), by = 'id_to_merge' ) %>% mutate( cluster = factor(cluster, levels = rev(group_ids)), gene = factor(gene, levels = genes_to_show) ) %>% arrange(gene) %>% ggplot(aes(gene, cluster)) + geom_point(aes(color = expression, size = percent_cells)) + scale_color_distiller( palette = 'Reds', direction = 1, name = 'Log-normalised\nexpression', guide = guide_colorbar(frame.colour = "black", ticks.colour = "black") ) + scale_size(name = 'Percent\nof cells', labels = scales::percent) + labs(y = 'Cluster', color = 'Expression') + coord_fixed() + theme_bw() + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)
14 基因集中的表达量
我们根据细胞类型划分检查GO通路B_CELL_ACTIVATION中基因的表达量
library(msigdbr) gene_set <- msigdbr(species = 'Homo sapiens', category = 'C5', subcategory = 'BP') %>% filter(gs_name == 'GOBP_B_CELL_ACTIVATION') %>% pull(gene_symbol) %>% unique() gene_set_present <- gene_set[which(gene_set %in% rownames(seurat@assays$SCT@data))]