作者用了SCT来整合24个数据集,建议大家换成harmony,否则电脑根本承受不来~
因为斑马鱼的基因格式和我们常用的人或鼠不太一样,所以过滤这一步显得麻烦一些:
list_mitochondrial = c("ENSDARG00000063895" , "ENSDARG00000063899" , "ENSDARG00000063905" , "ENSDARG00000063908" , "ENSDARG00000063910" , "ENSDARG00000063911" , "ENSDARG00000063912" , "ENSDARG00000063914" , "ENSDARG00000063916" , "ENSDARG00000063917" , "ENSDARG00000063921" , "ENSDARG00000063921" , "ENSDARG00000063922" , "ENSDARG00000063924" )for (i in 1 :length(sc_all_datasets)) { try (sc_all_datasets[[i]][["percent.mt" ]] }for (i in 1 :length(sc_all_datasets)) { print(sum(sc_all_datasets[[i]]@meta.data[["percent.mt" ]])) }
ids_list for (i in 1 :length(sc_all_datasets)) { ids ids_list } length(Reduce(intersect,ids_list))#common genes - 6904 sum_cells = 0 for (i in 1 :length(sc_all_datasets)) { cell_num sum_cells = sum_cells + cell_num }#251442 cells before filtering for (i in 1 :length(sc_all_datasets)) { sc_all_datasets[[i]] 200 ) } #250206 cells after filtering
接下来就是harmony整合:
rm(list = ls())source ("step0_lib.R" )library (future)# options(future.globals.maxSize = 1024 * 1024 * 1024) # 设置为1GB # 或者设置为2GB options(future.globals.maxSize = 16 * 1024 ^3 ) # 将最大内存限制增加到5GB #harmony load("./tidydata/backup-before-seurat-integr-11-datasets.RData" ) sc_gsm_merge 1 ]], c(sc_all_datasets[2 :length(sc_all_datasets)]))
如果为了方便,接下来当然直接可以和Jimmy老师的常规单细胞代码衔接起来:
rm(sc_all_datasets)# SCTransform 的内存占用过高,尝试其他内存较低的整合方法 # sc_gsm_merge # DefaultAssay(sc_gsm_merge) # from jimmy sc_gsm_merge normalization.method = "LogNormalize" , scale.factor = 1e4 ) sc_gsm_merge sc_gsm_merge sc_gsm_merge p1 "pca" , group.by = "orig.ident" , pt.size = 0.1 ) p2 "PC_1" , group.by = "orig.ident" , pt.size = 0.1 ) dir.create("./Figures_out" ) plot_grid(p1, p2, ncol = 2 , rel_widths = c(1 , 1.5 )) %>% ggsave(filename = paste("./Figures_out/" , "2_sc_gsm_merge_check_PCA.png" , sep="" ), width = 500 , height = 150 , units = "mm" ) sc_gsm_merge "orig.ident" ) p1 "harmony", group.by = "orig.ident" , pt.size = 0.1 ) p2 "harmony_1", group.by = "orig.ident" , pt.size = 0.1 ) plot_grid(p1, p2, ncol = 2 , rel_widths = c(1 , 1.5 )) %>% ggsave(filename = paste("./Figures_out/" , "2_sc_gsm_integrated_check_harmony.png" , sep="" ), width = 500 , height = 150 , units = "mm" ) figure_dir "./Figures_out/" ElbowPlot(sc_gsm_merge, ndims = 30 , reduction = "harmony" ) %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_dims_ElbowPlot.png" , sep="" ), width = 150 , height = 150 , units = "mm" ) sc_gsm_merge "harmony", dims = 1 :20 ) p1 "umap", group.by = "orig.ident" , label = F , pt.size = 0.1 ) p1[[1 ]]$layers[[1 ]]$aes_params$alpha = 0.2 p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_UMAP_harmony.png" , sep="" ), width = 150 , height = 130 , units = "mm" ) # display the profile of the integrated dataset # Save pheno.ident, stage.ident and sex.ident p1 "umap", group.by = "orig.ident" , label = F , pt.size = 0.1 ) p1[[1 ]]$layers[[1 ]]$aes_params$alpha = 0.2 p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_OrigIdent.png" , sep="" ), width = 150 , height = 130 , units = "mm" ) p1 "umap", group.by = "batch.ident" , label = F , pt.size = 0.1 ) p1[[1 ]]$layers[[1 ]]$aes_params$alpha = 0.2 p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_BatchIdent.png" , sep="" ), width = 150 , height = 130 , units = "mm" ) saveRDS(sc_gsm_merge,file = "./tidydata/harmonied_sc_gsm_merge.rds" )
细胞聚类
rm(list = ls())source ("step0_lib.R" ) sc_gsm_merge "./tidydata/harmonied_sc_gsm_merge.rds" ) sc_gsm_merge "pca", dims = 1 :30 ) sc_gsm_merge 1, verbose = T ) p1 "umap", group.by = "seurat_clusters" , label = T , repel = T , pt.size = 0.1 , raster=FALSE ) p2 "umap", group.by = "orig.ident" , pt.size = 0.1 , raster=FALSE ) plot_grid(p1, p2, ncol = 2 , rel_widths = c(1 , 1.15 )) %>% ggsave(filename = paste("./Figures_out/" , "3_gsm_integrated_harmonyClusters.png" , sep="" ), width = 15 , height = 13 , dpi = 300 ) saveRDS(sc_gsm_merge, paste("./tidydata/" , "sc_gse_after_harmony.rds" , sep = "" ))# CHECK CLUSTER COMPOSITION sc_gse_merge # Check how much samples are represented in each of the clusters cluster_count 0))@meta.data$orig.ident)) # set the structure names(cluster_count)[1 ] "GSM_ID" names(cluster_count)[2 ] "ToRemove" cluster_count$ToRemove 0for (i in levels(sc_gse_merge$seurat_clusters)) { print(paste("cluster " , i, sep = "" )) tmp_df names(tmp_df)[1 ] "GSM_ID" names(tmp_df)[2 ] "Cluster_", i, sep ="" )) cluster_count T, all.y = T ) } rownames(cluster_count) cluster_count 3:length(cluster_count)]for (i in 1 :length(cluster_count)) { for (j in 1 :length(cluster_count[[i]])) { if (is.na(cluster_count[j, i]) == T ) { cluster_count[j, i] 0 } } }library (pheatmap)library (RColorBrewer)# Create the annotations needed for the heatmap (which needs dataframe structures for annotations) tmp_df lines_to_manage for (i in 1 :length(rownames(tmp_df))) { if (tmp_df$Freq[i] == 0 ) { lines_to_manage } } tmp_df rownames(tmp_df) 2] tmp_df 1] colnames(tmp_df) "Batch", "Cell.Count" )# Then draw the pheatmap # You should save the heatmap from RStudio | 1000 x 535 p1 cluster_rows = F , cluster_cols = F , show_rownames = TRUE , color = c("white" , "yellow" , "green" ), breaks = c(0 , 100 , 1000 , 10000 ), # distances 0 to 3 are red, 3 to 9 black main = 'cells distribution' ) p1 write.table(cluster_count,file = "./tidydata/cluster_count.csv" ) # export "cluster_count" as a table
到了这一步,后续的细胞注释也就简单多了,但是鉴于斑马鱼物种的特殊性,还是会在接下来的几周里陆续更完代码,感兴趣的小伙伴可以一起交流~