4 - Classifying cell types from original data using metacell

Oct 15, 2020
In this section we will perform the cell type classification using metacell package:
## Loading R packages
library(Seurat)
library(future)
library(metacell)
library(ggpubr)
library(DropletUtils)
library(pheatmap)

## Convert an Seurat object to 10x format to input in metacell
data <- readRDS("/Users/biagi/PhD/Adipocyte/output/10x/10x_Processed.rds")
write10xCounts(x = data@assays$RNA@counts, path = "/Users/biagi/PhD/Adipocyte/output/10x/metacell/data")


## Initializing a database
if(!dir.exists("/Users/biagi/PhD/Adipocyte/output/10x/metacell/db")) dir.create("/Users/biagi/PhD/Adipocyte/output/10x/metacell/db")
scdb_init("/Users/biagi/PhD/Adipocyte/output/10x/metacell/db", force_reinit = TRUE)

## Loading the data generated by the 10x conversion above
mcell_import_scmat_10x("test1", base_dir = "/Users/biagi/PhD/Adipocyte/output/10x/metacell/data")
mat <- scdb_mat("test1")

## Linking to a figure directory
if(!dir.exists("/Users/biagi/PhD/Adipocyte/output/10x/metacell/figs")) dir.create("/Users/biagi/PhD/Adipocyte/output/10x/metacell/figs")
scfigs_init("/Users/biagi/PhD/Adipocyte/output/10x/metacell/figs")

## Plotting the distribution of UMI count per cell
mcell_plot_umis_per_cell("test1")

## Cleaning the data using a list of mitochondrial genes that typically mark cells as being stressed or dying, as well as immunoglobulin genes that may represent strong clonal signatures in plasma cells
mat <- scdb_mat("test1")
nms <- c(rownames(mat@mat), rownames(mat@ignore_gmat))
ig_genes <- c(grep("^Igj", nms, v = TRUE), 
             grep("^Igh",nms,v = TRUE),
             grep("^Igk", nms, v = TRUE), 
             grep("^Igl", nms, v = TRUE))
bad_genes <- unique(c(grep("^mt-", nms, v = TRUE), grep("^mtmr", nms, v = TRUE), grep("^Mtnd", nms, v = TRUE),"NEAT1","TMSB4X", "TMSB10", ig_genes))
mcell_mat_ignore_genes(new_mat_id = "test1", mat_id = "test1", bad_genes, reverse = FALSE)

## Eliminating cells with less than 700 UMIs
mcell_mat_ignore_small_cells("test1", "test1", 700)

## Computing statistics on the distributions of each gene in the data, which are going to be our main tool for selecting feature genes
mcell_add_gene_stat(gstat_id = "test1", mat_id = "test1", force = TRUE)

## We create a new object of type gset, to which all genes whose scaled variance (variance divided by mean) exceeds a given threshold are added:
mcell_gset_filter_varmean(gset_id = "test_feats", gstat_id = "test1", T_vm = 0.08, force_new = TRUE)
mcell_gset_filter_cov(gset_id = "test_feats", gstat_id = "test1", T_tot = 100, T_top3 = 2)

## Plotting all genes and our selected gene set given the mean and variance statistics
mcell_plot_gstats(gstat_id = "test1", gset_id = "test_feats")

## Creating a similarity graph, using a construction called balanced K-nn graph
mcell_add_cgraph_from_mat_bknn(mat_id = "test1", 
                               gset_id = "test_feats", 
                               graph_id = "test_graph",
                               K = 150,
                               dsamp = FALSE)

## Use the cgraph to sample 1000 metacell partitions, each covering 75% of the cells and organizing them in dense subgraphs
mcell_coclust_from_graph_resamp(
  coc_id = "test_coc500",  
  graph_id = "test_graph",
  min_mc_size = 20, 
  p_resamp = 0.75, n_resamp = 1000)

## The co-clustering statistics are used to generate a new similarity graph, based on which accurate calling of the final set of metacells
mcell_mc_from_coclust_balanced(
  coc_id = "test_coc500", 
  mat_id = "test1",
  mc_id = "test_mc", 
  K = 20, min_mc_size = 20, alpha = 2)

## Splitting metacells whose underlying similarity structure supports the existence of multiple sub-clusters, and removes outlier cells that strongly deviate from their metacell's expression profile
mcell_plot_outlier_heatmap(mc_id = "test_mc", mat_id = "test1", T_lfc = 3)
mcell_mc_split_filt(new_mc_id = "test_mc_f", 
                    mc_id = "test_mc", 
                    mat_id = "test1",
                    T_lfc = 3, plot_mats = FALSE)

## The filtered metacell object test_mc_f can now be visualized. In order to do this effectively, we usually go through one or two iterations of selecting informative marker genes. The package can select markers for you automatically - by simply looking for genes that are strongly enriched in any of the metacells
mcell_gset_from_mc_markers(gset_id = "test_markers", mc_id = "test_mc_f")


## Running 1st Round with known markers
load("/Users/biagi/PhD/Adipocyte/output/10x/metacell/db/mc.test_mc_f.Rda")
lfp <- log2(object@mc_fp)

marks_colors <- NULL
marks_colors <- rbind(marks_colors, c("Adipocyte", "Adrb3", "blue", 1, 2))
marks_colors <- rbind(marks_colors, c("Endothelial", "Pecam1", "green", 1, 1))
marks_colors <- rbind(marks_colors, c("Immune_1", "Ptprc", "#ff748c", 1, 0.5))
marks_colors <- rbind(marks_colors, c("Immune_2", "Cd19", "#ff8fa3", 1, 0.5))
marks_colors <- rbind(marks_colors, c("Progenitor_1", "Cd34", "#ffa500", 1, 2))
marks_colors <- rbind(marks_colors, c("Progenitor_2", "Pdgfra", "#ffb732", 1, 2))
marks_colors <- as.data.frame(marks_colors)
colnames(marks_colors) <- c("group", "gene", "color", "priority", "T_fold")
marks_colors$priority <- as.integer(marks_colors$priority)
marks_colors$T_fold <- as.numeric(marks_colors$T_fold)

mc <- scdb_mc("test_mc_f")
gene_folds <- mc@mc_fp

load("/Users/biagi/PhD/Adipocyte/output/10x/metacell/db/gset.test_markers.Rda")
gset <- object
good_marks <- intersect(names(gset@gene_set), rownames(mc@mc_fp))
mc_ord <- 1:ncol(mc@mc_fp)

mat <- log2(gene_folds[good_marks, mc_ord])
mat <- pmax(pmin(mat, 3), -3)

## Selecting best markers for adipocyte cells
mat_A <- mat[, which(mc@colors == "blue")]
mat_A <- mat_A[rowSums(mat_A) > quantile(rowSums(mat_A), 0.9), ]
rowMeans(mat_A[names(head(sort(rowSums(mat_A), decreasing = TRUE), 5)), ])

## Selecting best markers for endothelial cells
mat_E <- mat[, which(mc@colors == "green")]
mat_E <- mat_E[rowSums(mat_E) > quantile(rowSums(mat_E), 0.9), ]
rowMeans(mat_E[names(head(sort(rowSums(mat_E), decreasing = TRUE), 5)), ])

## Selecting best markers for immune cells
mat_I <- mat[, which(mc@colors %in% c("#ff748c", "#ff8fa3"))]
mat_I <- mat_I[rowSums(mat_I) > quantile(rowSums(mat_I), 0.9), ]
rowMeans(mat_I[names(head(sort(rowSums(mat_I), decreasing = TRUE), 5)), ])

## Selecting best markers for progenitor cells
mat_P <- mat[, which(mc@colors %in% c("#ffa500", "#ffb732"))]
mat_P <- mat_P[rowSums(mat_P) > quantile(rowSums(mat_P), 0.9), ]
rowMeans(mat_P[names(head(sort(rowSums(mat_P), decreasing = TRUE), 5)), ])

## Plotting heatmat to check the found markers
items <- list(Adipocytes = mat_A, Endothelials = mat_E, Immunes = mat_I, Progenitors = mat_P)
plot_list <- list()
for (i in 1:length(items)){
  x <- pheatmap(items[[i]], fontsize = 6, main = names(items)[i], legend = FALSE, treeheight_row = 0, treeheight_col = 0)
  plot_list[[i]] <- x[[4]]
}
gggpubr(plotlist = plot_list, ncol = 2)


## Running 2nd round with unsupervised markers provided by metacell
marks_colors <- NULL
marks_colors <- rbind(marks_colors, c("Adipocyte_1", "Acsl1", "#0000b3", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Adipocyte_2", "Plin4", "#0000cc", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Adipocyte_3", "Mlxipl", "#0000e6", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Adipocyte_4", "Pck1", "#0000ff", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Adipocyte_5", "Adrb3", "#1a1aff", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Endothelial_1", "Btnl9", "#00cd00", 1, 1))
marks_colors <- rbind(marks_colors, c("Endothelial_2", "Flt1", "#00b300", 1, 1))
marks_colors <- rbind(marks_colors, c("Endothelial_3", "Kdr", "#009a00", 1, 1))
marks_colors <- rbind(marks_colors, c("Endothelial_4", "Cdh13", "#008000", 1, 1))
marks_colors <- rbind(marks_colors, c("Endothelial_5", "Cyyr1", "#006700", 1, 1))
marks_colors <- rbind(marks_colors, c("Immune_1", "Zeb2", "#ff7f7f", 1, 0.85))
marks_colors <- rbind(marks_colors, c("Immune_2", "Trps1", "#ff6666", 1, 0.85))
marks_colors <- rbind(marks_colors, c("Immune_3", "Runx1", "#ff4c4c", 1, 0.85))
marks_colors <- rbind(marks_colors, c("Immune_4", "Ptprc", "#ff3232", 1, 0.85))
marks_colors <- rbind(marks_colors, c("Immune_5", "Adap2", "#ff1919", 1, 0.85))
marks_colors <- rbind(marks_colors, c("Progenitor_1", "Dcn", "#ffff4d", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Progenitor_2", "Celf2", "#ffff33", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Progenitor_3", "Meg3", "#ffff1a", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Progenitor_4", "Col1a2", "#ffff00", 1, 2.5))
marks_colors <- rbind(marks_colors, c("Progenitor_5", "Col3a1", "#e6e600", 1, 2.5))
marks_colors <- as.data.frame(marks_colors)
colnames(marks_colors) <- c("group", "gene", "color", "priority", "T_fold")
marks_colors$priority <- as.integer(marks_colors$priority)
marks_colors$T_fold <- as.numeric(marks_colors$T_fold)

mc_colorize("test_mc_f", marker_colors=marks_colors)
mc <- scdb_mc("test_mc_f")

## We can use the colors to produce a labeled heat map, showing selected genes and their distributions over metacells, with the colored annotation shown at the bottom
mcell_mc_plot_marks(mc_id = "test_mc_f", gset_id = "test_markers", mat_id = "test1", plot_cells = FALSE)

lfp <- log2(mc@mc_fp)

## Projecting metacells and cells in 2D
mcell_mc2d_force_knn(mc2d_id = "test_2dproj",mc_id = "test_mc_f", graph_id = "test_graph")
tgconfig::set_param("mcell_mc2d_height", 1000, "metacell")
tgconfig::set_param("mcell_mc2d_width", 1000, "metacell")
mcell_mc2d_plot(mc2d_id="test_2dproj")