# Load required libraries
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(Seurat)
## Attaching SeuratObject
## Attaching sp
##
## Attaching package: 'Seurat'
## The following object is masked from 'package:SummarizedExperiment':
##
## Assays
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ dplyr::slice() masks IRanges::slice()
library(MetaNeighbor)
# Load the Seurat objects
hSeurat <- readRDS("hSeurat_annotated.rds")
vtSeurat <- readRDS("vtSeurat2_annotated.rds")
lSeurat <- readRDS("lSeurat_annotated.rds")
# Collapse the cell types in Han dataset
hmeta_copy <- hSeurat@meta.data %>%
select(Annotation)
dim(hSeurat@meta.data)
# read the cell types data frame
h_cell_types <- read.csv("Han Cell Types.csv") %>%
rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)
# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)
# add to meta.data
hSeurat$merged_annotation <- df$Merged
# Collapse the cell types in Han dataset
vtmeta_copy <- vtSeurat@meta.data %>%
select(Annotation)
# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("VT Cell Types.csv") %>%
rename(Annotation = Annotations)
# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)
# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)
# add to meta.data
vtSeurat$merged_annotation <- df_vt$Merged
# Collapse the cell types in Han dataset
lmeta_copy <- lSeurat@meta.data %>%
select(Annotation)
dim(lSeurat@meta.data)
# read the cell types data frame
l_cell_types <- read.csv("Liu Cell Types.csv") %>%
rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df_l <- inner_join(lmeta_copy, l_cell_types)
# Check if the order is maintained
summary(df_l$Annotation == lmeta_copy$Annotation)
# add to meta.data
lSeurat$merged_annotation <- df_l$Merged
# Change the Idents of the Seurat Objects
Idents(object = vtSeurat) <- "merged_annotation"
Idents(object = hSeurat) <- "merged_annotation"
Idents(object = lSeurat) <- "merged_annotation"
# Convert to SCE objects
hSCE <- as.SingleCellExperiment(hSeurat)
vtSCE <- as.SingleCellExperiment(vtSeurat)
lSCE <- as.SingleCellExperiment(lSeurat)
# Create my_data
mydata <- list(hSCE = hSCE,
vtSCE = vtSCE,
lSCE = lSCE)
# Check whether the gene information align between the two datasets
lapply(mydata, function(x) head(rownames(x), 3))
# cell-type information is labeled identically in all datasets
lapply(mydata, function(x) colnames(colData(x)))
# add the cell type information onto a new column name called "cell type"
mydata$hSCE$"cell type" <- mydata$hSCE$ident
mydata$vtSCE$"cell type" <- mydata$vtSCE$ident
mydata$lSCE$"cell type" <- mydata$lSCE$ident
# Check that count matrices stored in the assays slot, have identical names
lapply(mydata, function(x) names(assays(x)))
# Create a merged dataset by using mergeSCE, which takes a list of SCE objects
# as an input and outputs a single SCE object
fused_data <- mergeSCE(mydata)
dim(fused_data)
dim(hSCE)
dim(vtSCE)
dim(lSCE)
head(colData(fused_data))
# To avoid having to recreate the merged object, save the R object to a file
# by using R's RDS format.
saveRDS(fused_data, "merged_3_datasets.rds")
We notice that the fused data contains the cells from both datasets, as the # of columns is the sum of the two datasets.
# Load RDS file
three_datasets <- readRDS("merged_3_datasets.rds")
# To obtain a cursory overview of cell-type composition by study, we cross-tabulate cell-type annotations by study IDs
table(three_datasets$"cell type", three_datasets$study_id)
##
## hSCE lSCE vtSCE
## Dendritic cell 526 0 11
## Extravillous trophoblast 584 437 3037
## Stromal cell 1500 587 1
## Smooth muscle cell 805 0 0
## Erythroid cell 507 0 0
## Hofbauer cell 181 0 1200
## Proliferating cell 47 0 0
## Syncytiotrophoblast 19 63 1023
## Epithelial cell 12 0 37
## Cytotrophoblast 31 245 8346
## Macrophage 102 0 829
## Endothelial cell 8 0 143
## Neutrophil 61 0 0
## Natural killer cell 0 0 9
## T cells 0 0 4
## Fibroblast 0 0 2118
## Monocyte 0 0 16
## Plasma 0 0 6
## Granulocyte 0 0 37
## ILC3 0 0 1
# It is okay to have identical cell type names at this stage.
Since there were some problems in the legends for the heatmap, we use this function.
my_heatmap <- function (aurocs, cex = 1, margins = c(8, 8), ...)
{
auroc_cols <- rev((grDevices::colorRampPalette(RColorBrewer::brewer.pal(11,
"RdYlBu")))(100))
breaks <- seq(0, 1, length = 101)
ordering <- stats::as.dendrogram(orderCellTypes(aurocs))
arg_list <- list(x = aurocs, margins = margins, key = TRUE,
keysize = 1, key.xlab = "AUROC", key.title = "",
offsetRow = 0.1, offsetCol = 0.1, trace = "none", density.info = "none",
Rowv = ordering, Colv = ordering, col = auroc_cols, breaks = breaks,
na.color = grDevices::gray(0.95), cexRow = cex, cexCol = cex)
additional_args <- list(...)
arg_list[names(additional_args)] <- additional_args
do.call(gplots::heatmap.2, arg_list)
}
<Procedure 1>: Hierarchical cell-type replicability analysis
# Picking the highly variable genes
global_hvgs <- variableGenes(dat = three_datasets,
exp_labels = three_datasets$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB
length(global_hvgs)
## [1] 880
# MetaNeighborUS returns a cell-type-by-cell-type matrix containing cell-type similarities.
# This fast version should always be used on large datasets but can also be run on smaller datasets.
aurocs <- MetaNeighborUS(var_genes = global_hvgs,
dat = three_datasets,
study_id = three_datasets$study_id,
cell_type = three_datasets$"cell type",
fast_version = TRUE)
# Apparently, we can use this line of code as well.
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter
# To identify pairs of replicable cell types, run the following function
top_hits <- topHitsByStudy(aurocs, threshold = 0.9)
# Using the topHitsByStudy function is more desirable.
top_hits_before <- topHits(aurocs,
dat = three_datasets,
study_id = three_datasets$study_id,
cell_type = three_datasets$"cell type", threshold = 0.9)
## Warning in topHits(aurocs, dat = three_datasets, study_id =
## three_datasets$study_id, : The topHits function only looks for the best overall
## hit for each reference cell type. We strongly recommend looking for best hits in
## each target dataset by using the topHitsByStudy function instead.
top_hits %>%
filter(Match_type != "Reciprocal_top_hit")
## # A tibble: 13 × 4
## `Study_ID|Celltype_1` `Study_ID|Celltype_2` AUROC Match_type
## <chr> <chr> <dbl> <chr>
## 1 vtSCE|Monocyte hSCE|Dendritic cell 0.98 Above_0.9
## 2 vtSCE|Dendritic cell hSCE|Dendritic cell 0.97 Above_0.9
## 3 hSCE|Macrophage vtSCE|Hofbauer cell 0.95 Above_0.9
## 4 vtSCE|Plasma hSCE|Dendritic cell 0.95 Above_0.9
## 5 vtSCE|Stromal cell lSCE|Stromal cell 0.94 Above_0.9
## 6 vtSCE|T cells hSCE|Dendritic cell 0.93 Above_0.9
## 7 vtSCE|Hofbauer cell lSCE|Stromal cell 0.93 Above_0.9
## 8 vtSCE|Natural killer cell hSCE|Dendritic cell 0.93 Above_0.9
## 9 vtSCE|Stromal cell hSCE|Dendritic cell 0.91 Above_0.9
## 10 vtSCE|Monocyte lSCE|Stromal cell 0.91 Above_0.9
## 11 hSCE|Neutrophil vtSCE|Monocyte 0.91 Above_0.9
## 12 vtSCE|ILC3 hSCE|Dendritic cell 0.9 Above_0.9
## 13 vtSCE|Granulocyte hSCE|Dendritic cell 0.9 Above_0.9
top_hits_before %>%
filter(Match_type != "Reciprocal_top_hit")
## Study_ID|Celltype_1 Study_ID|Celltype_2 Mean_AUROC
## 1 vtSCE|Extravillous trophoblast lSCE|Extravillous trophoblast 0.98
## 2 hSCE|Dendritic cell vtSCE|Monocyte 0.98
## 3 hSCE|Dendritic cell vtSCE|Dendritic cell 0.97
## 4 lSCE|Syncytiotrophoblast vtSCE|Syncytiotrophoblast 0.97
## 5 vtSCE|Hofbauer cell hSCE|Macrophage 0.95
## 6 vtSCE|Macrophage lSCE|Stromal cell 0.95
## 7 hSCE|Dendritic cell vtSCE|Plasma 0.95
## 8 lSCE|Stromal cell vtSCE|Stromal cell 0.94
## 9 hSCE|Dendritic cell vtSCE|T cells 0.93
## 10 hSCE|Dendritic cell vtSCE|Natural killer cell 0.93
## 11 vtSCE|Monocyte hSCE|Neutrophil 0.91
## 12 hSCE|Dendritic cell vtSCE|ILC3 0.90
## 13 hSCE|Dendritic cell vtSCE|Granulocyte 0.90
## Match_type
## 1 Above_0.9
## 2 Above_0.9
## 3 Above_0.9
## 4 Above_0.9
## 5 Above_0.9
## 6 Above_0.9
## 7 Above_0.9
## 8 Above_0.9
## 9 Above_0.9
## 10 Above_0.9
## 11 Above_0.9
## 12 Above_0.9
## 13 Above_0.9
# When there is a clear structure in the data, we can refine AUROCs by
# splitting the data.
level1_split <- splitClusters(aurocs, k = 2)
level1_split
## $`1`
## [1] "hSCE|Cytotrophoblast" "hSCE|Epithelial cell"
## [3] "hSCE|Extravillous trophoblast" "hSCE|Syncytiotrophoblast"
## [5] "vtSCE|Cytotrophoblast" "vtSCE|Extravillous trophoblast"
## [7] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"
## [9] "lSCE|Extravillous trophoblast" "lSCE|Syncytiotrophoblast"
##
## $`2`
## [1] "hSCE|Dendritic cell" "hSCE|Endothelial cell"
## [3] "hSCE|Erythroid cell" "hSCE|Hofbauer cell"
## [5] "hSCE|Macrophage" "hSCE|Neutrophil"
## [7] "hSCE|Proliferating cell" "hSCE|Smooth muscle cell"
## [9] "hSCE|Stromal cell" "vtSCE|Dendritic cell"
## [11] "vtSCE|Endothelial cell" "vtSCE|Epithelial cell"
## [13] "vtSCE|Fibroblast" "vtSCE|Granulocyte"
## [15] "vtSCE|Hofbauer cell" "vtSCE|ILC3"
## [17] "vtSCE|Macrophage" "vtSCE|Monocyte"
## [19] "vtSCE|Natural killer cell" "vtSCE|Plasma"
## [21] "vtSCE|Stromal cell" "vtSCE|T cells"
## [23] "lSCE|Stromal cell"
first_split <- level1_split[[1]]
first_split
## [1] "hSCE|Cytotrophoblast" "hSCE|Epithelial cell"
## [3] "hSCE|Extravillous trophoblast" "hSCE|Syncytiotrophoblast"
## [5] "vtSCE|Cytotrophoblast" "vtSCE|Extravillous trophoblast"
## [7] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"
## [9] "lSCE|Extravillous trophoblast" "lSCE|Syncytiotrophoblast"
# Repeat the MetaNeighbor analysis on immune cells only
full_labels <- makeClusterName(three_datasets$study_id,
three_datasets$`cell type`)
subdata <- three_datasets[, full_labels %in% first_split]
dim(subdata)
## [1] 16876 13797
# To focus on variability specific to endocrine cells, re-pick highly variable genes
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB
# recompute cell-type similarities and visualize AUROCs
aurocs <- MetaNeighborUS(var_genes = var_genes,
dat = subdata, fast_version = TRUE,
study_id = subdata$study_id,
cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter
# Continue to zoom in as long as there are at least two cell types per dataset.
# Repeat steps 13-16
level2_split <- splitClusters(aurocs, k = 2)
level2_split
## $`1`
## [1] "hSCE|Cytotrophoblast" "hSCE|Epithelial cell"
## [3] "hSCE|Syncytiotrophoblast" "vtSCE|Cytotrophoblast"
## [5] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"
## [7] "lSCE|Syncytiotrophoblast"
##
## $`2`
## [1] "hSCE|Extravillous trophoblast" "vtSCE|Extravillous trophoblast"
## [3] "lSCE|Extravillous trophoblast"
my_split <- level2_split[[1]]
my_split
## [1] "hSCE|Cytotrophoblast" "hSCE|Epithelial cell"
## [3] "hSCE|Syncytiotrophoblast" "vtSCE|Cytotrophoblast"
## [5] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"
## [7] "lSCE|Syncytiotrophoblast"
subdata <- three_datasets[, full_labels %in% my_split]
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.2 GiB
length(var_genes)
## [1] 582
aurocs <- MetaNeighborUS(var_genes = var_genes, dat = subdata,
fast_version = TRUE,
study_id = subdata$study_id,
cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter
However, if we use one-vs-best AUROCs, MetaNeighbor can automatically identify the two closest matching cell types in each target dataset.
best_hits <- MetaNeighborUS(var_genes = global_hvgs,
dat = three_datasets,
study_id = three_datasets$study_id,
cell_type = three_datasets$"cell type",
fast_version = TRUE,
one_vs_best = TRUE, symmetric_output = FALSE)
my_heatmap(best_hits, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter
# Extracting cell types as meta-clusters
mclusters <- extractMetaClusters(best_hits, threshold = 0.7)
mcsummary <- scoreMetaClusters(mclusters, best_hits)
# scoreMetaClusters() function provides a good summary of meta-clusters
# Visualize the number of robust cell types, the replicability structure
# can be summarized as an Upset plot with plotUpset()
plotUpset(mclusters)
# Save the output as pdf.
pdf("meta_clusters.pdf")
plotMetaClusters(mclusters, best_hits)
while (!is.null(dev.list())) dev.off()
cluster_graph <- makeClusterGraph(best_hits, low_threshold = 0.3)
plotClusterGraph(cluster_graph, three_datasets$study_id,
three_datasets$"cell type", size_factor = 3)
# We can focus on a cluster of interest (coi).
# Take a closer look at 'hSCE|Dendritic cell', query its closest neighbors in the graph with extendClusterSet() and then zoom in on its subgraph with subsetClusterGraph()
coi <- "hSCE|Extravillous trophoblast"
coi <- extendClusterSet(cluster_graph, initial_set = coi,
max_neighbor_distance = 2)
subgraph <- subsetClusterGraph(cluster_graph, coi)
plotClusterGraph(subgraph, three_datasets$study_id,
three_datasets$"cell type", size_factor = 5)