# Load required libraries
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(Seurat)
## Attaching SeuratObject
## Attaching sp
## 
## Attaching package: 'Seurat'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()
library(MetaNeighbor)
# Load the Seurat objects
hSeurat <- readRDS("hSeurat_annotated.rds")
vtSeurat <- readRDS("vtSeurat2_annotated.rds")
lSeurat <- readRDS("lSeurat_annotated.rds")

# Collapse the cell types in Han dataset
hmeta_copy <- hSeurat@meta.data %>% 
  select(Annotation)

dim(hSeurat@meta.data)

# read the cell types data frame
h_cell_types <- read.csv("Han Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)

# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)

# add to meta.data
hSeurat$merged_annotation <- df$Merged

# Collapse the cell types in Han dataset
vtmeta_copy <- vtSeurat@meta.data %>% 
  select(Annotation)

# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("VT Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)

# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)

# add to meta.data
vtSeurat$merged_annotation <- df_vt$Merged

# Collapse the cell types in Han dataset
lmeta_copy <- lSeurat@meta.data %>% 
  select(Annotation)

dim(lSeurat@meta.data)

# read the cell types data frame
l_cell_types <- read.csv("Liu Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df_l <- inner_join(lmeta_copy, l_cell_types)

# Check if the order is maintained
summary(df_l$Annotation == lmeta_copy$Annotation)

# add to meta.data
lSeurat$merged_annotation <- df_l$Merged


# Change the Idents of the Seurat Objects
Idents(object = vtSeurat) <- "merged_annotation"
Idents(object = hSeurat) <- "merged_annotation"
Idents(object = lSeurat) <- "merged_annotation"
# Convert to SCE objects
hSCE <- as.SingleCellExperiment(hSeurat)
vtSCE <- as.SingleCellExperiment(vtSeurat)
lSCE <- as.SingleCellExperiment(lSeurat)

# Create my_data
mydata <- list(hSCE = hSCE, 
               vtSCE = vtSCE,
               lSCE = lSCE)

# Check whether the gene information align between the two datasets
lapply(mydata, function(x) head(rownames(x), 3))

# cell-type information is labeled identically in all datasets
lapply(mydata, function(x) colnames(colData(x)))

# add the cell type information onto a new column name called "cell type"
mydata$hSCE$"cell type" <- mydata$hSCE$ident
mydata$vtSCE$"cell type" <- mydata$vtSCE$ident
mydata$lSCE$"cell type" <- mydata$lSCE$ident

# Check that count matrices stored in the assays slot, have identical names
lapply(mydata, function(x) names(assays(x)))

# Create a merged dataset by using mergeSCE, which takes a list of SCE objects
# as an input and outputs a single SCE object
fused_data <- mergeSCE(mydata)
dim(fused_data)
dim(hSCE)
dim(vtSCE)
dim(lSCE)

head(colData(fused_data))

# To avoid having to recreate the merged object, save the R object to a file
# by using R's RDS format.

saveRDS(fused_data, "merged_3_datasets.rds")

We notice that the fused data contains the cells from both datasets, as the # of columns is the sum of the two datasets.

# Load RDS file
three_datasets <- readRDS("merged_3_datasets.rds")

# To obtain a cursory overview of cell-type composition by study, we cross-tabulate cell-type annotations by study IDs

table(three_datasets$"cell type", three_datasets$study_id)
##                           
##                            hSCE lSCE vtSCE
##   Dendritic cell            526    0    11
##   Extravillous trophoblast  584  437  3037
##   Stromal cell             1500  587     1
##   Smooth muscle cell        805    0     0
##   Erythroid cell            507    0     0
##   Hofbauer cell             181    0  1200
##   Proliferating cell         47    0     0
##   Syncytiotrophoblast        19   63  1023
##   Epithelial cell            12    0    37
##   Cytotrophoblast            31  245  8346
##   Macrophage                102    0   829
##   Endothelial cell            8    0   143
##   Neutrophil                 61    0     0
##   Natural killer cell         0    0     9
##   T cells                     0    0     4
##   Fibroblast                  0    0  2118
##   Monocyte                    0    0    16
##   Plasma                      0    0     6
##   Granulocyte                 0    0    37
##   ILC3                        0    0     1
# It is okay to have identical cell type names at this stage.

Since there were some problems in the legends for the heatmap, we use this function.

my_heatmap <- function (aurocs, cex = 1, margins = c(8, 8), ...) 
{
  auroc_cols <- rev((grDevices::colorRampPalette(RColorBrewer::brewer.pal(11, 
                                                                          "RdYlBu")))(100))
  breaks <- seq(0, 1, length = 101)
  ordering <- stats::as.dendrogram(orderCellTypes(aurocs))
  arg_list <- list(x = aurocs, margins = margins, key = TRUE, 
                   keysize = 1, key.xlab = "AUROC", key.title = "", 
                   offsetRow = 0.1, offsetCol = 0.1, trace = "none", density.info = "none", 
                   Rowv = ordering, Colv = ordering, col = auroc_cols, breaks = breaks, 
                   na.color = grDevices::gray(0.95), cexRow = cex, cexCol = cex)
  additional_args <- list(...)
  arg_list[names(additional_args)] <- additional_args
  do.call(gplots::heatmap.2, arg_list)
}

<Procedure 1>: Hierarchical cell-type replicability analysis

# Picking the highly variable genes
global_hvgs <- variableGenes(dat = three_datasets,
                            exp_labels = three_datasets$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB
length(global_hvgs)
## [1] 880
# MetaNeighborUS returns a cell-type-by-cell-type matrix containing cell-type similarities.
# This fast version should always be used on large datasets but can also be run on smaller datasets.

aurocs <- MetaNeighborUS(var_genes = global_hvgs,
  dat = three_datasets,
  study_id = three_datasets$study_id,
  cell_type = three_datasets$"cell type",
  fast_version = TRUE)

# Apparently, we can use this line of code as well.
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter

# To identify pairs of replicable cell types, run the following function
top_hits <- topHitsByStudy(aurocs, threshold = 0.9)

# Using the topHitsByStudy function is more desirable.
top_hits_before <- topHits(aurocs, 
                    dat = three_datasets, 
                    study_id = three_datasets$study_id,
        cell_type = three_datasets$"cell type", threshold = 0.9)
## Warning in topHits(aurocs, dat = three_datasets, study_id =
## three_datasets$study_id, : The topHits function only looks for the best overall
## hit for each reference cell type. We strongly recommend looking for best hits in
## each target dataset by using the topHitsByStudy function instead.
top_hits %>% 
  filter(Match_type != "Reciprocal_top_hit")
## # A tibble: 13 × 4
##    `Study_ID|Celltype_1`     `Study_ID|Celltype_2` AUROC Match_type
##    <chr>                     <chr>                 <dbl> <chr>     
##  1 vtSCE|Monocyte            hSCE|Dendritic cell    0.98 Above_0.9 
##  2 vtSCE|Dendritic cell      hSCE|Dendritic cell    0.97 Above_0.9 
##  3 hSCE|Macrophage           vtSCE|Hofbauer cell    0.95 Above_0.9 
##  4 vtSCE|Plasma              hSCE|Dendritic cell    0.95 Above_0.9 
##  5 vtSCE|Stromal cell        lSCE|Stromal cell      0.94 Above_0.9 
##  6 vtSCE|T cells             hSCE|Dendritic cell    0.93 Above_0.9 
##  7 vtSCE|Hofbauer cell       lSCE|Stromal cell      0.93 Above_0.9 
##  8 vtSCE|Natural killer cell hSCE|Dendritic cell    0.93 Above_0.9 
##  9 vtSCE|Stromal cell        hSCE|Dendritic cell    0.91 Above_0.9 
## 10 vtSCE|Monocyte            lSCE|Stromal cell      0.91 Above_0.9 
## 11 hSCE|Neutrophil           vtSCE|Monocyte         0.91 Above_0.9 
## 12 vtSCE|ILC3                hSCE|Dendritic cell    0.9  Above_0.9 
## 13 vtSCE|Granulocyte         hSCE|Dendritic cell    0.9  Above_0.9
top_hits_before %>% 
  filter(Match_type != "Reciprocal_top_hit")
##               Study_ID|Celltype_1           Study_ID|Celltype_2 Mean_AUROC
## 1  vtSCE|Extravillous trophoblast lSCE|Extravillous trophoblast       0.98
## 2             hSCE|Dendritic cell                vtSCE|Monocyte       0.98
## 3             hSCE|Dendritic cell          vtSCE|Dendritic cell       0.97
## 4        lSCE|Syncytiotrophoblast     vtSCE|Syncytiotrophoblast       0.97
## 5             vtSCE|Hofbauer cell               hSCE|Macrophage       0.95
## 6                vtSCE|Macrophage             lSCE|Stromal cell       0.95
## 7             hSCE|Dendritic cell                  vtSCE|Plasma       0.95
## 8               lSCE|Stromal cell            vtSCE|Stromal cell       0.94
## 9             hSCE|Dendritic cell                 vtSCE|T cells       0.93
## 10            hSCE|Dendritic cell     vtSCE|Natural killer cell       0.93
## 11                 vtSCE|Monocyte               hSCE|Neutrophil       0.91
## 12            hSCE|Dendritic cell                    vtSCE|ILC3       0.90
## 13            hSCE|Dendritic cell             vtSCE|Granulocyte       0.90
##    Match_type
## 1   Above_0.9
## 2   Above_0.9
## 3   Above_0.9
## 4   Above_0.9
## 5   Above_0.9
## 6   Above_0.9
## 7   Above_0.9
## 8   Above_0.9
## 9   Above_0.9
## 10  Above_0.9
## 11  Above_0.9
## 12  Above_0.9
## 13  Above_0.9
# When there is a clear structure in the data, we can refine AUROCs by
# splitting the data. 
level1_split <- splitClusters(aurocs, k = 2)
level1_split
## $`1`
##  [1] "hSCE|Cytotrophoblast"           "hSCE|Epithelial cell"          
##  [3] "hSCE|Extravillous trophoblast"  "hSCE|Syncytiotrophoblast"      
##  [5] "vtSCE|Cytotrophoblast"          "vtSCE|Extravillous trophoblast"
##  [7] "vtSCE|Syncytiotrophoblast"      "lSCE|Cytotrophoblast"          
##  [9] "lSCE|Extravillous trophoblast"  "lSCE|Syncytiotrophoblast"      
## 
## $`2`
##  [1] "hSCE|Dendritic cell"       "hSCE|Endothelial cell"    
##  [3] "hSCE|Erythroid cell"       "hSCE|Hofbauer cell"       
##  [5] "hSCE|Macrophage"           "hSCE|Neutrophil"          
##  [7] "hSCE|Proliferating cell"   "hSCE|Smooth muscle cell"  
##  [9] "hSCE|Stromal cell"         "vtSCE|Dendritic cell"     
## [11] "vtSCE|Endothelial cell"    "vtSCE|Epithelial cell"    
## [13] "vtSCE|Fibroblast"          "vtSCE|Granulocyte"        
## [15] "vtSCE|Hofbauer cell"       "vtSCE|ILC3"               
## [17] "vtSCE|Macrophage"          "vtSCE|Monocyte"           
## [19] "vtSCE|Natural killer cell" "vtSCE|Plasma"             
## [21] "vtSCE|Stromal cell"        "vtSCE|T cells"            
## [23] "lSCE|Stromal cell"
first_split <- level1_split[[1]]
first_split
##  [1] "hSCE|Cytotrophoblast"           "hSCE|Epithelial cell"          
##  [3] "hSCE|Extravillous trophoblast"  "hSCE|Syncytiotrophoblast"      
##  [5] "vtSCE|Cytotrophoblast"          "vtSCE|Extravillous trophoblast"
##  [7] "vtSCE|Syncytiotrophoblast"      "lSCE|Cytotrophoblast"          
##  [9] "lSCE|Extravillous trophoblast"  "lSCE|Syncytiotrophoblast"
# Repeat the MetaNeighbor analysis on immune cells only
full_labels <- makeClusterName(three_datasets$study_id,
                               three_datasets$`cell type`)
subdata <- three_datasets[, full_labels %in% first_split]
dim(subdata)
## [1] 16876 13797
# To focus on variability specific to endocrine cells, re-pick highly variable genes
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB
# recompute cell-type similarities and visualize AUROCs
aurocs <- MetaNeighborUS(var_genes = var_genes,
                         dat = subdata, fast_version = TRUE,
                         study_id = subdata$study_id,
                         cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter

# Continue to zoom in as long as there are at least two cell types per dataset.
# Repeat steps 13-16

level2_split <- splitClusters(aurocs, k = 2)
level2_split
## $`1`
## [1] "hSCE|Cytotrophoblast"      "hSCE|Epithelial cell"     
## [3] "hSCE|Syncytiotrophoblast"  "vtSCE|Cytotrophoblast"    
## [5] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"     
## [7] "lSCE|Syncytiotrophoblast" 
## 
## $`2`
## [1] "hSCE|Extravillous trophoblast"  "vtSCE|Extravillous trophoblast"
## [3] "lSCE|Extravillous trophoblast"
my_split <- level2_split[[1]]
my_split
## [1] "hSCE|Cytotrophoblast"      "hSCE|Epithelial cell"     
## [3] "hSCE|Syncytiotrophoblast"  "vtSCE|Cytotrophoblast"    
## [5] "vtSCE|Syncytiotrophoblast" "lSCE|Cytotrophoblast"     
## [7] "lSCE|Syncytiotrophoblast"
subdata <- three_datasets[, full_labels %in% my_split]
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.2 GiB
length(var_genes)
## [1] 582
aurocs <- MetaNeighborUS(var_genes = var_genes, dat = subdata,
                         fast_version = TRUE,
                         study_id = subdata$study_id,
                         cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter

However, if we use one-vs-best AUROCs, MetaNeighbor can automatically identify the two closest matching cell types in each target dataset.

best_hits <- MetaNeighborUS(var_genes = global_hvgs,
                            dat = three_datasets,
                            study_id = three_datasets$study_id,
                            cell_type = three_datasets$"cell type",
                            fast_version = TRUE,
                            one_vs_best = TRUE, symmetric_output = FALSE)
my_heatmap(best_hits, label_size = 10, margins = c(12,12)) 
## Warning in plot.window(...): "label_size" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label_size" is not a graphical parameter
## Warning in title(...): "label_size" is not a graphical parameter

# Extracting cell types as meta-clusters
mclusters <- extractMetaClusters(best_hits, threshold = 0.7)
mcsummary <- scoreMetaClusters(mclusters, best_hits)

# scoreMetaClusters() function provides a good summary of meta-clusters

# Visualize the number of robust cell types, the replicability structure
# can be summarized as an Upset plot with plotUpset()

plotUpset(mclusters)

# Save the output as pdf.
pdf("meta_clusters.pdf")
plotMetaClusters(mclusters, best_hits)
while (!is.null(dev.list()))  dev.off()
cluster_graph <- makeClusterGraph(best_hits, low_threshold = 0.3)
plotClusterGraph(cluster_graph, three_datasets$study_id, 
                 three_datasets$"cell type", size_factor = 3)

# We can focus on a cluster of interest (coi).
# Take a closer look at 'hSCE|Dendritic cell', query its closest neighbors in the graph with extendClusterSet() and then zoom in on its subgraph with subsetClusterGraph()

coi <- "hSCE|Extravillous trophoblast"
coi <- extendClusterSet(cluster_graph, initial_set = coi,
                        max_neighbor_distance = 2)
subgraph <- subsetClusterGraph(cluster_graph, coi)
plotClusterGraph(subgraph, three_datasets$study_id,
                 three_datasets$"cell type", size_factor = 5)