---
title: "Meta-analysis Testing"
output: html_document
---
```{r}
# Load required libraries
library(SingleCellExperiment)
library(Seurat)
library(tidyverse)
library(MetaNeighbor)
```


```{r eval = FALSE}
# Load the Seurat objects
hSeurat <- readRDS("hSeurat_annotated.rds")
vtSeurat <- readRDS("vtSeurat2_annotated.rds")
lSeurat <- readRDS("lSeurat_annotated.rds")

# Collapse the cell types in Han dataset
hmeta_copy <- hSeurat@meta.data %>% 
  select(Annotation)

dim(hSeurat@meta.data)

# read the cell types data frame
h_cell_types <- read.csv("Han Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)

# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)

# add to meta.data
hSeurat$merged_annotation <- df$Merged

# Collapse the cell types in Han dataset
vtmeta_copy <- vtSeurat@meta.data %>% 
  select(Annotation)

# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("VT Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)

# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)

# add to meta.data
vtSeurat$merged_annotation <- df_vt$Merged

# Collapse the cell types in Han dataset
lmeta_copy <- lSeurat@meta.data %>% 
  select(Annotation)

dim(lSeurat@meta.data)

# read the cell types data frame
l_cell_types <- read.csv("Liu Cell Types.csv") %>% 
  rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df_l <- inner_join(lmeta_copy, l_cell_types)

# Check if the order is maintained
summary(df_l$Annotation == lmeta_copy$Annotation)

# add to meta.data
lSeurat$merged_annotation <- df_l$Merged


# Change the Idents of the Seurat Objects
Idents(object = vtSeurat) <- "merged_annotation"
Idents(object = hSeurat) <- "merged_annotation"
Idents(object = lSeurat) <- "merged_annotation"
```


```{r eval = FALSE}
# Convert to SCE objects
hSCE <- as.SingleCellExperiment(hSeurat)
vtSCE <- as.SingleCellExperiment(vtSeurat)
lSCE <- as.SingleCellExperiment(lSeurat)

# Create my_data
mydata <- list(hSCE = hSCE, 
               vtSCE = vtSCE,
               lSCE = lSCE)

# Check whether the gene information align between the two datasets
lapply(mydata, function(x) head(rownames(x), 3))

# cell-type information is labeled identically in all datasets
lapply(mydata, function(x) colnames(colData(x)))

# add the cell type information onto a new column name called "cell type"
mydata$hSCE$"cell type" <- mydata$hSCE$ident
mydata$vtSCE$"cell type" <- mydata$vtSCE$ident
mydata$lSCE$"cell type" <- mydata$lSCE$ident

# Check that count matrices stored in the assays slot, have identical names
lapply(mydata, function(x) names(assays(x)))

# Create a merged dataset by using mergeSCE, which takes a list of SCE objects
# as an input and outputs a single SCE object
fused_data <- mergeSCE(mydata)
dim(fused_data)
dim(hSCE)
dim(vtSCE)
dim(lSCE)

head(colData(fused_data))

# To avoid having to recreate the merged object, save the R object to a file
# by using R's RDS format.

saveRDS(fused_data, "merged_3_datasets.rds")
```

We notice that the fused data contains the cells from both datasets, as the # of columns is the sum of the two datasets.

```{r}
# Load RDS file
three_datasets <- readRDS("merged_3_datasets.rds")

# To obtain a cursory overview of cell-type composition by study, we cross-tabulate cell-type annotations by study IDs

table(three_datasets$"cell type", three_datasets$study_id)
# It is okay to have identical cell type names at this stage.
```

Since there were some problems in the legends for the heatmap, we use this function.
```{r}
my_heatmap <- function (aurocs, cex = 1, margins = c(8, 8), ...) 
{
  auroc_cols <- rev((grDevices::colorRampPalette(RColorBrewer::brewer.pal(11, 
                                                                          "RdYlBu")))(100))
  breaks <- seq(0, 1, length = 101)
  ordering <- stats::as.dendrogram(orderCellTypes(aurocs))
  arg_list <- list(x = aurocs, margins = margins, key = TRUE, 
                   keysize = 1, key.xlab = "AUROC", key.title = "", 
                   offsetRow = 0.1, offsetCol = 0.1, trace = "none", density.info = "none", 
                   Rowv = ordering, Colv = ordering, col = auroc_cols, breaks = breaks, 
                   na.color = grDevices::gray(0.95), cexRow = cex, cexCol = cex)
  additional_args <- list(...)
  arg_list[names(additional_args)] <- additional_args
  do.call(gplots::heatmap.2, arg_list)
}
```


<Procedure 1>: Hierarchical cell-type replicability analysis
```{r, fig.height = 8, fig.width = 10}
# Picking the highly variable genes
global_hvgs <- variableGenes(dat = three_datasets,
                            exp_labels = three_datasets$study_id)

length(global_hvgs)

# MetaNeighborUS returns a cell-type-by-cell-type matrix containing cell-type similarities.
# This fast version should always be used on large datasets but can also be run on smaller datasets.

aurocs <- MetaNeighborUS(var_genes = global_hvgs,
  dat = three_datasets,
  study_id = three_datasets$study_id,
  cell_type = three_datasets$"cell type",
  fast_version = TRUE)

# Apparently, we can use this line of code as well.
my_heatmap(aurocs, label_size = 10, margins = c(12,12))

# To identify pairs of replicable cell types, run the following function
top_hits <- topHitsByStudy(aurocs, threshold = 0.9)

# Using the topHitsByStudy function is more desirable.
top_hits_before <- topHits(aurocs, 
                    dat = three_datasets, 
                    study_id = three_datasets$study_id,
        cell_type = three_datasets$"cell type", threshold = 0.9)

top_hits %>% 
  filter(Match_type != "Reciprocal_top_hit")
top_hits_before %>% 
  filter(Match_type != "Reciprocal_top_hit")
```

```{r fig.height = 8, fig.width = 10}
# When there is a clear structure in the data, we can refine AUROCs by
# splitting the data. 
level1_split <- splitClusters(aurocs, k = 2)
level1_split

first_split <- level1_split[[1]]
first_split

# Repeat the MetaNeighbor analysis on immune cells only
full_labels <- makeClusterName(three_datasets$study_id,
                               three_datasets$`cell type`)
subdata <- three_datasets[, full_labels %in% first_split]
dim(subdata)

# To focus on variability specific to endocrine cells, re-pick highly variable genes
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)

# recompute cell-type similarities and visualize AUROCs
aurocs <- MetaNeighborUS(var_genes = var_genes,
                         dat = subdata, fast_version = TRUE,
                         study_id = subdata$study_id,
                         cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))

# Continue to zoom in as long as there are at least two cell types per dataset.
# Repeat steps 13-16

level2_split <- splitClusters(aurocs, k = 2)
level2_split

my_split <- level2_split[[1]]
my_split

subdata <- three_datasets[, full_labels %in% my_split]
var_genes <- variableGenes(dat = subdata, exp_labels = subdata$study_id)

length(var_genes)
aurocs <- MetaNeighborUS(var_genes = var_genes, dat = subdata,
                         fast_version = TRUE,
                         study_id = subdata$study_id,
                         cell_type = subdata$"cell type")
my_heatmap(aurocs, label_size = 10, margins = c(12,12))
```

However, if we use one-vs-best AUROCs, MetaNeighbor can automatically identify the two closest matching cell types in each target dataset.

```{r fig.height = 10, fig.width = 10}
best_hits <- MetaNeighborUS(var_genes = global_hvgs,
                            dat = three_datasets,
                            study_id = three_datasets$study_id,
                            cell_type = three_datasets$"cell type",
                            fast_version = TRUE,
                            one_vs_best = TRUE, symmetric_output = FALSE)
my_heatmap(best_hits, label_size = 10, margins = c(12,12)) 

```


```{r}
# Extracting cell types as meta-clusters
mclusters <- extractMetaClusters(best_hits, threshold = 0.7)
mcsummary <- scoreMetaClusters(mclusters, best_hits)

# scoreMetaClusters() function provides a good summary of meta-clusters

# Visualize the number of robust cell types, the replicability structure
# can be summarized as an Upset plot with plotUpset()

plotUpset(mclusters)

# Save the output as pdf.
pdf("meta_clusters.pdf")
plotMetaClusters(mclusters, best_hits)
while (!is.null(dev.list()))  dev.off()
```

```{r fig.height = 6, fig.width = 8}
cluster_graph <- makeClusterGraph(best_hits, low_threshold = 0.3)
plotClusterGraph(cluster_graph, three_datasets$study_id, 
                 three_datasets$"cell type", size_factor = 3)

# We can focus on a cluster of interest (coi).
# Take a closer look at 'hSCE|Dendritic cell', query its closest neighbors in the graph with extendClusterSet() and then zoom in on its subgraph with subsetClusterGraph()

coi <- "hSCE|Extravillous trophoblast"
coi <- extendClusterSet(cluster_graph, initial_set = coi,
                        max_neighbor_distance = 2)
subgraph <- subsetClusterGraph(cluster_graph, coi)
plotClusterGraph(subgraph, three_datasets$study_id,
                 three_datasets$"cell type", size_factor = 5)
```