---
title: "Cao et al. (2022) Analysis"
author: "Young-June Kim"
date: "2023-02-07"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(pheatmap)
library(GEOquery)
library(SeuratDisk)
library(loomR)

# Define color palettes
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
library(R.utils)
metadata.genes <- read.csv("GSE156793_S2_Metadata_genes.txt")
metadata.placentacells <- readRDS("Placenta cells.rds")
tissue <- read.csv("GSE156793_S4_gene_expression_tissue.txt")
fraction_tissue <- read.csv("GSE156793_S5_gene_fraction_tissue.txt")
expression_celltype <- read.csv("GSE156793_S6_gene_expression_celltype.txt")
fraction_celltype <- read.csv("GSE156793_S7_gene_fraction_celltype.txt")

# Let's call for the loom file
# gunzip("GSE156793_S3_gene_count.loom.gz", remove=FALSE)

loom_obj <- connect(filename = "GSE156793_S3_gene_count.loom", mode = 'r+')

# loom_obj$add.row.attribute(list(Gene = loom_obj$row.attrs$gene_id[]))
# loom_obj$add.col.attribute(list(CellID = loom_obj$col.attrs$sample[]))
# loom_obj$add.col.attribute(list(tokeep = loom_obj$col.attrs$sample[] %in% metadata.placentacells$sample))

filtered <- loom_obj[["matrix"]][which(loom_obj$col.attrs$tokeep[]),]
counts <- t(filtered)

loom_obj[["col_attrs"]][,]

coldf <- loom_obj$get.attribute.df(MARGIN = 2)
coldf <- coldf %>% 
  filter(tokeep)
rowdf <- loom_obj$get.attribute.df(MARGIN = 1)

dim(loom_obj$col.attrs)

# loom_obj$row.attrs$Gene = loom_obj$row.attrs$gene_id
# loom_obj[["row_attrs"]]$Gene = loom_obj[["row_attrs"]]$gene_id
# 
# loom_obj$col.attrs$sample[] %>%
#   unique() %>% 
#   length()

# loom_obj$close_all()

colnames(counts) <- coldf$CellID
rownames(counts) <- rowdf$gene_short_name

# the whole thing is too big to convert to Seurat - filter the loom first!
# seurat_loom <- as.Seurat(loom_obj)

cSeurat <- CreateSeuratObject(
  counts,
  project = "Placenta_Seurat",
  assay = "RNA",
  names.field = 1,
  names.delim = "_",
  meta.data = coldf,
)

# Check if the metadata looks okay
cSeurat@meta.data

# loom_obj$version()
# Even running the following lines of code yields the same type of error.
# loom_obj$version <- function(){
#   return("0.2.1.9000")
# }
# detach("package:SeuratDisk", unload = TRUE)
# detach("package:Seurat", unload = TRUE)
# 
# library(Seurat)

saveRDS(cSeurat, "cSeurat.rds")
```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

```{r}
# Start running from this line of code
cSeurat <- readRDS("cSeurat.rds")

# Add in the Mitochondrial PCT% information
cSeurat$percent.mt <- PercentageFeatureSet(cSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(cSeurat$nCount_RNA)
summary(cSeurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(cSeurat$nFeature_RNA)
summary(cSeurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(cSeurat$percent.mt)

# Later, we will subset the Seurat object based on a percent.mt cutoff.

# Check for the percentage of ribosomal genes
cSeurat[["percent.rb"]] <- PercentageFeatureSet(cSeurat, pattern = "^RP[SL]")
summary(cSeurat$percent.rb)
```

Another important part for QC is removing doublets.
```{r removing_doublets}
# Find for doublets
cSeurat <- as.SingleCellExperiment(cSeurat)
cSeurat <- scDblFinder(cSeurat)
# Convert back to Seurat object.
cSeurat <- as.Seurat(cSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = cSeurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(cSeurat@meta.data$scDblFinder.class)

Idents(object = cSeurat) <- "scDblFinder.class"
cSeurat@meta.data$scDblFinder.class <- as.character(cSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
cSeurat <- subset(cSeurat, idents = "singlet")
levels(as.factor(cSeurat@meta.data$scDblFinder.class))
```

```{r}
# convert column names to make the protocol coherent
colnames(cSeurat@meta.data)[2] <- "nCount_RNA"
colnames(cSeurat@meta.data)[3] <- "nFeature_RNA"

# We draw a violin plot to explore the values in meta data
VlnPlot(cSeurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))
```

```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(cSeurat, feature1 = "percent.rb", feature2 = "percent.mt")

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(cSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")

# Since the typical cutoff for percent.mt is 5%, we will try out < 5. 
# Last week we used 10%, and couldn't see much difference.
# Also, let't filter out cells that have less than 5% ribosomal content.
# (11/29 Update) Removed the ribosomal filter due to insufficient evidence
# cSeurat <- subset(cSeurat, subset = percent.mt < 5)

dim(cSeurat)
```

Now, we have a reduced version of the Seurat object, where the cells contain no genes.

```{r}
# Next, we normalize the data
cSeurat <- NormalizeData(cSeurat)

# Explore the most variable features, let's use the default version first.
cSeurat <- FindVariableFeatures(cSeurat)
# cSeurat <- FindVariableFeatures(cSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(cSeurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(cSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(cSeurat)
cSeurat <- ScaleData(cSeurat, features = all.genes)
```


```{r}
# Run PCA analysis
cSeurat <- RunPCA(cSeurat, features = VariableFeatures(object = cSeurat))

# Plot the PCA
DimPlot(cSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(cSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(cSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(cSeurat, features = c("EBI3", 
                                   "KRT7"))

# FeaturePlot of PC2 top features
FeaturePlot(cSeurat, features = c("B2M", 
                                   "TYROBP"))
```


```{r}
# Now perform clustering
cSeurat <- FindNeighbors(cSeurat, dims = 1:10)

cSeurat <- FindClusters(cSeurat, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
cSeurat <- RunUMAP(cSeurat, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(cSeurat, reduction = "umap")

# Get the TSNE embedding
cSeurat <- RunTSNE(cSeurat, dims = 1:10, check_duplicates = FALSE)

# Plot the t-SNE with clustering
DimPlot(cSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(cSeurat@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(cSeurat,label.size = 4,repel = T,label = T)

FeaturePlot(cSeurat, features = c("MMP12",
                                   "SERPINE2",
                                   "CSH1",
                                   "ERVW-1"))

```


Let's calculate the cell cycle scores at this point.

```{r fig.height = 4, fig.width = 6.5}
cSeurat <- SetIdent(cSeurat, value = "Main_cluster_name")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
cSeurat <- CellCycleScoring(cSeurat,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)
table(cSeurat[[]]$Phase)
```

Around 1/3 of the cells are in the S phase.

```{r}
# Check if mitochondrial genes show dependency on cell type
VlnPlot(cSeurat,
        features = "percent.mt") & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Check if ribosomal genes show dependency on cell type
VlnPlot(cSeurat,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes and # of transcripts per cluster
VlnPlot(cSeurat,features = 
          c("nCount_RNA","nFeature_RNA")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(cSeurat,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(cSeurat,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')
```

In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells.

```{r}
# Plot in UMAP
DimPlot(cSeurat, 
        group.by = "Phase")
# Most of the clusters contain all three cell cycle states.

# Plot with clusters
compTable.int <- table(cSeurat$Annotation, cSeurat$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(cSeurat, 
            features = c("S.Score", "G2M.Score"))
```

This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. 

<Differential Expression and Marker Selection>
```{r}
# This part is replaced when using the SCTransform
DefaultAssay(cSeurat) <- "RNA"

# Perform log-normalization
cSeurat <- NormalizeData(cSeurat)

# Find variable features
cSeurat <- FindVariableFeatures(cSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(cSeurat)
cSeurat <- ScaleData(cSeurat, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(cSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
# We can check the dataframe briefly 
head(top3_markers)
```


Clustering based on Cell Type annotation
```{r fig.height = 4, fig.width = 6}
options(ggrepel.max.overlaps = Inf)
pdf("cSeuratUMAP.pdf")
DimPlot(cSeurat, label = T , repel = T, label.size = 3) + 
  NoLegend()
dev.off()
# Syncytin-1 is a gene that allows the differentiation from cytotrophoblasts to SCTs.
FeaturePlot(cSeurat, features = "ERVW-1")
DimPlot(cSeurat, reduction = "pca")
DimPlot(cSeurat, reduction = "tsne")
DimPlot(cSeurat, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(cSeurat, "cSeurat_annotated.rds")
```
Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity.

## Still Work in Progress

```{r eval = FALSE}
# Pseudotime
# Convert to monocle3 cell_data_set
hSeurat.cds <- as.cell_data_set(hSeurat)
class(hSeurat.cds)

# Re-cluster (required for trajectories in monocle3)
hSeurat.cds <- cluster_cells(hSeurat.cds)

# Learn the trajectory
hSeurat.cds <- learn_graph(hSeurat.cds)

# Plot the trajectory
plot_cells(hSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(hSeurat, group.by = "Phase")
hSeurat.cds <- order_cells(hSeurat.cds)

# Plot pseudotime
plot_cells(hSeurat.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
hSeurat$pseudotime_score <- hSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime
hSeurat$pseudotime_score[hSeurat$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(hSeurat, features = c("pseudotime_score")) 
DimPlot(hSeurat, group.by = "Annotation")
FeaturePlot(hSeurat, features = c("pseudotime_score"))
```