---
title: "Han et al."
author: "Young-June Kim"
date: "2022-10-05"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(pheatmap)

# Define color palettes
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt")
Hcounts[1:10, 1:10]

# set the row names
Hcounts %<>% 
  column_to_rownames("GENE")

head(Hcounts)

# We have our prepared annotations ready as well
# Load the cell annotation data
annotations <- read_csv(file = "Placenta1_rmbatchAnno.csv")

# Notice that the number of samples (cells) in shSeurat is not identical to the number of rows of this annotation file.
dim(annotations)[1] == dim(shSeurat)[2]

sweep <- (colnames(shSeurat) %in% (annotations$Cell_barcode))

# Filter out the count data
Hcounts <- Hcounts[, sweep]

# Create the Seurat object
shSeurat <- CreateSeuratObject(counts = Hcounts)
```

Let's continue adding the annotations to the metadata.
```{r eval = FALSE}
# make the annotations object compatible for data analysis.
anno_revised <- annotations %>% 
  select(Cell_barcode, ident, Celltype)

# Extract the metadata
meta_data.copy <- shSeurat@meta.data

# make new column that copies the rownames
meta_data.copy %<>% 
  mutate(Cell_barcode = rownames(meta_data.copy))

# add the annotations!!!
meta_data.copy <- 
  inner_join(meta_data.copy, anno_revised, by = 'Cell_barcode')

# Reassign the metadata
shSeurat@meta.data$Cluster <- meta_data.copy$ident
shSeurat@meta.data$Annotation <- meta_data.copy$Celltype

# The Seurat object became smaller.
dim(shSeurat)

# Use head() to inspect the metadata
head(shSeurat@meta.data)

# Let's save this object to overwrite RDS file
saveRDS(shSeurat, "shSeurat.rds")

```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

```{r}
# Start running from this line of code
shSeurat <- readRDS("shSeurat.rds")

# Add in the Mitochondrial PCT% information
shSeurat$percent.mt <- PercentageFeatureSet(shSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(shSeurat$nCount_RNA)
summary(shSeurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(shSeurat$nFeature_RNA)
summary(shSeurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(shSeurat$percent.mt)

# Later, we will subset the Seurat object based on a percent.mt cutoff.

# Check for the percentage of ribosomal genes
shSeurat[["percent.rb"]] <- PercentageFeatureSet(shSeurat, pattern = "^RP[SL]")
summary(shSeurat$percent.rb)
```

Another important part for QC is removing doublets.
```{r removing_doublets}
# Find for doublets
shSeurat <- as.SingleCellExperiment(shSeurat)
shSeurat <- scDblFinder(shSeurat)
# Convert back to Seurat object.
shSeurat <- as.Seurat(shSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = shSeurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(shSeurat@meta.data$scDblFinder.class)

Idents(object = shSeurat) <- "scDblFinder.class"
shSeurat@meta.data$scDblFinder.class <- as.character(shSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
shSeurat <- subset(shSeurat, idents = "singlet")
levels(as.factor(shSeurat@meta.data$scDblFinder.class))
```

```{r}
# convert column names to make the protocol coherent
colnames(shSeurat@meta.data)[2] <- "nCount_RNA"
colnames(shSeurat@meta.data)[3] <- "nFeature_RNA"

# We draw a violin plot to explore the values in meta data
VlnPlot(shSeurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))
```

```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(shSeurat, feature1 = "percent.rb", feature2 = "percent.mt")

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(shSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")

# Since the typical cutoff for percent.mt is 5%, we will try out < 5. 
# Last week we used 10%, and couldn't see much difference.
# Also, let't filter out cells that have less than 5% ribosomal content.
shSeurat <- subset(shSeurat, subset = percent.mt < 5 & percent.rb > 5)

dim(shSeurat)
```
I tried to subset the dataset based on nCount_RNA > 500 as written in the publication, but the dataset seems to be already filtered.
Now, we have a reduced version of the Seurat object, where the cells contain less than 5% mitochondrial genes.

```{r}
# Next, we normalize the data
shSeurat <- NormalizeData(shSeurat)

# Explore the most variable features, let's use the default version first.
shSeurat <- FindVariableFeatures(shSeurat)
# shSeurat <- FindVariableFeatures(shSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(shSeurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(shSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(shSeurat)
shSeurat <- ScaleData(shSeurat, features = all.genes)
```


```{r}
# Run PCA analysis
shSeurat <- RunPCA(shSeurat, features = VariableFeatures(object = shSeurat))

# Plot the PCA
DimPlot(shSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(shSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(shSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(shSeurat, features = c("EBI3", 
                                   "KRT7"))

# FeaturePlot of PC2 top features
FeaturePlot(shSeurat, features = c("B2M", 
                                   "TYROBP"))
```


```{r}
# Now perform clustering
shSeurat <- FindNeighbors(shSeurat, dims = 1:10)

shSeurat <- FindClusters(shSeurat, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
shSeurat <- RunUMAP(shSeurat, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(shSeurat, reduction = "umap")

# Get the TSNE embedding
shSeurat <- RunTSNE(shSeurat, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(shSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(shSeurat@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(shSeurat,label.size = 4,repel = T,label = T)

FeaturePlot(shSeurat, features = c("MMP12",
                                   "SERPINE2",
                                   "CSH1",
                                   "CSH2"))

```


Let's calculate the cell cycle scores at this point.

```{r fig.height = 4, fig.width = 6.5}
shSeurat <- SetIdent(shSeurat, value = "Annotation")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
shSeurat <- CellCycleScoring(shSeurat,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)
table(shSeurat[[]]$Phase)
```

Around 1/3 of the cells are in the S phase.

```{r}
# Check if mitochondrial genes show dependency on cell type
VlnPlot(shSeurat,
        features = "percent.mt") & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Check if ribosomal genes show dependency on cell type
VlnPlot(shSeurat,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes and # of transcripts per cluster
VlnPlot(shSeurat,features = 
          c("nCount_RNA","nFeature_RNA")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(shSeurat,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(shSeurat,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')
```

In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells.

```{r}
# Plot in UMAP
DimPlot(shSeurat, 
        group.by = "Phase")
# Most of the clusters contain all three cell cycle states.

# Plot with clusters
compTable.int <- table(shSeurat$Annotation, shSeurat$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(shSeurat, 
            features = c("S.Score", "G2M.Score"))
```

This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. 

<Differential Expression and Marker Selection>
```{r}
# This part is replaced when using the SCTransform
DefaultAssay(shSeurat) <- "RNA"

# Perform log-normalization
shSeurat <- NormalizeData(shSeurat)

# Find variable features
shSeurat <- FindVariableFeatures(shSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(shSeurat)
shSeurat <- ScaleData(shSeurat, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(shSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
# We can check the dataframe briefly 
head(top3_markers)
```


Clustering based on Cell Type annotation
```{r fig.height = 4, fig.width = 6}
DimPlot(shSeurat, label = T , repel = T, label.size = 3) + NoLegend()
DimPlot(shSeurat, reduction = "pca")
DimPlot(shSeurat, reduction = "tsne")
DimPlot(shSeurat, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(shSeurat, "shSeurat_annotated.rds")
```
Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity.

## Still Work in Progress

```{r eval = FALSE}
# Pseudotime
# Convert to monocle3 cell_data_set
shSeurat.cds <- as.cell_data_set(shSeurat)
class(shSeurat.cds)

# Re-cluster (required for trajectories in monocle3)
shSeurat.cds <- cluster_cells(shSeurat.cds)

# Learn the trajectory
shSeurat.cds <- learn_graph(shSeurat.cds)

# Plot the trajectory
plot_cells(shSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(shSeurat, group.by = "Phase")
shSeurat.cds <- order_cells(shSeurat.cds)

# Plot pseudotime
plot_cells(shSeurat.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
shSeurat$pseudotime_score <- shSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime
shSeurat$pseudotime_score[shSeurat$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(shSeurat, features = c("pseudotime_score")) 
DimPlot(shSeurat, group.by = "Annotation")
FeaturePlot(shSeurat, features = c("pseudotime_score"))
```