---
title: "Han et al."
author: "Young-June Kim"
date: "2022-10-05"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(pheatmap)

# Define color palettes
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt")
Hcounts[1:10, 1:10]

# set the row names
Hcounts %<>% 
  column_to_rownames("GENE")

head(Hcounts)

# We have our prepared annotations ready as well
# Load the cell annotation data
annotations <- read_csv(file = "Placenta1_rmbatchAnno.csv")

# Notice that the number of samples (cells) in hSeurat is not identical to the number of rows of this annotation file.
dim(annotations)[1] == dim(hSeurat)[2]

sweep <- (colnames(hSeurat) %in% (annotations$Cell_barcode))

# Filter out the count data
Hcounts <- Hcounts[, sweep]

# Create the Seurat object
hSeurat <- CreateSeuratObject(counts = Hcounts)
```

Let's continue adding the annotations to the metadata.
```{r eval = FALSE}
# make the annotations object compatible for data analysis.
anno_revised <- annotations %>% 
  select(Cell_barcode, ident, Celltype)

# Extract the metadata
meta_data.copy <- hSeurat@meta.data

# make new column that copies the rownames
meta_data.copy %<>% 
  mutate(Cell_barcode = rownames(meta_data.copy))

# add the annotations!!!
meta_data.copy <- 
  inner_join(meta_data.copy, anno_revised, by = 'Cell_barcode')

# Reassign the metadata
hSeurat@meta.data$Cluster <- meta_data.copy$ident
hSeurat@meta.data$Annotation <- meta_data.copy$Celltype

# The Seurat object became smaller.
dim(hSeurat)

# Use head() to inspect the metadata
head(hSeurat@meta.data)

# Let's save this object to overwrite RDS file
saveRDS(hSeurat, "hSeurat.rds")

```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

```{r}
# Start running from this line of code
hSeurat <- readRDS("hSeurat.rds")

# Add in the Mitochondrial PCT% information
hSeurat$percent.mt <- PercentageFeatureSet(hSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(hSeurat$nCount_RNA)
summary(hSeurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(hSeurat$nFeature_RNA)
summary(hSeurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(hSeurat$percent.mt)

# Later, we will subset the Seurat object based on a percent.mt cutoff.

# Check for the percentage of ribosomal genes
hSeurat[["percent.rb"]] <- PercentageFeatureSet(hSeurat, pattern = "^RP[SL]")
summary(hSeurat$percent.rb)
```

Another important part for QC is removing doublets.
```{r removing_doublets}
# Find for doublets
hSeurat <- as.SingleCellExperiment(hSeurat)
hSeurat <- scDblFinder(hSeurat)
# Convert back to Seurat object.
hSeurat <- as.Seurat(hSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = hSeurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(hSeurat@meta.data$scDblFinder.class)

Idents(object = hSeurat) <- "scDblFinder.class"
hSeurat@meta.data$scDblFinder.class <- as.character(hSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
hSeurat <- subset(hSeurat, idents = "singlet")
levels(as.factor(hSeurat@meta.data$scDblFinder.class))
```

```{r}
# convert column names to make the protocol coherent
colnames(hSeurat@meta.data)[2] <- "nCount_RNA"
colnames(hSeurat@meta.data)[3] <- "nFeature_RNA"

# We draw a violin plot to explore the values in meta data
VlnPlot(hSeurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))
```

```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(hSeurat, feature1 = "percent.rb", feature2 = "percent.mt")

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(hSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")

# Since the typical cutoff for percent.mt is 5%, we will try out < 5. 
# Last week we used 10%, and couldn't see much difference.
# Also, let't filter out cells that have less than 5% ribosomal content.
hSeurat <- subset(hSeurat, subset = percent.mt < 5 & percent.rb > 5)

dim(hSeurat)
```
I tried to subset the dataset based on nCount_RNA > 500 as written in the publication, but the dataset seems to be already filtered.
Now, we have a reduced version of the Seurat object, where the cells contain less than 5% mitochondrial genes.

```{r}
# Next, we normalize the data
hSeurat <- NormalizeData(hSeurat)

# Explore the most variable features, let's use the default version first.
hSeurat <- FindVariableFeatures(hSeurat)
# hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(hSeurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(hSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(hSeurat)
hSeurat <- ScaleData(hSeurat, features = all.genes)
```


```{r}
# Run PCA analysis
hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat))

# Plot the PCA
DimPlot(hSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(hSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(hSeurat, features = c("EBI3", 
                                   "KRT7"))

# FeaturePlot of PC2 top features
FeaturePlot(hSeurat, features = c("B2M", 
                                   "TYROBP"))
```


```{r}
# Now perform clustering
hSeurat <- FindNeighbors(hSeurat, dims = 1:10)

hSeurat <- FindClusters(hSeurat, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
hSeurat <- RunUMAP(hSeurat, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(hSeurat, reduction = "umap")

# Get the TSNE embedding
hSeurat <- RunTSNE(hSeurat, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(hSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(hSeurat@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(hSeurat,label.size = 4,repel = T,label = T)

FeaturePlot(hSeurat, features = c("MMP12",
                                   "SERPINE2",
                                   "CSH1",
                                   "CSH2"))

```


Let's calculate the cell cycle scores at this point.

```{r fig.height = 4, fig.width = 6.5}
hSeurat <- SetIdent(hSeurat, value = "Annotation")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
hSeurat <- CellCycleScoring(hSeurat,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)
table(hSeurat[[]]$Phase)
```

Around 1/3 of the cells are in the S phase.

```{r}
# Check if mitochondrial genes show dependency on cell type
VlnPlot(hSeurat,
        features = "percent.mt") & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Check if ribosomal genes show dependency on cell type
VlnPlot(hSeurat,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes and # of transcripts per cluster
VlnPlot(hSeurat,features = 
          c("nCount_RNA","nFeature_RNA")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(hSeurat,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(hSeurat,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')
```

In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells.

```{r}
# Plot in UMAP
DimPlot(hSeurat, 
        group.by = "Phase")
# Most of the clusters contain all three cell cycle states.

# Plot with clusters
compTable.int <- table(hSeurat$Annotation, hSeurat$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(hSeurat, 
            features = c("S.Score", "G2M.Score"))
```

This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. 

<Differential Expression and Marker Selection>
```{r}
# This part is replaced when using the SCTransform
DefaultAssay(hSeurat) <- "RNA"

# Perform log-normalization
hSeurat <- NormalizeData(hSeurat)

# Find variable features
hSeurat <- FindVariableFeatures(hSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(hSeurat)
hSeurat <- ScaleData(hSeurat, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(hSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
# We can check the dataframe briefly 
head(top3_markers)
```


Clustering based on Cell Type annotation
```{r fig.height = 4, fig.width = 6}
DimPlot(hSeurat, label = T , repel = T, label.size = 3) + NoLegend()
DimPlot(hSeurat, reduction = "pca")
DimPlot(hSeurat, reduction = "tsne")
DimPlot(hSeurat, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(hSeurat, "hSeurat_annotated.rds")

hSeurat <- readRDS("/home/youkim/MetaMarkers/4-dataset-test/hSeurat_annotated.rds")
pdf("hSeuratUMAP2.pdf")
DimPlot(hSeurat, label = T , repel = T, label.size = 5) + 
  NoLegend()
dev.off()
```
Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity.

## Still Work in Progress

```{r eval = FALSE}
# Pseudotime
# Convert to monocle3 cell_data_set
hSeurat.cds <- as.cell_data_set(hSeurat)
class(hSeurat.cds)

# Re-cluster (required for trajectories in monocle3)
hSeurat.cds <- cluster_cells(hSeurat.cds)

# Learn the trajectory
hSeurat.cds <- learn_graph(hSeurat.cds)

# Plot the trajectory
plot_cells(hSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(hSeurat, group.by = "Phase")
hSeurat.cds <- order_cells(hSeurat.cds)

# Plot pseudotime
plot_cells(hSeurat.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
hSeurat$pseudotime_score <- hSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime
hSeurat$pseudotime_score[hSeurat$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(hSeurat, features = c("pseudotime_score")) 
DimPlot(hSeurat, group.by = "Annotation")
FeaturePlot(hSeurat, features = c("pseudotime_score"))
```