---
title: "Liu et al. 2018"
author: "Young-June Kim"
date: "2022-10-05"
output: html_document
---

```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(Matrix)
library(GEOquery)
library(pheatmap)
library(monocle3)
library(SeuratWrappers)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
Lcounts <- read.delim("GSE89497_Human_Placenta_TMP_V2.txt")

Lcounts[1:10, 1:10]


# check for duplicated gene names before setting row names
gene_dups <- rownames(Lcounts) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(Lcounts)

# filter out the duplicated genes
Lcounts <- Lcounts[!gene_dups,]
dim(Lcounts)

head(Lcounts)

# Create the Seurat object
lSeurat <- CreateSeuratObject(counts = Lcounts)

# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(lSeurat, "lSeurat2.rds")
```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

Since we have the metadata for this dataset, let's add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process.

```{r}
lSeurat <- readRDS("lSeurat.rds")
# View(lSeurat@meta.data)

# Check the dimensions of this object
dim(lSeurat)

# First remove the 24 week cells. We want to focus on the first trimester.
first_trimester_cells <- lSeurat@meta.data %>% 
  filter(orig.ident == "HE8W") %>% 
  rownames()

lSeurat <- lSeurat[, first_trimester_cells]

# Check dim() again to confirm the cells are now 8 weeks.
dim(lSeurat)

# Extract the metadata and copy it.
meta_data.copy <- lSeurat@meta.data

meta_data.copy <- meta_data.copy %>% 
  mutate(annotation = 
           gsub("_","", str_extract(rownames(meta_data.copy), "_.+_")))

meta_data.copy %>% 
  group_by(annotation) %>% 
  count()

# Reassign the metadata
lSeurat@meta.data$Annotation <- meta_data.copy$annotation

# check the metadata
head(lSeurat@meta.data)
```


```{r}
# Add in the Mitochondrial PCT% information
lSeurat$percent.mt <- PercentageFeatureSet(lSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(lSeurat$nCount_RNA)
summary(lSeurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(lSeurat$nFeature_RNA)
summary(lSeurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(lSeurat$percent.mt)
# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
lSeurat[["percent.rb"]] <- PercentageFeatureSet(lSeurat, pattern = "^RP[SL]")
summary(lSeurat$percent.rb)

dim(lSeurat)
```

We now have a lSeurat object that has 1567 cells, which would be sufficient for downstream analysis.
Another important part for QC is removing doublets.

```{r}
# Find for doublets
lSeurat <- as.SingleCellExperiment(lSeurat)
lSeurat <- scDblFinder(lSeurat)
# Convert back to Seurat object.
lSeurat <- as.Seurat(lSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = lSeurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(lSeurat@meta.data$scDblFinder.class)

Idents(object = lSeurat) <- "scDblFinder.class"
lSeurat@meta.data$scDblFinder.class <- as.character(lSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
lSeurat <- subset(lSeurat, idents = "singlet")
levels(as.factor(lSeurat@meta.data$scDblFinder.class))

# Check the dimensions of the dataset again.
dim(lSeurat)
```

Now, note that we have 1332 cells 

```{r}
colnames(lSeurat@meta.data)

# convert column names to make the protocol coherent
colnames(lSeurat@meta.data)[2] <- "nCount_RNA"
colnames(lSeurat@meta.data)[3] <- "nFeature_RNA"

# Make a violin plot of the QC columns
plt <- VlnPlot(lSeurat, 
               features = c("nFeature_RNA", "nCount_RNA"),
               ncol = 2,
               pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plt
# We can say that most of the QC was already performed.
ggsave(filename = "QC.png", plot = plt, width = 7, height = 3.5)
```

Looking at the violin plot, we do see some extreme values that should be filtered out.
However, let's put the subsetting part for later and proceed to next steps and come back later.
```{r}
# Make scatter plots to compare
plot1 <- FeatureScatter(lSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

# All mitochondrial genes have been removed!
# All of the cells have a million counts!
```

Since we want to compare the difference between subsetted data and the control (no subsetting), we make a copy for the lSeurat.
```{r eval = F}
subset_lSeurat <- lSeurat
subset_lSeurat <- subset(subset_lSeurat, subset = nFeature_RNA > 3000)
dim(subset_lSeurat)
```
Note that this number of cells is approximately identical to that of the publication (they used the same threshold)

Next, we need to normalize the data.

```{r}
# Next, we normalize the data
lSeurat <- NormalizeData(lSeurat)

# Explore the most variable features, let's use the default version first.
lSeurat <- FindVariableFeatures(lSeurat)
# lSeurat <- FindVariableFeatures(lSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(lSeurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(lSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(lSeurat)
lSeurat <- ScaleData(lSeurat, features = all.genes)

```

Before moving on, let's compare the above with the subsetted Seurat object.
```{r eval = F}
# Next, we normalize the data
subset_lSeurat <- NormalizeData(subset_lSeurat)

# Explore the most variable features, let's use the default version first.
subset_lSeurat <- FindVariableFeatures(subset_lSeurat)
# subset_lSeurat <- FindVariableFeatures(subset_lSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10_subset <- head(VariableFeatures(subset_lSeurat), 10) 
top10_subset

# plot variable features with and without labels
plot2_subset <- VariableFeaturePlot(subset_lSeurat)
plot2_subset <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2_subset

# Convert normalized gene expression to Z-score
all.genes_subset <- rownames(subset_lSeurat)
subset_lSeurat <- ScaleData(subset_lSeurat, features = all.genes)

# Check if the top 10 hvg are the same
top10_subset %in% top10
# There are two genes that are different after subsetting!
```

Now, we are ready to run PCA.
```{r}
# Run PCA analysis
lSeurat <- RunPCA(lSeurat, features = VariableFeatures(object = lSeurat))

# Plot the PCA
DimPlot(lSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(lSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(lSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(lSeurat, features = c("LAPTM5", 
                                   "TYROBP"))

# FeaturePlot of PC2 top features
FeaturePlot(lSeurat, features = c("COL6A1", 
                                   "COL6A2"))
```

Next, we move on to derive clusters.
```{r}
# Now perform clustering
lSeurat <- FindNeighbors(lSeurat, dims = 1:10)

lSeurat <- FindClusters(lSeurat, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
lSeurat <- RunUMAP(lSeurat, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(lSeurat, reduction = "umap")

lSeurat

# Get the TSNE embedding
lSeurat <- RunTSNE(lSeurat, dims = 1:10, check_duplicates = FALSE)

# Plot the t-SNE with clustering
DimPlot(lSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(lSeurat@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(lSeurat,label.size = 4,repel = T,label = T)


# Check expression patterns for some genes in top10
FeaturePlot(lSeurat, features = c("FN1",
                                   "SERPINE2",
                                   "IGF2",
                                   "RGS5"))

```
Let's calculate the cell cycle scores at this point.

```{r}
lSeurat <- SetIdent(lSeurat, value = "Annotation")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

lSeurat <- CellCycleScoring(lSeurat,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)

table(lSeurat[[]]$Phase)

# Check if ribosomal genes show dependency on cluster
FeaturePlot(lSeurat,features = "percent.rb",
            label.size = 4,
            repel = T,
            label = F) & 
  theme(plot.title = element_text(size=10))

# Check using the violin plot
VlnPlot(lSeurat,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes per cluster
VlnPlot(lSeurat,features = "nFeature_RNA") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(lSeurat,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(lSeurat, features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Plot in UMAP
DimPlot(lSeurat, 
        group.by = "Phase")

# Plot with clusters
compTable.int <- table(lSeurat$Annotation, lSeurat$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)


# FeaturePlot for showing mod score
FeaturePlot(lSeurat, 
            features = c("S.Score", "G2M.Score"))

```

Note that most of the cells are in the G1 phase.

<Differential Expression and Marker Selection>
```{r}
DefaultAssay(lSeurat) <- "RNA"

# Perform log-normalization
lSeurat <- NormalizeData(lSeurat)

# Find variable features
lSeurat <- FindVariableFeatures(lSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(lSeurat)
lSeurat <- ScaleData(lSeurat, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(lSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
head(top3_markers)
dim(lSeurat)
```

Clustering based on Cell Type annotation
```{r}
lSeurat <- SetIdent(lSeurat, value = "Annotation")

p1 <- DimPlot(lSeurat, label = T , repel = T, label.size = 3) + NoLegend()
p2 <- DimPlot(lSeurat, reduction = "pca")
p3 <- DimPlot(lSeurat, reduction = "tsne")
p4 <- DimPlot(lSeurat, reduction = "umap")

pdf("lSeuratUMAP.pdf")
DimPlot(lSeurat, label = T , repel = T, label.size = 5) + 
  NoLegend()
dev.off()

ggsave(filename = "UMAP.png", plot = p1, width = 7, height = 3.5)
ggsave(filename = "tsne.png", plot = p3, width = 7, height = 3.5)
ggsave(filename = "umap_legend.png", plot = p4, width = 6, height = 3.5)

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(lSeurat, "lSeurat_annotated.rds")
lSeurat <- readRDS("/home/youkim/MetaMarkers/4-dataset-test/lSeurat_annotated.rds")
```

# Still Work in Progress
```{r}
# Pseudotime
# Convert to monocle3 cell_data_set
lSeurat.cds <- as.cell_data_set(lSeurat)
class(lSeurat.cds)

# Re-cluster (required for trajectories in monocle3)
lSeurat.cds <- cluster_cells(lSeurat.cds)

# Learn the trajectory
lSeurat.cds <- learn_graph(lSeurat.cds)

# Plot the trajectory
plot_cells(lSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(lSeurat, group.by = "Phase")
lSeurat.cds <- order_cells(lSeurat.cds)

# Plot pseudotime
plot_cells(lSeurat.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
lSeurat$pseudotime_score <- lSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime
lSeurat$pseudotime_score[lSeurat$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(lSeurat, features = c("pseudotime_score")) 
DimPlot(lSeurat, group.by = "Annotation")
FeaturePlot(lSeurat, features = c("pseudotime_score"))
```