---
title: "Vento-Tormo et al.(2018)"
author: "Young-June Kim"
date: "2022-10-15"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
VTcounts2 <- read.delim("raw_data_10x.txt")
VTcounts2[1:10, 1:10]

# We notice that the gene names are too lengthy
VTcounts2[,1] <- sub("_.*", "", VTcounts2[,1])

# check for duplicated gene names before setting row names
gene_dups <- VTcounts2[,1] %>% 
  duplicated()

# check the dimensions of the count matrix
dim(VTcounts2)

# filter out the duplicated genes
VTcounts2 <- VTcounts2[!gene_dups,]
dim(VTcounts2)

rownames(VTcounts2) <- NULL

# set the row names
VTcounts2 %<>% 
  column_to_rownames("Gene")

head(VTcounts2)

# Create the Seurat object
vtSeurat2 <- CreateSeuratObject(counts = VTcounts2)

# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(vtSeurat2, "vtSeurat2.rds")
```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

Since we have the metadata for this dataset, let's add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process.

```{r}
vtSeurat2 <- readRDS("vtSeurat2.rds")
# View(vtSeurat2@meta.data)

# Check the dimensions of this object
dim(vtSeurat2)

# Load the cell annotation data
annotations <- read_delim(file = "meta_10x.txt")

# Check if the number of samples (cells) in vtSeurat2 is identical to the number of rows of this annotation file.

dim(annotations)[1] == dim(vtSeurat2)[2]

# make the annotations object compatible for data analysis.
anno_revised <- annotations %>% 
  separate(annotation, 
           into = c("Cluster", "Annotation"),
           sep = "\t") %>% 
  mutate(Location = final_cluster,
         Sample = location) %>% 
  select(-location, -final_cluster)

# Extract the metadata
meta_data.copy <- vtSeurat2@meta.data

# make new column that copies the rownames
meta_data.copy %<>% 
  mutate(Fetus = rownames(meta_data.copy))

# add the annotations!!!
meta_data.copy <- 
  inner_join(meta_data.copy, anno_revised, by = 'Fetus')

# Reassign the metadata
vtSeurat2@meta.data$Cluster <- meta_data.copy$Cluster
vtSeurat2@meta.data$Annotation <- meta_data.copy$Annotation
vtSeurat2@meta.data$Location <- meta_data.copy$Location
vtSeurat2@meta.data$Sample <- meta_data.copy$Sample

# Set the identity (classifier) of the cells based on the location

# Set the column "Location" as characters
vtSeurat2@meta.data$Location <- as.character(vtSeurat2@meta.data$Location)

Idents(object = vtSeurat2) <- "Location"
levels(Idents(vtSeurat2))


vtSeurat2@meta.data %>% 
  filter(Location == "Placenta")
# Subset the vtSeurat2 object to a smaller one.
vtSeurat2 <- subset(vtSeurat2, idents = "Placenta")

# The Seurat object became smaller.
dim(vtSeurat2)

# Let's save this object as another RDS file
saveRDS(vtSeurat2, "vtSeurat2_placenta.rds")
```


```{r}
vtSeurat2 <- readRDS("vtSeurat2_placenta.rds")

# Add in the Mitochondrial PCT% information
vtSeurat2$percent.mt <- PercentageFeatureSet(vtSeurat2, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(vtSeurat2$nCount_RNA)
summary(vtSeurat2@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(vtSeurat2$nFeature_RNA)
summary(vtSeurat2@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(vtSeurat2$percent.mt)
# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
vtSeurat2[["percent.rb"]] <- PercentageFeatureSet(vtSeurat2, pattern = "^RP[SL]")
summary(vtSeurat2$percent.rb)

```

We now have a nice vtSeurat object that contains only cells from the placenta. Luckily, we still have 18547 cells, which would be sufficient for downstream analysis.
Another important part for QC is removing doublets.

```{r}
# Find for doublets
vtSeurat2 <- as.SingleCellExperiment(vtSeurat2)
vtSeurat2 <- scDblFinder(vtSeurat2)
# Convert back to Seurat object.
vtSeurat2 <- as.Seurat(vtSeurat2)

# Notice that our identifiers have changed.
levels(Idents(object = vtSeurat2))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(vtSeurat2@meta.data$scDblFinder.class)

Idents(object = vtSeurat2) <- "scDblFinder.class"
vtSeurat2@meta.data$scDblFinder.class <- as.character(vtSeurat2@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
vtSeurat2 <- subset(vtSeurat2, idents = "singlet")
levels(as.factor(vtSeurat2@meta.data$scDblFinder.class))

# Check the dimensions of the dataset again.
dim(vtSeurat2)
```

Now, note that we have 16779 cells that are derived from the placenta.

```{r}
colnames(vtSeurat2@meta.data)

# convert column names to make the protocol coherent
colnames(vtSeurat2@meta.data)[2] <- "nCount_RNA"
colnames(vtSeurat2@meta.data)[3] <- "nFeature_RNA"

# Make a violin plot of the QC columns
plt <- VlnPlot(vtSeurat2, 
               features = c("nFeature_RNA", "nCount_RNA" ,"percent.rb"),
               ncol = 3,
               pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plt
ggsave(filename = "QC.png", plot = plt, width = 7, height = 3.5)
```

Looking at the violin plot, we do see some extreme values that should be filtered out.
However, let's put the subsetting part for later and proceed to next steps and come back later.
```{r}
# Make scatter plots to compare
plot1 <- FeatureScatter(vtSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

# Looking at the scatterplot, we should subset to remove some outliers
# This is because further data analysis steps requires the assumption that the data has relatively moderate, homogeneous technical variants.
dim(vtSeurat2)
# vtSeurat2 <- subset(vtSeurat2, subset = nFeature_RNA > 200 &
#                  nFeature_RNA < 2500 & percent.mt < 5)
```

Since we want to compare the difference between subsetted data and the control (no subsetting), we make a copy for the vtSeurat2.
```{r}
subset_vtSeurat2 <- vtSeurat2
subset_vtSeurat2 <- subset(subset_vtSeurat2, subset = nFeature_RNA > 2000 & nFeature_RNA < 6000 & nCount_RNA < 750000)
```


Next, we need to normalize the data.

```{r}
# Next, we normalize the data
vtSeurat2 <- NormalizeData(vtSeurat2)

# Explore the most variable features, let's use the default version first.
vtSeurat2 <- FindVariableFeatures(vtSeurat2)
# vtSeurat2 <- FindVariableFeatures(vtSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(vtSeurat2), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(vtSeurat2)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(vtSeurat2)
vtSeurat2 <- ScaleData(vtSeurat2, features = all.genes)

```

Before moving on, let's compare the above with the subsetted Seurat object.
```{r}
# Next, we normalize the data
subset_vtSeurat2 <- NormalizeData(subset_vtSeurat2)

# Explore the most variable features, let's use the default version first.
subset_vtSeurat2 <- FindVariableFeatures(subset_vtSeurat2)
# subset_vtSeurat2 <- FindVariableFeatures(subset_vtSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10_subset <- head(VariableFeatures(subset_vtSeurat2), 10) 
top10_subset

# plot variable features with and without labels
plot2_subset <- VariableFeaturePlot(subset_vtSeurat2)
plot2_subset <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2_subset

# Convert normalized gene expression to Z-score
all.genes_subset <- rownames(subset_vtSeurat2)
subset_vtSeurat2 <- ScaleData(subset_vtSeurat2, features = all.genes)

# Check if the top 10 hvg are the same
top10_subset %in% top10
# There are two genes that are different after subsetting!
```

Now, we are ready to run PCA.
```{r}
# Run PCA analysis
vtSeurat2 <- RunPCA(vtSeurat2, features = VariableFeatures(object = vtSeurat2))

# Plot the PCA
DimPlot(vtSeurat2, reduction = "pca")

# Plot the loadings
VizDimLoadings(vtSeurat2, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(vtSeurat2)

# FeaturePlot of PC1 top features
FeaturePlot(vtSeurat2, features = c("TMSB4X", 
                                   "FTL"))

# FeaturePlot of PC2 top features
FeaturePlot(vtSeurat2, features = c("ENPP1", 
                                   "MGP"))
```

Next, we move on to derive clusters.
```{r}
# Now perform clustering
vtSeurat2 <- FindNeighbors(vtSeurat2, dims = 1:10)

vtSeurat2 <- FindClusters(vtSeurat2, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
vtSeurat2 <- RunUMAP(vtSeurat2, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(vtSeurat2, reduction = "umap")

# Get the TSNE embedding
vtSeurat2 <- RunTSNE(vtSeurat2, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(vtSeurat2, reduction = "tsne")

# Let's look at cluster sizes
table(vtSeurat2@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(vtSeurat2,label.size = 4,repel = T,label = T)


# Check expression patterns for some genes in top10
FeaturePlot(vtSeurat2, features = c("FN1",
                                   "SERPINE2",
                                   "IGF2",
                                   "RGS5"))

```

<Differential Expression and Marker Selection>
```{r}
DefaultAssay(vtSeurat2) <- "RNA"

# Perform log-normalization
vtSeurat2 <- NormalizeData(vtSeurat2)

# Find variable features
vtSeurat2 <- FindVariableFeatures(vtSeurat2, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(vtSeurat2)
vtSeurat2 <- ScaleData(vtSeurat2, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(vtSeurat2,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
head(top3_markers)
```

Clustering based on Cell Type annotation
```{r}
vtSeurat2 <- SetIdent(vtSeurat2, value = "Annotation")

DimPlot(vtSeurat2, label = T , repel = T, label.size = 3) + NoLegend()
DimPlot(vtSeurat2, reduction = "pca")
DimPlot(vtSeurat2, reduction = "tsne")
DimPlot(vtSeurat2, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(vtSeurat2, "vtSeurat2_p_unfiltered.rds")
```


```{r eval = FALSE}
# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes

g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
vtSeurat <- CellCycleScoring(vtSeurat, s.features = s.genes, g2m.features = g2m.genes)
table(vtSeurat[[]]$Phase)

FeaturePlot(vtSeurat, features = "percent.mt",label.size = 4,repel = T,label = F) & 
  theme(plot.title = element_text(size=10))

VlnPlot(vtSeurat,features = "percent.mt") & theme(plot.title = element_text(size=10))

FeaturePlot(vtSeurat,features = "percent.rb",label.size = 4,repel = T,label = F) & theme(plot.title = element_text(size=10))

VlnPlot(vtSeurat,features = "percent.rb") & theme(plot.title = element_text(size=10))


VlnPlot(vtSeurat,features = c("nCount_originalexp","nFeature_originalexp")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(vtSeurat,features = c("S.Score","G2M.Score"),label.size = 4,repel = T,label = T) & 
  theme(plot.title = element_text(size=10))
VlnPlot(vtSeurat,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size=10))
```

# Still Work in Progress

8.3 SCTransform normalization and clustering
```{r eval = FALSE}
# Proceed to SCTransform
vtSeurat <- SCTransform(vtSeurat)

# View the object
vtSeurat
GetAssay(vtSeurat)

# Standard PCA, UMAP, and clustering
vtSeurat <- RunPCA(vtSeurat, verbose = F)
DimPlot(vtSeurat, reduction = "pca")
vtSeurat <- RunUMAP(vtSeurat, dims = 1:30, verbose = F)
DimPlot(vtSeurat, reduction = "umap")
vtSeurat <- FindNeighbors(vtSeurat, dims = 1:30, verbose = F)
vtSeurat <- FindClusters(vtSeurat, verbose = F)
table(vtSeurat[[]]$seurat_clusters)

DimPlot(vtSeurat, label = T)

FeaturePlot(vtSeurat,"SPP1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))

FeaturePlot(vtSeurat,"CSH1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))

vtSeurat %>% 
  SCTransform(return.only.var.genes = FALSE) %>%
  RunPCA(features = VariableFeatures(object = .)) %>%
  FindNeighbors(dims = 1:40) %>%
  FindClusters(resolution = c(0.5, 0.6, 0.8, 1, 1.2, 1.5, 1.8,2,2.5,3)) %>%
  RunUMAP(dims = 1:40) %>%
  RunTSNE(dims = 1:40)

# Using scanpy
install.packages("renv")
```