---
title: "Han et al."
author: "Young-June Kim"
date: "2022-10-05"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
#BiocManager::install("scDblFinder")
library(scDblFinder)
library(magrittr)
```

1. Creating the Seurat Object
```{r eval = FALSE}
# Data Wrangling Processes
Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt")
Hcounts[1:10, 1:10]

# set the row names
Hcounts %<>% 
  column_to_rownames("GENE")

head(Hcounts)

# Create the Seurat object
hSeurat <- CreateSeuratObject(counts = Hcounts)

# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(hSeurat, "hSeurat.rds")
```

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

```{r}
hSeurat <- readRDS("hSeurat.rds")

# Add in the Mitochondrial PCT% information
hSeurat$percent.mt <- PercentageFeatureSet(hSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(hSeurat$nCount_RNA)
summary(hSeurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(hSeurat$nFeature_RNA)
summary(hSeurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(hSeurat$percent.mt)
# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
hSeurat[["percent.rb"]] <- PercentageFeatureSet(hSeurat, pattern = "^RP[SL]")
summary(hSeurat$percent.rb)
```

Another important part for QC is removing doublets.
```{r}
# Find for doublets
hSeurat <- as.SingleCellExperiment(hSeurat)
hSeurat <- scDblFinder(hSeurat)
# Convert back to Seurat object.
hSeurat <- as.Seurat(hSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = hSeurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(hSeurat@meta.data$scDblFinder.class)

Idents(object = hSeurat) <- "scDblFinder.class"
hSeurat@meta.data$scDblFinder.class <- as.character(hSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
hSeurat <- subset(hSeurat, idents = "singlet")
levels(as.factor(hSeurat@meta.data$scDblFinder.class))
```

```{r}
# convert column names to make the protocol coherent
colnames(hSeurat@meta.data)[2] <- "nCount_RNA"
colnames(hSeurat@meta.data)[3] <- "nFeature_RNA"

# We draw a violin plot to explore the values in meta data
VlnPlot(hSeurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))
```

```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(hSeurat, feature1 = "percent.rb", feature2 = "percent.mt")

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(hSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")
```


```{r eval = FALSE}
# Looking at the scatterplots above, we can subset the data
subset_hSeurat <- hSeurat
max(hSeurat$nFeature_RNA)
subset_hSeurat <- subset(subset_hSeurat, subset = nFeature_RNA > 500 & nFeature_RNA < 2000 & percent.mt < 5)
dim(hSeurat)
```

```{r}
# Next, we normalize the data
hSeurat <- NormalizeData(hSeurat)

# Explore the most variable features, let's use the default version first.
hSeurat <- FindVariableFeatures(hSeurat)
# hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(hSeurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(hSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(hSeurat)
hSeurat <- ScaleData(hSeurat, features = all.genes)
```


```{r}
# Run PCA analysis
hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat))

# Plot the PCA
DimPlot(hSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(hSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(hSeurat, features = c("COL3A1", 
                                   "COL1A1"))

# FeaturePlot of PC2 top features
FeaturePlot(hSeurat, features = c("FTL", 
                                   "TMSB4X"))
```


```{r}
# Now perform clustering
hSeurat <- FindNeighbors(hSeurat, dims = 1:10)

hSeurat <- FindClusters(hSeurat, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
hSeurat <- RunUMAP(hSeurat, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(hSeurat, reduction = "umap")

# Get the TSNE embedding
hSeurat <- RunTSNE(hSeurat, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(hSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(hSeurat@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(hSeurat,label.size = 4,repel = T,label = T)

FeaturePlot(hSeurat, features = c("FN1",
                                   "SERPINE2",
                                   "IGF2",
                                   "RGS5"))

```
<Differential Expression and Marker Selection>
```{r}
DefaultAssay(hSeurat) <- "RNA"

# Perform log-normalization
hSeurat <- NormalizeData(hSeurat)

# Find variable features
hSeurat <- FindVariableFeatures(hSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(hSeurat)
hSeurat <- ScaleData(hSeurat, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(hSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
head(top3_markers)
```


<Cell Type annotation using SingleR>
```{r}
#install libraries
# BiocManager::install("celldex")
# BiocManager::install("SingleR")

# Load the library
library(SingleR)

hpca.ref <- celldex::HumanPrimaryCellAtlasData()

hSCE <- as.SingleCellExperiment(DietSeurat(hSeurat))
hSCE

hpca.main <- SingleR(test = hSCE, 
                     assay.type.test = 1,
                     ref = hpca.ref,
                     labels = hpca.ref$label.main)

hpca.fine <- SingleR(test = hSCE, 
                     assay.type.test = 1,
                     ref = hpca.ref,
                     labels = hpca.ref$label.fine)

# table(hpca.main$pruned.labels)
# table(hpca.fine$pruned.labels)

hSeurat@meta.data$hpca.main   <- hpca.main$pruned.labels
hSeurat@meta.data$hpca.fine   <- hpca.fine$pruned.labels


hSeurat <- SetIdent(hSeurat, value = "hpca.main")
DimPlot(hSeurat, label = T , repel = T, label.size = 3) + NoLegend()

DimPlot(hSeurat, reduction = "pca")

hSeurat
saveRDS(hSeurat, "hSeurat_annotated.rds")
```

# Still Work in Progress
```{r eval = FALSE}
# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes

g2m.genes <- cc.genes.updated.2019$g2m.genes

placenta <- CellCycleScoring(placenta, s.features = s.genes, g2m.features = g2m.genes)
table(placenta[[]]$Phase)

FeaturePlot(placenta,features = "percent.mt",label.size = 4,repel = T,label = F) & 
  theme(plot.title = element_text(size=10))

VlnPlot(placenta,features = "percent.mt") & theme(plot.title = element_text(size=10))

FeaturePlot(placenta,features = "percent.rb",label.size = 4,repel = T,label = F) & theme(plot.title = element_text(size=10))

VlnPlot(placenta,features = "percent.rb") & theme(plot.title = element_text(size=10))


VlnPlot(placenta,features = c("nCount_originalexp","nFeature_originalexp")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(placenta,features = c("S.Score","G2M.Score"),label.size = 4,repel = T,label = T) & 
  theme(plot.title = element_text(size=10))
VlnPlot(placenta,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size=10))
```


8.3 SCTransform normalization and clustering
```{r eval = FALSE}
# Proceed to SCTransform
placenta <- SCTransform(placenta, assay = "originalexp", method = "glmGamPoi", ncells = 8824, 
                    vars.to.regress = c("percent.mt","S.Score","G2M.Score"), verbose = F)

# View the object
placenta
GetAssay(placenta)

# Standard PCA, UMAP, and clustering
placenta <- RunPCA(placenta, verbose = F)
placenta <- RunUMAP(placenta, dims = 1:30, verbose = F)
placenta <- FindNeighbors(placenta, dims = 1:30, verbose = F)
placenta <- FindClusters(placenta, verbose = F)
table(placenta[[]]$seurat_clusters)

DimPlot(placenta, label = T)

FeaturePlot(placenta,"SPP1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))

FeaturePlot(placenta,"CSH1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
```

For later uses, examine how scanpy works
```{r eval = FALSE}

placenta %>% 
  SCTransform(return.only.var.genes = FALSE) %>%
  RunPCA(features = VariableFeatures(object = .)) %>%
  FindNeighbors(dims = 1:40) %>%
  FindClusters(resolution = c(0.5, 0.6, 0.8, 1, 1.2, 1.5, 1.8,2,2.5,3)) %>%
  RunUMAP(dims = 1:40) %>%
  RunTSNE(dims = 1:40)

# Using scanpy
install.packages("renv")
```