---
title: "Arutyanyan et al. (2023)"
author: "Young-June Kim"
date: "2023-06-14"
output: html_document
---

```{r}
# Load libraries
#install.packages('Seurat')
#remotes::install_github("mojaveazure/seurat-disk")
#BiocManager::install("zellkonverter")
library(Seurat)
library(SeuratDisk)
library(zellkonverter)
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
BiocManager::install("scDblFinder")
library(scDblFinder)
library(magrittr)
library(Matrix)
library(GEOquery)
library(pheatmap)
```


```{r}
# Let's load the h5ad file first
# my_data <- readH5AD(file = "/home/youkim/adata_all_donors_all_cell_states_raw_counts_in_raw_normlog_counts_in_X_for_download_UPD_20230307.h5ad", X_name = "X")

# Quick glance on the matrix
# my_data@assays@data$X[1:5, 1:5]

# Convert the SCE object into a Seurat object
# This step keeps failing, using dactyl is recommended
# my_data <- as.Seurat(my_data, counts = my_data@assays@data$X)
setwd("~/")
aSeurat <- readRDS("Seurat_obj.rds")

# copy metadata
meta_data <- aSeurat@meta.data
# filter out placental cells
placental_cells <- row.names(meta_data[meta_data$tissue == "placenta", ])
# Subset the Seurat object
aSeurat_placenta <- subset(aSeurat, cells = placental_cells)

# Remove original Seurat object and write new RDS file
rm(aSeurat)
rm(my_data)

# check the datasets that the placental cells originate
aSeurat_placenta@meta.data %>% 
  group_by(dataset) %>% 
  count() 

# perform filteration to include only the new dataset
meta_data <- aSeurat_placenta@meta.data

# filter out the new dataset cells
newdata_cells <- row.names(meta_data[meta_data$dataset == "samples_MFI_new_sc_batch_2", ])
# Subset the Seurat object
aSeurat_placenta_new <- subset(aSeurat_placenta, cells = newdata_cells)

# Remove Seurat_placenta object and write new RDS file
rm(aSeurat_placenta)
rm(meta_data)
#saveRDS(aSeurat_placenta_new, "aSeurat_placenta_new.rds")
rm(aSeurat_placenta_new)
```

Now we have the Seurat object ready for analysis.

```{r}
# Read in the .rds file
aSeurat2 <- readRDS("aSeurat_placenta_new.rds")

# Add in the Mitochondrial PCT% information
aSeurat2$percent.mt <- PercentageFeatureSet(aSeurat2, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(aSeurat2$nCount_RNA)
summary(aSeurat2@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(aSeurat2$nFeature_RNA)
summary(aSeurat2@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(aSeurat2$percent.mt)
# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
aSeurat2[["percent.rb"]] <- PercentageFeatureSet(aSeurat2, pattern = "^RP[SL]")
hist(aSeurat2$percent.rb)
summary(aSeurat2$percent.rb)

dim(aSeurat2)
```

We now have a aSeurat2 object that has 33439 cells, which would be sufficient for downstream analysis.
Another important part for QC is removing doublets.

```{r}
# Find for doublets
aSeurat2 <- as.SingleCellExperiment(aSeurat2)
aSeurat2 <- scDblFinder(aSeurat2)
# Convert back to Seurat object.
aSeurat2 <- as.Seurat(aSeurat2)

# Notice that our identifiers have changed.
levels(Idents(object = aSeurat2))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(aSeurat2@meta.data$scDblFinder.class)

Idents(object = aSeurat2) <- "scDblFinder.class"
aSeurat2@meta.data$scDblFinder.class <- as.character(aSeurat2@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
aSeurat2 <- subset(aSeurat2, idents = "singlet")
levels(as.factor(aSeurat2@meta.data$scDblFinder.class))

# Check the dimensions of the dataset again.
dim(aSeurat2)
```

Now, note that we have 31521 cells 

```{r}
colnames(aSeurat2@meta.data)

# Make a violin plot of the QC columns
plt <- VlnPlot(aSeurat2, 
               features = c("nFeature_RNA", "nCount_RNA"),
               ncol = 2,
               pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plt
# We can say that most of the QC was already performed.
ggsave(filename = "QC_(2023-06-14).png", plot = plt, width = 7, height = 3.5)
```

Looking at the violin plot, we do see some extreme values that should be filtered out.
However, let's put the subsetting part for later and proceed to next steps and come back later.

```{r}
# Make scatter plots to compare
plot1 <- FeatureScatter(aSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

# remove the cells with high mitochondrial gene percentage
aSeurat2 <- subset(aSeurat2, subset = percent.mt < 5) 
dim(aSeurat2)
```

Next, we need to normalize the data.

```{r}
# Next, we normalize the data
aSeurat2 <- NormalizeData(aSeurat2)

# Explore the most variable features, let's use the default version first.
aSeurat2 <- FindVariableFeatures(aSeurat2)
# aSeurat2 <- FindVariableFeatures(aSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(aSeurat2), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(aSeurat2)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(aSeurat2)
aSeurat2 <- ScaleData(aSeurat2, features = all.genes)

```

Now, we are ready to run PCA.
```{r}
# Run PCA analysis
aSeurat2 <- RunPCA(aSeurat2, features = VariableFeatures(object = aSeurat2))

# Plot the PCA
DimPlot(aSeurat2, reduction = "pca")

# Plot the loadings
VizDimLoadings(aSeurat2, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(aSeurat2)

# FeaturePlot of PC1 top features
FeaturePlot(aSeurat2, features = c("KRT7", 
                                   "KRT18"))

# FeaturePlot of PC2 top features
FeaturePlot(aSeurat2, features = c("HLA-G", 
                                   "VIM"))
```

Next, we move on to derive clusters.
```{r}
# Now perform clustering
aSeurat2 <- FindNeighbors(aSeurat2, dims = 1:10)

aSeurat2 <- FindClusters(aSeurat2, resolution = 0.5)

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
aSeurat2 <- RunUMAP(aSeurat2, dims = 1:10, verbose = F)

# Plot the UMAP with clustering
DimPlot(aSeurat2, reduction = "umap")

# Get the TSNE embedding
aSeurat2 <- RunTSNE(aSeurat2, dims = 1:10, check_duplicates = FALSE)

# Plot the t-SNE with clustering
DimPlot(aSeurat2, reduction = "tsne")

# Let's look at cluster sizes
table(aSeurat2@meta.data$seurat_clusters)

# Let's plot those clusters!
DimPlot(aSeurat2,label.size = 4,repel = T,label = T)


# Check expression patterns for some genes in top10
FeaturePlot(aSeurat2, features = c("KRT7",
                                   "EGFR"))

```
Let's calculate the cell cycle scores at this point.

```{r}
aSeurat2 <- SetIdent(aSeurat2, value = "cell_type")
Idents(aSeurat2)

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

aSeurat2 <- CellCycleScoring(aSeurat2,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)

table(aSeurat2[[]]$Phase)

# Check if ribosomal genes show dependency on cluster
FeaturePlot(aSeurat2,features = "percent.rb",
            label.size = 4,
            repel = T,
            label = F) & 
  theme(plot.title = element_text(size=10))

# Check using the violin plot
VlnPlot(aSeurat2,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes per cluster
VlnPlot(aSeurat2,features = "nFeature_RNA") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(aSeurat2,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(aSeurat2, features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Plot in UMAP
DimPlot(aSeurat2, 
        group.by = "Phase")

# Plot with clusters
compTable.int <- table(aSeurat2$coarse_annot, aSeurat2$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(aSeurat2, 
            features = c("S.Score", "G2M.Score"))

```

<Differential Expression and Marker Selection>
```{r}
DefaultAssay(aSeurat2) <- "RNA"

# Perform log-normalization
aSeurat2 <- NormalizeData(aSeurat2)

# Find variable features
aSeurat2 <- FindVariableFeatures(aSeurat2, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(aSeurat2)
aSeurat2 <- ScaleData(aSeurat2, features = all.genes)

# Designate the markers
all.markers <- FindAllMarkers(aSeurat2,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
dim(all.markers)
table(all.markers$cluster)

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
head(top3_markers)
dim(aSeurat2)

p1 <- DimPlot(aSeurat2, label = T , repel = T, label.size = 3) + NoLegend()
p2 <- DimPlot(aSeurat2, reduction = "pca")
p3 <- DimPlot(aSeurat2, reduction = "tsne")
p4 <- DimPlot(aSeurat2, reduction = "umap")

p1
p2
p3
p4

ggsave(filename = "UMAP(2023-06-14).pdf", plot = p1, width = 9, height = 7)
ggsave(filename = "tsne(2023-05-13).png", plot = p3, width = 7, height = 3.5)
ggsave(filename = "umap_legend(2023-05-13).png", plot = p4, width = 6, height = 3.5)

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(aSeurat2, "aSeurat_analyzed.rds")
```