---
title: "Analysis (2023-01-19)"
output: html_document
---

```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(GEOquery)
# BiocManager::install("GEOquery")
devtools::install_github('satijalab/seurat-data')
library(SeuratData)
```

1. Creating the Seurat Object
```{r eval = FALSE}
zcounts <- ReadMtx(
  mtx = "GSM5293860_D1.matrix.mtx.gz", 
  features = "GSM5293860_D1.features.tsv.gz",
  cells = "GSM5293860_D1.barcodes.tsv.gz"
)

# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(zcounts)

# filter out the duplicated genes
zcounts <- zcounts[!gene_dups,]
dim(zcounts)

head(zcounts)

# Create the Seurat object
zSeurat <- CreateSeuratObject(counts = zcounts)

# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(zSeurat, "zSeurat.rds")
```

```{r}
# Examine the Seurat object metadata
zSeurat@meta.data
dim(zSeurat)

rownames(zSeurat)

```

```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D8")
zcounts2 <- ReadMtx(
  mtx = "GSM5293861_D8.matrix.mtx.gz", 
  features = "GSM5293861_D8.features.tsv.gz",
  cells = "GSM5293861_D8.barcodes.tsv.gz"
)

# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts2) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(zcounts2)

# filter out the duplicated genes
zcounts2 <- zcounts2[!gene_dups,]
dim(zcounts2)

head(zcounts2)

# Create the Seurat object
zSeurat2 <- CreateSeuratObject(counts = zcounts2)

# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```

```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D9")
zcounts3 <- ReadMtx(
  mtx = "GSM5293862_D9.matrix.mtx.gz", 
  features = "GSM5293862_D9.features.tsv.gz",
  cells = "GSM5293862_D9.barcodes.tsv.gz"
)

# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts3) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(zcounts3)

# filter out the duplicated genes
zcounts3 <- zcounts3[!gene_dups,]
dim(zcounts3)

head(zcounts3)

# Create the Seurat object
zSeurat3 <- CreateSeuratObject(counts = zcounts3)

# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```

```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D10")
zcounts4 <- ReadMtx(
  mtx = "GSM5293863_D10.matrix.mtx.gz", 
  features = "GSM5293863_D10.features.tsv.gz",
  cells = "GSM5293863_D10.barcodes.tsv.gz"
)

# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts4) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(zcounts4)

# filter out the duplicated genes
zcounts4 <- zcounts4[!gene_dups,]
dim(zcounts4)

head(zcounts4)

# Create the Seurat object
zSeurat4 <- CreateSeuratObject(counts = zcounts4)

# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```

```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D13")
zcounts5 <- ReadMtx(
  mtx = "GSM5293864_D13.matrix.mtx.gz", 
  features = "GSM5293864_D13.features.tsv.gz",
  cells = "GSM5293864_D13.barcodes.tsv.gz"
)

# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts5) %>% 
  duplicated()

# check the dimensions of the count matrix
dim(zcounts5)

# filter out the duplicated genes
zcounts5 <- zcounts5[!gene_dups,]
dim(zcounts5)

head(zcounts5)

# Create the Seurat object
zSeurat5 <- CreateSeuratObject(counts = zcounts5)

# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```

```{r}
dim(zSeurat)
dim(zSeurat2)
dim(zSeurat3)
dim(zSeurat4)
dim(zSeurat5)
```

```{r}
# Let's try merging the five Seurat objects
merged_Seurat <- merge(zSeurat, 
                       y = c(zSeurat2,
                             zSeurat3,
                             zSeurat4,
                             zSeurat5), 
                       add.cell.ids = c("1", "2", "3", "4", "5"),
                       project = "Zheng et al. 2022")

dim(merged_Seurat)
dim(zSeurat)
dim(zSeurat2)
dim(zSeurat3)
dim(zSeurat4)
dim(zSeurat5)
```

The next line of code will be necessary for adding the cell type annotations to our data

```{r}
# Email the authors for this data
```

Next, we proceed the standard pipeline of scRNA-seq analysis.

```{r}
# Add in the Mitochondrial PCT% information
merged_Seurat$percent.mt <- PercentageFeatureSet(merged_Seurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(merged_Seurat$nCount_RNA)
summary(merged_Seurat@meta.data$nCount_RNA)

# nFeature_RNA is the number of different genes that had any reads
hist(merged_Seurat$nFeature_RNA)
summary(merged_Seurat@meta.data$nFeature_RNA)

# percent.mt is the percent mitochondrial reads
summary(merged_Seurat$percent.mt)

# Later, we will subset the Seurat object based on a percent.mt cutoff.

# Check for the percentage of ribosomal genes
merged_Seurat[["percent.rb"]] <- PercentageFeatureSet(merged_Seurat, pattern = "^RP[SL]")
summary(merged_Seurat$percent.rb)
```

Another important part for QC is removing doublets.
```{r removing_doublets}
# Find for doublets
merged_Seurat <- as.SingleCellExperiment(merged_Seurat)
merged_Seurat <- scDblFinder(merged_Seurat)
# Convert back to Seurat object.
merged_Seurat <- as.Seurat(merged_Seurat)

# Notice that our identifiers have changed.
levels(Idents(object = merged_Seurat))

# We want to specify the identifier whether the cell is a singlet or doublet.
table(merged_Seurat@meta.data$scDblFinder.class)

Idents(object = merged_Seurat) <- "scDblFinder.class"
merged_Seurat@meta.data$scDblFinder.class <- as.character(merged_Seurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
merged_Seurat <- subset(merged_Seurat, idents = "singlet")
levels(as.factor(merged_Seurat@meta.data$scDblFinder.class))
```

```{r}
# We draw a violin plot to explore the values in meta data
VlnPlot(merged_Seurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))
```
```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(merged_Seurat, feature1 = "percent.rb", feature2 = "percent.mt")

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(merged_Seurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")

# Since the typical cutoff for percent.mt is 5%, we will try out < 5. 
# Last week we used 10%, and couldn't see much difference.
# Also, let's filter out cells that have less than 5% ribosomal content.
merged_Seurat <- subset(merged_Seurat, subset = percent.mt < 5 & percent.rb > 5)

dim(merged_Seurat)
```


```{r}
# Next, we normalize the data
merged_Seurat <- NormalizeData(merged_Seurat)

# Explore the most variable features, let's use the default version first.
merged_Seurat <- FindVariableFeatures(merged_Seurat)
# merged_Seurat <- FindVariableFeatures(merged_Seurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(merged_Seurat), 10) 
top10

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(merged_Seurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2

# Convert normalized gene expression to Z-score
all.genes <- rownames(merged_Seurat)
merged_Seurat <- ScaleData(merged_Seurat, features = all.genes)
```

```{r}
# Run PCA analysis
hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat))

# Plot the PCA
DimPlot(hSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(hSeurat)
```