---
title: "Analysis (2023-01-19)"
output: html_document
---
```{r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(GEOquery)
# BiocManager::install("GEOquery")
devtools::install_github('satijalab/seurat-data')
library(SeuratData)
```
1. Creating the Seurat Object
```{r eval = FALSE}
zcounts <- ReadMtx(
mtx = "GSM5293860_D1.matrix.mtx.gz",
features = "GSM5293860_D1.features.tsv.gz",
cells = "GSM5293860_D1.barcodes.tsv.gz"
)
# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts) %>%
duplicated()
# check the dimensions of the count matrix
dim(zcounts)
# filter out the duplicated genes
zcounts <- zcounts[!gene_dups,]
dim(zcounts)
head(zcounts)
# Create the Seurat object
zSeurat <- CreateSeuratObject(counts = zcounts)
# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(zSeurat, "zSeurat.rds")
```
```{r}
# Examine the Seurat object metadata
zSeurat@meta.data
dim(zSeurat)
rownames(zSeurat)
```
```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D8")
zcounts2 <- ReadMtx(
mtx = "GSM5293861_D8.matrix.mtx.gz",
features = "GSM5293861_D8.features.tsv.gz",
cells = "GSM5293861_D8.barcodes.tsv.gz"
)
# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts2) %>%
duplicated()
# check the dimensions of the count matrix
dim(zcounts2)
# filter out the duplicated genes
zcounts2 <- zcounts2[!gene_dups,]
dim(zcounts2)
head(zcounts2)
# Create the Seurat object
zSeurat2 <- CreateSeuratObject(counts = zcounts2)
# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```
```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D9")
zcounts3 <- ReadMtx(
mtx = "GSM5293862_D9.matrix.mtx.gz",
features = "GSM5293862_D9.features.tsv.gz",
cells = "GSM5293862_D9.barcodes.tsv.gz"
)
# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts3) %>%
duplicated()
# check the dimensions of the count matrix
dim(zcounts3)
# filter out the duplicated genes
zcounts3 <- zcounts3[!gene_dups,]
dim(zcounts3)
head(zcounts3)
# Create the Seurat object
zSeurat3 <- CreateSeuratObject(counts = zcounts3)
# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```
```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D10")
zcounts4 <- ReadMtx(
mtx = "GSM5293863_D10.matrix.mtx.gz",
features = "GSM5293863_D10.features.tsv.gz",
cells = "GSM5293863_D10.barcodes.tsv.gz"
)
# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts4) %>%
duplicated()
# check the dimensions of the count matrix
dim(zcounts4)
# filter out the duplicated genes
zcounts4 <- zcounts4[!gene_dups,]
dim(zcounts4)
head(zcounts4)
# Create the Seurat object
zSeurat4 <- CreateSeuratObject(counts = zcounts4)
# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```
```{r}
setwd("~/Meta-analysis/Zheng et al. (2022)/D13")
zcounts5 <- ReadMtx(
mtx = "GSM5293864_D13.matrix.mtx.gz",
features = "GSM5293864_D13.features.tsv.gz",
cells = "GSM5293864_D13.barcodes.tsv.gz"
)
# check for duplicated gene names before setting row names
gene_dups <- rownames(zcounts5) %>%
duplicated()
# check the dimensions of the count matrix
dim(zcounts5)
# filter out the duplicated genes
zcounts5 <- zcounts5[!gene_dups,]
dim(zcounts5)
head(zcounts5)
# Create the Seurat object
zSeurat5 <- CreateSeuratObject(counts = zcounts5)
# Let's save the Seurat object as an .rds file to facilitate analysis.
# saveRDS(zSeurat, "zSeurat.rds")
```
```{r}
dim(zSeurat)
dim(zSeurat2)
dim(zSeurat3)
dim(zSeurat4)
dim(zSeurat5)
```
```{r}
# Let's try merging the five Seurat objects
merged_Seurat <- merge(zSeurat,
y = c(zSeurat2,
zSeurat3,
zSeurat4,
zSeurat5),
add.cell.ids = c("1", "2", "3", "4", "5"),
project = "Zheng et al. 2022")
dim(merged_Seurat)
dim(zSeurat)
dim(zSeurat2)
dim(zSeurat3)
dim(zSeurat4)
dim(zSeurat5)
```
The next line of code will be necessary for adding the cell type annotations to our data
```{r}
# Email the authors for this data
```
Next, we proceed the standard pipeline of scRNA-seq analysis.
```{r}
# Add in the Mitochondrial PCT% information
merged_Seurat$percent.mt <- PercentageFeatureSet(merged_Seurat, pattern = "^MT-")
# nCount_RNA is the number of UMI counts in a cell
hist(merged_Seurat$nCount_RNA)
summary(merged_Seurat@meta.data$nCount_RNA)
# nFeature_RNA is the number of different genes that had any reads
hist(merged_Seurat$nFeature_RNA)
summary(merged_Seurat@meta.data$nFeature_RNA)
# percent.mt is the percent mitochondrial reads
summary(merged_Seurat$percent.mt)
# Later, we will subset the Seurat object based on a percent.mt cutoff.
# Check for the percentage of ribosomal genes
merged_Seurat[["percent.rb"]] <- PercentageFeatureSet(merged_Seurat, pattern = "^RP[SL]")
summary(merged_Seurat$percent.rb)
```
Another important part for QC is removing doublets.
```{r removing_doublets}
# Find for doublets
merged_Seurat <- as.SingleCellExperiment(merged_Seurat)
merged_Seurat <- scDblFinder(merged_Seurat)
# Convert back to Seurat object.
merged_Seurat <- as.Seurat(merged_Seurat)
# Notice that our identifiers have changed.
levels(Idents(object = merged_Seurat))
# We want to specify the identifier whether the cell is a singlet or doublet.
table(merged_Seurat@meta.data$scDblFinder.class)
Idents(object = merged_Seurat) <- "scDblFinder.class"
merged_Seurat@meta.data$scDblFinder.class <- as.character(merged_Seurat@meta.data$scDblFinder.class)
# Subset the Seurat object to only include singlets.
merged_Seurat <- subset(merged_Seurat, idents = "singlet")
levels(as.factor(merged_Seurat@meta.data$scDblFinder.class))
```
```{r}
# We draw a violin plot to explore the values in meta data
VlnPlot(merged_Seurat,
features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
ncol = 4,
pt.size = 0.1) &
theme(plot.title = element_text(size=10))
```
```{r}
# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.rb")
FeatureScatter(merged_Seurat, feature1 = "percent.rb", feature2 = "percent.mt")
# We also check if there is correlation between number of features and the doublet score
FeatureScatter(merged_Seurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")
# Since the typical cutoff for percent.mt is 5%, we will try out < 5.
# Last week we used 10%, and couldn't see much difference.
# Also, let's filter out cells that have less than 5% ribosomal content.
merged_Seurat <- subset(merged_Seurat, subset = percent.mt < 5 & percent.rb > 5)
dim(merged_Seurat)
```
```{r}
# Next, we normalize the data
merged_Seurat <- NormalizeData(merged_Seurat)
# Explore the most variable features, let's use the default version first.
merged_Seurat <- FindVariableFeatures(merged_Seurat)
# merged_Seurat <- FindVariableFeatures(merged_Seurat, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(merged_Seurat), 10)
top10
# plot variable features with and without labels
plot2 <- VariableFeaturePlot(merged_Seurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2
# Convert normalized gene expression to Z-score
all.genes <- rownames(merged_Seurat)
merged_Seurat <- ScaleData(merged_Seurat, features = all.genes)
```
```{r}
# Run PCA analysis
hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat))
# Plot the PCA
DimPlot(hSeurat, reduction = "pca")
# Plot the loadings
VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") &
theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))
# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(hSeurat)
```