---
title: "Pseudobulk (2023-05-02)"
output: html_document
---
``` {r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(Matrix)
# BiocManager::install("DESeq2")
library(DESeq2)
library(pheatmap)
```
Let's start by loading our Seurat objects, which we have already done analysis.
```{r}
vtdataset <- readRDS("~/MetaMarkers/4-dataset-test/vtSeurat2_annotated.rds")
sdataset <- readRDS("~/MetaMarkers/4-dataset-test/sSeurat2_annotated.rds")
ldataset <- readRDS("~/MetaMarkers/4-dataset-test/lSeurat_annotated.rds")
hdataset <- readRDS("~/MetaMarkers/4-dataset-test/hSeurat_annotated.rds")
# We notice that the metadata does not explicitly display whether the cell is stromal or non-stromal. This information is contained in the Metamarkers folder.
# As we did back in Metamarker generation, let's start with the Liu dataset
# Collapse the cell types in Liu dataset
lmeta_copy <- ldataset@meta.data %>%
dplyr::select(Annotation)
dim(ldataset@meta.data)
# read the cell types data frame
l_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Liu Cell Types (S vs NS).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(lmeta_copy, l_cell_types)
# Check if the order is maintained
summary(df$Annotation == lmeta_copy$Annotation)
# add to meta.data
ldataset$merged_annotation <- df$Merged
# Verify the metadata
ldataset@meta.data$merged_annotation
```
We repeat the process for the other three datasets.
```{r}
# Collapse the cell types in VT dataset
vtmeta_copy <- vtdataset@meta.data %>%
dplyr::select(Annotation)
# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/VT Cell Types (S vs NS)_(noFB).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)
# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)
# add to meta.data
vtdataset$merged_annotation <- df_vt$Merged
```
```{r}
# Collapse the cell types in Suryawanshi dataset
smeta_copy <- sdataset@meta.data %>%
dplyr::select(CellType) %>%
mutate(Annotation = CellType)
sdataset@meta.data$CellType %>%
table()
# read the cell types data frame
s_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Suryawanshi Cell Types (S vs NS)_(noFB).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on smeta_copy and s_cell_types
df_s <- inner_join(smeta_copy, s_cell_types)
# Check if the order is maintained
summary(df_s$Annotation == smeta_copy$Annotation)
# add to meta.data
sdataset$merged_annotation <- df_s$Merged
sdataset@meta.data
```
Let's repeat the process one last time for the Han dataset
```{r}
# Collapse the cell types in Han dataset
hmeta_copy <- hdataset@meta.data %>%
dplyr::select(Annotation)
dim(hdataset@meta.data)
# read the cell types data frame
h_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Han Cell Types (S vs NS).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)
# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)
# add to meta.data
hdataset$merged_annotation <- df$Merged
# Change the Idents of the Seurat Objects
Idents(object = ldataset) <- "merged_annotation"
Idents(object = vtdataset) <- "merged_annotation"
Idents(object = sdataset) <- "merged_annotation"
Idents(object = hdataset) <- "merged_annotation"
```
To perform pseudo-bulk analysis, we need to check if we have the necessary metrics for aggregation across cells in a sample.
### Counts matrix - Stromal-Nonstromal level
```{r}
counts1 <- AggregateExpression(vtdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
vtdataset@assays$RNA
counts2 <- AggregateExpression(ldataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
counts3 <- AggregateExpression(sdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
counts4 <- AggregateExpression(hdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
df1 <- data.frame(counts1$RNA) %>%
mutate(genes = rownames(data.frame(counts1$RNA))) %>%
mutate(Stromal.vt = Stromal) %>%
mutate(NonStromal.vt = Non.Stromal) %>%
dplyr::select(genes, Stromal.vt, NonStromal.vt)
df2 <- data.frame(counts2$RNA) %>%
mutate(genes = rownames(data.frame(counts2$RNA)))%>%
mutate(Stromal.l = Stromal) %>%
mutate(NonStromal.l = Non.Stromal) %>%
dplyr::select(genes, Stromal.l, NonStromal.l)
df3 <- data.frame(counts3$RNA) %>%
mutate(genes = rownames(data.frame(counts3$RNA)))%>%
mutate(Stromal.s = Stromal) %>%
mutate(NonStromal.s = Non.Stromal) %>%
dplyr::select(genes, Stromal.s, NonStromal.s)
df4 <- data.frame(counts4$RNA) %>%
mutate(genes = rownames(data.frame(counts4$RNA)))%>%
mutate(Stromal.h = Stromal) %>%
mutate(NonStromal.h = Non.Stromal) %>%
dplyr::select(genes, Stromal.h, NonStromal.h)
all_dfs <- Reduce(function(...){merge(..., all = FALSE)},
list(df1, df2, df3, df4))
```
Before we run the DESeq2 pipeline, we need to check whether our samples cluster together.
Let's use the prcomp() function as we used before in our bulk RNA-seq data analysis.
```{r}
x <- DGEList(counts = all_dfs2,
samples = colnames(all_dfs2),
genes = rownames(all_dfs2))
x_normalized <- calcNormFactors(x, method = "TMM")
summary(x_normalized$counts == all_dfs2)
cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
colSums()
# Check using this plot whether normalization went well.
edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
data.frame() %>%
pivot_longer(cols = everything(),
names_to = "samples",
values_to = "exprs") %>%
ggplot(aes(x = samples, y = exprs, fill = samples)) +
geom_boxplot() +
labs(title = "Normalized") +
# modify the legend size
theme(legend.position = "bottom",
legend.key.size = unit(0.5, 'cm'),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
colSums(x_normalized$counts)
x_normalized$samples
pca1 <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
t() %>%
prcomp()
colData %<>% mutate(samples = rownames(colData))
colData %<>% mutate(study = gsub(".*\\.","", samples))
# Using the pca1 object, plot the PCA data categorized by Stromal vs Non-stromal
# Save the plot as an object
p1 <- ggplot2::autoplot(pca1, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
geom_point(aes(colour = colData$study)) +
labs(colour = "Category") +
theme(legend.position="bottom")
```
Then, we will run the DESeq2 pipeline, with the aim to generate a heatmap
```{r}
# First start with making the rownames of the summed matrix
all_dfs2 <- column_to_rownames(all_dfs, var = "genes")
colData <- data_frame(sample = colnames(all_dfs2),
annotation = gsub('\\..*','', colnames(all_dfs2))) %>%
column_to_rownames(var = "sample")
all(rownames(colData) == colnames(all_dfs2))
test <- DESeqDataSetFromMatrix(countData = round(all_dfs2),
colData = colData,
design = ~ annotation)
plotPCA(test)
# set the factor level
relevel(test$annotation, ref = "NonStromal")
# Run DESeq
test <- DESeq(test)
# NOTE: Collapse the technical replicates as necessary
plotPCA(DESeqTransform(test))
# Extract the results
res <- results(test)
res0.01 <- results(test, alpha = 0.01)
summary(res0.01)
# contrasts
resultsNames(test)
# Visualization
plotMA(res0.01)
```