---
title: "Pseudobulk (2023-05-02)"
output: html_document
---
``` {r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(edgeR)
library(dplyr)
library(cowplot)
library(Matrix)
library(ggfortify)
# BiocManager::install("DESeq2")
library(DESeq2)
library(pheatmap)
```
Let's start by loading our Seurat objects, which we have already done analysis.
```{r}
vtdataset <- readRDS("~/MetaMarkers/4-dataset-test/vtSeurat2_annotated.rds")
sdataset <- readRDS("~/MetaMarkers/4-dataset-test/sSeurat_20230513.rds")
ldataset <- readRDS("~/MetaMarkers/4-dataset-test/lSeurat_annotated.rds")
hdataset <- readRDS("~/MetaMarkers/4-dataset-test/hSeurat_annotated.rds")
# We notice that the metadata does not explicitly display whether the cell is stromal or non-stromal. This information is contained in the Metamarkers folder.
# As we did back in Metamarker generation, let's start with the Liu dataset
# Collapse the cell types in Liu dataset
lmeta_copy <- ldataset@meta.data %>%
dplyr::select(Annotation)
dim(ldataset@meta.data)
# read the cell types data frame
l_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Liu Cell Types (T vs NT).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(lmeta_copy, l_cell_types)
# Check if the order is maintained
summary(df$Annotation == lmeta_copy$Annotation)
# add to meta.data
ldataset$merged_annotation <- df$Merged
# Verify the metadata
head(ldataset@meta.data$merged_annotation)
```
We repeat the process for the other three datasets.
```{r}
# Collapse the cell types in VT dataset
vtmeta_copy <- vtdataset@meta.data %>%
dplyr::select(Annotation)
# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/VT Cell Types (T vs NT).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)
# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)
# add to meta.data
head(vtdataset$merged_annotation <- df_vt$Merged)
```
```{r}
# Collapse the cell types in Suryawanshi dataset
smeta_copy <- sdataset@meta.data %>%
dplyr::select(CellType) %>%
mutate(Annotation = CellType)
sdataset@meta.data$CellType %>%
table()
# read the cell types data frame
s_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Suryawanshi Cell Types (T vs NT).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on smeta_copy and s_cell_types
df_s <- inner_join(smeta_copy, s_cell_types)
# Check if the order is maintained
summary(df_s$Annotation == smeta_copy$Annotation)
# add to meta.data
sdataset$merged_annotation <- df_s$Merged
head(sdataset@meta.data)
```
Let's repeat the process one last time for the Han dataset
```{r}
# Collapse the cell types in Han dataset
hmeta_copy <- hdataset@meta.data %>%
dplyr::select(Annotation)
dim(hdataset@meta.data)
# read the cell types data frame
h_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Han Cell Types (T vs NT).csv") %>%
dplyr::rename(Annotation = Annotations)
# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)
# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)
# add to meta.data
hdataset$merged_annotation <- df$Merged
# Change the Idents of the Seurat Objects
Idents(object = ldataset) <- "merged_annotation"
Idents(object = vtdataset) <- "merged_annotation"
Idents(object = sdataset) <- "merged_annotation"
Idents(object = hdataset) <- "merged_annotation"
```
To perform pseudo-bulk analysis, we need to check if we have the necessary metrics for aggregation across cells in a sample.
### Counts matrix - Stromal-Nonstromal level
```{r}
counts1 <- AggregateExpression(vtdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
counts2 <- AggregateExpression(ldataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
counts3 <- AggregateExpression(sdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
counts4 <- AggregateExpression(hdataset,
group.by = "merged_annotation",
assays = "RNA",
slot = "counts",
return.seurat = FALSE)
df1 <- data.frame(counts1$RNA) %>%
mutate(genes = rownames(data.frame(counts1$RNA))) %>%
mutate(Troph.vt = Trophoblast) %>%
mutate(NonTroph.vt = Non.Trophoblast) %>%
dplyr::select(genes, Troph.vt, NonTroph.vt)
df2 <- data.frame(counts2$RNA) %>%
mutate(genes = rownames(data.frame(counts2$RNA)))%>%
mutate(Troph.l = Trophoblast) %>%
mutate(NonTroph.l = Non.Trophoblast) %>%
dplyr::select(genes, Troph.l, NonTroph.l)
df3 <- data.frame(counts3$RNA) %>%
mutate(genes = rownames(data.frame(counts3$RNA)))%>%
mutate(Troph.s = Trophoblast) %>%
mutate(NonTroph.s = Non.Trophoblast) %>%
dplyr::select(genes, Troph.s, NonTroph.s)
df4 <- data.frame(counts4$RNA) %>%
mutate(genes = rownames(data.frame(counts4$RNA)))%>%
mutate(Troph.h = Trophoblast) %>%
mutate(NonTroph.h = Non.Trophoblast) %>%
dplyr::select(genes, Troph.h, NonTroph.h)
all_dfs <- Reduce(function(...){merge(..., all = FALSE)},
list(df1, df2, df3, df4))
all_dfs2 <- column_to_rownames(all_dfs, var = "genes")
```
Before we run the DESeq2 pipeline, we need to check whether our samples cluster together.
Let's use the prcomp() function as we used before in our bulk RNA-seq data analysis.
```{r}
x <- DGEList(counts = all_dfs2,
samples = colnames(all_dfs2),
genes = rownames(all_dfs2))
x_normalized <- calcNormFactors(x, method = "TMM")
# Notice that the counts matrix for x_normalizedna and all_dfs2 is identical.
summary(x_normalized$counts == all_dfs2)
# The following now reflects the adjusted differences in total reads.
edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
colSums()
colData <- data_frame(sample = colnames(all_dfs2),
annotation = gsub('\\..*','', colnames(all_dfs2))) %>%
column_to_rownames(var = "sample")
colData %<>% mutate(samples = rownames(colData))
colData %<>% mutate(study = gsub(".*\\.","", samples))
# Check using this plot whether normalization went well.
edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
data.frame() %>%
pivot_longer(cols = everything(),
names_to = "samples",
values_to = "exprs") %>%
ggplot(aes(x = samples, y = exprs, fill = samples)) +
geom_boxplot() +
labs(title = "Normalized") +
# modify the legend size
theme(legend.position = "bottom",
legend.key.size = unit(0.5, 'cm'),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# create the pca object based on the normalized values.
pca1 <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>%
t() %>%
prcomp()
# Using the pca1 object, plot the PCA data categorized by Stromal vs Non-stromal
# Save the plot as an object
p1 <- ggplot2::autoplot(pca1, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
geom_point(aes(colour = colData$annotation)) +
labs(colour = "Category") +
theme(legend.position="bottom")
p2 <- ggplot2::autoplot(pca1, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
geom_point(aes(colour = colData$study)) +
labs(colour = "Category") +
theme(legend.position="bottom")
png(file= "PCA_pseudobulk_TvsNT_20230513.png")
p1
dev.off()
png(file="PCA_pseudobulk_by_study(TvsNT)_20230513.png")
p2
dev.off()
```
Let's also generate the PCA plots where we focus only on the 1000 most variable genes.
```{r}
# Retrieve the counts matrix to plot
norm <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T)
rownames(norm)
# Apply var() function to every row
row_variances <- apply(norm, 1, var)
# Print row variances and change name of columns
row_variances %<>% as.data.frame()
colnames(row_variances) <- c("variance")
row_variances %<>% mutate(genes = rownames(norm))
row_variances
# sort the variances
sorted_rv <- row_variances[order(-row_variances$variance),]
top1000 <- rownames(sorted_rv[1:1000,])
top100 <- rownames(sorted_rv[1:100,])
# Boxplot visualizing the 8 samples' expression levels
norm[top1000,] %>%
data.frame() %>%
pivot_longer(cols = everything(),
names_to = "samples",
values_to = "exprs") %>%
ggplot(aes(x = samples, y = exprs, fill = samples)) +
geom_boxplot() +
labs(title = "Normalized") +
# modify the legend size
theme(legend.position = "bottom",
legend.key.size = unit(0.5, 'cm'),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Generate the pca object
pca2 <- norm[top1000,] %>%
t() %>%
prcomp()
p3 <- ggplot2::autoplot(pca2, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
geom_point(aes(colour = colData$annotation)) +
labs(colour = "Category") +
theme(legend.position="bottom")
p4 <- ggplot2::autoplot(pca2, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
geom_point(aes(colour = colData$study)) +
labs(colour = "Category") +
theme(legend.position="bottom")
png(file="PCA_pseudobulk_TvsNT1000var.png")
p3
dev.off()
png(file="PCA_pseudobulk_TvsNT1000var_by_study.png")
p4
dev.off()
pca3 <- norm[top100,] %>%
t() %>%
prcomp()
p5 <- ggplot2::autoplot(pca3, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
geom_point(aes(colour = colData$annotation)) +
labs(colour = "Category") +
theme(legend.position="bottom")
p6 <- ggplot2::autoplot(pca3, data = x_normalized$samples) +
labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
geom_point(aes(colour = colData$study)) +
labs(colour = "Category") +
theme(legend.position="bottom")
png(file="PCA_pseudobulk_TvsNT100var.png")
p3
dev.off()
png(file="PCA_pseudobulk_TvsNT100var_by_study.png")
p4
dev.off()
```
```{r}
hm1 <- cor(norm[top1000,]) %>%
pheatmap(display_numbers = T)
hm2 <- cor(norm[top100,]) %>%
pheatmap(display_numbers = T)
hm3 <- cor(norm) %>%
pheatmap(display_numbers = T)
png(file="Heatmap_TvsNT100vargenes_20230513.png")
hm2
dev.off()
png(file="Heatmap_TvsNTallgenes_20230513.png")
hm3
dev.off()
```
Then, we will run the DESeq2 pipeline, with the aim to generate a heatmap
```{r}
# First start with making the rownames of the summed matrix
colData <- data_frame(sample = colnames(all_dfs2),
annotation = gsub('\\..*','', colnames(all_dfs2))) %>%
column_to_rownames(var = "sample")
all(rownames(colData) == colnames(all_dfs2))
test <- DESeqDataSetFromMatrix(countData = round(all_dfs2),
colData = colData,
design = ~ annotation)
plotPCA(test)
# set the factor level
relevel(test$annotation, ref = "NonStromal")
# Run DESeq
test <- DESeq(test)
# NOTE: Collapse the technical replicates as necessary
plotPCA(DESeqTransform(test))
# Extract the results
res <- results(test)
res0.01 <- results(test, alpha = 0.01)
summary(res0.01)
# contrasts
resultsNames(test)
# Visualization
plotMA(res0.01)
```