--- title: "Pseudobulk (2023-05-02)" output: html_document --- ``` {r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(ggplot2) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(Matrix) # BiocManager::install("DESeq2") library(DESeq2) library(pheatmap) ``` Let's start by loading our Seurat objects, which we have already done analysis. ```{r} vtdataset <- readRDS("~/MetaMarkers/4-dataset-test/vtSeurat2_annotated.rds") sdataset <- readRDS("~/MetaMarkers/4-dataset-test/sSeurat2_annotated.rds") ldataset <- readRDS("~/MetaMarkers/4-dataset-test/lSeurat_annotated.rds") hdataset <- readRDS("~/MetaMarkers/4-dataset-test/hSeurat_annotated.rds") # We notice that the metadata does not explicitly display whether the cell is stromal or non-stromal. This information is contained in the Metamarkers folder. # As we did back in Metamarker generation, let's start with the Liu dataset # Collapse the cell types in Liu dataset lmeta_copy <- ldataset@meta.data %>% dplyr::select(Annotation) dim(ldataset@meta.data) # read the cell types data frame l_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Liu Cell Types (S vs NS).csv") %>% dplyr::rename(Annotation = Annotations) # use inner_join() on hmeta_copy and h_cell_types df <- inner_join(lmeta_copy, l_cell_types) # Check if the order is maintained summary(df$Annotation == lmeta_copy$Annotation) # add to meta.data ldataset$merged_annotation <- df$Merged # Verify the metadata ldataset@meta.data$merged_annotation ``` We repeat the process for the other three datasets. ```{r} # Collapse the cell types in VT dataset vtmeta_copy <- vtdataset@meta.data %>% dplyr::select(Annotation) # Do the same thing for the vtSeurat vt_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/VT Cell Types (S vs NS)_(noFB).csv") %>% dplyr::rename(Annotation = Annotations) # use inner_join() on vtmeta_copy and vt_cell_types df_vt <- inner_join(vtmeta_copy, vt_cell_types) # Check if the order is maintained summary(df_vt$Annotation == vtmeta_copy$Annotation) # add to meta.data vtdataset$merged_annotation <- df_vt$Merged ``` ```{r} # Collapse the cell types in Suryawanshi dataset smeta_copy <- sdataset@meta.data %>% dplyr::select(CellType) %>% mutate(Annotation = CellType) sdataset@meta.data$CellType %>% table() # read the cell types data frame s_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Suryawanshi Cell Types (S vs NS)_(noFB).csv") %>% dplyr::rename(Annotation = Annotations) # use inner_join() on smeta_copy and s_cell_types df_s <- inner_join(smeta_copy, s_cell_types) # Check if the order is maintained summary(df_s$Annotation == smeta_copy$Annotation) # add to meta.data sdataset$merged_annotation <- df_s$Merged sdataset@meta.data ``` Let's repeat the process one last time for the Han dataset ```{r} # Collapse the cell types in Han dataset hmeta_copy <- hdataset@meta.data %>% dplyr::select(Annotation) dim(hdataset@meta.data) # read the cell types data frame h_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Stromal_Markers/Han Cell Types (S vs NS).csv") %>% dplyr::rename(Annotation = Annotations) # use inner_join() on hmeta_copy and h_cell_types df <- inner_join(hmeta_copy, h_cell_types) # Check if the order is maintained summary(df$Annotation == hmeta_copy$Annotation) # add to meta.data hdataset$merged_annotation <- df$Merged # Change the Idents of the Seurat Objects Idents(object = ldataset) <- "merged_annotation" Idents(object = vtdataset) <- "merged_annotation" Idents(object = sdataset) <- "merged_annotation" Idents(object = hdataset) <- "merged_annotation" ``` To perform pseudo-bulk analysis, we need to check if we have the necessary metrics for aggregation across cells in a sample. ### Counts matrix - Stromal-Nonstromal level ```{r} counts1 <- AggregateExpression(vtdataset, group.by = "merged_annotation", assays = "RNA", slot = "counts", return.seurat = FALSE) counts2 <- AggregateExpression(ldataset, group.by = "merged_annotation", assays = "RNA", slot = "counts", return.seurat = FALSE) counts3 <- AggregateExpression(sdataset, group.by = "merged_annotation", assays = "RNA", slot = "counts", return.seurat = FALSE) counts4 <- AggregateExpression(hdataset, group.by = "merged_annotation", assays = "RNA", slot = "counts", return.seurat = FALSE) df1 <- data.frame(counts1$RNA) %>% mutate(genes = rownames(data.frame(counts1$RNA))) %>% mutate(Stromal.vt = Stromal) %>% mutate(NonStromal.vt = Non.Stromal) %>% dplyr::select(genes, Stromal.vt, NonStromal.vt) df2 <- data.frame(counts2$RNA) %>% mutate(genes = rownames(data.frame(counts2$RNA)))%>% mutate(Stromal.l = Stromal) %>% mutate(NonStromal.l = Non.Stromal) %>% dplyr::select(genes, Stromal.l, NonStromal.l) df3 <- data.frame(counts3$RNA) %>% mutate(genes = rownames(data.frame(counts3$RNA)))%>% mutate(Stromal.s = Stromal) %>% mutate(NonStromal.s = Non.Stromal) %>% dplyr::select(genes, Stromal.s, NonStromal.s) df4 <- data.frame(counts4$RNA) %>% mutate(genes = rownames(data.frame(counts4$RNA)))%>% mutate(Stromal.h = Stromal) %>% mutate(NonStromal.h = Non.Stromal) %>% dplyr::select(genes, Stromal.h, NonStromal.h) all_dfs <- Reduce(function(...){merge(..., all = FALSE)}, list(df1, df2, df3, df4)) ``` Before we run the DESeq2 pipeline, we need to check whether our samples cluster together. Let's use the prcomp() function as we used before in our bulk RNA-seq data analysis. ```{r} x <- DGEList(counts = all_dfs2, samples = colnames(all_dfs2), genes = rownames(all_dfs2)) x_normalized <- calcNormFactors(x, method = "TMM") # Notice that the counts matrix for x_normalizedna and all_dfs2 is identical. summary(x_normalized$counts == all_dfs2) # The following now reflects the adjusted differences in total reads. cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% colSums() # Check using this plot whether normalization went well. edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% data.frame() %>% pivot_longer(cols = everything(), names_to = "samples", values_to = "exprs") %>% ggplot(aes(x = samples, y = exprs, fill = samples)) + geom_boxplot() + labs(title = "Normalized") + # modify the legend size theme(legend.position = "bottom", legend.key.size = unit(0.5, 'cm'), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # create the pca object based on the normalized values. pca1 <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% t() %>% prcomp() colData %<>% mutate(samples = rownames(colData)) colData %<>% mutate(study = gsub(".*\\.","", samples)) # Using the pca1 object, plot the PCA data categorized by Stromal vs Non-stromal # Save the plot as an object p1 <- ggplot2::autoplot(pca1, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") + geom_point(aes(colour = colData$annotation)) + labs(colour = "Category") + theme(legend.position="bottom") p2 <- ggplot2::autoplot(pca1, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") + geom_point(aes(colour = colData$study)) + labs(colour = "Category") + theme(legend.position="bottom") png(file="PCA_pseudobulk_SvsNS.png") p1 dev.off() png(file="PCA_pseudobulk_by_study.png") p2 dev.off() ``` Let's also generate the PCA plots where we focus only on the 1000 most variable genes. ```{r} # Retrieve the counts matrix to plot norm <- cpm(x_normalized, log = T, normalized.lib.sizes = T) rownames(norm) # Apply var() function to every row row_variances <- apply(norm, 1, var) # Print row variances and change name of columns row_variances %<>% as.data.frame() colnames(row_variances) <- c("variance") row_variances %<>% mutate(genes = rownames(norm)) row_variances # sort the variances sorted_rv <- row_variances[order(-row_variances$variance),] top1000 <- rownames(sorted_rv[1:1000,]) top100 <- rownames(sorted_rv[1:100,]) # Boxplot visualizing the 8 samples' expression levels norm[top1000,] %>% data.frame() %>% pivot_longer(cols = everything(), names_to = "samples", values_to = "exprs") %>% ggplot(aes(x = samples, y = exprs, fill = samples)) + geom_boxplot() + labs(title = "Normalized") + # modify the legend size theme(legend.position = "bottom", legend.key.size = unit(0.5, 'cm'), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Generate the pca object pca2 <- norm[top1000,] %>% t() %>% prcomp() p3 <- ggplot2::autoplot(pca2, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") + geom_point(aes(colour = colData$annotation)) + labs(colour = "Category") + theme(legend.position="bottom") p4 <- ggplot2::autoplot(pca2, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") + geom_point(aes(colour = colData$study)) + labs(colour = "Category") + theme(legend.position="bottom") png(file="PCA_pseudobulk_1000var.png") p3 dev.off() png(file="PCA_pseudobulk_1000var_by_study.png") p4 dev.off() pca3 <- norm[top100,] %>% t() %>% prcomp() p5 <- ggplot2::autoplot(pca3, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") + geom_point(aes(colour = colData$annotation)) + labs(colour = "Category") + theme(legend.position="bottom") p6 <- ggplot2::autoplot(pca3, data = x_normalized$samples) + labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") + geom_point(aes(colour = colData$study)) + labs(colour = "Category") + theme(legend.position="bottom") ``` ```{r} hm1 <- cor(norm[top1000,]) %>% pheatmap(display_numbers = T) hm2 <- cor(norm[top100,]) %>% pheatmap(display_numbers = T) hm3 <- cor(norm) %>% pheatmap(display_numbers = T) png(file="Heatmap_100vargenes.png") hm2 dev.off() png(file="Heatmap_allgenes.png") hm3 dev.off() ``` Then, we will run the DESeq2 pipeline, with the aim to generate a heatmap ```{r} # First start with making the rownames of the summed matrix all_dfs2 <- column_to_rownames(all_dfs, var = "genes") colData <- data_frame(sample = colnames(all_dfs2), annotation = gsub('\\..*','', colnames(all_dfs2))) %>% column_to_rownames(var = "sample") all(rownames(colData) == colnames(all_dfs2)) test <- DESeqDataSetFromMatrix(countData = round(all_dfs2), colData = colData, design = ~ annotation) plotPCA(test) # set the factor level relevel(test$annotation, ref = "NonStromal") # Run DESeq test <- DESeq(test) # NOTE: Collapse the technical replicates as necessary plotPCA(DESeqTransform(test)) # Extract the results res <- results(test) res0.01 <- results(test, alpha = 0.01) summary(res0.01) # contrasts resultsNames(test) # Visualization plotMA(res0.01) ```