---
title: "Pseudobulk (2023-05-02)"
output: html_document
---

``` {r}
# Load libraries
library(SingleCellExperiment)
library(tidyverse)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
library(edgeR)
library(dplyr)
library(cowplot)
library(Matrix)
library(ggfortify)
# BiocManager::install("DESeq2")
library(DESeq2)
library(pheatmap)
```

Let's start by loading our Seurat objects, which we have already done analysis.
```{r}
vtdataset <- readRDS("~/MetaMarkers/4-dataset-test/vtSeurat2_annotated.rds")
sdataset <- readRDS("~/MetaMarkers/4-dataset-test/sSeurat_20230513.rds")
ldataset <- readRDS("~/MetaMarkers/4-dataset-test/lSeurat_annotated.rds")
hdataset <- readRDS("~/MetaMarkers/4-dataset-test/hSeurat_annotated.rds")

# We notice that the metadata does not explicitly display whether the cell is stromal or non-stromal. This information is contained in the Metamarkers folder.

# As we did back in Metamarker generation, let's start with the Liu dataset
# Collapse the cell types in Liu dataset
lmeta_copy <- ldataset@meta.data %>% 
  dplyr::select(Annotation)

dim(ldataset@meta.data)


# read the cell types data frame
l_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Liu Cell Types (T vs NT).csv") %>% 
  dplyr::rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(lmeta_copy, l_cell_types)

# Check if the order is maintained
summary(df$Annotation == lmeta_copy$Annotation)

# add to meta.data
ldataset$merged_annotation <- df$Merged

# Verify the metadata
head(ldataset@meta.data$merged_annotation)
```

We repeat the process for the other three datasets.

```{r}
# Collapse the cell types in VT dataset
vtmeta_copy <- vtdataset@meta.data %>% 
  dplyr::select(Annotation)

# Do the same thing for the vtSeurat
vt_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/VT Cell Types (T vs NT).csv") %>% 
  dplyr::rename(Annotation = Annotations)

# use inner_join() on vtmeta_copy and vt_cell_types
df_vt <- inner_join(vtmeta_copy, vt_cell_types)

# Check if the order is maintained
summary(df_vt$Annotation == vtmeta_copy$Annotation)

# add to meta.data
head(vtdataset$merged_annotation <- df_vt$Merged)

```

```{r}
# Collapse the cell types in Suryawanshi dataset
smeta_copy <- sdataset@meta.data %>% 
  dplyr::select(CellType) %>% 
  mutate(Annotation = CellType)

sdataset@meta.data$CellType %>% 
  table()

# read the cell types data frame
s_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Suryawanshi Cell Types (T vs NT).csv") %>% 
  dplyr::rename(Annotation = Annotations)

# use inner_join() on smeta_copy and s_cell_types
df_s <- inner_join(smeta_copy, s_cell_types)

# Check if the order is maintained
summary(df_s$Annotation == smeta_copy$Annotation)

# add to meta.data
sdataset$merged_annotation <- df_s$Merged
head(sdataset@meta.data)
```

Let's repeat the process one last time for the Han dataset
```{r}
# Collapse the cell types in Han dataset
hmeta_copy <- hdataset@meta.data %>% 
  dplyr::select(Annotation)

dim(hdataset@meta.data)

# read the cell types data frame
h_cell_types <- read.csv("~/MetaMarkers/4-dataset-test/Han Cell Types (T vs NT).csv") %>% 
  dplyr::rename(Annotation = Annotations)

# use inner_join() on hmeta_copy and h_cell_types
df <- inner_join(hmeta_copy, h_cell_types)

# Check if the order is maintained
summary(df$Annotation == hmeta_copy$Annotation)

# add to meta.data
hdataset$merged_annotation <- df$Merged

# Change the Idents of the Seurat Objects
Idents(object = ldataset) <- "merged_annotation"
Idents(object = vtdataset) <- "merged_annotation"
Idents(object = sdataset) <- "merged_annotation"
Idents(object = hdataset) <- "merged_annotation"
```

To perform pseudo-bulk analysis, we need to check if we have the necessary metrics for aggregation across cells in a sample.

### Counts matrix - Stromal-Nonstromal level

```{r}
counts1 <- AggregateExpression(vtdataset,
                    group.by = "merged_annotation",
                    assays = "RNA",
                    slot = "counts",
                    return.seurat = FALSE)

counts2 <- AggregateExpression(ldataset,
                    group.by = "merged_annotation",
                    assays = "RNA",
                    slot = "counts",
                    return.seurat = FALSE)

counts3 <- AggregateExpression(sdataset,
                    group.by = "merged_annotation",
                    assays = "RNA",
                    slot = "counts",
                    return.seurat = FALSE)

counts4 <- AggregateExpression(hdataset,
                    group.by = "merged_annotation",
                    assays = "RNA",
                    slot = "counts",
                    return.seurat = FALSE)

df1 <- data.frame(counts1$RNA) %>% 
  mutate(genes = rownames(data.frame(counts1$RNA))) %>% 
  mutate(Troph.vt = Trophoblast) %>% 
  mutate(NonTroph.vt = Non.Trophoblast) %>% 
  dplyr::select(genes, Troph.vt, NonTroph.vt)
df2 <- data.frame(counts2$RNA) %>% 
  mutate(genes = rownames(data.frame(counts2$RNA)))%>% 
  mutate(Troph.l = Trophoblast) %>% 
  mutate(NonTroph.l = Non.Trophoblast) %>% 
  dplyr::select(genes, Troph.l, NonTroph.l)
df3 <- data.frame(counts3$RNA) %>% 
  mutate(genes = rownames(data.frame(counts3$RNA)))%>% 
  mutate(Troph.s = Trophoblast) %>% 
  mutate(NonTroph.s = Non.Trophoblast) %>% 
  dplyr::select(genes, Troph.s, NonTroph.s)
df4 <- data.frame(counts4$RNA) %>% 
  mutate(genes = rownames(data.frame(counts4$RNA)))%>% 
  mutate(Troph.h = Trophoblast) %>% 
  mutate(NonTroph.h = Non.Trophoblast) %>% 
  dplyr::select(genes, Troph.h, NonTroph.h)

all_dfs <- Reduce(function(...){merge(..., all = FALSE)},
                  list(df1, df2, df3, df4))

all_dfs2 <- column_to_rownames(all_dfs, var = "genes")

```

Before we run the DESeq2 pipeline, we need to check whether our samples cluster together.
Let's use the prcomp() function as we used before in our bulk RNA-seq data analysis.

```{r}
x <- DGEList(counts = all_dfs2, 
             samples = colnames(all_dfs2),
             genes = rownames(all_dfs2))

x_normalized <- calcNormFactors(x, method = "TMM")
# Notice that the counts matrix for x_normalizedna and all_dfs2 is identical.
summary(x_normalized$counts == all_dfs2)

# The following now reflects the adjusted differences in total reads.
edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% 
  colSums()

colData <- data_frame(sample = colnames(all_dfs2),
                      annotation = gsub('\\..*','', colnames(all_dfs2))) %>% 
  column_to_rownames(var = "sample")

colData %<>% mutate(samples = rownames(colData))
colData %<>% mutate(study = gsub(".*\\.","", samples))

# Check using this plot whether normalization went well.
edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% 
  data.frame() %>% 
  pivot_longer(cols = everything(), 
               names_to = "samples",
               values_to = "exprs") %>% 
  ggplot(aes(x = samples, y = exprs, fill = samples)) +
  geom_boxplot() +
  labs(title = "Normalized") +
  # modify the legend size
  theme(legend.position = "bottom",
        legend.key.size = unit(0.5, 'cm'),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# create the pca object based on the normalized values.
pca1 <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T)  %>% 
  t() %>% 
  prcomp()

# Using the pca1 object, plot the PCA data categorized by Stromal vs Non-stromal
# Save the plot as an object
p1 <- ggplot2::autoplot(pca1, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
	geom_point(aes(colour = colData$annotation)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

p2 <- ggplot2::autoplot(pca1, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
	geom_point(aes(colour = colData$study)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

png(file= "PCA_pseudobulk_TvsNT_20230513.png")
p1
dev.off()

png(file="PCA_pseudobulk_by_study(TvsNT)_20230513.png")
p2
dev.off()
```

Let's also generate the PCA plots where we focus only on the 1000 most variable genes.

```{r}
# Retrieve the counts matrix to plot
norm <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T)
rownames(norm)
# Apply var() function to every row
row_variances <- apply(norm, 1, var)

# Print row variances and change name of columns
row_variances %<>% as.data.frame()
colnames(row_variances) <- c("variance")
row_variances %<>% mutate(genes = rownames(norm))
row_variances

# sort the variances 
sorted_rv <- row_variances[order(-row_variances$variance),]
top1000 <- rownames(sorted_rv[1:1000,])
top100 <- rownames(sorted_rv[1:100,])

# Boxplot visualizing the 8 samples' expression levels
norm[top1000,] %>% 
  data.frame() %>% 
  pivot_longer(cols = everything(), 
               names_to = "samples",
               values_to = "exprs") %>% 
  ggplot(aes(x = samples, y = exprs, fill = samples)) +
  geom_boxplot() +
  labs(title = "Normalized") +
  # modify the legend size
  theme(legend.position = "bottom",
        legend.key.size = unit(0.5, 'cm'),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Generate the pca object
pca2 <- norm[top1000,] %>% 
  t() %>% 
  prcomp()

p3 <- ggplot2::autoplot(pca2, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
	geom_point(aes(colour = colData$annotation)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

p4 <- ggplot2::autoplot(pca2, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
	geom_point(aes(colour = colData$study)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

png(file="PCA_pseudobulk_TvsNT1000var.png")
p3
dev.off()

png(file="PCA_pseudobulk_TvsNT1000var_by_study.png")
p4
dev.off()

pca3 <- norm[top100,] %>% 
  t() %>% 
  prcomp()

p5 <- ggplot2::autoplot(pca3, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data") +
	geom_point(aes(colour = colData$annotation)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

p6 <- ggplot2::autoplot(pca3, data = x_normalized$samples) +
	labs(title = "PCA Plot for Pseudo-bulk scRNA-seq data by study") +
	geom_point(aes(colour = colData$study)) +
	labs(colour = "Category") +
	theme(legend.position="bottom")

png(file="PCA_pseudobulk_TvsNT100var.png")
p3
dev.off()

png(file="PCA_pseudobulk_TvsNT100var_by_study.png")
p4
dev.off()

```


```{r}
hm1 <- cor(norm[top1000,]) %>% 
  pheatmap(display_numbers = T)

hm2 <- cor(norm[top100,]) %>% 
  pheatmap(display_numbers = T)

hm3 <- cor(norm) %>% 
  pheatmap(display_numbers = T)

png(file="Heatmap_TvsNT100vargenes_20230513.png")
hm2
dev.off()

png(file="Heatmap_TvsNTallgenes_20230513.png")
hm3
dev.off()
  
```

Then, we will run the DESeq2 pipeline, with the aim to generate a heatmap

```{r}
# First start with making the rownames of the summed matrix

colData <- data_frame(sample = colnames(all_dfs2),
                      annotation = gsub('\\..*','', colnames(all_dfs2))) %>% 
  column_to_rownames(var = "sample")

all(rownames(colData) == colnames(all_dfs2))

test <- DESeqDataSetFromMatrix(countData = round(all_dfs2),
                               colData = colData,
                               design = ~ annotation)

plotPCA(test)

# set the factor level
relevel(test$annotation, ref = "NonStromal")

# Run DESeq
test <- DESeq(test)
# NOTE: Collapse the technical replicates as necessary

plotPCA(DESeqTransform(test))

# Extract the results
res <- results(test)
res0.01 <- results(test, alpha = 0.01)
summary(res0.01)

# contrasts
resultsNames(test)

# Visualization
plotMA(res0.01)
```