---
title: "(20231027) Circadian_Rhythms_DE"
output: pdf_document
---

```{r}
# load libraries
library(DESeq2)
library(apeglm)
library(EnhancedVolcano)
library(org.Mm.eg.db)
library(magrittr)
library(GenomicFeatures)
library(tidyverse)
# install.packages("ggbreak")
library(ggbreak)

# BiocManager::install("DESeq2")
# BiocManager::install("apeglm")
# BiocManager::install("EnhancedVolcano")
```

Load the count matrix, and perform some data wrangling

```{r Prepare count matrix}
counts <- readRDS("~/Circadian_Rhythms/count_matrix_20230925.rds")

# Check dimensions of the matrix and use head()
dim(counts)
colnames(counts)

# Taking a glance of the matrix
counts[1:10, 1:6]

# we should remove the first four rows and make the gene column as the row names
counts <- data.frame(counts)
rownames(counts) <- counts$gene
counts %<>% dplyr::select(-gene)
counts <- counts[5:nrow(counts),]

# Let's add the gene symbol at this point, and filter out those that don't have gene symbols from the counts matrix
ensembl_ids <- rownames(counts) 

# This function checks for the gene symbols that match the particular ENSEMBL ID.
gene_symbols <- mapIds(org.Mm.eg.db, 
                       keys = ensembl_ids, 
                       column = "SYMBOL",
                       keytype = "ENSEMBL")

# how many gene symbols?
length(gene_symbols)

gene_symbols %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(gene = value)

gene_symbols %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

# Filter out the genes where it doesn't have a gene symbol
gene_symbols %<>% dplyr::filter(!is.na(gene))

# Save the genes that are NAs (invalid since we already filtered them out)
# gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene))

#check for duplicate ENSEMBL IDs
gene_symbols %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1)

#check for duplicate gene symbols
gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1)

# Filter the counts matrix (these have a registered gene symbol)
counts <- counts[gene_symbols$ENSEMBL,]

# Check order for gene symbols
summary(gene_symbols$ENSEMBL == rownames(counts))

# We'll select out the first gene ENSEMBL ID among the duplicates.
counts <- counts[!duplicated(gene_symbols$gene),]
```


```{r colData Wrangling}
# Load the mouse gtf file, containing information on each transcript
gtf_file <- "/home/youkim/Genome_indexing/Mus_musculus.GRCm39.110.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")

# Select out the Y chromosome
chromosome_name <- "Y"

#Extract the Y chromosome Ensembl IDs
genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y"))
ensembl_gene_ids <- genes_on_Y@ranges@NAMES


#Check for all Y chromosome genes
Y_chr_expression <- counts[(rownames(counts) %in% ensembl_gene_ids),] %>% 
  colSums() %>% 
  data.frame(Y_exp = .)

# We should also set the colData for DESeq2
coldata <- data.frame(sample = colnames(counts), Y_chr_expression)
  
#check if the order of the rows are identical
rownames(coldata) == rownames(Y_chr_expression)

# Add biological sex information in metadata
coldata <- coldata %>% 
  mutate(sex = ifelse(coldata$Y_exp > 100, "male", "female"))
```


```{r}
# Now we can change the rownames of the count matrix to gene symbols!
counts <- counts %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  rename(ENSEMBL = rowname) %>% 
  inner_join(gene_symbols)

counts <- counts %>% 
  column_to_rownames("gene") %>% 
  dplyr::select(-ENSEMBL)

colnames(counts) %>% head()
```

Let's also modify the column names and sample information, before running DESeq2

```{r}
# Let's also add columns addressing their circadian status (time of the day)

# Load the .csv file including the timepoint data
tpdata <- read_csv("~/Circadian_Rhythms/Sample_Info(Sept.2023).csv", col_names = F)

# Modify the second column
extracted_part <- str_extract(tpdata$X2, "(?<=_).*?(?=_)") %>% 
  rev()

# Attach the timepoint data to our bulk RNA-seq metadata.
coldata <- coldata %>% 
  mutate(time_point = extracted_part)

# We want to modify our colnames of the count matrix, so that it can be compatible for NiteCap
sample_numbers <- str_extract(coldata$sample, "(?<=_)[^_]+(?=_)")
coldata <- coldata %>%
  mutate(sample_numbers = sample_numbers)

new_colnames <- paste0(coldata$sex, 
                       "_", 
                       coldata$time_point,
                       "_",
                       coldata$sample_numbers)

# Let's add the new column names
colnames(counts) <- new_colnames
rownames(coldata) <- new_colnames

# Take an overview of the counts and coldata dataframes
colnames(counts) %>% 
  grepl("female")
head(coldata)

male_count <- counts[, !grepl("female", colnames(counts))]
male_contrast_ZT11vs23 <- bind_cols(male_count[,grepl("ZT11", colnames(male_count))], male_count[,grepl("ZT23", colnames(male_count))])
male_contrast_ZT7vs19 <- bind_cols(male_count[,grepl("ZT7", colnames(male_count))], male_count[,grepl("ZT19", colnames(male_count))])

female_count <- counts[, grepl("female", colnames(counts))]
female_contrast_ZT11vs23 <- bind_cols(female_count[,grepl("ZT11", colnames(female_count))], female_count[,grepl("ZT23", colnames(female_count))])
female_contrast_ZT7vs19 <- bind_cols(female_count[,grepl("ZT7", colnames(female_count))], female_count[,grepl("ZT19", colnames(female_count))])
# Let's also arrange the columns by time frames
# tibble_colnames_1 <- tibble(name = colnames(male_count)) %>% 
#   filter(grepl("male_", name)) %>% 
#   mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% 
#   arrange(timepoint)
# male_count_modified <- male_count[,tibble_colnames_1$name]
# 
# tibble_colnames_2 <- tibble(name = colnames(female_count)) %>% 
#   filter(grepl("female_", name)) %>% 
#   mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% 
#   arrange(timepoint)
# female_count_modified <- female_count[,tibble_colnames_2$name]

# average_per_gene_M <- sapply(seq(1, ncol(male_count_modified), by = 6), function(start_col) {
#   end_col <- min(start_col + 5, ncol(male_count_modified))  # Determine the end column for the group
#   avg_exprs <- rowSums(male_count_modified[, start_col:end_col])/6
#   return(avg_exprs)
# }) %>% data.frame()
# 
# colnames(average_per_gene_M) <- c("M_ZT3", "M_ZT7", "M_ZT11", "M_ZT15", "M_ZT19", "M_ZT23")
# 
# average_per_gene_F <- sapply(seq(1, ncol(female_count_modified), by = 6), function(start_col) {
#   end_col <- min(start_col + 5, ncol(female_count_modified))  # Determine the end column for the group
#   avg_exprs <- rowSums(female_count_modified[, start_col:end_col])/6
#   return(avg_exprs)
# }) %>% data.frame()
# 
# colnames(average_per_gene_F) <- c("F_ZT3", "F_ZT7", "F_ZT11", "F_ZT15", "F_ZT19", "F_ZT23")

# counts_revised <- bind_cols(average_per_gene_M, average_per_gene_F)
```

Contrary to what we did for the timepoint-based analysis, we only need two count matrices.

```{r}
# Use a for loop to generate the 6 matrices
# write_csv(male_contrast_ZT11vs23, "male_contrast_(ZT11vsZT23)_20240419.csv")
# write_csv(male_contrast_ZT7vs19, "male_contrast_(ZT11vsZT23)_20240419.csv")
# write_csv(female_contrast_ZT11vs23, "female_contrast_(ZT11vsZT23)_20240419.csv")
# write_csv(female_contrast_ZT7vs19, "female_contrast_(ZT11vsZT23)_20240419.csv")
```

Let's revise the coldata

```{r}
# Load relevant library
library(stringr)

# Regular expression pattern to match text between underscores
pattern <- "_([^_]+)_"

male_coldata_revised_ZT11vs23 <- data.frame(sample = colnames(male_contrast_ZT11vs23)) %>% 
  mutate(timepoint = str_match(colnames(male_contrast_ZT11vs23) ,pattern)[,2])
rownames(male_coldata_revised_ZT11vs23) <- male_coldata_revised_ZT11vs23$sample
male_coldata_revised_ZT7vs19 <- data.frame(sample = colnames(male_contrast_ZT7vs19)) %>% 
  mutate(timepoint = str_match(colnames(male_contrast_ZT7vs19) ,pattern)[,2])
rownames(male_coldata_revised_ZT7vs19) <- male_coldata_revised_ZT7vs19$sample
# write_csv(male_coldata_revised, "male_coldata_revised.csv")

female_coldata_revised_ZT11vs23 <- data.frame(sample = colnames(female_contrast_ZT11vs23)) %>% 
  mutate(timepoint = str_match(colnames(female_contrast_ZT11vs23) ,pattern)[,2])
rownames(female_coldata_revised_ZT11vs23) <- female_coldata_revised_ZT11vs23$sample
female_coldata_revised_ZT7vs19 <- data.frame(sample = colnames(female_contrast_ZT7vs19)) %>% 
  mutate(timepoint = str_match(colnames(female_contrast_ZT7vs19) ,pattern)[,2])
rownames(female_coldata_revised_ZT7vs19) <- female_coldata_revised_ZT7vs19$sample
# write_csv(female_coldata_revised, "female_coldata_revised.csv")
```

```{R}
# At this point, we can generate the DESeqDataSetFromMatrix
smallestGroupSize <- 6


male_contrast_ZT11vs23 <- male_contrast_ZT11vs23 %>%
  mutate_at(colnames(male_contrast_ZT11vs23), as.integer)
male_dds_revised_ZT11vs23 <- DESeqDataSetFromMatrix(countData = male_contrast_ZT11vs23,
                              colData = male_coldata_revised_ZT11vs23,
                              design= ~ timepoint)
male_contrast_ZT7vs19 <- male_contrast_ZT7vs19 %>%
  mutate_at(colnames(male_contrast_ZT7vs19), as.integer)
male_dds_revised_ZT7vs19 <- DESeqDataSetFromMatrix(countData = male_contrast_ZT7vs19,
                              colData = male_coldata_revised_ZT7vs19,
                              design= ~ timepoint)

female_contrast_ZT11vs23 <- female_contrast_ZT11vs23 %>%
  mutate_at(colnames(female_contrast_ZT11vs23), as.integer)
female_dds_revised_ZT11vs23 <- DESeqDataSetFromMatrix(countData = female_contrast_ZT11vs23,
                              colData = female_coldata_revised_ZT11vs23,
                              design= ~ timepoint)
female_contrast_ZT7vs19 <- female_contrast_ZT7vs19 %>%
  mutate_at(colnames(female_contrast_ZT7vs19), as.integer)
female_dds_revised_ZT7vs19 <- DESeqDataSetFromMatrix(countData = female_contrast_ZT7vs19,
                              colData = female_coldata_revised_ZT7vs19,
                              design= ~ timepoint)

summary(rownames(male_dds_revised_ZT11vs23@colData) == colnames(male_dds_revised_ZT11vs23@assays@data$counts))
summary(rownames(male_dds_revised_ZT7vs19@colData) == colnames(male_dds_revised_ZT7vs19@assays@data$counts))
summary(rownames(female_dds_revised_ZT11vs23@colData) == colnames(female_dds_revised_ZT11vs23@assays@data$counts))
summary(rownames(female_dds_revised_ZT7vs19@colData) == colnames(female_dds_revised_ZT7vs19@assays@data$counts))

# We will filter out genes that have less than 10 reads.
keep <- rowSums(counts(male_dds_revised_ZT11vs23) >= 10) >= smallestGroupSize
male_dds_ZT11vs23 <- male_dds_revised_ZT11vs23[keep,]
keep <- rowSums(counts(male_dds_revised_ZT7vs19) >= 10) >= smallestGroupSize
male_dds_ZT7vs19 <- male_dds_revised_ZT7vs19[keep,]

keep <- rowSums(counts(female_dds_revised_ZT11vs23) >= 10) >= smallestGroupSize
female_dds_ZT11vs23 <- female_dds_revised_ZT11vs23[keep,]
keep <- rowSums(counts(female_dds_revised_ZT7vs19) >= 10) >= smallestGroupSize
female_dds_ZT7vs19 <- female_dds_revised_ZT7vs19[keep,]
# Check the dimensions of the count matrix
dim(male_dds_ZT11vs23) %>% print()
dim(male_dds_ZT7vs19) %>% print()
dim(female_dds_ZT7vs19) %>% print()
dim(female_dds_ZT11vs23) %>% print()

# At this point, I run DESeq. Compare before and after running DESeq
male_dds_ZT11vs23 <- DESeq(male_dds_ZT11vs23)
male_dds_ZT7vs19 <- DESeq(male_dds_ZT7vs19)
female_dds_ZT11vs23 <- DESeq(female_dds_ZT11vs23)
female_dds_ZT7vs19 <- DESeq(female_dds_ZT7vs19)
# Save the dds R object as a RDS file
# saveRDS(male_dds, "male_dds_20240302.rds")
# saveRDS(female_dds, "female_dds_20240304.rds")

# saveRDS(male_dds, "male_dds_20240408_ZT11vsZT23.rds")
# saveRDS(female_dds, "female_dds_20240408_ZT11vsZT23.rds")
```

(20240408 Added) Let's be careful that the "female_dds_20240304.rds" file is for ZT7vsZT19

I think we can write the prefiltering step here too, where we follow the original vignette!

```{r}
not_normalized_counts <- counts(female_dds, normalized = FALSE)
normalized_counts <- counts(female_dds, normalized = TRUE)
head(normalized_counts)
```

(20240508 Added) We can start at this step.

```{r}
# Let's recall our dds object
male_res_ZT11vs23 <- readRDS("male_res_ZT11vs23_DDS.rds")
male_res_ZT7vs19 <- readRDS("male_res_ZT7vs19_DDS.rds")
female_res_ZT11vs23 <- readRDS("female_res_ZT11vs23_DDS.rds")
female_res_ZT7vs19 <- readRDS("female_res_ZT7vs19_DDS.rds")

# male_res_ZT11vs23 <- results(male_dds_ZT11vs23, name = "timepoint ZT23_vs_ZT11")
# male_res_ZT7vs19 <- results(male_dds_ZT7vs19, name = "timepoint_ZT7_vs_ZT19")
# female_res_ZT11vs23 <- results(female_dds_ZT11vs23, name = "timepoint_ZT23_vs_ZT11")
# female_res_ZT7vs19 <- results(female_dds_ZT7vs19, name = "timepoint_ZT7_vs_ZT19")

# Check the number of genes that pass the adj. p-value of less than 0.05
male_res_ZT11vs23 %>% 
  data.frame() %>% 
  dplyr::filter(padj < 0.05) %>%
  pull(log2FoldChange) %>% 
  hist()
male_res_ZT7vs19 %>% 
  data.frame() %>% 
  dplyr::filter(padj < 0.05) %>%
  pull(log2FoldChange) %>% 
  hist()
female_res_ZT11vs23 %>% 
  data.frame() %>% 
  dplyr::filter(padj < 0.05) %>%
  pull(log2FoldChange) %>% 
  hist()
female_res_ZT7vs19 %>% 
  data.frame() %>% 
  dplyr::filter(padj < 0.05) %>%
  pull(log2FoldChange) %>% 
  hist()
# saveRDS(male_res_ZT11vs23, "male_res_ZT11vs23_DDS.rds")
# saveRDS(male_res_ZT7vs19, "male_res_ZT7vs19_DDS.rds")
# saveRDS(female_res_ZT11vs23, "female_res_ZT11vs23_DDS.rds")
# saveRDS(female_res_ZT7vs19, "female_res_ZT7vs19_DDS.rds")

# male_res_ZT11vs23 %>% 
#   data.frame() %>% 
#   write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/male_DESeq2_res_ZT11vs23.csv")
# male_res_ZT7vs19 %>% 
#   data.frame() %>% 
#   write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/male_DESeq2_res_ZT7vs19.csv")
# female_res_ZT11vs23 %>% 
#   data.frame() %>% 
#   write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/female_DESeq2_res_ZT11vs23.csv")
# female_res_ZT11vs23 %>% 
#   data.frame() %>% 
#   write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/female_DESeq2_res_ZT7vs19.csv")
```


```{r}
# Revisit the volcano plots if necessary
p1 <- EnhancedVolcano(male_res_ZT11vs23,
                lab = rownames(male_res_ZT11vs23),
                x = "log2FoldChange",
                pCutoff = 0.05,
                FCcutoff = NA,
                y= "padj",
                drawConnectors = FALSE,
                # selectLab = (male_res_ZT11vs23 %>% 
                               # data.frame() %>% 
                               # filter(-log10(padj) > 50) %>% 
                               # rownames()),
                # labSize = 4.0,
                col=c('grey', 'grey', 'red1', 'red1'),
                max.overlaps = 100,
                title = "Male ZT11 vs ZT23 Volcano Plot (adjusted p-value)",
                subtitle = NULL)
p2 <- EnhancedVolcano(male_res_ZT7vs19,
                lab = rownames(male_res_ZT7vs19),
                x = "log2FoldChange",
                pCutoff = 0.05,
                FCcutoff = NA,
                y= "padj",
                drawConnectors = FALSE,
                # selectLab = (male_res_ZT7vs19 %>% 
                               # data.frame() %>% 
                               # filter(-log10(padj) > 50) %>% 
                               # rownames()),
                # labSize = 4.0,
                col=c('grey', 'grey', 'red1', 'red1'),
                max.overlaps = 100,
                title = "Male ZT7 vs ZT19 Volcano Plot (adjusted p-value)",
                subtitle = NULL)
p3 <- EnhancedVolcano(female_res_ZT11vs23,
                lab = rownames(female_res_ZT11vs23),
                x = "log2FoldChange",
                pCutoff = 0.05,
                FCcutoff = NA,
                y= "padj",
                drawConnectors = FALSE,
                # selectLab = (female_res_ZT11vs23 %>% 
                               # data.frame() %>% 
                               # filter(-log10(padj) > 50) %>% 
                               # rownames()),
                # labSize = 4.0,
                col=c('grey', 'grey', 'red1', 'red1'),
                max.overlaps = 100,
                title = "Female ZT11 vs ZT23 Volcano Plot (adjusted p-value)",
                subtitle = NULL)
p4 <- EnhancedVolcano(female_res_ZT7vs19,
                lab = rownames(female_res_ZT7vs19),
                x = "log2FoldChange",
                pCutoff = 0.05,
                FCcutoff = NA,
                y= "padj",
                drawConnectors = FALSE,
                # selectLab = (female_res_ZT7vs19 %>% 
                               # data.frame() %>% 
                               # filter(-log10(padj) > 50) %>% 
                               # rownames()),
                # labSize = 4.0,
                col=c('grey', 'grey', 'red1', 'red1'),
                max.overlaps = 100,
                title = "Female ZT7 vs ZT19 Volcano Plot (adjusted p-value)",
                subtitle = NULL)

pdf(file = "/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/Volcanoplots_20240508.pdf",
    height = 11,
    width = 9)
p1
p2
p3
p4
p1 + scale_y_break(breaks = c(65, 125),
                   expand = F,
                   scales = 0.2)
p2 + scale_y_break(breaks = c(40, 125),
                   expand = F,
                   scales = 0.2)
p3 + scale_y_break(breaks = c(50, 50),
                   expand = F,
                   scales = 0.2,
                   space = 0.125)
p4 + scale_y_break(breaks = c(45, 100),
                   expand = F,
                   scales = 0.2,
                   space = 0.125)
dev.off()

# italic <- function(x){
#   NULL
# }
# 
# italic <- NULL
?scale_y_break
```