--- title: "(20231027) Circadian_Rhythms_DE" output: pdf_document --- ```{r} # load libraries library(DESeq2) library(apeglm) library(EnhancedVolcano) library(org.Mm.eg.db) library(magrittr) library(GenomicFeatures) library(tidyverse) # install.packages("ggbreak") library(ggbreak) # BiocManager::install("DESeq2") # BiocManager::install("apeglm") # BiocManager::install("EnhancedVolcano") ``` Load the count matrix, and perform some data wrangling ```{r Prepare count matrix} counts <- readRDS("~/Circadian_Rhythms/count_matrix_20230925.rds") # Check dimensions of the matrix and use head() dim(counts) colnames(counts) # Taking a glance of the matrix counts[1:10, 1:6] # we should remove the first four rows and make the gene column as the row names counts <- data.frame(counts) rownames(counts) <- counts$gene counts %<>% dplyr::select(-gene) counts <- counts[5:nrow(counts),] # Let's add the gene symbol at this point, and filter out those that don't have gene symbols from the counts matrix ensembl_ids <- rownames(counts) # This function checks for the gene symbols that match the particular ENSEMBL ID. gene_symbols <- mapIds(org.Mm.eg.db, keys = ensembl_ids, column = "SYMBOL", keytype = "ENSEMBL") # how many gene symbols? length(gene_symbols) gene_symbols %<>% as_tibble(rownames = "ENSEMBL") %>% dplyr::rename(gene = value) gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) # Filter out the genes where it doesn't have a gene symbol gene_symbols %<>% dplyr::filter(!is.na(gene)) # Save the genes that are NAs (invalid since we already filtered them out) # gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene)) #check for duplicate ENSEMBL IDs gene_symbols %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) #check for duplicate gene symbols gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) # Filter the counts matrix (these have a registered gene symbol) counts <- counts[gene_symbols$ENSEMBL,] # Check order for gene symbols summary(gene_symbols$ENSEMBL == rownames(counts)) # We'll select out the first gene ENSEMBL ID among the duplicates. counts <- counts[!duplicated(gene_symbols$gene),] ``` ```{r colData Wrangling} # Load the mouse gtf file, containing information on each transcript gtf_file <- "/home/youkim/Genome_indexing/Mus_musculus.GRCm39.110.gtf" txdb <- makeTxDbFromGFF(gtf_file, format = "gtf") # Select out the Y chromosome chromosome_name <- "Y" #Extract the Y chromosome Ensembl IDs genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y")) ensembl_gene_ids <- genes_on_Y@ranges@NAMES #Check for all Y chromosome genes Y_chr_expression <- counts[(rownames(counts) %in% ensembl_gene_ids),] %>% colSums() %>% data.frame(Y_exp = .) # We should also set the colData for DESeq2 coldata <- data.frame(sample = colnames(counts), Y_chr_expression) #check if the order of the rows are identical rownames(coldata) == rownames(Y_chr_expression) # Add biological sex information in metadata coldata <- coldata %>% mutate(sex = ifelse(coldata$Y_exp > 100, "male", "female")) ``` ```{r} # Now we can change the rownames of the count matrix to gene symbols! counts <- counts %>% data.frame() %>% rownames_to_column() %>% rename(ENSEMBL = rowname) %>% inner_join(gene_symbols) counts <- counts %>% column_to_rownames("gene") %>% dplyr::select(-ENSEMBL) colnames(counts) %>% head() ``` Let's also modify the column names and sample information, before running DESeq2 ```{r} # Let's also add columns addressing their circadian status (time of the day) # Load the .csv file including the timepoint data tpdata <- read_csv("~/Circadian_Rhythms/Sample_Info(Sept.2023).csv", col_names = F) # Modify the second column extracted_part <- str_extract(tpdata$X2, "(?<=_).*?(?=_)") %>% rev() # Attach the timepoint data to our bulk RNA-seq metadata. coldata <- coldata %>% mutate(time_point = extracted_part) # We want to modify our colnames of the count matrix, so that it can be compatible for NiteCap sample_numbers <- str_extract(coldata$sample, "(?<=_)[^_]+(?=_)") coldata <- coldata %>% mutate(sample_numbers = sample_numbers) new_colnames <- paste0(coldata$sex, "_", coldata$time_point, "_", coldata$sample_numbers) # Let's add the new column names colnames(counts) <- new_colnames rownames(coldata) <- new_colnames # Take an overview of the counts and coldata dataframes colnames(counts) %>% grepl("female") head(coldata) male_count <- counts[, !grepl("female", colnames(counts))] male_contrast_ZT11vs23 <- bind_cols(male_count[,grepl("ZT11", colnames(male_count))], male_count[,grepl("ZT23", colnames(male_count))]) male_contrast_ZT7vs19 <- bind_cols(male_count[,grepl("ZT7", colnames(male_count))], male_count[,grepl("ZT19", colnames(male_count))]) female_count <- counts[, grepl("female", colnames(counts))] female_contrast_ZT11vs23 <- bind_cols(female_count[,grepl("ZT11", colnames(female_count))], female_count[,grepl("ZT23", colnames(female_count))]) female_contrast_ZT7vs19 <- bind_cols(female_count[,grepl("ZT7", colnames(female_count))], female_count[,grepl("ZT19", colnames(female_count))]) # Let's also arrange the columns by time frames # tibble_colnames_1 <- tibble(name = colnames(male_count)) %>% # filter(grepl("male_", name)) %>% # mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% # arrange(timepoint) # male_count_modified <- male_count[,tibble_colnames_1$name] # # tibble_colnames_2 <- tibble(name = colnames(female_count)) %>% # filter(grepl("female_", name)) %>% # mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% # arrange(timepoint) # female_count_modified <- female_count[,tibble_colnames_2$name] # average_per_gene_M <- sapply(seq(1, ncol(male_count_modified), by = 6), function(start_col) { # end_col <- min(start_col + 5, ncol(male_count_modified)) # Determine the end column for the group # avg_exprs <- rowSums(male_count_modified[, start_col:end_col])/6 # return(avg_exprs) # }) %>% data.frame() # # colnames(average_per_gene_M) <- c("M_ZT3", "M_ZT7", "M_ZT11", "M_ZT15", "M_ZT19", "M_ZT23") # # average_per_gene_F <- sapply(seq(1, ncol(female_count_modified), by = 6), function(start_col) { # end_col <- min(start_col + 5, ncol(female_count_modified)) # Determine the end column for the group # avg_exprs <- rowSums(female_count_modified[, start_col:end_col])/6 # return(avg_exprs) # }) %>% data.frame() # # colnames(average_per_gene_F) <- c("F_ZT3", "F_ZT7", "F_ZT11", "F_ZT15", "F_ZT19", "F_ZT23") # counts_revised <- bind_cols(average_per_gene_M, average_per_gene_F) ``` Contrary to what we did for the timepoint-based analysis, we only need two count matrices. ```{r} # Use a for loop to generate the 6 matrices # write_csv(male_contrast_ZT11vs23, "male_contrast_(ZT11vsZT23)_20240419.csv") # write_csv(male_contrast_ZT7vs19, "male_contrast_(ZT11vsZT23)_20240419.csv") # write_csv(female_contrast_ZT11vs23, "female_contrast_(ZT11vsZT23)_20240419.csv") # write_csv(female_contrast_ZT7vs19, "female_contrast_(ZT11vsZT23)_20240419.csv") ``` Let's revise the coldata ```{r} # Load relevant library library(stringr) # Regular expression pattern to match text between underscores pattern <- "_([^_]+)_" male_coldata_revised_ZT11vs23 <- data.frame(sample = colnames(male_contrast_ZT11vs23)) %>% mutate(timepoint = str_match(colnames(male_contrast_ZT11vs23) ,pattern)[,2]) rownames(male_coldata_revised_ZT11vs23) <- male_coldata_revised_ZT11vs23$sample male_coldata_revised_ZT7vs19 <- data.frame(sample = colnames(male_contrast_ZT7vs19)) %>% mutate(timepoint = str_match(colnames(male_contrast_ZT7vs19) ,pattern)[,2]) rownames(male_coldata_revised_ZT7vs19) <- male_coldata_revised_ZT7vs19$sample # write_csv(male_coldata_revised, "male_coldata_revised.csv") female_coldata_revised_ZT11vs23 <- data.frame(sample = colnames(female_contrast_ZT11vs23)) %>% mutate(timepoint = str_match(colnames(female_contrast_ZT11vs23) ,pattern)[,2]) rownames(female_coldata_revised_ZT11vs23) <- female_coldata_revised_ZT11vs23$sample female_coldata_revised_ZT7vs19 <- data.frame(sample = colnames(female_contrast_ZT7vs19)) %>% mutate(timepoint = str_match(colnames(female_contrast_ZT7vs19) ,pattern)[,2]) rownames(female_coldata_revised_ZT7vs19) <- female_coldata_revised_ZT7vs19$sample # write_csv(female_coldata_revised, "female_coldata_revised.csv") ``` ```{R} # At this point, we can generate the DESeqDataSetFromMatrix smallestGroupSize <- 6 male_contrast_ZT11vs23 <- male_contrast_ZT11vs23 %>% mutate_at(colnames(male_contrast_ZT11vs23), as.integer) male_dds_revised_ZT11vs23 <- DESeqDataSetFromMatrix(countData = male_contrast_ZT11vs23, colData = male_coldata_revised_ZT11vs23, design= ~ timepoint) male_contrast_ZT7vs19 <- male_contrast_ZT7vs19 %>% mutate_at(colnames(male_contrast_ZT7vs19), as.integer) male_dds_revised_ZT7vs19 <- DESeqDataSetFromMatrix(countData = male_contrast_ZT7vs19, colData = male_coldata_revised_ZT7vs19, design= ~ timepoint) female_contrast_ZT11vs23 <- female_contrast_ZT11vs23 %>% mutate_at(colnames(female_contrast_ZT11vs23), as.integer) female_dds_revised_ZT11vs23 <- DESeqDataSetFromMatrix(countData = female_contrast_ZT11vs23, colData = female_coldata_revised_ZT11vs23, design= ~ timepoint) female_contrast_ZT7vs19 <- female_contrast_ZT7vs19 %>% mutate_at(colnames(female_contrast_ZT7vs19), as.integer) female_dds_revised_ZT7vs19 <- DESeqDataSetFromMatrix(countData = female_contrast_ZT7vs19, colData = female_coldata_revised_ZT7vs19, design= ~ timepoint) summary(rownames(male_dds_revised_ZT11vs23@colData) == colnames(male_dds_revised_ZT11vs23@assays@data$counts)) summary(rownames(male_dds_revised_ZT7vs19@colData) == colnames(male_dds_revised_ZT7vs19@assays@data$counts)) summary(rownames(female_dds_revised_ZT11vs23@colData) == colnames(female_dds_revised_ZT11vs23@assays@data$counts)) summary(rownames(female_dds_revised_ZT7vs19@colData) == colnames(female_dds_revised_ZT7vs19@assays@data$counts)) # We will filter out genes that have less than 10 reads. keep <- rowSums(counts(male_dds_revised_ZT11vs23) >= 10) >= smallestGroupSize male_dds_ZT11vs23 <- male_dds_revised_ZT11vs23[keep,] keep <- rowSums(counts(male_dds_revised_ZT7vs19) >= 10) >= smallestGroupSize male_dds_ZT7vs19 <- male_dds_revised_ZT7vs19[keep,] keep <- rowSums(counts(female_dds_revised_ZT11vs23) >= 10) >= smallestGroupSize female_dds_ZT11vs23 <- female_dds_revised_ZT11vs23[keep,] keep <- rowSums(counts(female_dds_revised_ZT7vs19) >= 10) >= smallestGroupSize female_dds_ZT7vs19 <- female_dds_revised_ZT7vs19[keep,] # Check the dimensions of the count matrix dim(male_dds_ZT11vs23) %>% print() dim(male_dds_ZT7vs19) %>% print() dim(female_dds_ZT7vs19) %>% print() dim(female_dds_ZT11vs23) %>% print() # At this point, I run DESeq. Compare before and after running DESeq male_dds_ZT11vs23 <- DESeq(male_dds_ZT11vs23) male_dds_ZT7vs19 <- DESeq(male_dds_ZT7vs19) female_dds_ZT11vs23 <- DESeq(female_dds_ZT11vs23) female_dds_ZT7vs19 <- DESeq(female_dds_ZT7vs19) # Save the dds R object as a RDS file # saveRDS(male_dds, "male_dds_20240302.rds") # saveRDS(female_dds, "female_dds_20240304.rds") # saveRDS(male_dds, "male_dds_20240408_ZT11vsZT23.rds") # saveRDS(female_dds, "female_dds_20240408_ZT11vsZT23.rds") ``` (20240408 Added) Let's be careful that the "female_dds_20240304.rds" file is for ZT7vsZT19 I think we can write the prefiltering step here too, where we follow the original vignette! ```{r} not_normalized_counts <- counts(female_dds, normalized = FALSE) normalized_counts <- counts(female_dds, normalized = TRUE) head(normalized_counts) ``` (20240508 Added) We can start at this step. ```{r} # Let's recall our dds object male_res_ZT11vs23 <- readRDS("male_res_ZT11vs23_DDS.rds") male_res_ZT7vs19 <- readRDS("male_res_ZT7vs19_DDS.rds") female_res_ZT11vs23 <- readRDS("female_res_ZT11vs23_DDS.rds") female_res_ZT7vs19 <- readRDS("female_res_ZT7vs19_DDS.rds") # male_res_ZT11vs23 <- results(male_dds_ZT11vs23, name = "timepoint ZT23_vs_ZT11") # male_res_ZT7vs19 <- results(male_dds_ZT7vs19, name = "timepoint_ZT7_vs_ZT19") # female_res_ZT11vs23 <- results(female_dds_ZT11vs23, name = "timepoint_ZT23_vs_ZT11") # female_res_ZT7vs19 <- results(female_dds_ZT7vs19, name = "timepoint_ZT7_vs_ZT19") # Check the number of genes that pass the adj. p-value of less than 0.05 male_res_ZT11vs23 %>% data.frame() %>% dplyr::filter(padj < 0.05) %>% pull(log2FoldChange) %>% hist() male_res_ZT7vs19 %>% data.frame() %>% dplyr::filter(padj < 0.05) %>% pull(log2FoldChange) %>% hist() female_res_ZT11vs23 %>% data.frame() %>% dplyr::filter(padj < 0.05) %>% pull(log2FoldChange) %>% hist() female_res_ZT7vs19 %>% data.frame() %>% dplyr::filter(padj < 0.05) %>% pull(log2FoldChange) %>% hist() # saveRDS(male_res_ZT11vs23, "male_res_ZT11vs23_DDS.rds") # saveRDS(male_res_ZT7vs19, "male_res_ZT7vs19_DDS.rds") # saveRDS(female_res_ZT11vs23, "female_res_ZT11vs23_DDS.rds") # saveRDS(female_res_ZT7vs19, "female_res_ZT7vs19_DDS.rds") # male_res_ZT11vs23 %>% # data.frame() %>% # write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/male_DESeq2_res_ZT11vs23.csv") # male_res_ZT7vs19 %>% # data.frame() %>% # write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/male_DESeq2_res_ZT7vs19.csv") # female_res_ZT11vs23 %>% # data.frame() %>% # write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/female_DESeq2_res_ZT11vs23.csv") # female_res_ZT11vs23 %>% # data.frame() %>% # write.csv("/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/female_DESeq2_res_ZT7vs19.csv") ``` ```{r} # Revisit the volcano plots if necessary p1 <- EnhancedVolcano(male_res_ZT11vs23, lab = rownames(male_res_ZT11vs23), x = "log2FoldChange", pCutoff = 0.05, FCcutoff = NA, y= "padj", drawConnectors = FALSE, # selectLab = (male_res_ZT11vs23 %>% # data.frame() %>% # filter(-log10(padj) > 50) %>% # rownames()), # labSize = 4.0, col=c('grey', 'grey', 'red1', 'red1'), max.overlaps = 100, title = "Male ZT11 vs ZT23 Volcano Plot (adjusted p-value)", subtitle = NULL) p2 <- EnhancedVolcano(male_res_ZT7vs19, lab = rownames(male_res_ZT7vs19), x = "log2FoldChange", pCutoff = 0.05, FCcutoff = NA, y= "padj", drawConnectors = FALSE, # selectLab = (male_res_ZT7vs19 %>% # data.frame() %>% # filter(-log10(padj) > 50) %>% # rownames()), # labSize = 4.0, col=c('grey', 'grey', 'red1', 'red1'), max.overlaps = 100, title = "Male ZT7 vs ZT19 Volcano Plot (adjusted p-value)", subtitle = NULL) p3 <- EnhancedVolcano(female_res_ZT11vs23, lab = rownames(female_res_ZT11vs23), x = "log2FoldChange", pCutoff = 0.05, FCcutoff = NA, y= "padj", drawConnectors = FALSE, # selectLab = (female_res_ZT11vs23 %>% # data.frame() %>% # filter(-log10(padj) > 50) %>% # rownames()), # labSize = 4.0, col=c('grey', 'grey', 'red1', 'red1'), max.overlaps = 100, title = "Female ZT11 vs ZT23 Volcano Plot (adjusted p-value)", subtitle = NULL) p4 <- EnhancedVolcano(female_res_ZT7vs19, lab = rownames(female_res_ZT7vs19), x = "log2FoldChange", pCutoff = 0.05, FCcutoff = NA, y= "padj", drawConnectors = FALSE, # selectLab = (female_res_ZT7vs19 %>% # data.frame() %>% # filter(-log10(padj) > 50) %>% # rownames()), # labSize = 4.0, col=c('grey', 'grey', 'red1', 'red1'), max.overlaps = 100, title = "Female ZT7 vs ZT19 Volcano Plot (adjusted p-value)", subtitle = NULL) pdf(file = "/home/youkim/Circadian_Rhythms/DE_each_sex/Share_with_Alex/Volcanoplots_20240508.pdf", height = 11, width = 9) p1 p2 p3 p4 p1 + scale_y_break(breaks = c(65, 125), expand = F, scales = 0.2) p2 + scale_y_break(breaks = c(40, 125), expand = F, scales = 0.2) p3 + scale_y_break(breaks = c(50, 50), expand = F, scales = 0.2, space = 0.125) p4 + scale_y_break(breaks = c(45, 100), expand = F, scales = 0.2, space = 0.125) dev.off() # italic <- function(x){ # NULL # } # # italic <- NULL ?scale_y_break ```