--- title: "(20231027) Circadian_Rhythms_DE" output: pdf_document --- ```{r} # load libraries library(DESeq2) library(apeglm) library(EnhancedVolcano) library(tidyverse) # BiocManager::install("DESeq2") # BiocManager::install("apeglm") # BiocManager::install("EnhancedVolcano") ``` Also, try to get the correlation between normalized gene exprs data and the myogenic tone data. ```{r} # Load the filtered, normalized counts male_counts <- read.csv("DESeq2_normalized_male_counts_20231106.csv") female_counts <- read.csv("DESeq2_normalized_female_counts_20231106.csv") head(male_counts) # Remember that these are normalized counts! # Continue to load the myogenic data male_myogenic_data <- read.csv("~/Circadian_Rhythms/Myogenic_Data/Final_data_males.csv") female_myogenic_data <- read.csv("~/Circadian_Rhythms/Myogenic_Data/Final_data_females.csv") ``` We will focus our analysis first on the common rhythmic genes for both males and females, from the MetaCycle results ```{r} male_metaCycle <- read.csv("~/Circadian_Rhythms/metaout_20231106/meta2d_DESeq2_normalized_male_counts_20231106.csv") female_metaCycle <- read.csv("~/Circadian_Rhythms/metaout_20231106/meta2d_DESeq2_normalized_female_counts_20231106.csv") male_cycling <- male_metaCycle %>% filter(meta2d_pvalue < 0.05) %>% pull(CycID) female_cycling <- female_metaCycle %>% filter(meta2d_pvalue < 0.05) %>% pull(CycID) # Find the common genes that cycle between males and females common_cycling <- intersect(male_cycling, female_cycling) # Filter out only the common_cycling genes in males cycling_m <- male_counts %>% dplyr::filter(gene %in% common_cycling) %>% column_to_rownames("gene") average_per_gene_M <- sapply(seq(1, ncol(cycling_m), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(cycling_m)) # Determine the end column for the group avg_exprs <- rowSums(cycling_m[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() colnames(average_per_gene_M) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") # Filter out only the common_cycling genes in females cycling_f <- female_counts %>% dplyr::filter(gene %in% common_cycling) %>% column_to_rownames("gene") average_per_gene_F <- sapply(seq(1, ncol(cycling_f), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(cycling_f)) # Determine the end column for the group avg_exprs <- rowSums(cycling_f[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() colnames(average_per_gene_F) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") ``` Now we also need to do some data wrangling for the myogenic data too. ```{r} # Overview of the data, and filter out the Myo60 data male_myogenic_data %>% head() female_myogenic_data %>% head() M_Myo60 <- male_myogenic_data %>% filter(ID == "M Myo 60") %>% dplyr::select(-ID) # Assuming df is your data frame # Define the list of group sizes group_sizes <- c(4, 7, 6, 4, 5, 5) # Initialize an empty list to store group means group_means_list <- list() # Track the starting column index start_col <- 1 # Loop through each group size for (group_size in group_sizes) { end_col <- min(start_col + group_size - 1, ncol(M_Myo60)) # Determine the end column for the group averages <- rowSums(M_Myo60[, start_col:end_col])/group_size # Calculate the mean for the current group group_means_list[[length(group_means_list) + 1]] <- averages # Store the mean in the list start_col <- end_col + 1 # Move to the next group } # Convert the list of group means to a data frame averages_df_m <- data.frame(do.call(cbind, group_means_list)) colnames(averages_df_m) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") F_Myo60 <- female_myogenic_data %>% filter(ID == "F Myo 60") %>% dplyr::select(-ID) # Assuming df is your data frame # Define the list of group sizes group_sizes <- c(4, 7, 5, 6, 8, 6) # Initialize an empty list to store group means group_means_list <- list() # Track the starting column index start_col <- 1 # Loop through each group size for (group_size in group_sizes) { end_col <- min(start_col + group_size - 1, ncol(F_Myo60)) # Determine the end column for the group averages <- rowSums(F_Myo60[, start_col:end_col])/group_size # Calculate the mean for the current group group_means_list[[length(group_means_list) + 1]] <- averages # Store the mean in the list start_col <- end_col + 1 # Move to the next group } # Convert the list of group means to a data frame averages_df_f <- data.frame(do.call(cbind, group_means_list)) colnames(averages_df_f) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") v1 <- averages_df_m %>% t() v2 <- averages_df_f %>% t() cor(v1,v2) ``` At this point, we can calculate the correlations between male or female myogenic data and the expression data for each rhythmic gene. ```{r} # Take an example of one gene for males example <- average_per_gene_M[6,] %>% t() cor(example, v1) # Let's make a for loop to get the correlations male_tibble <- tibble() for (i in 1:dim(average_per_gene_M)[1]){ example <- average_per_gene_M[i,] %>% t() gene <- rownames(average_per_gene_M)[i] correlation <- data.frame(gene, abs(cor(example, v1))) male_tibble <- bind_rows(male_tibble, correlation) } male_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v1..) %>% pull(abs_corr) %>% mean() female_tibble <- tibble() for (i in 1:dim(average_per_gene_F)[1]){ example <- average_per_gene_F[i,] %>% t() gene <- rownames(average_per_gene_F)[i] correlation <- data.frame(abs(cor(example, v2))) female_tibble <- bind_rows(female_tibble, correlation) } female_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v2..) %>% pull(abs_corr) %>% mean() pdf(file = "Correlations_p0.05_20231107.pdf") male_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v1..) %>% pull(abs_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Male)", breaks = 10) female_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v2..) %>% pull(abs_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Female)", breaks = 10) dev.off() ``` ```{r} # Test for spearman correlation male_tibble <- tibble() for (i in 1:dim(average_per_gene_M)[1]){ example <- average_per_gene_M[i,] %>% t() gene <- rownames(average_per_gene_M)[i] correlation <- data.frame(gene, abs(cor(example, v1, method = "spearman"))) male_tibble <- bind_rows(male_tibble, correlation) } male_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v1..method....spearman...) %>% pull(abs_corr) %>% hist() female_tibble <- tibble() for (i in 1:dim(average_per_gene_F)[1]){ example <- average_per_gene_F[i,] %>% t() gene <- rownames(average_per_gene_M)[i] correlation <- data.frame(gene, abs(cor(example, v2, method = "spearman"))) female_tibble <- bind_rows(female_tibble, correlation) } female_tibble %>% dplyr::rename(abs_corr = abs.cor.example..v2..method....spearman...) %>% pull(abs_corr) %>% hist() ``` Let's use a more stringent p-value (the adjusted version instead). ```{r} male_cycling_BH.Q <- male_metaCycle %>% filter(meta2d_BH.Q < 0.05) %>% pull(CycID) female_cycling_BH.Q <- female_metaCycle %>% filter(meta2d_BH.Q < 0.05) %>% pull(CycID) # Find the common genes that cycle between males and females common_cycling_BH.Q <- intersect(male_cycling_BH.Q, female_cycling_BH.Q) # Filter out only the common_cycling genes in males cycling_m_BH.Q <- male_counts %>% dplyr::filter(gene %in% common_cycling_BH.Q) %>% column_to_rownames("gene") average_per_gene_M_BH.Q <- sapply(seq(1, ncol(cycling_m_BH.Q), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(cycling_m_BH.Q)) # Determine the end column for the group avg_exprs <- rowSums(cycling_m_BH.Q[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() colnames(average_per_gene_M_BH.Q) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") # Filter out only the common_cycling genes in females cycling_f_BH.Q <- female_counts %>% dplyr::filter(gene %in% common_cycling_BH.Q) %>% column_to_rownames("gene") average_per_gene_F_BH.Q <- sapply(seq(1, ncol(cycling_f_BH.Q), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(cycling_f_BH.Q)) # Determine the end column for the group avg_exprs <- rowSums(cycling_f_BH.Q[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() colnames(average_per_gene_F_BH.Q) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") a <- male_metaCycle %>% filter(CycID %in% common_cycling_BH.Q) %>% dplyr::select(CycID, meta2d_BH.Q, meta2d_period, meta2d_phase, meta2d_rAMP) %>% dplyr::rename(m.p_value = meta2d_BH.Q, m.period = meta2d_period, m.phase = meta2d_phase, m.amplitude = meta2d_rAMP) b <- female_metaCycle %>% filter(CycID %in% common_cycling_BH.Q) %>% dplyr::select(CycID, meta2d_BH.Q, meta2d_period, meta2d_phase, meta2d_rAMP) %>% dplyr::rename(f.p_value = meta2d_BH.Q, f.period = meta2d_period, f.phase = meta2d_phase, f.amplitude = meta2d_rAMP) common_genes_BH <- inner_join(a, b, by = "CycID") ``` Before we run dryR, let's select out the genes that have similar rhythmicity between males and females ```{r} common_genes_BH <- common_genes_BH %>% mutate(diff_amp = abs(m.amplitude - f.amplitude)) %>% # I converted the phase back to polar coordinates since the period of the same gene can be different between males and females. mutate(m.polar = 2*pi*m.phase/m.period) %>% mutate(f.polar = 2*pi*f.phase/f.period) %>% mutate(diff_polar = abs(m.polar - f.polar) / (2 * pi)) # Generate the numbers for the Venn diagram # Left circle will be genes that have difference in amplitude > 0.1 # Right circle will be genes that have difference in polar coordinates > 0.1 common_genes_BH %>% filter(diff_amp > 0.1) %>% pull(CycID) %>% length() common_genes_BH %>% filter(diff_polar > 0.1 & diff_polar < 0.9) common_genes_BH %>% filter(diff_polar > 0.1 & diff_polar < 0.9) %>% filter(diff_amp > 0.1) # Let's save common_genes_BH as another .csv file to make the analysis easier. # write.csv(common_genes_BH, "common_genes_BH_corrected_20231213.csv") common_genes_BH <- read.csv("common_genes_BH_corrected_20231213.csv") a <- common_genes_BH %>% filter(diff_amp > 0.1) b <- common_genes_BH %>% filter(diff_polar > 0.1 & diff_polar < 0.9) dplyr::union(a,b) %>% write_csv("common_genes_BH_model5_20231214.csv") write.csv(a, "diff_amp_20231214.csv") write.csv(b, "diff_polar_20231214.csv") 42 + 79 - 13 intersect(a$CycID, b$CycID) pdf(file = "Histograms_MetaCycle_20231210.pdf") common_genes_BH %>% pull(diff_amp) %>% hist(col = "lightblue", main = "Histogram of diff_rAMP of Common Cycling Genes", breaks = 50) common_genes_BH %>% pull(diff_polar) %>% hist(col = "lightblue", main = "Histogram of diff_polar of Common Cycling Genes", breaks = 100) dev.off() ``` ```{r} # Let's implement dryR and check if we get the same results # devtools::install_github("naef-lab/dryR") library(dryR) ?dryseq # The following is simulated data countData = simData[["countData"]] group = simData[["group"]] time = simData[["time"]] # Let's generate the same formatted datasets # Recall that we have saved count matrix with the important sample info as colnames seq_countData <- read.csv("Labelled_counts.csv") gene_symbols <- read.csv("mm_gene_symbols.csv") seq_countData <- seq_countData %>% column_to_rownames("X") dim(seq_countData) # The code below is for data wrangling! sample_info <- colnames(seq_countData) group_mydata <- c() time_mydata <- c() for(i in 1:72){ a <- strsplit(sample_info, "_")[[i]] a_1 <- a[1] a_2 <- a[2] %>% substr(start = 3, stop = 4) group_mydata <- append(group_mydata, a_1) time_mydata <- append(time_mydata, a_2) } length(group) class(time) length(group_mydata) time_mydata <- as.numeric(time_mydata) class(time_mydata) # prefilter keep <- rowSums(countData) > 0 countData <- countData[keep,] # Do the same for our data keep_mydata <- rowSums(seq_countData) > 0 seq_countData <- seq_countData[keep_mydata,] ``` ```{r} # Let's manually revise dryseq() function and debug it! # The following is simulated data countData = simData[["countData"]] group = simData[["group"]] time = simData[["time"]] library("BiocParallel") register(MulticoreParam(4)) ``` ```{r} # We inspect the dryR's dryseq() function as it doesn't work well. dryseq <- function(countData, group, time, period=24, sample_name=colnames(countData), batch=rep("A",length(sample_name)), n.cores=round(parallel::detectCores()*.6,0) ) { #update doParallel::registerDoParallel(cores=n.cores) sel = order(group,time) time = time[sel] group = group[sel] countData = countData[,sel] batch = batch[sel] sample_name = sample_name[sel] countData = countData[rowSums(countData)!=0,] s1 <- sin(2*pi*time/period) c1 <- cos(2*pi*time/period) conds = cbind(group,s1,c1,batch) colnames(conds) = c("group","s1","c1","batch") colData <- data.frame(row.names=colnames(countData), conds) N = length(unique(group)) ############################ # FIT RHYTHMS message("fitting rhythmic models") models = create_matrix_list(time,group, N,period) #Reorder u, a, b models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))] ) for (i in 1:length(models)){ rownames(models[[i]]) = rownames(colData)} if (length(unique(batch))>1) { # add the batch effect model_b = as.matrix(model.matrix(~ batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F] colnames(model_b)=paste0("BATCH_",unique(batch)[-1]) models = lapply(models, function(l) cbind(model_b,l)) models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("BATCH",colnames(l)),grep("a|b",colnames(l)))] ) } dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData, design=~1) dds.full = DESeq2::DESeq(dds, full=models[[length(models)]], betaPrior = F, fitType = "parametric", test = "Wald", parallel =T, quiet = T) deviances = sapply(models[-length(models)], function(m){ dds.x = DESeq2::nbinomWaldTest(dds.full, modelMatrix= m, betaPrior = F, quiet = T) return(mcols(dds.x)$deviance) }) deviances = cbind(deviances,mcols(dds.full)$deviance) message("computing BICW (rhythm)") # calculate the BIC BIC = as.data.frame(sapply(1:ncol(deviances), function(i) {deviances[,i] + log(ncol(countData)) * ncol(models[[i]] )})) #calculate the BICW BICW = t(apply(BIC,1,compute_BICW)) chosen_model = apply(BIC,1,which.min) chosen_model_BICW = apply(BICW,1,max) ############################ # FIT BASELINE message("fitting mean models") model_mean_cond=create_matrix_list_mean(N,group) model_mean_cond=lapply(model_mean_cond,annotate_matrix,group) for (i in 1:length(model_mean_cond)){ rownames(model_mean_cond[[i]]) = rownames(colData)} # choose the best model for rhthmicity and then run the mean on the samples DDS_dev = foreach (i = 1:length(models)) %dopar% { sel = which(chosen_model==i) gene = rownames(dds.full)[sel] if(length(gene)>0){ M=models[[i]] #build the gene specific model from the rhythmic point of view gene_specific_mean_models = lapply(model_mean_cond, function(x) cbind(x,M[,-grep("u",colnames(M))])) dev <- lapply(gene_specific_mean_models,function(m){ dds.m <- dds.full # Copying the full model dds.m <- DESeq2::nbinomWaldTest(dds.m[gene], modelMatrix= as.matrix(m), betaPrior = F) # Re-run wald test return(list(dds.m, mcols(dds.m)$deviance)) # Returning deviances (-2 * log likelihood) // https://support.bioconductor.org/p/107472/ }) } if(length(gene)==0){dev = list (NA, NA)} return(dev) } deviance_mean = NULL for (cm_r in 1:length(models)){ if(!is.na(DDS_dev[[cm_r]][1])){ deviance_mean.x = rbind(sapply(1:length(model_mean_cond),function(x) {DDS_dev[[cm_r]][[x]][[2]]})) rownames(deviance_mean.x) = rownames(DDS_dev[[cm_r]][[1]][[1]]) deviance_mean = rbind(deviance_mean, deviance_mean.x)} } deviance_mean = deviance_mean[rownames(countData),] message("computing BICW (mean)") # calculate the BIC BIC_mean = as.data.frame(sapply(1:ncol(deviance_mean), function(i) { deviance_mean[,i] + log(ncol(countData)) * ncol(model_mean_cond[[i]] )} )) #calculate the BICW BICW_mean = t(apply(BIC_mean,1,compute_BICW)) chosen_model_mean = apply(BIC_mean,1,which.min) chosen_model_mean_BICW = apply(BICW_mean,1,max) ################ # coefficients / mean, amplitude and phase ############### message("extracting rhythmic parameters") parameters=NULL parameters = foreach (i = 1:nrow(deviance_mean)) %dopar% { gene = rownames(deviance_mean)[i] cm_r = chosen_model[i] cm_m = chosen_model_mean[i] dds= DDS_dev[[cm_r]][[cm_m]][[1]] out = compute_param(dds, gene ,period,N) return(data.frame(row.names= gene, t(matrix(out))) ) } parameters = data.frame(do.call(rbind.data.frame, parameters)) colnames(parameters) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_")) parameters = parameters[rownames(countData),] # Generate all the count and expression data # raw counts counts_RF = counts(dds.full, normalized = FALSE) #vst stabilized counts vsd <- DESeq2::varianceStabilizingTransformation(dds.full) vsd <- assay(vsd) #normalized counts ncounts_RF = counts(dds.full, normalized = TRUE) # generate a table summarizing the analysis complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW) global_table = merge(ncounts_RF,complete_parameters, by="row.names") rownames(global_table) = global_table$Row.names global_table_df = global_table[,-grep("Row.names",colnames(global_table))] global_table_df = global_table_df[rownames(countData),] out = list() out[["time"]] = time out[["group"]] = group out[["results"]] = global_table_df out[["BICW_rhythm"]] = BICW out[["BICW_mean"]] = BICW_mean out[["vsd"]] = vsd out[["ncounts"]] = ncounts_RF out[["counts"]] = counts_RF out[["parameters"]] = complete_parameters out[["cook"]] = assays(dds.full)[["cooks"]] out[["dds.full"]] = dds.full message("finished!") return(out) # to add flags for low expression (counts), high cook's distance # to be added error messages when only one group is given etc. } dryseq(countData,group,time,n.cores = round(parallel::detectCores() * 0.6, 0)) ``` ```{R} # run the analysis for count data (e.g. RNA-Seq data) dryList <- dryR::dryseq(countData, group, time) # library("BiocParallel") # MulticoreParam(4) dryList_mydata <- dryseq(seq_countData,group_mydata,time_mydata, n.cores = round(parallel::detectCores() * 0.6, 0)) # explore the results dryList[["results"]] # data frame summarizing results dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group dryList[["ncounts"]] # normalized counts dryList[["counts"]] # raw counts dryList[["cook"]] # cook's distance for outlier detection dryList[["BICW_rhythm"]] # BICW for each rhythmic model dryList[["BICW_mean"]] # BICW for each mean model # generate a pdf with a global summary of all models plot_models_rhythm(dryList, "./") # plot a feature of interest dry_plot(dryList, "feature_113") ``` ```{r} # Try to recalculate the correlations # Let's make a for loop to get the correlations male_tibble_BH.Q <- tibble() for (i in 1:dim(average_per_gene_M_BH.Q)[1]){ example <- average_per_gene_M_BH.Q[i,] %>% t() gene <- rownames(average_per_gene_M_BH.Q)[i] correlation <- data.frame(gene, abs(cor(example, v1))) male_tibble_BH.Q <- bind_rows(male_tibble_BH.Q, correlation) } male_tibble_BH.Q <- male_tibble_BH.Q %>% dplyr::rename(abs_corr = abs.cor.example..v1..) female_tibble_BH.Q <- tibble() for (i in 1:dim(average_per_gene_F_BH.Q)[1]){ example <- average_per_gene_F_BH.Q[i,] %>% t() gene <- rownames(average_per_gene_F_BH.Q)[i] correlation <- data.frame(gene, abs(cor(example, v2))) female_tibble_BH.Q <- bind_rows(female_tibble_BH.Q, correlation) } female_tibble_BH.Q <- female_tibble_BH.Q %>% dplyr::rename(abs_corr = abs.cor.example..v2..) pdf(file = "Correlations_Q0.05_20231107.pdf") male_tibble_BH.Q %>% dplyr::rename(abs_corr = abs_corr) %>% pull(abs_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Male)", breaks = 10) female_tibble_BH.Q %>% dplyr::rename(abs_corr = abs_corr) %>% pull(abs_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Female)", breaks = 10) dev.off() # Test out for spearman correlations male_tibble_BH.Q_sp <- tibble() for (i in 1:dim(average_per_gene_M_BH.Q)[1]){ example <- average_per_gene_M_BH.Q[i,] %>% t() correlation <- data.frame(abs(cor(example, v1, method = "spearman"))) male_tibble_BH.Q_sp <- bind_rows(male_tibble_BH.Q_sp, correlation) } male_tibble_BH.Q_sp %>% dplyr::rename(abs_corr_sp = abs.cor.example..v1..method....spearman...) %>% pull(abs_corr_sp) %>% mean() female_tibble_BH.Q_sp <- tibble() for (i in 1:dim(average_per_gene_F_BH.Q)[1]){ example <- average_per_gene_F_BH.Q[i,] %>% t() correlation <- data.frame(abs(cor(example, v2, method = "spearman"))) female_tibble_BH.Q_sp <- bind_rows(female_tibble_BH.Q_sp, correlation) } female_tibble_BH.Q_sp %>% dplyr::rename(abs_corr = abs.cor.example..v2..method....spearman...) %>% pull(abs_corr) %>% mean() # For curiosity, let's test MvsF malesVSfemales <- tibble() for (i in 1:length(common_cycling)){ males <- average_per_gene_M[i,] %>% t() females <- average_per_gene_F[i,] %>% t() correlation_value <- data.frame(cor(males, females)) %>% pull() correlation <- data.frame(gene = colnames(males), corr = correlation_value) malesVSfemales <- bind_rows(malesVSfemales, correlation) } malesVSfemales$corr %>% hist(breaks = length(common_cycling)) malesVSfemales_BH.Q <- tibble() for (i in 1:length(common_cycling_BH.Q)){ males <- average_per_gene_M_BH.Q[i,] %>% t() females <- average_per_gene_F_BH.Q[i,] %>% t() correlation_value <- data.frame(cor(males, females)) %>% pull() correlation <- data.frame(gene = colnames(males), corr = correlation_value) malesVSfemales_BH.Q <- bind_rows(malesVSfemales_BH.Q, correlation) } malesVSfemales_BH.Q$corr %>% mean() ``` (20231114) At this point, let's check for the genes that have correlation > 0.7 for both males and females. ```{r} # Get a newer version where we get the raw values, not the absolute values male_tibble_BH.Q <- tibble() for (i in 1:dim(average_per_gene_M_BH.Q)[1]){ example <- average_per_gene_M_BH.Q[i,] %>% t() gene <- rownames(average_per_gene_M_BH.Q)[i] correlation <- data.frame(gene, cor(example, v1)) male_tibble_BH.Q <- bind_rows(male_tibble_BH.Q, correlation) } male_tibble_BH.Q <- male_tibble_BH.Q %>% dplyr::rename(male_corr = cor.example..v1.) male_tibble_BH.Q$male_corr %>% hist() female_tibble_BH.Q <- tibble() for (i in 1:dim(average_per_gene_F_BH.Q)[1]){ example <- average_per_gene_F_BH.Q[i,] %>% t() gene <- rownames(average_per_gene_F_BH.Q)[i] correlation <- data.frame(gene, cor(example, v2)) female_tibble_BH.Q <- bind_rows(female_tibble_BH.Q, correlation) } female_tibble_BH.Q <- female_tibble_BH.Q %>% dplyr::rename(female_corr = cor.example..v2.) female_tibble_BH.Q$female_corr %>% hist() male_tibble_BH.Q %>% data.frame() %>% write.csv("male_raw_correlations.csv") female_tibble_BH.Q %>% data.frame() %>% write.csv("female_raw_correlations.csv") dim(female_tibble_BH.Q) dim(female_tibble_BH.Q) highCorr_M %>% data.frame() %>% write.csv("Correlations>0.7_Male.csv") highCorr_F %>% data.frame() %>% write.csv("Correlations>0.7_Female.csv") ``` (20231121) Let's try to get the correlations between the expression data and myogenic data for all genes ```{r} # Check the columns of male_counts colnames(male_counts) rownames(male_counts) <- NULL male_counts <- male_counts %>% column_to_rownames("gene") average_per_gene_M_all <- sapply(seq(1, ncol(male_counts), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(male_counts)) # Determine the end column for the group avg_exprs <- rowSums(male_counts[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() dim(average_per_gene_M_all) colnames(average_per_gene_M_all) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") # Perform the same thing for the females colnames(female_counts) rownames(female_counts) <- NULL female_counts <- female_counts %>% column_to_rownames("gene") average_per_gene_F_all <- sapply(seq(1, ncol(female_counts), by = 6), function(start_col) { end_col <- min(start_col + 5, ncol(female_counts)) # Determine the end column for the group avg_exprs <- rowSums(female_counts[, start_col:end_col])/6 return(avg_exprs) }) %>% data.frame() dim(average_per_gene_F_all) colnames(average_per_gene_F_all) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23") ``` Similarly, we create the for-loop to get the correlations between the two types of data for each gene. ```{r} # Let's make a for loop to get the correlations male_tibble <- tibble() for (i in 1:dim(average_per_gene_M_all)[1]){ example <- average_per_gene_M_all[i,] %>% t() gene <- rownames(average_per_gene_M_all)[i] correlation <- data.frame(gene, cor(example, v1)) male_tibble <- bind_rows(male_tibble, correlation) } male_tibble <- male_tibble %>% dplyr::rename(male_corr = cor.example..v1.) female_tibble <- tibble() for (i in 1:dim(average_per_gene_F_all)[1]){ example <- average_per_gene_F_all[i,] %>% t() gene <- rownames(average_per_gene_F_all)[i] correlation <- data.frame(gene, cor(example, v2)) female_tibble <- bind_rows(female_tibble, correlation) } female_tibble <- female_tibble %>% dplyr::rename(female_corr = cor.example..v2.) # Save the correlations histograms pdf(file = "Correlations_all_genes_20231121.pdf") male_tibble %>% dplyr::rename(male_corr = male_corr) %>% pull(male_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Male)", breaks = 10) female_tibble %>% dplyr::rename(female_corr = female_corr) %>% pull(female_corr) %>% hist(col = "lightblue", main = "Distributions of Correlations (Female)", breaks = 10) dev.off() ``` Let's also perform Fisher's transformation at this step ```{r} #install.packages("DescTools") library(DescTools) fisherZ_male <- FisherZ(male_tibble$male_corr) male_tibble <- male_tibble %>% mutate(After_FT = FisherZ(male_tibble$male_corr)) %>% mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr")) # compare results with another library that also has fisher's transformation function # install.packages("psych") library(psych) male_tibble <- male_tibble %>% mutate(After_FT_2 = psych::fisherz(male_tibble$male_corr)) # The Z-scores are the same from both methods all(male_tibble$After_FT == male_tibble$After_FT_2) # remove one method's results for simplicity. male_tibble <- male_tibble %>% dplyr::select(-After_FT_2) female_tibble <- female_tibble %>% mutate(After_FT = FisherZ(female_tibble$female_corr)) %>% mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr")) # No genes remain after adjusting p-values for multiple hypothesis correction afterFT_male_adj <- male_tibble[male_tibble$adjusted_p_fdr < 0.05,] %>% pull(gene) afterFT_female_adj <- female_tibble[female_tibble$adjusted_p_fdr < 0.05,] %>% pull(gene) # Some genes have p-values lower than 0.05. afterFT_male <- male_tibble[male_tibble$p_value < 0.05,] %>% nrow() afterFT_female <- female_tibble[female_tibble$p_value < 0.05,] %>% nrow() # Check for the distributions of adjusted_p_fdr, and p-values pdf(file = "Correlations_p-values_20240105.pdf") male_tibble$p_value %>% hist(col = "lightblue", main = "Distributions of p-values (Male)", breaks = 20) female_tibble$p_value %>% hist(col = "lightblue", main = "Distributions of p-values (Female)", breaks = 20) dev.off() # (20240625 Added) # Write the files that contain the genes' correlations and p-values for males and females write.csv(male_tibble, "male_correlations_p-values_20240625.csv") write.csv(female_tibble, "female_correlations_p-values_20240625.csv") ``` (20240626 Added) We believe that the multiple hypothesis correction should be performed on genes that have MetaCycle results with adj_p-value < 0.05 ```{r} # Check the number of results from the MetaCycle length(male_cycling_BH.Q) length(female_cycling_BH.Q) male_tibble_metaCycle <- male_tibble %>% filter(gene %in% male_cycling_BH.Q) %>% select(gene, male_corr) male_tibble_metaCycle <- male_tibble_metaCycle %>% mutate(After_FT = FisherZ(male_tibble_metaCycle$male_corr)) %>% mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr")) female_tibble_metaCycle <- female_tibble %>% filter(gene %in% female_cycling_BH.Q) %>% select(gene, female_corr) female_tibble_metaCycle <- female_tibble_metaCycle %>% mutate(After_FT = FisherZ(female_tibble_metaCycle$female_corr)) %>% mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr")) hist(male_tibble_metaCycle$p_value) hist(female_tibble_metaCycle$p_value) male_tibble_metaCycle[male_tibble_metaCycle$adjusted_p_fdr < 0.05,] %>% pull(gene) ``` ```{r} all_corr <- inner_join(male_tibble, female_tibble, by = "gene") all_corr %>% filter(male_corr == 0 | female_corr == 0) all_corr <- all_corr %>% mutate(male_corr_pos = as.numeric(all_corr$male_corr > 0)) %>% mutate(female_corr_pos = as.numeric(all_corr$female_corr > 0)) %>% mutate(sign_check = male_corr_pos + female_corr_pos) %>% mutate(male_abs_corr = abs(male_corr), female_abs_corr = abs(female_corr)) contrast <- all_corr %>% filter(sign_check == 1) %>% filter(male_abs_corr > 0.6 & female_abs_corr > 0.6) contrast2 <- all_corr %>% filter(sign_check == 1) %>% filter(male_abs_corr > 0.7 & female_abs_corr > 0.7) contrast3 <- all_corr %>% filter(sign_check == 1) %>% filter(male_abs_corr > 0.8 & female_abs_corr > 0.8) dim(contrast3) # contrast %>% # write.csv("all_genes_corr_thres_0.6_20231121.csv") pdf(file = "Gene_correlations_0.6_genes_20231121.pdf", width = 15, height = 15) average_per_gene_M_all[contrast$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Male)", fontsize_row = 2, fontsize_col = 2) average_per_gene_F_all[contrast$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Female)", fontsize_row = 2, fontsize_col = 2) dev.off() pdf(file = "Gene_correlations_0.7_genes_20231121.pdf", width = 15, height = 15) average_per_gene_M_all[contrast2$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Male)", fontsize_row = 2, fontsize_col = 2) average_per_gene_F_all[contrast2$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Female)", fontsize_row = 2, fontsize_col = 2) dev.off() pdf(file = "Gene_correlations_0.8_genes_20231121.pdf", width = 15, height = 15) average_per_gene_M_all[contrast3$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Male)", fontsize_row = 4, fontsize_col = 4) average_per_gene_F_all[contrast3$gene,] %>% t() %>% cor() %>% pheatmap(main = "Gene Correlation Heatmap (Female)", fontsize_row = 4, fontsize_col = 4) dev.off() ``` Let's revisit the volcano plots again. This time, we will remove the Y chromosome genes to start off. ```{r} ```