---
title: "(20231027) Circadian_Rhythms_DE"
output: pdf_document
---

```{r}
# load libraries
library(DESeq2)
library(apeglm)
library(EnhancedVolcano)
library(tidyverse)
# BiocManager::install("DESeq2")
# BiocManager::install("apeglm")
# BiocManager::install("EnhancedVolcano")
```

Also, try to get the correlation between normalized gene exprs data and the myogenic tone data.

```{r}
# Load the filtered, normalized counts
male_counts <- read.csv("DESeq2_normalized_male_counts_20231106.csv")
female_counts <- read.csv("DESeq2_normalized_female_counts_20231106.csv")
  
head(male_counts)
# Remember that these are normalized counts!
# Continue to load the myogenic data
male_myogenic_data <- read.csv("~/Circadian_Rhythms/Myogenic_Data/Final_data_males.csv")
female_myogenic_data <- read.csv("~/Circadian_Rhythms/Myogenic_Data/Final_data_females.csv")
```

We will focus our analysis first on the common rhythmic genes for both males and females, from the MetaCycle results

```{r}
male_metaCycle <- read.csv("~/Circadian_Rhythms/metaout_20231106/meta2d_DESeq2_normalized_male_counts_20231106.csv")
female_metaCycle <- read.csv("~/Circadian_Rhythms/metaout_20231106/meta2d_DESeq2_normalized_female_counts_20231106.csv")

male_cycling <- male_metaCycle %>% 
  filter(meta2d_pvalue < 0.05) %>% pull(CycID)
female_cycling <- female_metaCycle %>% 
  filter(meta2d_pvalue < 0.05) %>% pull(CycID)
# Find the common genes that cycle between males and females
common_cycling <- intersect(male_cycling, female_cycling)

# Filter out only the common_cycling genes in males
cycling_m <- male_counts %>% 
  dplyr::filter(gene %in% common_cycling) %>% 
  column_to_rownames("gene")

average_per_gene_M <- sapply(seq(1, ncol(cycling_m), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(cycling_m))  # Determine the end column for the group
  avg_exprs <- rowSums(cycling_m[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

colnames(average_per_gene_M) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")

# Filter out only the common_cycling genes in females
cycling_f <- female_counts %>% 
  dplyr::filter(gene %in% common_cycling) %>% 
  column_to_rownames("gene")

average_per_gene_F <- sapply(seq(1, ncol(cycling_f), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(cycling_f))  # Determine the end column for the group
  avg_exprs <- rowSums(cycling_f[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

colnames(average_per_gene_F) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")


```

Now we also need to do some data wrangling for the myogenic data too.

```{r}
# Overview of the data, and filter out the Myo60 data
male_myogenic_data %>% head()
female_myogenic_data %>% head()

M_Myo60 <- male_myogenic_data %>% 
  filter(ID == "M Myo 60") %>% 
  dplyr::select(-ID)

# Assuming df is your data frame
# Define the list of group sizes
group_sizes <- c(4, 7, 6, 4, 5, 5)

# Initialize an empty list to store group means
group_means_list <- list()

# Track the starting column index
start_col <- 1

# Loop through each group size
for (group_size in group_sizes) {
  end_col <- min(start_col + group_size - 1, ncol(M_Myo60))  # Determine the end column for the group
  averages <- rowSums(M_Myo60[, start_col:end_col])/group_size  # Calculate the mean for the current group
  group_means_list[[length(group_means_list) + 1]] <- averages  # Store the mean in the list
  start_col <- end_col + 1  # Move to the next group
}

# Convert the list of group means to a data frame
averages_df_m <- data.frame(do.call(cbind, group_means_list))
colnames(averages_df_m) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")


F_Myo60 <- female_myogenic_data %>% 
  filter(ID == "F Myo 60") %>% 
  dplyr::select(-ID)

# Assuming df is your data frame
# Define the list of group sizes
group_sizes <- c(4, 7, 5, 6, 8, 6)

# Initialize an empty list to store group means
group_means_list <- list()

# Track the starting column index
start_col <- 1

# Loop through each group size
for (group_size in group_sizes) {
  end_col <- min(start_col + group_size - 1, ncol(F_Myo60))  # Determine the end column for the group
  averages <- rowSums(F_Myo60[, start_col:end_col])/group_size  # Calculate the mean for the current group
  group_means_list[[length(group_means_list) + 1]] <- averages  # Store the mean in the list
  start_col <- end_col + 1  # Move to the next group
}

# Convert the list of group means to a data frame
averages_df_f <- data.frame(do.call(cbind, group_means_list))
colnames(averages_df_f) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")

v1 <- averages_df_m %>% t()
v2 <- averages_df_f %>% t()
cor(v1,v2)
```

At this point, we can calculate the correlations between male or female myogenic data and the expression data for each rhythmic gene.

```{r}
# Take an example of one gene for males
example <- average_per_gene_M[6,] %>% 
  t()
cor(example, v1)

# Let's make a for loop to get the correlations
male_tibble <- tibble()
for (i in 1:dim(average_per_gene_M)[1]){
  example <- average_per_gene_M[i,] %>% t()
  gene <- rownames(average_per_gene_M)[i]
  correlation <- data.frame(gene, abs(cor(example, v1)))
  male_tibble <- bind_rows(male_tibble, correlation)
}
male_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v1..) %>% 
  pull(abs_corr) %>% 
  mean()

female_tibble <- tibble()
for (i in 1:dim(average_per_gene_F)[1]){
  example <- average_per_gene_F[i,] %>% t()
  gene <- rownames(average_per_gene_F)[i]
  correlation <- data.frame(abs(cor(example, v2)))
  female_tibble <- bind_rows(female_tibble, correlation)
}
female_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v2..) %>% 
  pull(abs_corr) %>% 
  mean()

pdf(file = "Correlations_p0.05_20231107.pdf")
male_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v1..) %>% 
  pull(abs_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Male)",
       breaks = 10)
 female_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v2..) %>% 
  pull(abs_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Female)",
       breaks = 10)
dev.off()
```

```{r}

# Test for spearman correlation
male_tibble <- tibble()
for (i in 1:dim(average_per_gene_M)[1]){
  example <- average_per_gene_M[i,] %>% t()
  gene <- rownames(average_per_gene_M)[i]
  correlation <- data.frame(gene, abs(cor(example, v1, method = "spearman")))
  male_tibble <- bind_rows(male_tibble, correlation)
}
male_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v1..method....spearman...) %>% 
  pull(abs_corr) %>% 
  hist()

female_tibble <- tibble()
for (i in 1:dim(average_per_gene_F)[1]){
  example <- average_per_gene_F[i,] %>% t()
  gene <- rownames(average_per_gene_M)[i]
  correlation <- data.frame(gene, abs(cor(example, v2, method = "spearman")))
  female_tibble <- bind_rows(female_tibble, correlation)
}
female_tibble %>% 
  dplyr::rename(abs_corr = abs.cor.example..v2..method....spearman...) %>% 
  pull(abs_corr) %>% 
  hist()
```

Let's use a more stringent p-value (the adjusted version instead).

```{r}
male_cycling_BH.Q <- male_metaCycle %>% 
  filter(meta2d_BH.Q < 0.05) %>% pull(CycID)
female_cycling_BH.Q <- female_metaCycle %>% 
  filter(meta2d_BH.Q < 0.05) %>% pull(CycID)
# Find the common genes that cycle between males and females
common_cycling_BH.Q <- intersect(male_cycling_BH.Q, female_cycling_BH.Q)

# Filter out only the common_cycling genes in males
cycling_m_BH.Q <- male_counts %>% 
  dplyr::filter(gene %in% common_cycling_BH.Q) %>% 
  column_to_rownames("gene")

average_per_gene_M_BH.Q <- sapply(seq(1, ncol(cycling_m_BH.Q), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(cycling_m_BH.Q))  # Determine the end column for the group
  avg_exprs <- rowSums(cycling_m_BH.Q[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

colnames(average_per_gene_M_BH.Q) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")

# Filter out only the common_cycling genes in females
cycling_f_BH.Q <- female_counts %>% 
  dplyr::filter(gene %in% common_cycling_BH.Q) %>% 
  column_to_rownames("gene")

average_per_gene_F_BH.Q <- sapply(seq(1, ncol(cycling_f_BH.Q), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(cycling_f_BH.Q))  # Determine the end column for the group
  avg_exprs <- rowSums(cycling_f_BH.Q[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

colnames(average_per_gene_F_BH.Q) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")

a <- male_metaCycle %>% 
  filter(CycID %in% common_cycling_BH.Q) %>% 
  dplyr::select(CycID, meta2d_BH.Q, meta2d_period, meta2d_phase, meta2d_rAMP) %>% 
  dplyr::rename(m.p_value = meta2d_BH.Q, m.period = meta2d_period, m.phase = meta2d_phase, m.amplitude = meta2d_rAMP)
b <- female_metaCycle %>% 
  filter(CycID %in% common_cycling_BH.Q) %>% 
  dplyr::select(CycID, meta2d_BH.Q, meta2d_period, meta2d_phase, meta2d_rAMP) %>% 
  dplyr::rename(f.p_value = meta2d_BH.Q, f.period = meta2d_period, f.phase = meta2d_phase, f.amplitude = meta2d_rAMP)
common_genes_BH <- inner_join(a, b, by = "CycID")
```

Before we run dryR, let's select out the genes that have similar rhythmicity between males and females

```{r}
common_genes_BH <- common_genes_BH %>% 
  mutate(diff_amp = abs(m.amplitude - f.amplitude)) %>% 
  # I converted the phase back to polar coordinates since the period of the same gene can be different between males and females. 
  mutate(m.polar = 2*pi*m.phase/m.period) %>% 
  mutate(f.polar = 2*pi*f.phase/f.period) %>% 
  mutate(diff_polar = abs(m.polar - f.polar) / (2 * pi))

# Generate the numbers for the Venn diagram
# Left circle will be genes that have difference in amplitude > 0.1
# Right circle will be genes that have difference in polar coordinates > 0.1
common_genes_BH %>% 
  filter(diff_amp > 0.1) %>% 
  pull(CycID) %>% 
  length()

common_genes_BH %>% 
  filter(diff_polar > 0.1 & diff_polar < 0.9)

common_genes_BH %>% 
  filter(diff_polar > 0.1 & diff_polar < 0.9) %>% 
  filter(diff_amp > 0.1)

# Let's save common_genes_BH as another .csv file to make the analysis easier.
# write.csv(common_genes_BH, "common_genes_BH_corrected_20231213.csv")

common_genes_BH <- read.csv("common_genes_BH_corrected_20231213.csv")
a <- common_genes_BH %>% 
  filter(diff_amp > 0.1)
b <- common_genes_BH %>% 
  filter(diff_polar > 0.1 & diff_polar < 0.9) 

dplyr::union(a,b) %>% 
  write_csv("common_genes_BH_model5_20231214.csv")
write.csv(a, "diff_amp_20231214.csv")
write.csv(b, "diff_polar_20231214.csv")

42 + 79 - 13
intersect(a$CycID, b$CycID)

pdf(file = "Histograms_MetaCycle_20231210.pdf")
common_genes_BH %>% 
  pull(diff_amp) %>% 
  hist(col = "lightblue",
       main = "Histogram of diff_rAMP of Common Cycling Genes",
       breaks = 50)

common_genes_BH %>% 
  pull(diff_polar) %>% 
  hist(col = "lightblue",
       main = "Histogram of diff_polar of Common Cycling Genes",
       breaks = 100)
dev.off()
```


```{r}
# Let's implement dryR and check if we get the same results

# devtools::install_github("naef-lab/dryR")
library(dryR)
?dryseq
# The following is simulated data
countData = simData[["countData"]]
group     = simData[["group"]]
time      = simData[["time"]]

# Let's generate the same formatted datasets
# Recall that we have saved count matrix with the important sample info as colnames
seq_countData <- read.csv("Labelled_counts.csv")
gene_symbols <- read.csv("mm_gene_symbols.csv")
seq_countData <- seq_countData %>% 
  column_to_rownames("X")
dim(seq_countData)

# The code below is for data wrangling!
sample_info <- colnames(seq_countData)
group_mydata <- c()
time_mydata <- c()
for(i in 1:72){
  a <- strsplit(sample_info, "_")[[i]]
  a_1 <- a[1]
  a_2 <- a[2] %>% 
    substr(start = 3, stop = 4)
  group_mydata <- append(group_mydata, a_1)
  time_mydata <- append(time_mydata, a_2)
}

length(group)
class(time)
length(group_mydata)

time_mydata <- as.numeric(time_mydata)
class(time_mydata)

# prefilter
keep <- rowSums(countData) > 0
countData <- countData[keep,]

# Do the same for our data
keep_mydata <- rowSums(seq_countData) > 0
seq_countData <- seq_countData[keep_mydata,]

```

```{r}

# Let's manually revise dryseq() function and debug it!
# The following is simulated data
countData = simData[["countData"]]
group     = simData[["group"]]
time      = simData[["time"]]

library("BiocParallel")
register(MulticoreParam(4))

```

```{r}
# We inspect the dryR's dryseq() function as it doesn't work well.
dryseq <- function(countData,
                   group,
                   time,
                   period=24,
                   sample_name=colnames(countData),
                   batch=rep("A",length(sample_name)),
                   n.cores=round(parallel::detectCores()*.6,0) )
  {
#update
  doParallel::registerDoParallel(cores=n.cores)
  sel       = order(group,time)
  time      = time[sel]
  group     = group[sel]
  countData = countData[,sel]
  batch = batch[sel]
  sample_name = sample_name[sel]

  countData = countData[rowSums(countData)!=0,]

  s1 <- sin(2*pi*time/period)
  c1 <- cos(2*pi*time/period)

  conds  = cbind(group,s1,c1,batch)
  colnames(conds) = c("group","s1","c1","batch")

  colData <- data.frame(row.names=colnames(countData), conds)
  N = length(unique(group))

  ############################
  # FIT RHYTHMS

  message("fitting rhythmic models")

  models = create_matrix_list(time,group, N,period)
  #Reorder u, a, b
  models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))]   )

  for (i in 1:length(models)){
    rownames(models[[i]]) = rownames(colData)}

  if (length(unique(batch))>1) {
    # add the batch effect
    model_b = as.matrix(model.matrix(~  batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F]
    colnames(model_b)=paste0("BATCH_",unique(batch)[-1])
    models = lapply(models, function(l) cbind(model_b,l))
    models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("BATCH",colnames(l)),grep("a|b",colnames(l)))]   )
  }

  dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData,  design=~1)
  dds.full = DESeq2::DESeq(dds, full=models[[length(models)]], betaPrior = F, fitType = "parametric", test = "Wald", parallel =T, quiet = T)

  deviances = sapply(models[-length(models)], function(m){
    dds.x = DESeq2::nbinomWaldTest(dds.full, modelMatrix= m, betaPrior = F, quiet = T)
    return(mcols(dds.x)$deviance)
  })

  deviances = cbind(deviances,mcols(dds.full)$deviance)

  message("computing BICW (rhythm)")

  # calculate the BIC
  BIC = as.data.frame(sapply(1:ncol(deviances), function(i) {deviances[,i] + log(ncol(countData)) * ncol(models[[i]] )}))

  #calculate the BICW
  BICW              = t(apply(BIC,1,compute_BICW))
  chosen_model      = apply(BIC,1,which.min)
  chosen_model_BICW = apply(BICW,1,max)

  ############################
  # FIT BASELINE

  message("fitting mean models")

  model_mean_cond=create_matrix_list_mean(N,group)
  model_mean_cond=lapply(model_mean_cond,annotate_matrix,group)

  for (i in 1:length(model_mean_cond)){
    rownames(model_mean_cond[[i]]) = rownames(colData)}

  # choose the best model for rhthmicity and then run the mean on the samples

  DDS_dev =  foreach (i = 1:length(models)) %dopar% {
    sel = which(chosen_model==i)
    gene = rownames(dds.full)[sel]

    if(length(gene)>0){
      M=models[[i]]
      #build the gene specific model from the rhythmic point of view
      gene_specific_mean_models = lapply(model_mean_cond,
                                         function(x) cbind(x,M[,-grep("u",colnames(M))]))

      dev <- lapply(gene_specific_mean_models,function(m){
        dds.m <- dds.full # Copying the full model
        dds.m <- DESeq2::nbinomWaldTest(dds.m[gene], modelMatrix= as.matrix(m), betaPrior = F) # Re-run wald test
        return(list(dds.m, mcols(dds.m)$deviance)) # Returning deviances (-2 * log likelihood) // https://support.bioconductor.org/p/107472/
      })
    }

    if(length(gene)==0){dev = list (NA, NA)}

    return(dev)
  }

  deviance_mean = NULL
  for (cm_r in 1:length(models)){

    if(!is.na(DDS_dev[[cm_r]][1])){
      deviance_mean.x            = rbind(sapply(1:length(model_mean_cond),function(x) {DDS_dev[[cm_r]][[x]][[2]]}))
      rownames(deviance_mean.x)  = rownames(DDS_dev[[cm_r]][[1]][[1]])
      deviance_mean              = rbind(deviance_mean, deviance_mean.x)}
  }

  deviance_mean = deviance_mean[rownames(countData),]

  message("computing BICW (mean)")

  # calculate the BIC
  BIC_mean = as.data.frame(sapply(1:ncol(deviance_mean), function(i) { deviance_mean[,i] + log(ncol(countData)) * ncol(model_mean_cond[[i]] )}   ))

  #calculate the BICW
  BICW_mean = t(apply(BIC_mean,1,compute_BICW))

  chosen_model_mean = apply(BIC_mean,1,which.min)
  chosen_model_mean_BICW = apply(BICW_mean,1,max)

  ################
  # coefficients / mean, amplitude and phase
  ###############

  message("extracting rhythmic parameters")

  parameters=NULL

  parameters =  foreach (i = 1:nrow(deviance_mean)) %dopar% {
    gene = rownames(deviance_mean)[i]
    cm_r = chosen_model[i]
    cm_m = chosen_model_mean[i]
    dds= DDS_dev[[cm_r]][[cm_m]][[1]]
    out = compute_param(dds, gene ,period,N)
    return(data.frame(row.names= gene, t(matrix(out)))           )
  }

  parameters            = data.frame(do.call(rbind.data.frame, parameters))
  colnames(parameters)  = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_"))
  parameters            = parameters[rownames(countData),]

  # Generate all the count and expression data
  # raw counts
  counts_RF        =  counts(dds.full, normalized = FALSE)

  #vst stabilized counts
  vsd <- DESeq2::varianceStabilizingTransformation(dds.full)
  vsd <- assay(vsd)

  #normalized counts
  ncounts_RF       = counts(dds.full, normalized = TRUE)

  # generate a table summarizing the analysis
  complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW)
  global_table = merge(ncounts_RF,complete_parameters, by="row.names")
  rownames(global_table) = global_table$Row.names
  global_table_df  = global_table[,-grep("Row.names",colnames(global_table))]

  global_table_df = global_table_df[rownames(countData),]

  out = list()

  out[["time"]]        = time
  out[["group"]]       = group
  out[["results"]]     = global_table_df
  out[["BICW_rhythm"]] = BICW
  out[["BICW_mean"]]   = BICW_mean
  out[["vsd"]]         = vsd
  out[["ncounts"]]     = ncounts_RF
  out[["counts"]]      = counts_RF
  out[["parameters"]]  = complete_parameters
  out[["cook"]]        = assays(dds.full)[["cooks"]]
  out[["dds.full"]]    = dds.full

  message("finished!")
  return(out)

  # to add flags for low expression (counts), high cook's distance
  # to be added error messages when only one group is given etc.
}

dryseq(countData,group,time,n.cores = round(parallel::detectCores() * 0.6, 0))

```

```{R}

# run the analysis for count data (e.g. RNA-Seq data)
dryList <-  dryR::dryseq(countData,
                   group,
                   time)
# library("BiocParallel")
# MulticoreParam(4)
dryList_mydata <- dryseq(seq_countData,group_mydata,time_mydata,
                         n.cores = round(parallel::detectCores() * 0.6, 0))

# explore the results
dryList[["results"]]     # data frame summarizing results
dryList[["parameters"]]  # coefficients: phase, amplitude and mean for each group
dryList[["ncounts"]]     # normalized counts
dryList[["counts"]]      # raw counts
dryList[["cook"]]        # cook's distance for outlier detection
dryList[["BICW_rhythm"]] # BICW for each rhythmic model
dryList[["BICW_mean"]]   # BICW for each mean model

# generate a pdf with a global summary of all models
plot_models_rhythm(dryList, "./")

# plot a feature of interest
dry_plot(dryList, "feature_113")
```


```{r}
# Try to recalculate the correlations
# Let's make a for loop to get the correlations
male_tibble_BH.Q <- tibble()
for (i in 1:dim(average_per_gene_M_BH.Q)[1]){
  example <- average_per_gene_M_BH.Q[i,] %>% t()
  gene <- rownames(average_per_gene_M_BH.Q)[i]
  correlation <- data.frame(gene, abs(cor(example, v1)))
  male_tibble_BH.Q <- bind_rows(male_tibble_BH.Q, correlation)
}
male_tibble_BH.Q <- male_tibble_BH.Q %>% 
  dplyr::rename(abs_corr = abs.cor.example..v1..)

female_tibble_BH.Q <- tibble()
for (i in 1:dim(average_per_gene_F_BH.Q)[1]){
  example <- average_per_gene_F_BH.Q[i,] %>% t()
  gene <- rownames(average_per_gene_F_BH.Q)[i]
  correlation <- data.frame(gene, abs(cor(example, v2)))
  female_tibble_BH.Q <- bind_rows(female_tibble_BH.Q, correlation)
}
female_tibble_BH.Q <- female_tibble_BH.Q %>% 
  dplyr::rename(abs_corr = abs.cor.example..v2..)

pdf(file = "Correlations_Q0.05_20231107.pdf")
male_tibble_BH.Q %>% 
  dplyr::rename(abs_corr = abs_corr) %>% 
  pull(abs_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Male)",
       breaks = 10)
female_tibble_BH.Q %>% 
  dplyr::rename(abs_corr = abs_corr) %>% 
  pull(abs_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Female)",
       breaks = 10)
dev.off()

# Test out for spearman correlations
male_tibble_BH.Q_sp <- tibble()
for (i in 1:dim(average_per_gene_M_BH.Q)[1]){
  example <- average_per_gene_M_BH.Q[i,] %>% t()
  correlation <- data.frame(abs(cor(example, v1, method = "spearman")))
  male_tibble_BH.Q_sp <- bind_rows(male_tibble_BH.Q_sp, correlation)
}
male_tibble_BH.Q_sp %>% 
  dplyr::rename(abs_corr_sp = abs.cor.example..v1..method....spearman...) %>% 
  pull(abs_corr_sp) %>% 
  mean()

female_tibble_BH.Q_sp <- tibble()
for (i in 1:dim(average_per_gene_F_BH.Q)[1]){
  example <- average_per_gene_F_BH.Q[i,] %>% t()
  correlation <- data.frame(abs(cor(example, v2, method = "spearman")))
  female_tibble_BH.Q_sp <- bind_rows(female_tibble_BH.Q_sp, correlation)
}
female_tibble_BH.Q_sp %>% 
  dplyr::rename(abs_corr = abs.cor.example..v2..method....spearman...) %>% 
  pull(abs_corr) %>% 
  mean()


# For curiosity, let's test MvsF
malesVSfemales <- tibble()
for (i in 1:length(common_cycling)){
  males <- average_per_gene_M[i,] %>% t()
  females <- average_per_gene_F[i,] %>% t()
  correlation_value <- data.frame(cor(males, females)) %>% pull()
  correlation <- data.frame(gene = colnames(males),
                            corr = correlation_value)
  malesVSfemales <- bind_rows(malesVSfemales, correlation)
}

malesVSfemales$corr %>% hist(breaks = length(common_cycling))

malesVSfemales_BH.Q <- tibble()
for (i in 1:length(common_cycling_BH.Q)){
  males <- average_per_gene_M_BH.Q[i,] %>% t()
  females <- average_per_gene_F_BH.Q[i,] %>% t()
  correlation_value <- data.frame(cor(males, females)) %>% pull()
  correlation <- data.frame(gene = colnames(males),
                            corr = correlation_value)
  malesVSfemales_BH.Q <- bind_rows(malesVSfemales_BH.Q, correlation)
}

malesVSfemales_BH.Q$corr %>% mean()

```

(20231114) At this point, let's check for the genes that have correlation > 0.7 for both males and females.

```{r}

# Get a newer version where we get the raw values, not the absolute values

male_tibble_BH.Q <- tibble()
for (i in 1:dim(average_per_gene_M_BH.Q)[1]){
  example <- average_per_gene_M_BH.Q[i,] %>% t()
  gene <- rownames(average_per_gene_M_BH.Q)[i]
  correlation <- data.frame(gene, cor(example, v1))
  male_tibble_BH.Q <- bind_rows(male_tibble_BH.Q, correlation)
}
male_tibble_BH.Q <- male_tibble_BH.Q %>% 
  dplyr::rename(male_corr = cor.example..v1.)

male_tibble_BH.Q$male_corr %>% 
  hist()

female_tibble_BH.Q <- tibble()
for (i in 1:dim(average_per_gene_F_BH.Q)[1]){
  example <- average_per_gene_F_BH.Q[i,] %>% t()
  gene <- rownames(average_per_gene_F_BH.Q)[i]
  correlation <- data.frame(gene, cor(example, v2))
  female_tibble_BH.Q <- bind_rows(female_tibble_BH.Q, correlation)
}
female_tibble_BH.Q <- female_tibble_BH.Q %>% 
  dplyr::rename(female_corr = cor.example..v2.)

female_tibble_BH.Q$female_corr %>% 
  hist()

male_tibble_BH.Q %>% 
  data.frame() %>% 
  write.csv("male_raw_correlations.csv")
female_tibble_BH.Q %>%
  data.frame() %>% 
  write.csv("female_raw_correlations.csv")

dim(female_tibble_BH.Q)
dim(female_tibble_BH.Q)

highCorr_M %>%
  data.frame() %>% 
  write.csv("Correlations>0.7_Male.csv")
highCorr_F %>%
  data.frame() %>% 
  write.csv("Correlations>0.7_Female.csv")


```

(20231121) Let's try to get the correlations between the expression data and myogenic data for all genes

```{r}
# Check the columns of male_counts
colnames(male_counts)

rownames(male_counts) <- NULL
male_counts <- male_counts %>% 
  column_to_rownames("gene")

average_per_gene_M_all <- sapply(seq(1, ncol(male_counts), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(male_counts))  # Determine the end column for the group
  avg_exprs <- rowSums(male_counts[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

dim(average_per_gene_M_all)

colnames(average_per_gene_M_all) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")

# Perform the same thing for the females
colnames(female_counts)

rownames(female_counts) <- NULL
female_counts <- female_counts %>% 
  column_to_rownames("gene")
average_per_gene_F_all <- sapply(seq(1, ncol(female_counts), by = 6), function(start_col) {
  end_col <- min(start_col + 5, ncol(female_counts))  # Determine the end column for the group
  avg_exprs <- rowSums(female_counts[, start_col:end_col])/6
  return(avg_exprs)
}) %>% data.frame()

dim(average_per_gene_F_all)

colnames(average_per_gene_F_all) <- c("ZT3", "ZT7", "ZT11", "ZT15", "ZT19", "ZT23")
```

Similarly, we create the for-loop to get the correlations between the two types of data for each gene.

```{r}
# Let's make a for loop to get the correlations
male_tibble <- tibble()
for (i in 1:dim(average_per_gene_M_all)[1]){
  example <- average_per_gene_M_all[i,] %>% t()
  gene <- rownames(average_per_gene_M_all)[i]
  correlation <- data.frame(gene, cor(example, v1))
  male_tibble <- bind_rows(male_tibble, correlation)
}
male_tibble <- male_tibble %>% 
  dplyr::rename(male_corr = cor.example..v1.)

female_tibble <- tibble()
for (i in 1:dim(average_per_gene_F_all)[1]){
  example <- average_per_gene_F_all[i,] %>% t()
  gene <- rownames(average_per_gene_F_all)[i]
  correlation <- data.frame(gene, cor(example, v2))
  female_tibble <- bind_rows(female_tibble, correlation)
}
female_tibble <- female_tibble %>% 
  dplyr::rename(female_corr = cor.example..v2.) 

# Save the correlations histograms
pdf(file = "Correlations_all_genes_20231121.pdf")
male_tibble %>% 
  dplyr::rename(male_corr = male_corr) %>% 
  pull(male_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Male)",
       breaks = 10)
female_tibble %>% 
  dplyr::rename(female_corr = female_corr) %>% 
  pull(female_corr) %>% 
  hist(col = "lightblue",
       main = "Distributions of Correlations (Female)",
       breaks = 10)
dev.off()
```

Let's also perform Fisher's transformation at this step

```{r}
#install.packages("DescTools")
library(DescTools)
fisherZ_male <- FisherZ(male_tibble$male_corr)

male_tibble <- male_tibble %>% 
  mutate(After_FT = FisherZ(male_tibble$male_corr)) %>% 
  mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% 
  mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr"))

# compare results with another library that also has fisher's transformation function
# install.packages("psych")
library(psych)

male_tibble <- male_tibble %>% 
  mutate(After_FT_2 = psych::fisherz(male_tibble$male_corr))

# The Z-scores are the same from both methods
all(male_tibble$After_FT == male_tibble$After_FT_2)

# remove one method's results for simplicity.
male_tibble <- male_tibble %>% 
  dplyr::select(-After_FT_2)

female_tibble <- female_tibble %>% 
  mutate(After_FT = FisherZ(female_tibble$female_corr)) %>% 
  mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% 
  mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr"))

# No genes remain after adjusting p-values for multiple hypothesis correction
afterFT_male_adj <- male_tibble[male_tibble$adjusted_p_fdr < 0.05,] %>% pull(gene)
afterFT_female_adj <- female_tibble[female_tibble$adjusted_p_fdr < 0.05,] %>% pull(gene)

# Some genes have p-values lower than 0.05.
afterFT_male <- male_tibble[male_tibble$p_value < 0.05,] %>% nrow()
afterFT_female <- female_tibble[female_tibble$p_value < 0.05,] %>% nrow()

# Check for the distributions of adjusted_p_fdr, and p-values
pdf(file = "Correlations_p-values_20240105.pdf")
male_tibble$p_value %>% 
  hist(col = "lightblue",
       main = "Distributions of p-values (Male)",
       breaks = 20)
female_tibble$p_value %>% 
  hist(col = "lightblue",
       main = "Distributions of p-values (Female)",
       breaks = 20)
dev.off()

# (20240625 Added)
# Write the files that contain the genes' correlations and p-values for males and females
write.csv(male_tibble, "male_correlations_p-values_20240625.csv")
write.csv(female_tibble, "female_correlations_p-values_20240625.csv")
```

(20240626 Added) We believe that the multiple hypothesis correction should be performed on genes that have MetaCycle results with adj_p-value < 0.05

```{r}
# Check the number of results from the MetaCycle
length(male_cycling_BH.Q)
length(female_cycling_BH.Q)

male_tibble_metaCycle <- male_tibble %>% 
  filter(gene %in% male_cycling_BH.Q) %>% 
  select(gene, male_corr)

male_tibble_metaCycle <- male_tibble_metaCycle %>% 
  mutate(After_FT = FisherZ(male_tibble_metaCycle$male_corr)) %>% 
  mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% 
  mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr"))

female_tibble_metaCycle <- female_tibble %>% 
  filter(gene %in% female_cycling_BH.Q) %>% 
  select(gene, female_corr)

female_tibble_metaCycle <- female_tibble_metaCycle %>% 
  mutate(After_FT = FisherZ(female_tibble_metaCycle$female_corr)) %>% 
  mutate(p_value = 2 * (1 - pnorm(abs(After_FT)))) %>% 
  mutate(adjusted_p_fdr = p.adjust(p_value, method = "fdr"))

hist(male_tibble_metaCycle$p_value)
hist(female_tibble_metaCycle$p_value)
male_tibble_metaCycle[male_tibble_metaCycle$adjusted_p_fdr < 0.05,] %>% pull(gene)
```


```{r}

all_corr <- inner_join(male_tibble, female_tibble, by = "gene")
all_corr %>% 
  filter(male_corr == 0 | female_corr == 0)
all_corr <- all_corr %>% 
  mutate(male_corr_pos = as.numeric(all_corr$male_corr > 0)) %>% 
  mutate(female_corr_pos = as.numeric(all_corr$female_corr > 0)) %>% 
  mutate(sign_check = male_corr_pos + female_corr_pos) %>% 
  mutate(male_abs_corr = abs(male_corr), female_abs_corr = abs(female_corr))


contrast <- all_corr %>% 
  filter(sign_check == 1) %>% 
  filter(male_abs_corr > 0.6 & female_abs_corr > 0.6)

contrast2 <- all_corr %>% 
  filter(sign_check == 1) %>% 
  filter(male_abs_corr > 0.7 & female_abs_corr > 0.7)

contrast3 <- all_corr %>% 
  filter(sign_check == 1) %>% 
  filter(male_abs_corr > 0.8 & female_abs_corr > 0.8)

dim(contrast3)
# contrast %>% 
#   write.csv("all_genes_corr_thres_0.6_20231121.csv")

pdf(file = "Gene_correlations_0.6_genes_20231121.pdf",
    width = 15,
    height = 15)
average_per_gene_M_all[contrast$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Male)",
           fontsize_row = 2,
           fontsize_col = 2)

average_per_gene_F_all[contrast$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Female)",
           fontsize_row = 2,
           fontsize_col = 2)
dev.off()


pdf(file = "Gene_correlations_0.7_genes_20231121.pdf",
    width = 15,
    height = 15)
average_per_gene_M_all[contrast2$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Male)",
           fontsize_row = 2,
           fontsize_col = 2)

average_per_gene_F_all[contrast2$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Female)",
           fontsize_row = 2,
           fontsize_col = 2)
dev.off()

pdf(file = "Gene_correlations_0.8_genes_20231121.pdf",
    width = 15,
    height = 15)
average_per_gene_M_all[contrast3$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Male)",
           fontsize_row = 4,
           fontsize_col = 4)

average_per_gene_F_all[contrast3$gene,] %>% 
  t() %>% 
  cor() %>% 
  pheatmap(main = "Gene Correlation Heatmap (Female)",
           fontsize_row = 4,
           fontsize_col = 4)
dev.off()
```

Let's revisit the volcano plots again.
This time, we will remove the Y chromosome genes to start off.

```{r}

```