---
title: "XCI_Escape_Removal (20231018).Rmd"
output: html_notebook
---

For the first step for removing the XCI escape genes, we need to annotate our variants.
To make things easier, Jonathan's code (as seen in the "Plotting VCF" file) filters out the SNPs that match our criteria.
These SNPs are biallelic - which serves our purpose.
Now we start with loading required libraries.

```{r}
# Load libraries
library(magrittr)
library(readr)
library(ggplot2)
library(tibble)
library(tidyr)
library(dplyr)
library(readr)
library(rtracklayer)
library(stringr)
library(plyranges)
library(GenomicFeatures)
library(plyranges)
library(cowplot)
library(Seurat)
library(SeuratDisk)
library(pheatmap)
library(reshape2)
library(RColorBrewer)
library(rhdf5)
library(matrixStats)
library(EGAD)
library(GO.db)
library(org.Hs.eg.db)

# installation
# BiocManager::install("plyranges")
```

Let's regenerate the combined dataframe that includes the XCI skew information.

```{r}
# Let's do this analysis for both Pilot and Second data
merged_df_1 <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Pilot_Data/Merged_analysis/20230920_XCI_genes(Pilot_Data).csv")

merged_df_2 <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Second_Data/Merged_analysis_After_QC/20230921_XCI_genes(Second_Data).csv")

merged_df <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Third_Data/XCI_genes(Third_Data).csv")
merged_df <- merged_df %>% 
  dplyr::select(-X.1)

combined <- rbind(merged_df_1, merged_df_2) %>% dplyr::select(-X.1)

# merge it with the third data
combined <- rbind(combined, merged_df)

```

Now let's see if we could generate a heatmap that could encompass all 22 samples based on their genes

```{r}
# Pivot the dataframe into a genes-by-samples matrix
heatmap_data <- dcast(combined, gene_name ~ sample_id, value.var = "mean_XCI_ratio")
rownames(heatmap_data) <- heatmap_data$gene_name  # Set row names to gene names
heatmap_data <- heatmap_data[, -1]  # Remove the gene_name column

# Let's remove B5_12764 and see what happens
heatmap_data <- heatmap_data %>% 
  dplyr::select(-B5_12764)

# heatmap_data[is.na(heatmap_data)] <- "grey"

drows_1 <- dist(heatmap_data, method = "euclidean")

drows_1[is.na(drows_1)] <- max(drows_1,na.rm=T)
which(is.na(drows_1), arr.ind = T)
pheatmap(drows_1)
# heatmap_data_noNA <- heatmap_data %>%
#   filter(rowSums(is.na(heatmap_data)) == 0)

# heatmap_matrix <- as.matrix(heatmap_data)
# 
# giveNAs = which(is.na(as.matrix(dist(heatmap_data))),arr.ind=TRUE)
# giveNAs
# 
# tab = sort(table(c(giveNAs)),decreasing=TRUE)
# checkNA = sapply(1:length(tab),function(i){
# sum(is.na(as.matrix(dist(heatmap_data[-as.numeric(names(tab[1:i])),]))))
# })
# rmv = names(tab)[1:min(which(checkNA==0))]
# 
# heatmap_data = heatmap_data[-as.numeric(rmv),]

# Define a color palette for the heatmap
colors <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100)

annotation_helper_1 <- combined %>% 
  filter(gene_name %in% rownames(heatmap_data)) %>% 
  arrange(gene_name) %>% 
  dplyr::select(gene_name, mean_exprs_ratio) 

rownames(annotation_helper_1) <- NULL
interim <- annotation_helper_1$gene_name %>% duplicated()
annotation_helper_1 <- annotation_helper_1[!interim,]

# double check the order
summary(rownames(heatmap_data) == annotation_helper_1$gene_name)

# Color for annotations
expression_ratio_palette <- colorRampPalette(c("blue", "white", "red"))(100)

# Annotation dataframe
annotation_df_1 <- data.frame(Expression_Ratio = log10(annotation_helper_1$mean_exprs_ratio + 1))
rownames(annotation_df_1) <- rownames(heatmap_data)

# Create the heatmap using pheatmap
pdf(file = "Heatmap_22_samples(20231018).pdf",
    height = 12,
    width = 15)
p <- pheatmap(
  heatmap_data,
  clustering_distance_rows = drows_1,  # You can choose a different clustering method if needed
  clustering_distance_cols = "euclidean",  # You can choose a different clustering method if needed
  annotation_row = annotation_df_1,  # Add the expression ratio annotation
  annotation_colors = list(Expression_Ratio = expression_ratio_palette),
  col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),  # Color palette
  na_col = "grey",  # Set the color for NA values
  main = "Gene Expression Heatmap",
  clustering_method = "complete",
  fontsize_row = 3,  # Adjust row label size
  fontsize_col = 9  # Adjust column label size
)
p
dev.off()

row_ordering_1 <- p$tree_row
gene_list_1 <- row_ordering_1$labels[row_ordering_1$order]
annotation_df_1 <- rownames_to_column(annotation_df_1, "genes")
tibble_1 <- tibble(genes = gene_list_1)
tibble_1 <- inner_join(tibble_1, annotation_df_1)

pdf(file = "Heatmap_Annotation_filter5(20230929).pdf",
    height = 15,
    width = 3)
tibble_1 %>%
  mutate(genes = factor(genes, levels = rev(genes))) %>% 
  ggplot(aes(x = 1, y = genes, fill = Expression_Ratio)) + 
  geom_tile() +
  scale_fill_gradient(low = "white", high = "purple") +  # Adjust the color scale
  coord_cartesian(expand = FALSE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank())
dev.off()

```

Since there were too many NAs in the first heatmap, let's generate a new one that has a specific NA count filter.

```{r}
heatmap_data <- dcast(combined, gene_name ~ sample_id, value.var = "mean_XCI_ratio")
rownames(heatmap_data) <- heatmap_data$gene_name  # Set row names to gene names
heatmap_data <- heatmap_data[, -1]  # Remove the gene_name column

# Let's remove B5_12764 and see what happens
heatmap_data <- heatmap_data %>% 
  dplyr::select(-B5_12764)

# Count the number of NAs in each row
na_count <- rowSums(is.na(heatmap_data))

# Create a list to store the heatmaps
heatmap_list <- list()

# Create a function to generate a heatmap for a given threshold
generate_heatmap <- function(threshold) {
    
    # Filter rows with a number of NAs less than or equal to the threshold
    filtered_heatmap_data <- heatmap_data[na_count <= threshold, ]
    
    drows_3 <- dist(filtered_heatmap_data, method = "euclidean")
    # Check that we have no NAs in our distance matrix now!
    which(is.na(drows_3), arr.ind = T)
    
    colors <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100)
    
    annotation_helper <- combined %>% 
      filter(gene_name %in% rownames(filtered_heatmap_data)) %>% 
      arrange(gene_name) %>% 
      dplyr::select(gene_name, mean_exprs_ratio) 
    
    rownames(annotation_helper) <- NULL
    interim <- annotation_helper$gene_name %>% duplicated()
    annotation_helper <- annotation_helper[!interim,]
    
    # double check the order
    summary(rownames(filtered_heatmap_data) == annotation_helper$gene_name)
    
    # Color for annotations
    expression_ratio_palette <- colorRampPalette(c("blue", "white", "red"))(100)
    
    # Annotation dataframe
    annotation_df <- data.frame(Expression_Ratio = log10(annotation_helper$mean_exprs_ratio + 1))
    rownames(annotation_df) <- rownames(filtered_heatmap_data)
    
    p <- pheatmap(
      filtered_heatmap_data,
      clustering_distance_rows = drows_3,  # You can choose a different clustering method if needed
      clustering_distance_cols = "euclidean",  # You can choose a different clustering method if needed
      annotation_row = annotation_df,  # Add the expression ratio annotation
      annotation_colors = list(Expression_Ratio = expression_ratio_palette),
      col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),  # Color palette
      na_col = "grey",  # Set the color for NA values
      main = "Gene Expression Heatmap",
      clustering_method = "complete",
      fontsize_row = 6,  # Adjust row label size
      fontsize_col = 9,  # Adjust column label size
      cutree_rows = 2
    )
  return(p)
}

# Use lapply to generate the heatmaps for different thresholds
heatmaps <- lapply(thresholds, generate_heatmap)


pdf(file = "Heatmap_22_samples_NA_thresholds(20231019).pdf",
    height = 14,
    width = 11)
generate_heatmap(11)
generate_heatmap(10)
generate_heatmap(9)
generate_heatmap(8)
generate_heatmap(7)
generate_heatmap(6)
generate_heatmap(5)
generate_heatmap(4)
generate_heatmap(3)
generate_heatmap(2)
generate_heatmap(1)
generate_heatmap(0)
  dev.off()

# Create the heatmap using pheatmap
pdf(file = "Heatmap_22_samples_fitered_NA(20231019).pdf",
    height = 15,
    width = 12)
p <- pheatmap(
  filtered_heatmap_data,
  clustering_distance_rows = drows_3,  # You can choose a different clustering method if needed
  clustering_distance_cols = "euclidean",  # You can choose a different clustering method if needed
  annotation_row = annotation_df,  # Add the expression ratio annotation
  annotation_colors = list(Expression_Ratio = expression_ratio_palette),
  col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),  # Color palette
  na_col = "grey",  # Set the color for NA values
  main = "Gene Expression Heatmap",
  clustering_method = "complete",
  fontsize_row = 4,  # Adjust row label size
  fontsize_col = 9,  # Adjust column label size
  cutree_rows = 2
)
p
dev.off()

object <- generate_heatmap(11)

```


Some major notes from the Supplementary Material of the EGAD publication:
- The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network.

```{r}
#list structure
getwd()
h5ls("~/EGAD/human_HC_AggNet.hdf5")

coconet <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/agg")
coconet <- as.data.frame(coconet)

# Check the dimension of the human gene co-expression matrix
dim(coconet)

colnames(coconet) <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/col")
rownames(coconet) <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/col")

# The following is a template of how to use this tool
mapping <- mapIds(org.Hs.eg.db, 
                  colnames(coconet), 
                  'SYMBOL',
                  'ENSEMBL')

mapping %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(ncbi = value) 

# check for column names
colnames(mapping)

# Remove the NAs
mapping %<>% filter(!is.na(ncbi))

# remove 1 ncbi to many genes (remove the duplicates)
detach(package:plyranges,unload=TRUE)

dup_ids <- mapping %>% 
  group_by(ncbi) %>% 
  dplyr::summarize(counts = n()) %>% 
  dplyr::filter(counts != 1) %>% 
  dplyr::pull(ncbi)

mapping %<>% 
  filter(!(ncbi %in% dup_ids))

#check for 1-many or many-1
mapping %>% 
  dplyr::group_by(ncbi) %>% 
  dplyr::count() %>% 
  filter(n != 1)

mapping %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)

dim(mapping)
```

Let's try to bring out the EGAD RMarkdown file and try to run it in this file!

```{r}
# Call all the XCI skew genes
XCI_genes <- rownames(heatmap_data)

mapping_XCI <- mapIds(org.Hs.eg.db, 
                  XCI_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI %<>% filter(!is.na(ENSEMBL))

# Remove duplicates
dup_ids <- mapping_XCI %>% 
  group_by(gene) %>% 
  dplyr::summarize(counts = n()) %>% 
  dplyr::filter(counts != 1) %>% 
  dplyr::pull(gene)

# No duplicates
mapping %<>% 
  filter(!(ncbi %in% dup_ids))

#check for 1-many or many-1
mapping_XCI %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

```

```{r}
annotations_1 <- tibble(gene = rownames(coconet))
annotations_1 <- annotations_1 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI %>% pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
head(annotations_1)

annotations_1 <- annotations_1 %>% 
  as.data.frame()

rownames(annotations_1) <- annotations_1$gene
annotations_1 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_1))
testing[[1]]
```

Let's also test it for the gene list derived based on the NA count filter!

```{r}
NA_filter11_genes <- read.csv("NA_filter11_genes.csv",header = 0) %>% pull(V1)
mapping_XCI_2 <- mapIds(org.Hs.eg.db, 
                  NA_filter11_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI_2 %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI_2 %<>% filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_XCI_2 %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI_2 %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

annotations_2 <- tibble(gene = rownames(coconet))
annotations_2 <- annotations_2 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI_2 %>% 
                          pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
head(annotations_2)

annotations_2 <- annotations_2 %>% 
  as.data.frame()

rownames(annotations_2) <- annotations_2$gene
annotations_2 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing_2 <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_2))

testing_2[[1]]
```

```{r}
noNA_genes <- read.csv("noNA_genes.csv",header = 0) %>% pull(V1)
mapping_XCI_3 <- mapIds(org.Hs.eg.db, 
                  noNA_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI_3 %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI_3 %<>% filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_XCI_3 %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI_3 %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

annotations_3 <- tibble(gene = rownames(coconet))
annotations_3 <- annotations_3 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI_3 %>% 
                          pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
dim(annotations_3)

annotations_3 <- annotations_3 %>% 
  as.data.frame()

rownames(annotations_3) <- annotations_3$gene
annotations_3 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing_3 <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_3), min = 5)


testing_3[[1]]
```

Let's check the co-expression for genes showing high XCI skew from the heatmap when NA count filter = 11

```{r}
NA_filter11_XCI_skew_genes <- read.csv("NA_filter11_XCI_skew_genes.csv",header = 0) %>% pull(V1)
mapping_XCI_4 <- mapIds(org.Hs.eg.db, 
                  NA_filter11_XCI_skew_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI_4 %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI_4 %<>% filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_XCI_4 %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI_4 %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

annotations_4 <- tibble(gene = rownames(coconet))
annotations_4 <- annotations_4 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI_4 %>% 
                          pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
dim(annotations_4)

annotations_4 <- annotations_4 %>% 
  as.data.frame()

rownames(annotations_4) <- annotations_4$gene
annotations_4 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing_4 <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_4), min = 5)

testing_4[[1]]
```

Let's also compare when we set the read count filter as 5 (NA counts <= 6)

```{r}
readfilter5_NA_filter6_XCI_skew_genes <- read.csv("readfilter5_NA_filter6_XCI_skew_genes.csv",header = 0) %>% pull(V1)
mapping_XCI_5 <- mapIds(org.Hs.eg.db, 
                  readfilter5_NA_filter6_XCI_skew_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI_5 %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI_5 %<>% filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_XCI_5 %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI_5 %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

annotations_5 <- tibble(gene = rownames(coconet))
annotations_5 <- annotations_5 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI_5 %>% 
                          pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
dim(annotations_5)

annotations_5 <- annotations_5 %>% 
  as.data.frame()

rownames(annotations_5) <- annotations_5$gene
annotations_5 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing_5 <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_5), min = 5)
?run_GBA
testing_5[[1]]
```

As a last point of reference, let's perform EGAD on all genes in the heatmap for read count filter = 5.

```{r}
readfilter5_noNA_all_genes <- read.csv("readfilter5_noNA_all_genes.csv",header = 0) %>% pull(V1)
mapping_XCI_6 <- mapIds(org.Hs.eg.db, 
                  readfilter5_noNA_all_genes, 
                  'ENSEMBL',
                  'SYMBOL')

mapping_XCI_6 %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

# Remove the NAs
mapping_XCI_6 %<>% filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_XCI_6 %>% 
  group_by(ENSEMBL) %>% 
  dplyr::count() %>% 
  filter(n != 1)
mapping_XCI_6 %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

annotations_6 <- tibble(gene = rownames(coconet))
annotations_6 <- annotations_6 %>% 
  mutate(XCI_skew = gene %in% (mapping_XCI_6 %>% 
                          pull(ENSEMBL))) %>%
  mutate(XCI_skew = as.numeric(XCI_skew)) 
dim(annotations_6)

annotations_6 <- annotations_6 %>% 
  as.data.frame()

rownames(annotations_6) <- annotations_6$gene
annotations_6 %<>% 
  dplyr::select(-gene)

# 2.3.5 Building networks
# hist <- plot_distribution(node_degree(coconet), 
#                           b=10,
#                           xlim=c(0,14), 
#                           ylim=c(0,2), 
#                           xlab="Node degree")

# Let's run this on EGAD
testing_6 <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotations_6), min = 5)

testing_6[[1]]
```

Compare the expression values of these genes from the count matrix!

```{r}
d1 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/After_QC/Pilot_Data/count_matrix_Pilot_Data_AfterQC.rds")
d2 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/After_QC/Second_Data/count_matrix_Second_Data_AfterQC.rds")
d3 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/Third_data/count_matrix_20231016.rds")

merged_exprs <- merge(d1, d2, key = gene)
merged_exprs <- merge(merged_exprs, d3, key = gene)
merged_exprs <- merged_exprs %>% 
  dplyr::select(-contains("12788")) %>% 
  dplyr::select(-contains("12919")) %>% 
  dplyr::select(-contains("12980")) %>% 
  dplyr::select(-contains("12944")) %>% 
  # the following code is to get only the EGFR+/EGFR- samples (like we did for the Third data)
  dplyr::select(-contains("L")) %>% 
  dplyr::select(-contains("ns"))
merged_exprs <- merged_exprs %>% 
  column_to_rownames("gene")

merged_exprs %>% 
  rowSums() %>% 
  mean()

# Let's check for the expression from mapping_XCI_4
merged_exprs[mapping_XCI_4$ENSEMBL,] %>% 
  rowSums() %>% 
  mean()

# Let's check for the expression from mapping_XCI_5
merged_exprs[mapping_XCI_5$ENSEMBL,] %>% 
  rowSums() %>% 
  mean()
```