---
title: "Co-alleli= Network Analysis (2024-03-04)"
author: "Young June Kim"
date: "3/4/2024"
output: html_document
---

## Loading Relevant Data

Dr. Jonathan Werner has already prepared the required data in separate files. These 2 are the starting data for making the co-allelic network.
Our first step will be to examine the files and plan how to use them.

```{r co-allelic network ingredients}
# Load the list of all filtered vcfs for the GTEx data.
# Contains the XCI ratios of genes per sample
load("/data/jwerner/for_youngjune/all_v8_GATK_1_20_2021_update_filt_escape.skew.est.max.genes.Rdata")

# Check one entry in both lists
list.skew.max[[1]]
ratios.max[[1]]

# This is a dataframe with metadata for all the GTEx samples, contains the computed XCI ratio, tissue name, donor name, and a bunch of other information. 
load("/data/jwerner/for_youngjune/human_skew_and_stats_autoFilt_df_7_12_23.Rdata")

# Check the number of tissues!
# human_skew_and_stats_df %>% 
#   filter(grepl("Brain", .$tissue)) %>% 
#   pull(sample_index) %>% 
#   head()
  # group_by(tissue) %>% 
  # summarize(count = dplyr::n()) %>% 

human_skew_and_stats_df %>% 
  group_by(tissue) %>% 
  # filter(grepl("Heart", .$tissue)) %>% 
  summarize(count = dplyr::n()) %>% 
  pull(tissue)

donor_mean_xci <- human_skew_and_stats_df %>% 
  group_by(donor) %>% 
  # filter(grepl("Heart", .$tissue)) %>% 
  summarize(count = dplyr::n(), mean_xci = mean(skew))

at_least_ten <- human_skew_and_stats_df %>% 
  group_by(donor) %>% 
  summarize(count = dplyr::n(), mean_xci = mean(skew)) %>% 
  filter(count > 10) %>% 
  pull(donor)
  
# Read the meta-markers file
germ_layer_markers <- read.csv("/home/youkim/XCI/Co-allelic_network_analysis/Germ_layer_level_analysis/germ_layer_specific_markers.csv")
germ_layer_markers %>% 
  group_by(germlayer_marker) %>% 
  summarize(count = dplyr::n())
```

Now we check for the networks that Jonathan generated.

```{r eval = F}
# This was made by first computing the deviation of each gene's XCI ratio from the estimated sample XCI ratio using the two files above. 
# I only used samples with at least 10 SNPs and genes detected in at least 10 samples
load("/data/jwerner/for_youngjune/allelic_correlation_network.Rdata")
allelic_corrs[1:10,1:10]

# expression matrices for the GTEx data
load("/data/jwerner/for_youngjune/raw_exp_matrix.Rdata")
dim(all_exp_data)

# This is the rank co-expression network for GTEx. 
load("/data/jwerner/for_youngjune/coexp_rank_network.Rdata")
gtex_coexp_all[1:10,1:10]
dim(gtex_coexp_all)
```

I think the first step would be trying to apply WGCNA!
Not really. We would first like to see how many of these germ layer markers are seen in the X chromosome.

```{r}
# Load libraries
# install.packages("kader")
library(tidyverse)
library(tibble)
library(ggsignif)
library(corrplot)
library(kader)
# install.packages("ggcorrplot")
library(ggcorrplot)

# Check for XCI escape genes
XCI_chromosome_file <- read.csv("~/Bulk_RNA_seq/XCI_analysis/Pilot_Data/XCI_escape_genes.csv")
XCI_chromosome_genes <- XCI_chromosome_file[,1:7] 
colnames(XCI_chromosome_genes) <- XCI_chromosome_genes[1, ]

# Remove the XCI_chromosome file object at this point
rm(XCI_chromosome_file)

XCI_chromosome_genes <- XCI_chromosome_genes[2:nrow(XCI_chromosome_genes),]

XCI_chromosome_genes$`Gene name` %>% head()

XCI_genes <- XCI_chromosome_genes %>% 
  filter(`Combined XCI status` == "inactive")
XCI_escape_genes <- XCI_chromosome_genes %>% 
  filter(`Combined XCI status` != "inactive")

mesoderm_genes <- germ_layer_markers[(germ_layer_markers$name %in% XCI_genes$`Gene name`),] %>% 
  filter(germlayer_marker == "Mesoderm") %>% 
  pull(name) %>% 
  unique()
endoderm_genes <- germ_layer_markers[(germ_layer_markers$name %in% XCI_genes$`Gene name`),] %>% 
  filter(germlayer_marker == "Endoderm") %>% 
  pull(name) %>% 
  unique()
ectoderm_genes <- germ_layer_markers[(germ_layer_markers$name %in% XCI_genes$`Gene name`),] %>% 
  filter(germlayer_marker == "Ectoderm") %>% 
  pull(name) %>% 
  unique()

intersect(mesoderm_genes, ectoderm_genes)
intersect(endoderm_genes, ectoderm_genes)
intersect(mesoderm_genes, endoderm_genes)
```

# Co-expression network analysis

```{r}
without_decimals <- rownames(all_exp_data) %>% substr(1,15) %>% unique()
human_skew_and_stats_df
all_exp_data[1:5, 1:10]
library(GenomicFeatures)

gtf_file <- "/home/youkim/Genome_indexing/Homo_sapiens.GRCh38.109.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")

#BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
library(BSgenome.Hsapiens.UCSC.hg38)
chromosome_name <- "X"

#Extract the Y chromosome Ensembl IDs
genes_on_X <- genes(txdb, filter = list(tx_chrom = "X"))
ensembl_gene_ids <- genes_on_X@ranges@NAMES

Xchr_genes <- ensembl_gene_ids[(ensembl_gene_ids %in% without_decimals)]
exp_subdata <- all_exp_data[Xchr_genes,1:ncol(all_exp_data)]

# Still running into some problems with the rownames
```


# Generate the network specific for the adrenal gland

The adrenal gland is unique since it has two distinct developmental germ layers involved in its developmental process. The cortex of the adrenal gland is derived from mesoderm, whereas the medulla is derived from the neural crest, which is of ectodermal origin.

That is, it seems that this structure is quite analagous to the CGE vs MGE distinction.
I also previously generated correlations for the lung, checking for mesoderm genes.
That also reported very weak correlations, but I suspect that the non-germ layer genes are not robust. This analysis using a 2-germ layer origin organ would be more informative.

```{r}
# Check the indices of the adrenal gland
ac_indices <- human_skew_and_stats_df %>% 
  filter(grepl("Adrenal Gland", .$tissue)) %>% 
  pull(sample_index)
```

```{r}
# Define the folded() function
folded <- function(x) { apply( cbind(x,1-x), 1, max)} 

# Derive a tibble showing the number of SNPs for each sample
tibble <- data_frame()
for (i in ac_indices){
  a <- list.skew.max[[i]]
  b <- nrow(a)
  col_name <- paste0("sample_", i)
  ifelse(nrow(tibble) == 0, 
         tibble <- tibble(!!col_name := b),
         tibble <- tibble %>%
           mutate(!!col_name := b))
}

# Vector of sample indices with insufficient SNPs
samples_to_remove <- t(tibble) %>% 
  data.frame(SNP_counts = .) %>% 
  filter(SNP_counts < 10) %>% 
  rownames() %>% 
  substr(8,11)

# Filter the samples to include for this analysis
samples_to_include <- ac_indices[!(ac_indices %in% samples_to_remove)]

tibble_2 <- data_frame()
for(i in samples_to_include){
  a <- list.skew.max[[i]]
  b <- human_skew_and_stats_df %>% 
    filter(sample_index == i) %>% 
    pull(donor)
  c <- donor_mean_xci %>% 
    filter(donor == b) %>% 
    pull(mean_xci)
  folded_XCI_ratio_of_sample <- a %>% 
    mutate(folded_XCI_ratio = (a$G %>% folded())) %>% 
    mutate(dev = folded_XCI_ratio - c) %>% 
    dplyr::select(name, folded_XCI_ratio, dev, `test3.mask[b3.mask == 2, 1][allele.f]`)
  col_name <- paste0("sample_", i)
  df <- tibble(name = folded_XCI_ratio_of_sample$name,
                   !!col_name := folded_XCI_ratio_of_sample$dev)
  df <- data.frame(df)
  ifelse(nrow(tibble_2) == 0, 
         tibble_2 <- df,
         tibble_2 <- full_join(tibble_2, df, by = "name"))
}

dim(tibble_2)

# Remove the genes that show up in less than 10 samples
for_sp <- tibble_2 %>% 
  column_to_rownames("name")
for_sp <- for_sp[((80 - ((for_sp %>% is.na()) %>% rowSums())) >= 10),]

# for_sp <- for_sp[!(rownames(for_sp) %in% XCI_escape_genes$`Gene name`),]

# Now generate the correlation matrix by setting the NAs as zero
for_sp[is.na(for_sp)] <- 0
cors <- for_sp %>% 
  t() %>% 
  cor(method = "spearman")

pdf(file = "Adrenal_Cortex_subtract_donor_mean_xci_20240328.pdf", 
    width = 10, 
    height = 10)
cors %>% pheatmap(fontsize_row = 2,
                  fontsize_col = 2)
diag(cors) <- 0
cors %>% pheatmap(fontsize_row = 2,
                  fontsize_col = 2)
dev.off()

new_names <- rownames(cors)[!(rownames(cors) %in% XCI_escape_genes$`Gene name`)]
escape_removed_cors <- cors[new_names,new_names]

escape_removed_cors %>% 
  pheatmap(fontsize_row = 2,
           fontsize_col = 2)
# write_csv(as.data.frame(cors), "brain_GTEX_gene_sample_df.csv")
```