--- title: "XCI_Escape_Removal (20231018).Rmd" output: html_notebook --- For the first step for removing the XCI escape genes, we need to annotate our variants. To make things easier, Jonathan's code (as seen in the "Plotting VCF" file) filters out the SNPs that match our criteria. These SNPs are biallelic - which serves our purpose. Now we start with loading required libraries. ```{r} # Load libraries library(magrittr) library(readr) library(ggplot2) library(tibble) library(tidyr) library(dplyr) library(readr) library(rtracklayer) library(stringr) library(plyranges) library(GenomicFeatures) library(plyranges) library(cowplot) library(Seurat) library(SeuratDisk) library(pheatmap) library(reshape2) library(RColorBrewer) library(rhdf5) library(matrixStats) library(EGAD) library(GO.db) library(org.Hs.eg.db) # installation # BiocManager::install("plyranges") ``` Let's regenerate the combined dataframe that includes the XCI skew information. ```{r} # Let's do this analysis for both Pilot and Second data merged_df_1 <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Pilot_Data/Merged_analysis/20230920_XCI_genes(Pilot_Data).csv") merged_df_2 <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Second_Data/Merged_analysis_After_QC/20230921_XCI_genes(Second_Data).csv") merged_df <- read.csv("/home/youkim/Bulk_RNA_seq/XCI_analysis/Third_Data/XCI_genes(Third_Data).csv") merged_df <- merged_df %>% dplyr::select(-X.1) combined <- rbind(merged_df_1, merged_df_2) %>% dplyr::select(-X.1) # merge it with the third data combined <- rbind(combined, merged_df) ``` Now let's see if we could generate a heatmap that could encompass all 22 samples based on their genes ```{r} # Pivot the dataframe into a genes-by-samples matrix heatmap_data <- dcast(combined, gene_name ~ sample_id, value.var = "mean_XCI_ratio") rownames(heatmap_data) <- heatmap_data$gene_name # Set row names to gene names heatmap_data <- heatmap_data[, -1] # Remove the gene_name column # Let's remove B5_12764 and see what happens heatmap_data <- heatmap_data %>% dplyr::select(-B5_12764) # heatmap_data[is.na(heatmap_data)] <- "grey" drows_1 <- dist(heatmap_data, method = "euclidean") drows_1[is.na(drows_1)] <- max(drows_1,na.rm=T) which(is.na(drows_1), arr.ind = T) pheatmap(drows_1) # heatmap_data_noNA <- heatmap_data %>% # filter(rowSums(is.na(heatmap_data)) == 0) # heatmap_matrix <- as.matrix(heatmap_data) # # giveNAs = which(is.na(as.matrix(dist(heatmap_data))),arr.ind=TRUE) # giveNAs # # tab = sort(table(c(giveNAs)),decreasing=TRUE) # checkNA = sapply(1:length(tab),function(i){ # sum(is.na(as.matrix(dist(heatmap_data[-as.numeric(names(tab[1:i])),])))) # }) # rmv = names(tab)[1:min(which(checkNA==0))] # # heatmap_data = heatmap_data[-as.numeric(rmv),] # Define a color palette for the heatmap colors <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100) annotation_helper_1 <- combined %>% filter(gene_name %in% rownames(heatmap_data)) %>% arrange(gene_name) %>% dplyr::select(gene_name, mean_exprs_ratio) rownames(annotation_helper_1) <- NULL interim <- annotation_helper_1$gene_name %>% duplicated() annotation_helper_1 <- annotation_helper_1[!interim,] # double check the order summary(rownames(heatmap_data) == annotation_helper_1$gene_name) # Color for annotations expression_ratio_palette <- colorRampPalette(c("blue", "white", "red"))(100) # Annotation dataframe annotation_df_1 <- data.frame(Expression_Ratio = log10(annotation_helper_1$mean_exprs_ratio + 1)) rownames(annotation_df_1) <- rownames(heatmap_data) # Create the heatmap using pheatmap pdf(file = "Heatmap_22_samples(20231018).pdf", height = 12, width = 15) p <- pheatmap( heatmap_data, clustering_distance_rows = drows_1, # You can choose a different clustering method if needed clustering_distance_cols = "euclidean", # You can choose a different clustering method if needed annotation_row = annotation_df_1, # Add the expression ratio annotation annotation_colors = list(Expression_Ratio = expression_ratio_palette), col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100), # Color palette na_col = "grey", # Set the color for NA values main = "Gene Expression Heatmap", clustering_method = "complete", fontsize_row = 3, # Adjust row label size fontsize_col = 9 # Adjust column label size ) p dev.off() row_ordering_1 <- p$tree_row gene_list_1 <- row_ordering_1$labels[row_ordering_1$order] annotation_df_1 <- rownames_to_column(annotation_df_1, "genes") tibble_1 <- tibble(genes = gene_list_1) tibble_1 <- inner_join(tibble_1, annotation_df_1) pdf(file = "Heatmap_Annotation_filter5(20230929).pdf", height = 15, width = 3) tibble_1 %>% mutate(genes = factor(genes, levels = rev(genes))) %>% ggplot(aes(x = 1, y = genes, fill = Expression_Ratio)) + geom_tile() + scale_fill_gradient(low = "white", high = "purple") + # Adjust the color scale coord_cartesian(expand = FALSE) + theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_blank()) dev.off() ``` Since there were too many NAs in the first heatmap, let's generate a new one that has a specific NA count filter. ```{r} heatmap_data <- dcast(combined, gene_name ~ sample_id, value.var = "mean_XCI_ratio") rownames(heatmap_data) <- heatmap_data$gene_name # Set row names to gene names heatmap_data <- heatmap_data[, -1] # Remove the gene_name column # Let's remove B5_12764 and see what happens heatmap_data <- heatmap_data %>% dplyr::select(-B5_12764) # Count the number of NAs in each row na_count <- rowSums(is.na(heatmap_data)) # Create a list to store the heatmaps heatmap_list <- list() # Create a function to generate a heatmap for a given threshold generate_heatmap <- function(threshold) { # Filter rows with a number of NAs less than or equal to the threshold filtered_heatmap_data <- heatmap_data[na_count <= threshold, ] drows_3 <- dist(filtered_heatmap_data, method = "euclidean") # Check that we have no NAs in our distance matrix now! which(is.na(drows_3), arr.ind = T) colors <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100) annotation_helper <- combined %>% filter(gene_name %in% rownames(filtered_heatmap_data)) %>% arrange(gene_name) %>% dplyr::select(gene_name, mean_exprs_ratio) rownames(annotation_helper) <- NULL interim <- annotation_helper$gene_name %>% duplicated() annotation_helper <- annotation_helper[!interim,] # double check the order summary(rownames(filtered_heatmap_data) == annotation_helper$gene_name) # Color for annotations expression_ratio_palette <- colorRampPalette(c("blue", "white", "red"))(100) # Annotation dataframe annotation_df <- data.frame(Expression_Ratio = log10(annotation_helper$mean_exprs_ratio + 1)) rownames(annotation_df) <- rownames(filtered_heatmap_data) p <- pheatmap( filtered_heatmap_data, clustering_distance_rows = drows_3, # You can choose a different clustering method if needed clustering_distance_cols = "euclidean", # You can choose a different clustering method if needed annotation_row = annotation_df, # Add the expression ratio annotation annotation_colors = list(Expression_Ratio = expression_ratio_palette), col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100), # Color palette na_col = "grey", # Set the color for NA values main = "Gene Expression Heatmap", clustering_method = "complete", fontsize_row = 6, # Adjust row label size fontsize_col = 9, # Adjust column label size cutree_rows = 2 ) return(p) } # Use lapply to generate the heatmaps for different thresholds heatmaps <- lapply(thresholds, generate_heatmap) pdf(file = "Heatmap_22_samples_NA_thresholds(20231019).pdf", height = 14, width = 11) generate_heatmap(11) generate_heatmap(10) generate_heatmap(9) generate_heatmap(8) generate_heatmap(7) generate_heatmap(6) generate_heatmap(5) generate_heatmap(4) generate_heatmap(3) generate_heatmap(2) generate_heatmap(1) generate_heatmap(0) dev.off() # Create the heatmap using pheatmap pdf(file = "Heatmap_22_samples_fitered_NA(20231019).pdf", height = 15, width = 12) p <- pheatmap( filtered_heatmap_data, clustering_distance_rows = drows_3, # You can choose a different clustering method if needed clustering_distance_cols = "euclidean", # You can choose a different clustering method if needed annotation_row = annotation_df, # Add the expression ratio annotation annotation_colors = list(Expression_Ratio = expression_ratio_palette), col = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100), # Color palette na_col = "grey", # Set the color for NA values main = "Gene Expression Heatmap", clustering_method = "complete", fontsize_row = 4, # Adjust row label size fontsize_col = 9, # Adjust column label size cutree_rows = 2 ) p dev.off() object <- generate_heatmap(11) ``` Some major notes from the Supplementary Material of the EGAD publication: - The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network. ```{r} #list structure getwd() h5ls("~/EGAD/human_HC_AggNet.hdf5") coconet <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/agg") coconet <- as.data.frame(coconet) # Check the dimension of the human gene co-expression matrix dim(coconet) colnames(coconet) <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/col") rownames(coconet) <- h5read("~/EGAD/human_HC_AggNet.hdf5", "/col") # The following is a template of how to use this tool mapping <- mapIds(org.Hs.eg.db, colnames(coconet), 'SYMBOL', 'ENSEMBL') mapping %<>% as_tibble(rownames = "ENSEMBL") %>% dplyr::rename(ncbi = value) # check for column names colnames(mapping) # Remove the NAs mapping %<>% filter(!is.na(ncbi)) # remove 1 ncbi to many genes (remove the duplicates) detach(package:plyranges,unload=TRUE) dup_ids <- mapping %>% group_by(ncbi) %>% dplyr::summarize(counts = n()) %>% dplyr::filter(counts != 1) %>% dplyr::pull(ncbi) mapping %<>% filter(!(ncbi %in% dup_ids)) #check for 1-many or many-1 mapping %>% dplyr::group_by(ncbi) %>% dplyr::count() %>% filter(n != 1) mapping %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) dim(mapping) ``` Let's try to bring out the EGAD RMarkdown file and try to run it in this file! ```{r} # Call all the XCI skew genes XCI_genes <- rownames(heatmap_data) mapping_XCI <- mapIds(org.Hs.eg.db, XCI_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI %<>% filter(!is.na(ENSEMBL)) # Remove duplicates dup_ids <- mapping_XCI %>% group_by(gene) %>% dplyr::summarize(counts = n()) %>% dplyr::filter(counts != 1) %>% dplyr::pull(gene) # No duplicates mapping %<>% filter(!(ncbi %in% dup_ids)) #check for 1-many or many-1 mapping_XCI %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) ``` ```{r} annotations_1 <- tibble(gene = rownames(coconet)) annotations_1 <- annotations_1 %>% mutate(XCI_skew = gene %in% (mapping_XCI %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) head(annotations_1) annotations_1 <- annotations_1 %>% as.data.frame() rownames(annotations_1) <- annotations_1$gene annotations_1 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing <- run_GBA(as.matrix(coconet), as.matrix(annotations_1)) testing[[1]] ``` Let's also test it for the gene list derived based on the NA count filter! ```{r} NA_filter11_genes <- read.csv("NA_filter11_genes.csv",header = 0) %>% pull(V1) mapping_XCI_2 <- mapIds(org.Hs.eg.db, NA_filter11_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI_2 %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI_2 %<>% filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_XCI_2 %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI_2 %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) annotations_2 <- tibble(gene = rownames(coconet)) annotations_2 <- annotations_2 %>% mutate(XCI_skew = gene %in% (mapping_XCI_2 %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) head(annotations_2) annotations_2 <- annotations_2 %>% as.data.frame() rownames(annotations_2) <- annotations_2$gene annotations_2 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing_2 <- run_GBA(as.matrix(coconet), as.matrix(annotations_2)) testing_2[[1]] ``` ```{r} noNA_genes <- read.csv("noNA_genes.csv",header = 0) %>% pull(V1) mapping_XCI_3 <- mapIds(org.Hs.eg.db, noNA_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI_3 %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI_3 %<>% filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_XCI_3 %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI_3 %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) annotations_3 <- tibble(gene = rownames(coconet)) annotations_3 <- annotations_3 %>% mutate(XCI_skew = gene %in% (mapping_XCI_3 %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) dim(annotations_3) annotations_3 <- annotations_3 %>% as.data.frame() rownames(annotations_3) <- annotations_3$gene annotations_3 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing_3 <- run_GBA(as.matrix(coconet), as.matrix(annotations_3), min = 5) testing_3[[1]] ``` Let's check the co-expression for genes showing high XCI skew from the heatmap when NA count filter = 11 ```{r} NA_filter11_XCI_skew_genes <- read.csv("NA_filter11_XCI_skew_genes.csv",header = 0) %>% pull(V1) mapping_XCI_4 <- mapIds(org.Hs.eg.db, NA_filter11_XCI_skew_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI_4 %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI_4 %<>% filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_XCI_4 %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI_4 %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) annotations_4 <- tibble(gene = rownames(coconet)) annotations_4 <- annotations_4 %>% mutate(XCI_skew = gene %in% (mapping_XCI_4 %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) dim(annotations_4) annotations_4 <- annotations_4 %>% as.data.frame() rownames(annotations_4) <- annotations_4$gene annotations_4 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing_4 <- run_GBA(as.matrix(coconet), as.matrix(annotations_4), min = 5) testing_4[[1]] ``` Let's also compare when we set the read count filter as 5 (NA counts <= 6) ```{r} readfilter5_NA_filter6_XCI_skew_genes <- read.csv("readfilter5_NA_filter6_XCI_skew_genes.csv",header = 0) %>% pull(V1) mapping_XCI_5 <- mapIds(org.Hs.eg.db, readfilter5_NA_filter6_XCI_skew_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI_5 %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI_5 %<>% filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_XCI_5 %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI_5 %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) annotations_5 <- tibble(gene = rownames(coconet)) annotations_5 <- annotations_5 %>% mutate(XCI_skew = gene %in% (mapping_XCI_5 %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) dim(annotations_5) annotations_5 <- annotations_5 %>% as.data.frame() rownames(annotations_5) <- annotations_5$gene annotations_5 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing_5 <- run_GBA(as.matrix(coconet), as.matrix(annotations_5), min = 5) ?run_GBA testing_5[[1]] ``` As a last point of reference, let's perform EGAD on all genes in the heatmap for read count filter = 5. ```{r} readfilter5_noNA_all_genes <- read.csv("readfilter5_noNA_all_genes.csv",header = 0) %>% pull(V1) mapping_XCI_6 <- mapIds(org.Hs.eg.db, readfilter5_noNA_all_genes, 'ENSEMBL', 'SYMBOL') mapping_XCI_6 %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) # Remove the NAs mapping_XCI_6 %<>% filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_XCI_6 %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) mapping_XCI_6 %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) annotations_6 <- tibble(gene = rownames(coconet)) annotations_6 <- annotations_6 %>% mutate(XCI_skew = gene %in% (mapping_XCI_6 %>% pull(ENSEMBL))) %>% mutate(XCI_skew = as.numeric(XCI_skew)) dim(annotations_6) annotations_6 <- annotations_6 %>% as.data.frame() rownames(annotations_6) <- annotations_6$gene annotations_6 %<>% dplyr::select(-gene) # 2.3.5 Building networks # hist <- plot_distribution(node_degree(coconet), # b=10, # xlim=c(0,14), # ylim=c(0,2), # xlab="Node degree") # Let's run this on EGAD testing_6 <- run_GBA(as.matrix(coconet), as.matrix(annotations_6), min = 5) testing_6[[1]] ``` Compare the expression values of these genes from the count matrix! ```{r} d1 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/After_QC/Pilot_Data/count_matrix_Pilot_Data_AfterQC.rds") d2 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/After_QC/Second_Data/count_matrix_Second_Data_AfterQC.rds") d3 <- readRDS("/home/youkim/Bulk_RNA_seq/STAR_output_analysis/Third_data/count_matrix_20231016.rds") merged_exprs <- merge(d1, d2, key = gene) merged_exprs <- merge(merged_exprs, d3, key = gene) merged_exprs <- merged_exprs %>% dplyr::select(-contains("12788")) %>% dplyr::select(-contains("12919")) %>% dplyr::select(-contains("12980")) %>% dplyr::select(-contains("12944")) %>% # the following code is to get only the EGFR+/EGFR- samples (like we did for the Third data) dplyr::select(-contains("L")) %>% dplyr::select(-contains("ns")) merged_exprs <- merged_exprs %>% column_to_rownames("gene") merged_exprs %>% rowSums() %>% mean() # Let's check for the expression from mapping_XCI_4 merged_exprs[mapping_XCI_4$ENSEMBL,] %>% rowSums() %>% mean() # Let's check for the expression from mapping_XCI_5 merged_exprs[mapping_XCI_5$ENSEMBL,] %>% rowSums() %>% mean() ```