--- title: "EGAD (4-dataset-metamarkers)" author: "Young-June Kim" date: "2023-02-21" output: html_document --- Load required libraries: ```{r} library(tidyverse) library(rhdf5) library(matrixStats) library('org.Hs.eg.db') library(dplyr) library(magrittr) library(readr) # BiocManager::install("EGAD") library(EGAD) library(MetaMarkers) # BiocManager::install("GO.db") library(GO.db) # install.packages("conflicted") library(conflicted) conflict_prefer("filter", "dplyr") conflict_prefer("count", "dplyr") conflict_prefer("rename", "dplyr") conflict_prefer("select", "dplyr") conflict_prefer("slice", "dplyr") conflict_prefer("desc", "dplyr") ``` Some major notes from the Supplementary Material of the EGAD publication: - The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network. ```{r} #list structure getwd() h5ls("human_HC_AggNet.hdf5") coconet <- h5read("human_HC_AggNet.hdf5", "/agg") coconet <- as.data.frame(coconet) dim(coconet) colnames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col") rownames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col") mapping <- mapIds(org.Hs.eg.db, colnames(coconet), 'ENTREZID', 'ENSEMBL') mapping %<>% as_tibble(rownames = "ENSEMBL") %>% dplyr::rename(ncbi = value) # check for column names colnames(mapping) # Remove the NAs mapping %<>% filter(!is.na(ncbi)) # remove 1 ncbi to many genes (remove the duplicates) dup_ids <- mapping %>% dplyr::group_by(ncbi) %>% summarize(n=n()) %>% filter(n != 1) %>% pull(ncbi) mapping %<>% filter(!(ncbi %in% dup_ids)) #check for 1-many or many-1 mapping %>% group_by(ncbi) %>% count() %>% filter(n != 1) mapping %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) head(mapping) ``` ```{r} # 2.3.3 Getting gene annotation and functional data # 2.3.4 Parsing gene data into gene annotations # 2.3.4.2 Constructing gene annotations meta_markers <- read_meta_markers("meta_markers_(4_datasets).csv.gz") meta_markers_filtered <- meta_markers %>% # Top N highest values by group arrange(desc(auroc)) %>% group_by(cell_type) %>% slice(1:30) meta_markers_VCT <- meta_markers_filtered %>% filter(cell_type == "Cytotrophoblast") dim(meta_markers_VCT) meta_markers_SCT <- meta_markers_filtered %>% filter(cell_type == "Syncytiotrophoblast") dim(meta_markers_SCT) meta_markers_EVT <- meta_markers_filtered %>% filter(cell_type == "Extravillous trophoblast") dim(meta_markers_EVT) ``` ```{r} # write.csv(meta_markers_EVT$gene, "EVT Gene List") mapping_EVT <- mapIds(org.Hs.eg.db, meta_markers_EVT$gene, 'ENSEMBL', 'SYMBOL') head(mapping_EVT) mapping_EVT %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) mapping_EVT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_EVT %<>% dplyr::filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_EVT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_EVT %>% group_by(gene) %>% count() %>% filter(n != 1) meta_markers_EVT %<>% inner_join(mapping_EVT) df1 <- meta_markers_EVT %>% select(ENSEMBL, cell_type) # Let's save the metamarkers ENSEMBL IDs to a file write.csv(df1, "EVT_ENSEMBL_IDs.csv", row.names=FALSE) ``` ```{r} # Repeat the same process for VCT mapping_VCT <- mapIds(org.Hs.eg.db, meta_markers_VCT$gene, 'ENSEMBL', 'SYMBOL') head(mapping_VCT) mapping_VCT %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) mapping_VCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_VCT %<>% dplyr::filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_VCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_VCT %>% group_by(gene) %>% count() %>% filter(n != 1) meta_markers_VCT %<>% inner_join(mapping_VCT) df2 <- meta_markers_VCT %>% select(ENSEMBL, cell_type) # Let's save the metamarkers ENSEMBL IDs to a file write.csv(df2, "VCT_ENSEMBL_IDs.csv", row.names=FALSE) ``` ```{r} # Repeat the same for SCT mapping_SCT <- mapIds(org.Hs.eg.db, meta_markers_SCT$gene, 'ENSEMBL', 'SYMBOL') head(mapping_SCT) mapping_SCT %<>% as_tibble(rownames = "gene") %>% dplyr::rename(ENSEMBL = value) mapping_SCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_SCT %<>% dplyr::filter(!is.na(ENSEMBL)) #check for 1-many or many-1 mapping_SCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) mapping_SCT %>% group_by(gene) %>% count() %>% filter(n != 1) meta_markers_SCT %<>% inner_join(mapping_SCT) df3 <- meta_markers_SCT %>% select(ENSEMBL, cell_type) # Let's save the metamarkers ENSEMBL IDs to a file write.csv(df3, "SCT_ENSEMBL_IDs.csv", row.names=FALSE) ``` ```{r} annotationsPlac <- tibble(gene = rownames(coconet)) annotationsPlac %<>% mutate(EVT = gene %in% (df1 %>% dplyr::filter(cell_type == "Extravillous trophoblast") %>% pull(ENSEMBL))) %>% mutate(EVT = as.numeric(EVT)) %>% mutate(VCT = gene %in% (df2 %>% dplyr::filter(cell_type == "Cytotrophoblast") %>% pull(ENSEMBL))) %>% mutate(VCT = as.numeric(VCT)) %>% mutate(SCT = gene %in% (df3 %>% dplyr::filter(cell_type == "Syncytiotrophoblast") %>% pull(ENSEMBL))) %>% mutate(SCT = as.numeric(SCT)) annotationsPlac <- annotationsPlac %>% as.data.frame() rownames(annotationsPlac) <- annotationsPlac$gene annotationsPlac %<>% select(-gene) # # data("example_annotations") # annotations <- make_annotations(example_annotations$gene2Annot, # example_annotations$genes, # example_annotations$annotationlist) # head(rownames(annotations)) # 2.3.5 Building networks hist <- plot_distribution(node_degree(coconet), b=10, xlim=c(0,14), ylim=c(0,2), xlab="Node degree") # Let's run this on EGAD testing <- run_GBA(as.matrix(coconet), as.matrix(annotationsPlac)) colSums(annotationsPlac) class(annotationsPlac) ``` Extract the AUROCs and compare it with the node degree AUCs ```{r} auc_nv_coexp <- testing[[1]][,1] auc_nv_coexp sum(is.na(auc_nv_coexp)) mean(auc_nv_coexp,na.rm=T) # node degree AUCs auc_nd_coexp <- testing[[1]][,3] mean(auc_nd_coexp,na.rm=T) ``` Recall from our PSL1040 content that most genes (nodes) have a small node degree, and that hubs exist. Hubs are the small handful of genes which have a very large node degree, being connected to many other nodes. ```{r eval = F} # 2.3.5.2 Extending binary networks extended_net <- extend_network(extended_net) class(extended_net) hist <- plot_distribution(node_degree(extended_net), b = 10, xlab = "Node degree",) # 2.3.5.3 Semantic similarity networks using Jaccard semantic_net <- build_semantic_similarity_network(annotations, example_binary_network$genelist) hist <- plot_distribution(node_degree(semantic_net), b = 20, xlab = "Node degree", bars = TRUE) ``` ```{r} # 2.3.6 Running the neighbor voting algorithm (similar code as above) # 2.3.6.1 Neighbor voting gba_auc_nv <- neighbor_voting(as.matrix(annotationsPlac), as.matrix(coconet), nFold = 3, output = "AUROC") head(gba_auc_nv) gba_pr_nv <- neighbor_voting(as.matrix(annotationsPlac), as.matrix(coconet), nFold = 3, output = "PR") head(gba_pr_nv) ``` ```{r} # 2.3.7 Running multifunctionality assessment analytically multifunc_assessment <- calculate_multifunc(annotationsPlac) auc_mf <- auc_multifunc(annotationsPlac, multifunc_assessment[,4]) hist <- plot_distribution(auc_mf, xlab = "AUROC", med = F, avg = F) ``` ```{r eval = F} # 2.3.8 Plotting Results # 2.3.8.1 Scatterplots and smoothed plots X <- runif(1000) Y <- runif(1000) + X*0.3 window <- 50 smoothedxy <- conv_smoother(X, Y, window, xlab = "X", ylab = "Y") # 2.3.8.2 Distributions filt <- !is.na(gba_auc_nv[,1]) aucA <- gba_auc_nv[filt,1] aucB <- gba_auc_nv[filt,3] hist <- plot_distribution(aucA, xlab="AUROCs") avg <- plot_density_compare(aucA, aucB, xlab = "AUROCs") plot_value_compare(aucA, aucB) # 2.3.8.3 ROC plots and overlay plots scores <- predictions(labels, network) roc <- plot_roc(scores, labels) plot_roc_overlay(scores, labels) # Refer back to the guide for specific ROC / PR curves ``` ```{r eval = F} # 3. User Guide # 3.1. Using EGAD to evaluate protein-protein interactions with GO # 3.1.1 Prepare data data("biogrid") data("GO.human") # 3.1.2 Build interaction networks (e.g., gene-by-gene matrix) genelist <- make_genelist(biogrid) gene_network <- make_gene_network(biogrid, genelist) # 3.1.3 Build annotation matrix goterms <- unique(GO.human[,3]) annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms) # 3.1.4 Run neighbor voting GO_groups_voted <- run_GBA(gene_network,annotations) head(GO_groups_voted) # 3.1.5 Run multifunctionality assessment for the ontology used GO_multifunc_assessment <- calculate_multifunc(annotations) # 3.1.6 Create optimal lists # For genes ord <- order(as.numeric(GO_multifunc_assessment[,2]), decreasing=TRUE) GO_multifunc_assessment_s <- GO_multifunc_assessment[ord,c(1,2)] optimallist_genes <- as.vector(unlist(GO_multifunc_assessment_s[,1])) # and for GO groups ord <- order(as.numeric(GO_groups_voted[[1]][,2]), decreasing=TRUE) GO_groups_voted_s <- GO_groups_voted[[1]][ord,] optimallist_GO <- cbind(GO.term=rownames(GO_groups_voted_s), node.degree=GO_groups_voted_s[,2], roc=GO_groups_voted_s[,1]) # 3.1.7 Use optimal lists to determine multifunctionality AUROC auc_GO_mf <- auc_multifunc(annotations, optimallist_genes) # 3.1.8 Visualize Results # 3.1.8.1 Gene Ontology auc_GO_nv <- GO_groups_voted[,1] plot_distribution(auc_GO_nv, xlab="Neighbor voting AUROC ", ylab="Number of functional terms", b=30, xlim=c(0.4,1), ylim=c(0, 440), col="gray64", density=FALSE, avg=FALSE, bars=TRUE) # Compare with the distribution of multifunctionality AUROCs plot_distribution(auc_GO_mf, xlab="Optimal GO Ranking AUROC", ylab="Number of functional terms", b = 20, xlim = c(0.2,1), ylim = c(0,4400), col="gray64", density = FALSE, avg = FALSE, bars = TRUE) ``` ```{r eval = F} # 3.2. Using EGAD to generate and analyze dense co-expression networks # 3.2.1 Prepare data data("attr.human") genelist <- unique(attr.human$entrezID[attr.human$type == "protein_coding" & !is.na(attr.human$entrezID)]) exprs <- get_expression_matrix_from_GEO("GSE34308") # 3.2.2 Build co-expression network network <- build_coexp_network(exprs, gene_list) network <- build_coexp_expressionset(exprsSet, genelist) # 3.2.3 Calculate and visualize network topological properties assort <- assortativity(network) nd <- node_degrees(network) plot_density(nd) # 3.2.4 Run neighbor voting data("GO.human") annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms) annotations_sub <- filter_network_cols(annotations, min = 20, max = 300) GO_groups_voted <- run_GBA(network, annotations_sub) ```