---
title: "EGAD (4-dataset-metamarkers)"
author: "Young-June Kim"
date: "2023-02-21"
output: html_document
---
Load required libraries:
```{r}
library(tidyverse)
library(rhdf5)
library(matrixStats)
library('org.Hs.eg.db')
library(dplyr)
library(magrittr)
library(readr)
# BiocManager::install("EGAD")
library(EGAD)
library(MetaMarkers)
# BiocManager::install("GO.db")
library(GO.db)
# install.packages("conflicted")
library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("count", "dplyr")
conflict_prefer("rename", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("slice", "dplyr")
conflict_prefer("desc", "dplyr")
```

Some major notes from the Supplementary Material of the EGAD publication:
- The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network.

```{r}
#list structure
getwd()
h5ls("human_HC_AggNet.hdf5")

coconet <- h5read("human_HC_AggNet.hdf5", "/agg")
coconet <- as.data.frame(coconet)

dim(coconet)

colnames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")
rownames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")

mapping <- mapIds(org.Hs.eg.db, 
                  colnames(coconet), 
                  'ENTREZID',
                  'ENSEMBL')

mapping %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(ncbi = value) 

# check for column names
colnames(mapping)

# Remove the NAs
mapping %<>% filter(!is.na(ncbi))

# remove 1 ncbi to many genes (remove the duplicates)
dup_ids <- mapping %>% 
  dplyr::group_by(ncbi) %>% 
  summarize(n=n()) %>% 
  filter(n != 1) %>% 
  pull(ncbi)

mapping %<>% 
  filter(!(ncbi %in% dup_ids))

#check for 1-many or many-1
mapping %>% 
  group_by(ncbi) %>% 
  count() %>% 
  filter(n != 1)

mapping %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

head(mapping)
```

```{r}
# 2.3.3 Getting gene annotation and functional data
# 2.3.4 Parsing gene data into gene annotations
# 2.3.4.2 Constructing gene annotations
meta_markers <- read_meta_markers("meta_markers_(4_datasets).csv.gz")

meta_markers_filtered <- meta_markers %>% # Top N highest values by group
  arrange(desc(auroc)) %>% 
  group_by(cell_type) %>%
  slice(1:30)

meta_markers_VCT <- meta_markers_filtered %>%  
  filter(cell_type == "Cytotrophoblast") 
dim(meta_markers_VCT)

meta_markers_SCT <- meta_markers_filtered %>%  
  filter(cell_type == "Syncytiotrophoblast")
dim(meta_markers_SCT)

meta_markers_EVT <- meta_markers_filtered %>% 
  filter(cell_type == "Extravillous trophoblast")
dim(meta_markers_EVT)
```

```{r}
# write.csv(meta_markers_EVT$gene, "EVT Gene List")
mapping_EVT <- mapIds(org.Hs.eg.db, 
                  meta_markers_EVT$gene, 
                  'ENSEMBL',
                  'SYMBOL')
head(mapping_EVT)
mapping_EVT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_EVT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

mapping_EVT %<>% dplyr::filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_EVT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
mapping_EVT %>% group_by(gene) %>% count() %>% filter(n != 1)

meta_markers_EVT %<>% inner_join(mapping_EVT)
df1 <- meta_markers_EVT %>%
  select(ENSEMBL, cell_type)

# Let's save the metamarkers ENSEMBL IDs to a file
write.csv(df1, "EVT_ENSEMBL_IDs.csv", row.names=FALSE)
```

```{r}
# Repeat the same process for VCT
mapping_VCT <- mapIds(org.Hs.eg.db, 
                  meta_markers_VCT$gene, 
                  'ENSEMBL',
                  'SYMBOL')
head(mapping_VCT)
mapping_VCT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_VCT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

mapping_VCT %<>% dplyr::filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_VCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
mapping_VCT %>% group_by(gene) %>% count() %>% filter(n != 1)

meta_markers_VCT %<>% inner_join(mapping_VCT)
df2 <- meta_markers_VCT %>%
  select(ENSEMBL, cell_type)

# Let's save the metamarkers ENSEMBL IDs to a file
write.csv(df2, "VCT_ENSEMBL_IDs.csv", row.names=FALSE)
```

```{r}
# Repeat the same for SCT
mapping_SCT <- mapIds(org.Hs.eg.db, 
                  meta_markers_SCT$gene, 
                  'ENSEMBL',
                  'SYMBOL')
head(mapping_SCT)
mapping_SCT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_SCT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

mapping_SCT %<>% dplyr::filter(!is.na(ENSEMBL))

#check for 1-many or many-1
mapping_SCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
mapping_SCT %>% group_by(gene) %>% count() %>% filter(n != 1)

meta_markers_SCT %<>% inner_join(mapping_SCT)

df3 <- meta_markers_SCT %>%
  select(ENSEMBL, cell_type)

# Let's save the metamarkers ENSEMBL IDs to a file
write.csv(df3, "SCT_ENSEMBL_IDs.csv", row.names=FALSE)
```

```{r}
annotationsPlac <- tibble(gene = rownames(coconet))
annotationsPlac %<>% 
  mutate(EVT = gene %in% (df1 %>% 
                          dplyr::filter(cell_type == "Extravillous trophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(EVT = as.numeric(EVT)) %>% 
  mutate(VCT = gene %in% (df2 %>% 
                          dplyr::filter(cell_type == "Cytotrophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(VCT = as.numeric(VCT)) %>% 
  mutate(SCT = gene %in% (df3 %>% 
                          dplyr::filter(cell_type == "Syncytiotrophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(SCT = as.numeric(SCT))

annotationsPlac <- annotationsPlac %>% 
  as.data.frame()

rownames(annotationsPlac) <- annotationsPlac$gene
annotationsPlac %<>% 
  select(-gene)
# 
# data("example_annotations")
# annotations <- make_annotations(example_annotations$gene2Annot,
#                                 example_annotations$genes,
#                                 example_annotations$annotationlist)
# head(rownames(annotations))

# 2.3.5 Building networks
hist <- plot_distribution(node_degree(coconet), 
                          b=10,
                          xlim=c(0,14), 
                          ylim=c(0,2), 
                          xlab="Node degree")

# Let's run this on EGAD
testing <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotationsPlac))

colSums(annotationsPlac)

class(annotationsPlac)
```

Extract the AUROCs and compare it with the node degree AUCs
```{r}
auc_nv_coexp <- testing[[1]][,1]
auc_nv_coexp
sum(is.na(auc_nv_coexp))
mean(auc_nv_coexp,na.rm=T)

# node degree AUCs
auc_nd_coexp <- testing[[1]][,3]
mean(auc_nd_coexp,na.rm=T)
```

Recall from our PSL1040 content that most genes (nodes) have a small node degree, and that hubs exist. Hubs are the small handful of genes which have a very large node degree, being connected to many other nodes.

```{r eval = F}
# 2.3.5.2 Extending binary networks
extended_net <- extend_network(extended_net)
class(extended_net)
hist <- plot_distribution(node_degree(extended_net), 
                          b = 10,
                          xlab = "Node degree",)

# 2.3.5.3 Semantic similarity networks using Jaccard
semantic_net <- build_semantic_similarity_network(annotations, 
                                                  example_binary_network$genelist)
hist <- plot_distribution(node_degree(semantic_net),
                          b = 20,
                          xlab = "Node degree",
                          bars = TRUE)
```

```{r}
# 2.3.6 Running the neighbor voting algorithm (similar code as above)
# 2.3.6.1 Neighbor voting
gba_auc_nv <- neighbor_voting(as.matrix(annotationsPlac), 
                              as.matrix(coconet),
                              nFold = 3,
                              output = "AUROC")
head(gba_auc_nv)

gba_pr_nv <- neighbor_voting(as.matrix(annotationsPlac), 
                             as.matrix(coconet),
                              nFold = 3,
                              output = "PR")
head(gba_pr_nv)
```

```{r}
# 2.3.7 Running multifunctionality assessment analytically
multifunc_assessment <- calculate_multifunc(annotationsPlac)
auc_mf <- auc_multifunc(annotationsPlac, multifunc_assessment[,4])
hist <- plot_distribution(auc_mf, 
                          xlab = "AUROC",
                          med = F,
                          avg = F)

```

```{r eval = F}

# 2.3.8 Plotting Results
# 2.3.8.1 Scatterplots and smoothed plots
X <- runif(1000)
Y <- runif(1000) + X*0.3
window <- 50
smoothedxy <- conv_smoother(X, 
                            Y,
                            window,
                            xlab = "X",
                            ylab = "Y")

# 2.3.8.2 Distributions
filt <- !is.na(gba_auc_nv[,1])
aucA <- gba_auc_nv[filt,1]
aucB <- gba_auc_nv[filt,3]
hist <- plot_distribution(aucA, xlab="AUROCs")

avg <- plot_density_compare(aucA, aucB, xlab = "AUROCs")

plot_value_compare(aucA, aucB)

# 2.3.8.3 ROC plots and overlay plots
scores <- predictions(labels, network)
roc <- plot_roc(scores, labels)
plot_roc_overlay(scores, labels)

# Refer back to the guide for specific ROC / PR curves
```


```{r eval = F}
# 3. User Guide
# 3.1. Using EGAD to evaluate protein-protein interactions with GO
# 3.1.1 Prepare data
data("biogrid")
data("GO.human")

# 3.1.2 Build interaction networks (e.g., gene-by-gene matrix)
genelist <- make_genelist(biogrid)
gene_network <- make_gene_network(biogrid, genelist)

# 3.1.3 Build annotation matrix
goterms <- unique(GO.human[,3])
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)

# 3.1.4 Run neighbor voting
GO_groups_voted <- run_GBA(gene_network,annotations)
head(GO_groups_voted)

# 3.1.5 Run multifunctionality assessment for the ontology used
GO_multifunc_assessment <- calculate_multifunc(annotations)

# 3.1.6 Create optimal lists
# For genes
ord <- order(as.numeric(GO_multifunc_assessment[,2]), 
             decreasing=TRUE)
GO_multifunc_assessment_s <- GO_multifunc_assessment[ord,c(1,2)]
optimallist_genes <- as.vector(unlist(GO_multifunc_assessment_s[,1]))

# and for GO groups
ord <- order(as.numeric(GO_groups_voted[[1]][,2]),
             decreasing=TRUE)
GO_groups_voted_s <- GO_groups_voted[[1]][ord,]
optimallist_GO <- cbind(GO.term=rownames(GO_groups_voted_s),
                        node.degree=GO_groups_voted_s[,2],
                        roc=GO_groups_voted_s[,1])

# 3.1.7 Use optimal lists to determine multifunctionality AUROC
auc_GO_mf <- auc_multifunc(annotations, optimallist_genes)

# 3.1.8 Visualize Results
# 3.1.8.1 Gene Ontology
auc_GO_nv <- GO_groups_voted[,1]
plot_distribution(auc_GO_nv, 
                  xlab="Neighbor voting AUROC ",
                  ylab="Number of functional terms",
                  b=30, 
                  xlim=c(0.4,1),
                  ylim=c(0, 440),
                  col="gray64",
                  density=FALSE,
                  avg=FALSE, 
                  bars=TRUE)

# Compare with the distribution of multifunctionality AUROCs
plot_distribution(auc_GO_mf,
                  xlab="Optimal GO Ranking AUROC",
                  ylab="Number of functional terms",
                  b = 20, 
                  xlim = c(0.2,1), 
                  ylim = c(0,4400), 
                  col="gray64",
                  density = FALSE, 
                  avg = FALSE, 
                  bars = TRUE)
```

```{r eval = F}
# 3.2. Using EGAD to generate and analyze dense co-expression networks
# 3.2.1 Prepare data
data("attr.human")
genelist <- unique(attr.human$entrezID[attr.human$type == "protein_coding"
                                       & !is.na(attr.human$entrezID)])
exprs <- get_expression_matrix_from_GEO("GSE34308")

# 3.2.2 Build co-expression network
network <- build_coexp_network(exprs, gene_list)
network <- build_coexp_expressionset(exprsSet, genelist)

# 3.2.3 Calculate and visualize network topological properties
assort <- assortativity(network)
nd <- node_degrees(network)
plot_density(nd)

# 3.2.4 Run neighbor voting
data("GO.human")
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)
annotations_sub <- filter_network_cols(annotations, 
                                       min = 20,
                                       max = 300)
GO_groups_voted <- run_GBA(network, annotations_sub)
```