EGAD (4-dataset-metamarkers)

Load required libraries:

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(rhdf5)
library(matrixStats)

## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count

library('org.Hs.eg.db')

## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## 
## Attaching package: 'IRanges'
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(dplyr)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(readr)
# BiocManager::install("EGAD")
library(EGAD)

## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)

library(MetaMarkers)
# BiocManager::install("GO.db")
library(GO.db)

##

# install.packages("conflicted")
library(conflicted)
conflict_prefer("filter", "dplyr")

## [conflicted] Will prefer dplyr::filter over any other package.

conflict_prefer("count", "dplyr")

## [conflicted] Will prefer dplyr::count over any other package.

conflict_prefer("rename", "dplyr")

## [conflicted] Will prefer dplyr::rename over any other package.

conflict_prefer("select", "dplyr")

## [conflicted] Will prefer dplyr::select over any other package.

conflict_prefer("slice", "dplyr")

## [conflicted] Will prefer dplyr::slice over any other package.

conflict_prefer("desc", "dplyr")

## [conflicted] Will prefer dplyr::desc over any other package.

Some major notes from the Supplementary Material of the EGAD publication: - The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network.

#list structure
getwd()

## [1] "/home/youkim/EGAD"

h5ls("human_HC_AggNet.hdf5")

##   group      name       otype dclass           dim
## 0     /       agg H5I_DATASET  FLOAT 24631 x 24631
## 1     / chunkSize H5I_DATASET  FLOAT             1
## 2     /       col H5I_DATASET STRING         24631
## 3     /       row H5I_DATASET STRING         24631

coconet <- h5read("human_HC_AggNet.hdf5", "/agg")
coconet <- as.data.frame(coconet)

dim(coconet)

## [1] 24631 24631

colnames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")
rownames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")

mapping <- mapIds(org.Hs.eg.db, 
                  colnames(coconet), 
                  'ENTREZID',
                  'ENSEMBL')

## 'select()' returned 1:many mapping between keys and columns

mapping %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(ncbi = value) 

# check for column names
colnames(mapping)

## [1] "ENSEMBL" "ncbi"

# Remove the NAs
mapping %<>% filter(!is.na(ncbi))

# remove 1 ncbi to many genes (remove the duplicates)
dup_ids <- mapping %>% 
  dplyr::group_by(ncbi) %>% 
  summarize(n=n()) %>% 
  filter(n != 1) %>% 
  pull(ncbi)

mapping %<>% 
  filter(!(ncbi %in% dup_ids))

#check for 1-many or many-1
mapping %>% 
  group_by(ncbi) %>% 
  count() %>% 
  filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ncbi [0]
## # … with 2 variables: ncbi <chr>, n <int>

mapping %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

# 2.3.3 Getting gene annotation and functional data
# 2.3.4 Parsing gene data into gene annotations
# 2.3.4.2 Constructing gene annotations
meta_markers <- read_meta_markers("meta_markers_(4_datasets).csv.gz")

meta_markers_filtered <- meta_markers %>% # Top N highest values by group
  arrange(desc(auroc)) %>% 
  group_by(cell_type) %>%
  slice(1:30)

meta_markers_VCT <- meta_markers_filtered %>%  
  filter(cell_type == "Cytotrophoblast") 
dim(meta_markers_VCT)

## [1] 30 17

meta_markers_SCT <- meta_markers_filtered %>%  
  filter(cell_type == "Syncytiotrophoblast")
dim(meta_markers_SCT)

## [1] 30 17

meta_markers_EVT <- meta_markers_filtered %>% 
  filter(cell_type == "Extravillous trophoblast")
dim(meta_markers_EVT)

## [1] 30 17

# write.csv(meta_markers_EVT$gene, "EVT Gene List")
mapping_EVT <- mapIds(org.Hs.eg.db, 
                  meta_markers_EVT$gene, 
                  'ENSEMBL',
                  'SYMBOL')

## 'select()' returned 1:many mapping between keys and columns

head(mapping_EVT)

##               FN1              TPM1             PLAC8             QSOX1 
## "ENSG00000115414" "ENSG00000140416" "ENSG00000145287" "ENSG00000116260" 
##             HLA-G             KRT19 
## "ENSG00000204632" "ENSG00000171345"

mapping_EVT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_EVT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_EVT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_EVT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_EVT %>% group_by(gene) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   gene [0]
## # … with 2 variables: gene <chr>, n <int>

meta_markers_EVT %<>% inner_join(mapping_EVT)

## Joining, by = "gene"

df1 <- meta_markers_EVT %>%
  select(ENSEMBL, cell_type)

# Repeat the same process for VCT
mapping_VCT <- mapIds(org.Hs.eg.db, 
                  meta_markers_VCT$gene, 
                  'ENSEMBL',
                  'SYMBOL')

## 'select()' returned 1:many mapping between keys and columns

head(mapping_VCT)

##             PAGE4              LDHB              MEST             COX7C 
## "ENSG00000101951" "ENSG00000111716" "ENSG00000106484" "ENSG00000127184" 
##             RPS25             SMAGP 
## "ENSG00000118181" "ENSG00000170545"

mapping_VCT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_VCT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_VCT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_VCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_VCT %>% group_by(gene) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   gene [0]
## # … with 2 variables: gene <chr>, n <int>

meta_markers_VCT %<>% inner_join(mapping_VCT)

## Joining, by = "gene"

df2 <- meta_markers_VCT %>%
  select(ENSEMBL, cell_type)

# Repeat the same for SCT
mapping_SCT <- mapIds(org.Hs.eg.db, 
                  meta_markers_SCT$gene, 
                  'ENSEMBL',
                  'SYMBOL')

## 'select()' returned 1:many mapping between keys and columns

head(mapping_SCT)

##               CGA           CYP19A1             GDF15             S100P 
## "ENSG00000135346" "ENSG00000137869" "ENSG00000130513" "ENSG00000163993" 
##           CYP11A1             PLIN2 
## "ENSG00000140459" "ENSG00000147872"

mapping_SCT %<>% 
  as_tibble(rownames = "gene") %>% 
  dplyr::rename(ENSEMBL = value)

mapping_SCT %>% 
  group_by(ENSEMBL) %>% 
  count() %>% 
  filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_SCT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_SCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>

mapping_SCT %>% group_by(gene) %>% count() %>% filter(n != 1)

## # A tibble: 0 × 2
## # Groups:   gene [0]
## # … with 2 variables: gene <chr>, n <int>

meta_markers_SCT %<>% inner_join(mapping_SCT)

## Joining, by = "gene"

df3 <- meta_markers_SCT %>%
  select(ENSEMBL, cell_type)

annotationsPlac <- tibble(gene = rownames(coconet))
annotationsPlac %<>% 
  mutate(EVT = gene %in% (df1 %>% 
                          dplyr::filter(cell_type == "Extravillous trophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(EVT = as.numeric(EVT)) %>% 
  mutate(VCT = gene %in% (df2 %>% 
                          dplyr::filter(cell_type == "Cytotrophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(VCT = as.numeric(VCT)) %>% 
  mutate(SCT = gene %in% (df3 %>% 
                          dplyr::filter(cell_type == "Syncytiotrophoblast") %>% 
                          pull(ENSEMBL))) %>% 
  mutate(SCT = as.numeric(SCT))

annotationsPlac <- annotationsPlac %>% 
  as.data.frame()

rownames(annotationsPlac) <- annotationsPlac$gene
annotationsPlac %<>% 
  select(-gene)
# 
# data("example_annotations")
# annotations <- make_annotations(example_annotations$gene2Annot,
#                                 example_annotations$genes,
#                                 example_annotations$annotationlist)
# head(rownames(annotations))

# 2.3.5 Building networks
hist <- plot_distribution(node_degree(coconet), 
                          b=10,
                          xlim=c(0,14), 
                          ylim=c(0,2), 
                          xlab="Node degree")

# Let's run this on EGAD
testing <- run_GBA(as.matrix(coconet), 
                   as.matrix(annotationsPlac))

colSums(annotationsPlac)

## EVT VCT SCT 
##  28  26  20

class(annotationsPlac)

## [1] "data.frame"

Extract the AUROCs and compare it with the node degree AUCs

auc_nv_coexp <- testing[[1]][,1]
auc_nv_coexp

##       EVT       VCT 
## 0.8751281 0.8502540

sum(is.na(auc_nv_coexp))

## [1] 0

mean(auc_nv_coexp,na.rm=T)

## [1] 0.8626911

# node degree AUCs
auc_nd_coexp <- testing[[1]][,3]
mean(auc_nd_coexp,na.rm=T)

## [1] 0.5018878

Recall from our PSL1040 content that most genes (nodes) have a small node degree, and that hubs exist. Hubs are the small handful of genes which have a very large node degree, being connected to many other nodes.

# 2.3.5.2 Extending binary networks
extended_net <- extend_network(extended_net)
class(extended_net)
hist <- plot_distribution(node_degree(extended_net), 
                          b = 10,
                          xlab = "Node degree",)

# 2.3.5.3 Semantic similarity networks using Jaccard
semantic_net <- build_semantic_similarity_network(annotations, 
                                                  example_binary_network$genelist)
hist <- plot_distribution(node_degree(semantic_net),
                          b = 20,
                          xlab = "Node degree",
                          bars = TRUE)

# 2.3.6 Running the neighbor voting algorithm (similar code as above)
# 2.3.6.1 Neighbor voting
gba_auc_nv <- neighbor_voting(as.matrix(annotationsPlac), 
                              as.matrix(coconet),
                              nFold = 3,
                              output = "AUROC")
head(gba_auc_nv)

##           auc avg_node_degree degree_null_auc
## EVT 0.8766411        11664.10       0.4318405
## VCT 0.8626058        14006.55       0.5719350
## SCT 0.8653673        10878.24       0.4034090

gba_pr_nv <- neighbor_voting(as.matrix(annotationsPlac), 
                             as.matrix(coconet),
                              nFold = 3,
                              output = "PR")
head(gba_pr_nv)

##          avgprc avg_node_degree degree_null_auc
## EVT 0.003230716        11664.10    0.0007713857
## VCT 0.001523380        14006.55    0.0007307864
## SCT 0.003125033        10878.24    0.0005683894

# 2.3.7 Running multifunctionality assessment analytically
multifunc_assessment <- calculate_multifunc(annotationsPlac)
auc_mf <- auc_multifunc(annotationsPlac, multifunc_assessment[,4])
hist <- plot_distribution(auc_mf, 
                          xlab = "AUROC",
                          med = F,
                          avg = F)

# 2.3.8 Plotting Results
# 2.3.8.1 Scatterplots and smoothed plots
X <- runif(1000)
Y <- runif(1000) + X*0.3
window <- 50
smoothedxy <- conv_smoother(X, 
                            Y,
                            window,
                            xlab = "X",
                            ylab = "Y")

# 2.3.8.2 Distributions
filt <- !is.na(gba_auc_nv[,1])
aucA <- gba_auc_nv[filt,1]
aucB <- gba_auc_nv[filt,3]
hist <- plot_distribution(aucA, xlab="AUROCs")

avg <- plot_density_compare(aucA, aucB, xlab = "AUROCs")

plot_value_compare(aucA, aucB)

# 2.3.8.3 ROC plots and overlay plots
scores <- predictions(labels, network)
roc <- plot_roc(scores, labels)
plot_roc_overlay(scores, labels)

# Refer back to the guide for specific ROC / PR curves

# 3. User Guide
# 3.1. Using EGAD to evaluate protein-protein interactions with GO
# 3.1.1 Prepare data
data("biogrid")
data("GO.human")

# 3.1.2 Build interaction networks (e.g., gene-by-gene matrix)
genelist <- make_genelist(biogrid)
gene_network <- make_gene_network(biogrid, genelist)

# 3.1.3 Build annotation matrix
goterms <- unique(GO.human[,3])
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)

# 3.1.4 Run neighbor voting
GO_groups_voted <- run_GBA(gene_network,annotations)
head(GO_groups_voted)

# 3.1.5 Run multifunctionality assessment for the ontology used
GO_multifunc_assessment <- calculate_multifunc(annotations)

# 3.1.6 Create optimal lists
# For genes
ord <- order(as.numeric(GO_multifunc_assessment[,2]), 
             decreasing=TRUE)
GO_multifunc_assessment_s <- GO_multifunc_assessment[ord,c(1,2)]
optimallist_genes <- as.vector(unlist(GO_multifunc_assessment_s[,1]))

# and for GO groups
ord <- order(as.numeric(GO_groups_voted[[1]][,2]),
             decreasing=TRUE)
GO_groups_voted_s <- GO_groups_voted[[1]][ord,]
optimallist_GO <- cbind(GO.term=rownames(GO_groups_voted_s),
                        node.degree=GO_groups_voted_s[,2],
                        roc=GO_groups_voted_s[,1])

# 3.1.7 Use optimal lists to determine multifunctionality AUROC
auc_GO_mf <- auc_multifunc(annotations, optimallist_genes)

# 3.1.8 Visualize Results
# 3.1.8.1 Gene Ontology
auc_GO_nv <- GO_groups_voted[,1]
plot_distribution(auc_GO_nv, 
                  xlab="Neighbor voting AUROC ",
                  ylab="Number of functional terms",
                  b=30, 
                  xlim=c(0.4,1),
                  ylim=c(0, 440),
                  col="gray64",
                  density=FALSE,
                  avg=FALSE, 
                  bars=TRUE)

# Compare with the distribution of multifunctionality AUROCs
plot_distribution(auc_GO_mf,
                  xlab="Optimal GO Ranking AUROC",
                  ylab="Number of functional terms",
                  b = 20, 
                  xlim = c(0.2,1), 
                  ylim = c(0,4400), 
                  col="gray64",
                  density = FALSE, 
                  avg = FALSE, 
                  bars = TRUE)

# 3.2. Using EGAD to generate and analyze dense co-expression networks
# 3.2.1 Prepare data
data("attr.human")
genelist <- unique(attr.human$entrezID[attr.human$type == "protein_coding"
                                       & !is.na(attr.human$entrezID)])
exprs <- get_expression_matrix_from_GEO("GSE34308")

# 3.2.2 Build co-expression network
network <- build_coexp_network(exprs, gene_list)
network <- build_coexp_expressionset(exprsSet, genelist)

# 3.2.3 Calculate and visualize network topological properties
assort <- assortativity(network)
nd <- node_degrees(network)
plot_density(nd)

# 3.2.4 Run neighbor voting
data("GO.human")
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)
annotations_sub <- filter_network_cols(annotations, 
                                       min = 20,
                                       max = 300)
GO_groups_voted <- run_GBA(network, annotations_sub)

EGAD (4-dataset-metamarkers)

Young-June Kim

2023-02-21