Load required libraries:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rhdf5)
library(matrixStats)
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
library('org.Hs.eg.db')
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
##
## Attaching package: 'IRanges'
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
# BiocManager::install("EGAD")
library(EGAD)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(MetaMarkers)
# BiocManager::install("GO.db")
library(GO.db)
##
# install.packages("conflicted")
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("count", "dplyr")
## [conflicted] Will prefer dplyr::count over any other package.
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package.
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package.
conflict_prefer("desc", "dplyr")
## [conflicted] Will prefer dplyr::desc over any other package.
Some major notes from the Supplementary Material of the EGAD publication: - The neighbor voting method takes a gene network and a set of annotations (gene label vectors) as input, and outputs prediction scores for each of the annotation sets, as well as the performance metric of the network.
#list structure
getwd()
## [1] "/home/youkim/EGAD"
h5ls("human_HC_AggNet.hdf5")
## group name otype dclass dim
## 0 / agg H5I_DATASET FLOAT 24631 x 24631
## 1 / chunkSize H5I_DATASET FLOAT 1
## 2 / col H5I_DATASET STRING 24631
## 3 / row H5I_DATASET STRING 24631
coconet <- h5read("human_HC_AggNet.hdf5", "/agg")
coconet <- as.data.frame(coconet)
dim(coconet)
## [1] 24631 24631
colnames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")
rownames(coconet) <- h5read("human_HC_AggNet.hdf5", "/col")
mapping <- mapIds(org.Hs.eg.db,
colnames(coconet),
'ENTREZID',
'ENSEMBL')
## 'select()' returned 1:many mapping between keys and columns
mapping %<>%
as_tibble(rownames = "ENSEMBL") %>%
dplyr::rename(ncbi = value)
# check for column names
colnames(mapping)
## [1] "ENSEMBL" "ncbi"
# Remove the NAs
mapping %<>% filter(!is.na(ncbi))
# remove 1 ncbi to many genes (remove the duplicates)
dup_ids <- mapping %>%
dplyr::group_by(ncbi) %>%
summarize(n=n()) %>%
filter(n != 1) %>%
pull(ncbi)
mapping %<>%
filter(!(ncbi %in% dup_ids))
#check for 1-many or many-1
mapping %>%
group_by(ncbi) %>%
count() %>%
filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ncbi [0]
## # … with 2 variables: ncbi <chr>, n <int>
mapping %>%
group_by(ENSEMBL) %>%
count() %>%
filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
# 2.3.3 Getting gene annotation and functional data
# 2.3.4 Parsing gene data into gene annotations
# 2.3.4.2 Constructing gene annotations
meta_markers <- read_meta_markers("meta_markers_(4_datasets).csv.gz")
meta_markers_filtered <- meta_markers %>% # Top N highest values by group
arrange(desc(auroc)) %>%
group_by(cell_type) %>%
slice(1:30)
meta_markers_VCT <- meta_markers_filtered %>%
filter(cell_type == "Cytotrophoblast")
dim(meta_markers_VCT)
## [1] 30 17
meta_markers_SCT <- meta_markers_filtered %>%
filter(cell_type == "Syncytiotrophoblast")
dim(meta_markers_SCT)
## [1] 30 17
meta_markers_EVT <- meta_markers_filtered %>%
filter(cell_type == "Extravillous trophoblast")
dim(meta_markers_EVT)
## [1] 30 17
# write.csv(meta_markers_EVT$gene, "EVT Gene List")
mapping_EVT <- mapIds(org.Hs.eg.db,
meta_markers_EVT$gene,
'ENSEMBL',
'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
head(mapping_EVT)
## FN1 TPM1 PLAC8 QSOX1
## "ENSG00000115414" "ENSG00000140416" "ENSG00000145287" "ENSG00000116260"
## HLA-G KRT19
## "ENSG00000204632" "ENSG00000171345"
mapping_EVT %<>%
as_tibble(rownames = "gene") %>%
dplyr::rename(ENSEMBL = value)
mapping_EVT %>%
group_by(ENSEMBL) %>%
count() %>%
filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_EVT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_EVT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_EVT %>% group_by(gene) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: gene [0]
## # … with 2 variables: gene <chr>, n <int>
meta_markers_EVT %<>% inner_join(mapping_EVT)
## Joining, by = "gene"
df1 <- meta_markers_EVT %>%
select(ENSEMBL, cell_type)
# Repeat the same process for VCT
mapping_VCT <- mapIds(org.Hs.eg.db,
meta_markers_VCT$gene,
'ENSEMBL',
'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
head(mapping_VCT)
## PAGE4 LDHB MEST COX7C
## "ENSG00000101951" "ENSG00000111716" "ENSG00000106484" "ENSG00000127184"
## RPS25 SMAGP
## "ENSG00000118181" "ENSG00000170545"
mapping_VCT %<>%
as_tibble(rownames = "gene") %>%
dplyr::rename(ENSEMBL = value)
mapping_VCT %>%
group_by(ENSEMBL) %>%
count() %>%
filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_VCT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_VCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_VCT %>% group_by(gene) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: gene [0]
## # … with 2 variables: gene <chr>, n <int>
meta_markers_VCT %<>% inner_join(mapping_VCT)
## Joining, by = "gene"
df2 <- meta_markers_VCT %>%
select(ENSEMBL, cell_type)
# Repeat the same for SCT
mapping_SCT <- mapIds(org.Hs.eg.db,
meta_markers_SCT$gene,
'ENSEMBL',
'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
head(mapping_SCT)
## CGA CYP19A1 GDF15 S100P
## "ENSG00000135346" "ENSG00000137869" "ENSG00000130513" "ENSG00000163993"
## CYP11A1 PLIN2
## "ENSG00000140459" "ENSG00000147872"
mapping_SCT %<>%
as_tibble(rownames = "gene") %>%
dplyr::rename(ENSEMBL = value)
mapping_SCT %>%
group_by(ENSEMBL) %>%
count() %>%
filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_SCT %<>% dplyr::filter(!is.na(ENSEMBL))
#check for 1-many or many-1
mapping_SCT %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: ENSEMBL [0]
## # … with 2 variables: ENSEMBL <chr>, n <int>
mapping_SCT %>% group_by(gene) %>% count() %>% filter(n != 1)
## # A tibble: 0 × 2
## # Groups: gene [0]
## # … with 2 variables: gene <chr>, n <int>
meta_markers_SCT %<>% inner_join(mapping_SCT)
## Joining, by = "gene"
df3 <- meta_markers_SCT %>%
select(ENSEMBL, cell_type)
annotationsPlac <- tibble(gene = rownames(coconet))
annotationsPlac %<>%
mutate(EVT = gene %in% (df1 %>%
dplyr::filter(cell_type == "Extravillous trophoblast") %>%
pull(ENSEMBL))) %>%
mutate(EVT = as.numeric(EVT)) %>%
mutate(VCT = gene %in% (df2 %>%
dplyr::filter(cell_type == "Cytotrophoblast") %>%
pull(ENSEMBL))) %>%
mutate(VCT = as.numeric(VCT)) %>%
mutate(SCT = gene %in% (df3 %>%
dplyr::filter(cell_type == "Syncytiotrophoblast") %>%
pull(ENSEMBL))) %>%
mutate(SCT = as.numeric(SCT))
annotationsPlac <- annotationsPlac %>%
as.data.frame()
rownames(annotationsPlac) <- annotationsPlac$gene
annotationsPlac %<>%
select(-gene)
#
# data("example_annotations")
# annotations <- make_annotations(example_annotations$gene2Annot,
# example_annotations$genes,
# example_annotations$annotationlist)
# head(rownames(annotations))
# 2.3.5 Building networks
hist <- plot_distribution(node_degree(coconet),
b=10,
xlim=c(0,14),
ylim=c(0,2),
xlab="Node degree")
# Let's run this on EGAD
testing <- run_GBA(as.matrix(coconet),
as.matrix(annotationsPlac))
colSums(annotationsPlac)
## EVT VCT SCT
## 28 26 20
class(annotationsPlac)
## [1] "data.frame"
Extract the AUROCs and compare it with the node degree AUCs
auc_nv_coexp <- testing[[1]][,1]
auc_nv_coexp
## EVT VCT
## 0.8751281 0.8502540
sum(is.na(auc_nv_coexp))
## [1] 0
mean(auc_nv_coexp,na.rm=T)
## [1] 0.8626911
# node degree AUCs
auc_nd_coexp <- testing[[1]][,3]
mean(auc_nd_coexp,na.rm=T)
## [1] 0.5018878
Recall from our PSL1040 content that most genes (nodes) have a small node degree, and that hubs exist. Hubs are the small handful of genes which have a very large node degree, being connected to many other nodes.
# 2.3.5.2 Extending binary networks
extended_net <- extend_network(extended_net)
class(extended_net)
hist <- plot_distribution(node_degree(extended_net),
b = 10,
xlab = "Node degree",)
# 2.3.5.3 Semantic similarity networks using Jaccard
semantic_net <- build_semantic_similarity_network(annotations,
example_binary_network$genelist)
hist <- plot_distribution(node_degree(semantic_net),
b = 20,
xlab = "Node degree",
bars = TRUE)
# 2.3.6 Running the neighbor voting algorithm (similar code as above)
# 2.3.6.1 Neighbor voting
gba_auc_nv <- neighbor_voting(as.matrix(annotationsPlac),
as.matrix(coconet),
nFold = 3,
output = "AUROC")
head(gba_auc_nv)
## auc avg_node_degree degree_null_auc
## EVT 0.8766411 11664.10 0.4318405
## VCT 0.8626058 14006.55 0.5719350
## SCT 0.8653673 10878.24 0.4034090
gba_pr_nv <- neighbor_voting(as.matrix(annotationsPlac),
as.matrix(coconet),
nFold = 3,
output = "PR")
head(gba_pr_nv)
## avgprc avg_node_degree degree_null_auc
## EVT 0.003230716 11664.10 0.0007713857
## VCT 0.001523380 14006.55 0.0007307864
## SCT 0.003125033 10878.24 0.0005683894
# 2.3.7 Running multifunctionality assessment analytically
multifunc_assessment <- calculate_multifunc(annotationsPlac)
auc_mf <- auc_multifunc(annotationsPlac, multifunc_assessment[,4])
hist <- plot_distribution(auc_mf,
xlab = "AUROC",
med = F,
avg = F)
# 2.3.8 Plotting Results
# 2.3.8.1 Scatterplots and smoothed plots
X <- runif(1000)
Y <- runif(1000) + X*0.3
window <- 50
smoothedxy <- conv_smoother(X,
Y,
window,
xlab = "X",
ylab = "Y")
# 2.3.8.2 Distributions
filt <- !is.na(gba_auc_nv[,1])
aucA <- gba_auc_nv[filt,1]
aucB <- gba_auc_nv[filt,3]
hist <- plot_distribution(aucA, xlab="AUROCs")
avg <- plot_density_compare(aucA, aucB, xlab = "AUROCs")
plot_value_compare(aucA, aucB)
# 2.3.8.3 ROC plots and overlay plots
scores <- predictions(labels, network)
roc <- plot_roc(scores, labels)
plot_roc_overlay(scores, labels)
# Refer back to the guide for specific ROC / PR curves
# 3. User Guide
# 3.1. Using EGAD to evaluate protein-protein interactions with GO
# 3.1.1 Prepare data
data("biogrid")
data("GO.human")
# 3.1.2 Build interaction networks (e.g., gene-by-gene matrix)
genelist <- make_genelist(biogrid)
gene_network <- make_gene_network(biogrid, genelist)
# 3.1.3 Build annotation matrix
goterms <- unique(GO.human[,3])
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)
# 3.1.4 Run neighbor voting
GO_groups_voted <- run_GBA(gene_network,annotations)
head(GO_groups_voted)
# 3.1.5 Run multifunctionality assessment for the ontology used
GO_multifunc_assessment <- calculate_multifunc(annotations)
# 3.1.6 Create optimal lists
# For genes
ord <- order(as.numeric(GO_multifunc_assessment[,2]),
decreasing=TRUE)
GO_multifunc_assessment_s <- GO_multifunc_assessment[ord,c(1,2)]
optimallist_genes <- as.vector(unlist(GO_multifunc_assessment_s[,1]))
# and for GO groups
ord <- order(as.numeric(GO_groups_voted[[1]][,2]),
decreasing=TRUE)
GO_groups_voted_s <- GO_groups_voted[[1]][ord,]
optimallist_GO <- cbind(GO.term=rownames(GO_groups_voted_s),
node.degree=GO_groups_voted_s[,2],
roc=GO_groups_voted_s[,1])
# 3.1.7 Use optimal lists to determine multifunctionality AUROC
auc_GO_mf <- auc_multifunc(annotations, optimallist_genes)
# 3.1.8 Visualize Results
# 3.1.8.1 Gene Ontology
auc_GO_nv <- GO_groups_voted[,1]
plot_distribution(auc_GO_nv,
xlab="Neighbor voting AUROC ",
ylab="Number of functional terms",
b=30,
xlim=c(0.4,1),
ylim=c(0, 440),
col="gray64",
density=FALSE,
avg=FALSE,
bars=TRUE)
# Compare with the distribution of multifunctionality AUROCs
plot_distribution(auc_GO_mf,
xlab="Optimal GO Ranking AUROC",
ylab="Number of functional terms",
b = 20,
xlim = c(0.2,1),
ylim = c(0,4400),
col="gray64",
density = FALSE,
avg = FALSE,
bars = TRUE)
# 3.2. Using EGAD to generate and analyze dense co-expression networks
# 3.2.1 Prepare data
data("attr.human")
genelist <- unique(attr.human$entrezID[attr.human$type == "protein_coding"
& !is.na(attr.human$entrezID)])
exprs <- get_expression_matrix_from_GEO("GSE34308")
# 3.2.2 Build co-expression network
network <- build_coexp_network(exprs, gene_list)
network <- build_coexp_expressionset(exprsSet, genelist)
# 3.2.3 Calculate and visualize network topological properties
assort <- assortativity(network)
nd <- node_degrees(network)
plot_density(nd)
# 3.2.4 Run neighbor voting
data("GO.human")
annotations <- make_annotations(GO.human[,c(2,3)], genelist, goterms)
annotations_sub <- filter_network_cols(annotations,
min = 20,
max = 300)
GO_groups_voted <- run_GBA(network, annotations_sub)