# first define some functions library(EGAD,quietly=T) library(rhdf5,quietly=T) library(Matrix,quietly=T) library(taRifx,quietly=T) load("/ftp/data/gene2go/voc.Rdata") getEnsemblID <- function(species, gene_list){ # Input gene_list is allowed to be anything. EnsemblID , entrezID, genesymbol, or any synonyms # input should be an array/vector # GeneInfo file is read from the ftp - change to read from where you store it. # outputs # [[1]] ensemble genes # [[2]] input genes not found # [[3]] original gene set # get gene info # adjust file location as needed #################################################################################################################################### geneInfo = read.delim(paste0("/ftp/data/geneInfo/",species,"_geneInfo.tab"), stringsAsFactors=F, sep="\t") #################################################################################################################################### gene_list = as.character(gene_list) # search for genes in EnsemblID m = match(toupper(gene_list) , toupper(geneInfo$EnsemblID) ) ind_found = m[!is.na(m)] ind_found = ind_found[! duplicated(ind_found )] # remove duplicates. ensemblGenes = geneInfo$EnsemblID[ind_found] genes_found = gene_list [ !is.na(m) ] genes_not_found = gene_list [ is.na(m) ] # search for genes in symbols m = match(toupper(genes_not_found), toupper(geneInfo$GeneSymbol) ) ind_found = m[!is.na(m)] ind_found = ind_found[! duplicated(ind_found )] # remove duplicates. ensemblGenes = c(ensemblGenes , geneInfo$EnsemblID[ind_found]) genes_found = c(genes_found , genes_not_found [ !is.na(m) ] ) genes_not_found = genes_not_found [ is.na(m) ] # search for genes in EntrezID m = match(genes_not_found, geneInfo$EntrezID) ind_found = m[!is.na(m)] ind_found = ind_found[! duplicated(ind_found )] # remove duplicates. ensemblGenes = c(ensemblGenes , geneInfo$EnsemblID[ind_found]) genes_found = c(genes_found , genes_not_found [ !is.na(m) ] ) genes_not_found = genes_not_found [ is.na(m) ] # search in synonyms temp = paste0("\\|", genes_not_found ,"\\|") ind_found = lapply(temp , grep , geneInfo$Synonyms,ignore.case=TRUE ) lens = lengths(ind_found) ind_found = unlist(ind_found) ind_found = ind_found[! duplicated(ind_found )] # remove duplicates. ensemblGenes = c(ensemblGenes , geneInfo$EnsemblID[ind_found]) genes_found = c(genes_found , rep(genes_not_found, lens ) ) genes_not_found = genes_not_found[ lens==0] # tidy up genes_found = genes_found[!is.na(genes_found)] noENS = genes_found[ is.na(ensemblGenes) ] genes_not_found = c(genes_not_found,noENS) ensemblGenes = ensemblGenes[! duplicated(ensemblGenes)] N = length(ensemblGenes) if (N == 0 ){ print("None of your input genes were found or cannot be converted to the correct format!") } count = length(genes_not_found) if (count > 1 ) { if (count > 5) { temp = paste(genes_not_found[1:5],collapse=", ") temp = paste(temp ,"\n and", count -5 , "others." ) print(paste("The following input genes were not found! \n ",temp) ) } else { temp = paste(genes_not_found,collapse=", ") print(paste("The following input genes were not found! \n ",temp) ) } } if (N > 1000 ){ ind = sample(1:length(ensemblGenes) , 1000 ) ensemblGenes = ensemblGenes[ind] genes_found = genes_found[ind] print(paste("We found",N,"genes. Requesting too many genes will slow our server down for other users - so we took a random sample of 1000 genes instead." ) ) } # ensemblGenes should be used to match to the data, # genes_not_found should be reported to the user # genes_found should be used as display names return(list(ensemblGenes, genes_not_found, make.unique(genes_found) ) ) } getGOA <- function(species){ # reads in the hdf5 file related genes to GO terms and stores as a sparse matrix # adjust file location as needed ######################################################################################## filename = paste0("/ftp/data/gene2go/",species,"_gene2go.hdf5" ) ######################################################################################## # read in gene2go matrices (hdf5) in COO format go = h5read(filename, name ="GO") genes = h5read(filename, name ="genes") ind = h5read(filename, name ="ind") h5closeAll() # store as a sparsematrix GO_annots = sparseMatrix(i=ind[,1] ,j= ind[,2] ,dimnames=list(toupper(genes),toupper(go))) csums = colSums( GO_annots) GO_annots = GO_annots[, (csums <=1000)&(csums>=10) ] return(GO_annots) } geneSetScore <- function( species, gene_list ,flag1= "prio",op = "AUROC"){ # get score of geneset using EGAD # apply egad to all genes # takes in the ensemblIDs filename = paste0("/ftp/data/networks/" ,species,"_",flag1, "AggNet.hdf5") row = h5read(filename, name= "row" ) col = h5read(filename, name= "col" ) # just in case, make sure it's symmetric genes = intersect(toupper(row) , toupper(col)) ind = match(gene_list, genes) ind = ind[!is.na(ind)] network = h5read(filename, name= "agg" ) rownames(network) = toupper(row) colnames(network) = toupper(col) h5closeAll() network=network[genes,genes] labels = matrix(0, length(genes) , 1) rownames(labels ) = genes colnames(labels ) = "Gene Set" labels[ rownames(labels) %in% toupper(gene_list),1] =1 roc <- neighbor_voting(labels, network, 3, output =op) # preds <- predictions(labels, network.sub) return(roc) } # networkMaker <- function(species,gene_list,flag1="prio"){ # genes from file are listed as ENSEMBL type - need to make sure to convert input genes to this format first. # get indices of genes in hdf5 file filename = paste0("/ftp/data/networks/",species,"_",flag1,"AggNet.hdf5") allGenes = h5read(filename, name= "row") genesInDB = intersect(gene_list , allGenes ) genesNotInDB = setdiff(gene_list, allGenes) ind = match(genesInDB , allGenes) net = h5read(filename, name = "agg" ,index=list(ind,ind)) rownames(net) = genesInDB colnames(net) = genesInDB # symmetric part removed - along with diagonal which is always 1.0 net[ lower.tri(net,diag=TRUE)] = 0 ind = which(net > 0, arr.ind = TRUE ) df = data.frame( from = rownames(net)[ind[,1]], to = colnames(net)[ind[,2]] , value = net[ind], stringsAsFactors=F ) h5closeAll() return ( list(df,net )) } gene_set_enrichment <- function(gene_list, species){ ## Perform gene set enrichment to look for over represented GO terms in the sample (gene_list) ## Sample should be preprocessed in function # Input: gene_list - User defined gene list to consider # GOA - ngCMatrices of species gene x GO # voc - Data frame containing global variable # Output: results - Data frame containing GOA = getGOA(species) genes.names = rownames(GOA) labels.names = colnames(GOA) genes.counts = rowSums(GOA) labels.counts = colSums(GOA) # p - gives number of genes in a GO group # look only at genes with known annotations m = match ( toupper(gene_list), toupper(genes.names) ) filt.genes = !is.na(m) filt.labels = m[filt.genes] labels.counts.set = rep( sum(filt.genes), length(labels.counts) ) # q - for broadcasting later. # of genes involved m = match (labels.names, voc[,1] ) v.f = !is.na(m) v.g = m[v.f] if( length(filt.labels) == 1 ) { genes.counts.set = GOA[filt.labels,] } else { genes.counts.set = colSums(GOA[filt.labels,]) } # Fisher's Exact test test = cbind( genes.counts.set , labels.counts, dim(GOA)[1]-labels.counts, labels.counts.set) # computes P[X > x] . Probabilities are given as log(p) pvals = phyper(test[,1]-1, test[,2], test[,3], test[,4], lower.tail=F) # FDR Correction pvals.adj = p.adjust( pvals, method="BH") sigs = pvals.adj < ( 0.05/length(pvals.adj) ) # Downloadable results = data.frame( GO_term = voc[v.g,1], description = as.character(voc[v.g,2]), N_sample = test[v.f,1], N_univ = test[v.f,2], pvals = pvals[v.f], adj_pvals = pvals.adj[v.f], sig = sigs[v.f], stringsAsFactors = FALSE ) # sort by pvals results=results[order(results$pvals), ] return ( remove.factors(results) ) } getMoreGenes <- function(gene_list, species , N=10, flag1="prio") { # gene list needs to be in the proper format. # Find genes not in current set of genes with coexpression greater than thrs for at least one gene in set # Input: # Output: new_genes temp = toupper(gene_list) # load the data filename = paste0("/ftp/data/networks/",species,"_", flag1, "AggNet.hdf5") # read in gene names allGenes = toupper(h5read(filename,name = "col") ) genesInSet = intersect(temp, allGenes) genesNotInSet = setdiff( allGenes,temp ) # if (length(genesInSet) == 0) { shinyalert("No input genes were found - unable to add any genes. ",type = "error",closeOnClickOutside= TRUE ) ; return(gene_list ) } ind1 = match(genesInSet , allGenes) # use these as the rows ind2 = match(genesNotInSet , allGenes) # use these as the cols # Get total node degree of genes not in set coexpMatrix = h5read(filename, name="agg" ,index = list(ind1 , 1:length(allGenes) ) ) coexpMatrix = coexpMatrix[,ind2] coexpMatrix = matrix(coexpMatrix,nrow = length(ind1), ncol=length(ind2) ) rownames(coexpMatrix ) = allGenes[ind1] colnames(coexpMatrix ) = allGenes[ind2] internalDeg = colSums(coexpMatrix) # totalDeg = read.delim(paste0("/ftp/data/nodeDegree/", species,"_", flag1,"Degree.csv") , sep = ",", stringsAsFactors=F) # rownames(totalDeg) = toupper(rownames(totalDeg)) # totalDeg = totalDeg[names(internalDeg),] ind = rank(-internalDeg ,ties.method="min") # low is good # internalDeg/totalDeg new_genes = allGenes[ind2][ind <= N] h5closeAll() return(toupper(c(gene_list, new_genes ))) } orthologMapper <- function(genes,speciesA, speciesB){ ## Maps genes in speciesA to orthologous genes in speciesB # Input: genes - genes of SpeciesA # speciesA - name of reference species # speciesB - name of target species # Output: temp - orthologous genes in speciesB # get gene info tables geneInfoA = read.delim(paste0("/ftp/data/geneInfo/",speciesA,"_geneInfo.tab"), stringsAsFactors=F, sep="\t") geneInfoB = read.delim(paste0("/ftp/data/geneInfo/",speciesB,"_geneInfo.tab"), stringsAsFactors=F, sep="\t") # look up the OrthogeneID for the first species reforthogenes = geneInfoA[geneInfoA$EnsemblID %in% intersect(genes , geneInfoA$EnsemblID ) , 4 ] reforthogenes = reforthogenes[! is.na(reforthogenes)] reforthogenes = reforthogenes[! duplicated(reforthogenes)] filename1 = paste0('/ftp/data/orthologData/',speciesA,'_',speciesB,'_ortho.csv') filename2 = paste0('/ftp/data/orthologData/',speciesB,'_',speciesA,'_ortho.csv') # find target ortholog genes matching to reforthogenes from ortholog matrix if(file.exists(filename1) ){ odata = read.delim(filename1,sep=',',stringsAsFactors=F) } else if(file.exists(filename2 )){ odata = read.delim(filename2,sep=',',stringsAsFactors=F) } if (length(intersect(as.vector(odata[,1]),reforthogenes))>0){ ind = match(reforthogenes,odata[,1]) tarorthogenes <- as.vector(odata[ ind,2]) } else if (length(intersect(as.vector(odata[,2]),reforthogenes))>0) { tarorthogenes <- as.vector(odata[match(reforthogenes,odata[,2]),1]) } else{ shinyalert('No ortholog genes not found',type="error" ,closeOnClickOutside=TRUE)} tarorthogenes = tarorthogenes[!is.na(tarorthogenes)] ind = match(tarorthogenes,geneInfoB$OrthoID) tarorthogenes = geneInfoB$EnsemblID[ind] tarorthogenes = tarorthogenes[!is.na( tarorthogenes)] tarorthogenes = tarorthogenes[!duplicated(tarorthogenes)] return(tarorthogenes) } GBA_driver <- function(network, labels , op = "AUROC" ,nfold = 3,label_min=10,label_max=1000) { # takes in network as full matrix # which input genes have an annotaton? rownames(network ) = toupper(rownames( network)) colnames(network ) = toupper(colnames( network)) rownames(labels) = toupper(rownames(labels)) colnames(labels) = toupper(colnames(labels)) m <- match(rownames(network),rownames(labels) ) f <- !is.na(m) g <- m[f] if (network[1,2] != network[2,1]){ network.sub <- network[f, f] + t(network[f,f]) diag(network.sub) = 1 } else {network.sub <- network[f, f]} # genes.labels = labels[g, sig_GOs] genes.labels <- filter_network_cols(labels, label_min, label_max) # filters the GO terms # genes.labels= genes.labels[g,] genes.labels = filter_network_cols(genes.labels[g,], 1, label_max ) roc.sub <- neighbor_voting(genes.labels, network.sub, nfold, output =op) return(roc.sub) } getCompleteNetwork <- function(species, flag1="prio"){ # store all of the co-expression data in a single matrix # WARNING: RAM intensive for some species filename = paste0("/ftp/data/networks/" ,species,"_",flag1, "AggNet.hdf5") agg = h5read(filename, name= "agg" ) col = h5read(filename, name= "col" ) row = h5read(filename, name= "row" ) rownames(agg) = row colnames(agg) = col return(agg) } ################################################################################################################################ ########################################################### Example ############################################################ ################################################################################################################################ # Example requires about 10 GB of RAM # possible species include # speciesList = c("human", "mouse", "arabidopsis", "boar", "chicken", "cow", "fruitfly", "maize", "rat", "rice", "roundworm", "soybean", "yeast", "zebrafish") speciesA = "yeast" speciesB = "roundworm" gene_listA = read.delim("/ftp/data/sample_input/top_yeastGenes.csv",stringsAsFactors=F, header=F)[,1] thresholdA = 0.99 thresholdB = 0.99 # convert these genes to ensembl gene_listA = getEnsemblID(speciesA, gene_listA )[[1]] # get more genes if desired # gene_listA = getMoreGenes(gene_listA, speciesA , N=100) # get corresponding ortholog genes. gene_listB = orthologMapper(gene_listA,speciesA, speciesB) # construct full networks for both ( no threshold ) netdatA = networkMaker(speciesA, gene_listA) netdatB = networkMaker(speciesB, gene_listB) # network in coordinate format - i , j , x netA_coo = netdatA[[1]] netB_coo = netdatB[[1]] # dense network as adjacency matrix (lower tri + diag removed) netA_full = netdatA[[2]] netB_full = netdatB[[2]] # apply threshold to get subnetworks subnetA = netA_coo[netA_coo[,3] > thresholdA ,] subnetB = netB_coo[netB_coo[,3] > thresholdB ,] # apply gene set enrichment on subnetwork subGeneListA = unique(c(subnetA[,1 ] , subnetA[,2] )) subGeneListB = unique(c(subnetB[,1 ] , subnetB[,2] )) gseA = gene_set_enrichment(subGeneListA, speciesA) gseB = gene_set_enrichment(subGeneListB, speciesB) # assess network quality using EGAD labelA = getGOA(speciesA) labelB = getGOA(speciesB) rocsA = GBA_driver(netA_full, labelA , op = "AUROC",nfold=3) # op can be "AUROC" or "PR" print(paste0("Average AUROC for ",speciesA, " is ", round(mean(rocsA[,1], na.rm = T),4 ) )) rocsB = GBA_driver(netB_full, labelB , op = "AUROC",nfold=3) print(paste0("Average AUROC for ",speciesB, " is ", round(mean(rocsB[,1], na.rm = T),4 ) )) # get gene set score using EGAD ############################## WARNING: RAM INTENSIVE FOR SOME SPECIES ############################## aggA = getCompleteNetwork(speciesA) aggB = getCompleteNetwork(speciesB) setScoreA = geneSetScore( speciesA, gene_listA ,op = "AUROC") setScoreB = geneSetScore( speciesB, gene_listB ,op = "AUROC") print(paste0("Gene set score for ",speciesA ," is ",round(setScoreA[,1],4 ))) print(paste0("Gene set score for ",speciesB ," is ",round(setScoreB[,1],4 ))) ######################################################################################################