load("/home/werner/tools/GO_enrichment/voc.Rdata") library(rhdf5) library('Matrix') #library(taRifx) getGOA <- function(species){ filename = paste0("/home/werner/tools/GO_enrichment/",species,"_gene2go.hdf5" ) # read in gene2go matrices (hdf5) in COO format go = h5read(filename, name ="GO") genes = h5read(filename, name ="genes") ind = h5read(filename, name ="ind") h5closeAll() # store as a sparsematrix GO_annots = sparseMatrix(i=ind[,1] ,j= ind[,2] ,dimnames=list(toupper(genes),toupper(go))) csums = colSums( GO_annots) GO_annots = GO_annots[, (csums <=1000)&(csums>=10) ] return(GO_annots) } gene_set_enrichment <- function(gene_list, species){ ## Perform gene set enrichment to look for over represented GO terms in the sample (gene_list) ## Sample should be preprocessed in function # Input: gene_list - User defined gene list to consider # GOA - ngCMatrices of species gene x GO # voc - Data frame containing global variable # Output: results - Data frame containing GOA = getGOA(species) genes.names = rownames(GOA) labels.names = colnames(GOA) genes.counts = rowSums(GOA) labels.counts = colSums(GOA) # p - gives number of genes in a GO group # look only at genes with known annotations m = match ( toupper(gene_list), toupper(genes.names) ) filt.genes = !is.na(m) filt.labels = m[filt.genes] labels.counts.set = rep( sum(filt.genes), length(labels.counts) ) # q - for broadcasting later. # of genes involved m = match (labels.names, voc[,1] ) v.f = !is.na(m) v.g = m[v.f] if( length(filt.labels) == 1 ) { genes.counts.set = GOA[filt.labels,] } else { genes.counts.set = colSums(GOA[filt.labels,]) } # Fisher's Exact test test = cbind( genes.counts.set , labels.counts, dim(GOA)[1]-labels.counts, labels.counts.set) # computes P[X > x]. pvals = phyper(test[,1]-1, test[,2], test[,3], test[,4], lower.tail=F) # FDR Correction pvals.adj = p.adjust( pvals, method="BH") sigs = pvals.adj < ( 0.05/length(pvals.adj) ) results = data.frame( GO_term = as.character(voc[v.g,1]), description = as.character(voc[v.g,2]), N_sample = test[v.f,1], N_univ = test[v.f,2], pvals = pvals[v.f], adj_pvals = pvals.adj[v.f], sig = sigs[v.f], stringsAsFactors = FALSE ) # sort by pvals results=results[order(results$pvals), ] return ( results ) }