# go enrichment on top terms in each group # what cross-species patterns? # upset plots, distributions of scores?? library(Matrix) getGOA <- function(species){ filename = paste0("/data/CoCoCoNet/gene2go/",species,"_gene2go.csv") golist = read.delim(filename, sep = ' ', header = T, stringsAsFactors = F) # read in gene2go list go = unique(golist$GO_term) genes = unique(golist$NetworkIDs) ind = cbind(match(golist$NetworkIDs, genes), match(golist$GO_term, go)) # store as a sparsematrix GO_annots = sparseMatrix(i=ind[,1] ,j=ind[,2] ,dimnames=list(toupper(genes),toupper(go))) csums = colSums(GO_annots) GO_annots = GO_annots[, (csums <=1000)&(csums>=10) ] return(GO_annots) } gene_set_enrichment <- function(gene_list, species){ ## Perform gene set enrichment to look for over represented GO terms in the sample (gene_list) ## Sample should be preprocessed in function # Input: gene_list - User defined gene list to consider # GOA - ngCMatrices of species gene x GO # voc - Data frame containing global variable # Output: results - Data frame containing GOA = getGOA(species) load('/data/johlee/CoCoCoNet/voc.Rdata') genes.names = rownames(GOA) labels.names = colnames(GOA) genes.counts = rowSums(GOA) labels.counts = colSums(GOA) # p - gives number of genes in a GO group # look only at genes with known annotations m = match ( toupper(gene_list), toupper(genes.names) ) filt.genes = !is.na(m) filt.labels = m[filt.genes] labels.counts.set = rep( sum(filt.genes), length(labels.counts) ) # q - for broadcasting later. # of genes involved m = match (labels.names, voc[,1] ) v.f = !is.na(m) v.g = m[v.f] if( length(filt.labels) == 1 ) { genes.counts.set = GOA[filt.labels,] } else { genes.counts.set = colSums(GOA[filt.labels,]) } # Fisher's Exact test test = cbind( genes.counts.set , labels.counts, dim(GOA)[1]-labels.counts, labels.counts.set) # computes P[X > x] . Probabilities are given as log(p) pvals = phyper(test[,1]-1, test[,2], test[,3], test[,4], lower.tail=F) # FDR Correction pvals.adj = p.adjust( pvals, method="BH") sigs = pvals.adj < ( 0.05/length(pvals.adj) ) # Downloadable results = data.frame( GO_term = voc[v.g,1], description = as.character(voc[v.g,2]), N_sample = test[v.f,1], N_univ = test[v.f,2], pvals = pvals[v.f], adj_pvals = pvals.adj[v.f], sig = sigs[v.f], stringsAsFactors = FALSE ) results = results[order(results$pvals), ] return(results) }