library(org.Hs.eg.db) library(AnnotationDbi) library(GO.db) library(Matrix) #Run function to generate GO ontology tables. Versions are tied to current version of R generate_go_tables = function(){ go_table <- as.data.frame(org.Hs.egGO2ALLEGS) #gene mapping to the direct GO term and that term's children go_table$gene_symbol = mapIds(org.Hs.eg.db, go_table$gene_id, "SYMBOL","ENTREZID") #Add gene symbol go_genes = unique(go_table$gene_symbol) GO_descriptions = as.list(GOTERM) ordered_go = go_table[order(go_table$go_id), ] unique_go = ordered_go[!duplicated(ordered_go$go_id), ] unique_go = unique_go[ ,c('go_id','Evidence','Ontology')] go_descp = sapply(1:nrow(unique_go), function(i) Term(GO_descriptions[[unique_go$go_id[i]]])) unique_go$Evidence = go_descp colnames(unique_go) = c('go_id','description','Ontology') BP_GO = unique(go_table$go_id[go_table$Ontology== 'BP']) CC_GO = unique(go_table$go_id[go_table$Ontology== 'CC']) MF_GO = unique(go_table$go_id[go_table$Ontology== 'MF']) #Make 3 binary matrices for the GO ontologies, BP, CC, MF BP_GO_mat = sapply(1:length(BP_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == BP_GO[i]] )) colnames(BP_GO_mat) = BP_GO rownames(BP_GO_mat) = go_genes per_term = colSums(BP_GO_mat) col_index = per_term >= 10 & per_term <= 1000 #Filter for terms between 10 and 1000 genes BP_GO_mat = BP_GO_mat[ ,col_index] CC_GO_mat = sapply(1:length(CC_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == CC_GO[i]] )) colnames(CC_GO_mat) = CC_GO rownames(CC_GO_mat) = go_genes per_term = colSums(CC_GO_mat) col_index = per_term >= 10 & per_term <= 1000 CC_GO_mat = CC_GO_mat[ ,col_index] MF_GO_mat = sapply(1:length(MF_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == MF_GO[i]] )) colnames(MF_GO_mat) = MF_GO rownames(MF_GO_mat) = go_genes per_term = colSums(MF_GO_mat) col_index = per_term >= 10 & per_term <= 1000 MF_GO_mat = MF_GO_mat[ ,col_index] save(unique_go, BP_GO_mat, CC_GO_mat, MF_GO_mat, file = '/home/werner/tools/GO_enrichment_human/go_tables.Rdata') } #Input a gene list and a GO term matrix, one of BP_GO_mat, CC_GO_mat, MF_GO_mat gene_set_enrichment = function(gene_list, GOA){ genes.names = rownames(GOA) labels.names = colnames(GOA) genes.counts = rowSums(GOA) labels.counts = colSums(GOA) # p - gives number of genes in a GO group # look only at genes with known annotations m = match ( toupper(gene_list), toupper(genes.names) ) filt.genes = !is.na(m) filt.labels = m[filt.genes] labels.counts.set = rep( sum(filt.genes), length(labels.counts) ) # q - for broadcasting later. # of genes involved m = match (labels.names, unique_go[,1] ) v.f = !is.na(m) v.g = m[v.f] if( length(filt.labels) == 1 ) { genes.counts.set = GOA[filt.labels,] }else { genes.counts.set = colSums(GOA[filt.labels,]) } # Fisher's Exact test test = cbind( genes.counts.set , labels.counts, dim(GOA)[1]-labels.counts, labels.counts.set) # computes P[X > x]. pvals = phyper(test[,1]-1, test[,2], test[,3], test[,4], lower.tail=F) # FDR Correction pvals.adj = p.adjust( pvals, method="BH") sigs = pvals.adj < ( 0.05/length(pvals.adj) ) results = data.frame( GO_term = as.character(unique_go[v.g,1]), description = as.character(unique_go[v.g,2]), N_sample = test[v.f,1], N_univ = test[v.f,2], pvals = pvals[v.f], adj_pvals = pvals.adj[v.f], sig = sigs[v.f], stringsAsFactors = FALSE ) # sort by pvals results=results[order(results$pvals), ] return(results) }