library(org.Hs.eg.db)
library(AnnotationDbi)
library(GO.db)
library(Matrix)

#Run function to generate GO ontology tables. Versions are tied to current version of R
generate_go_tables = function(){
  
  go_table <- as.data.frame(org.Hs.egGO2ALLEGS) #gene mapping to the direct GO term and that term's children
  go_table$gene_symbol = mapIds(org.Hs.eg.db, go_table$gene_id, "SYMBOL","ENTREZID") #Add gene symbol 
  
  go_genes = unique(go_table$gene_symbol)
  
  GO_descriptions = as.list(GOTERM)
  
  ordered_go = go_table[order(go_table$go_id), ]
  unique_go = ordered_go[!duplicated(ordered_go$go_id), ]
  unique_go = unique_go[ ,c('go_id','Evidence','Ontology')]
  
  go_descp = sapply(1:nrow(unique_go), function(i) Term(GO_descriptions[[unique_go$go_id[i]]]))
  unique_go$Evidence = go_descp
  colnames(unique_go) = c('go_id','description','Ontology')
  
  BP_GO = unique(go_table$go_id[go_table$Ontology== 'BP'])
  CC_GO = unique(go_table$go_id[go_table$Ontology== 'CC'])
  MF_GO = unique(go_table$go_id[go_table$Ontology== 'MF'])
  #Make 3 binary matrices for the GO ontologies, BP, CC, MF
  
  BP_GO_mat = sapply(1:length(BP_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == BP_GO[i]] ))
  colnames(BP_GO_mat) = BP_GO
  rownames(BP_GO_mat) = go_genes
  per_term = colSums(BP_GO_mat) 
  col_index = per_term >= 10 & per_term <= 1000  #Filter for terms between 10 and 1000 genes
  BP_GO_mat = BP_GO_mat[ ,col_index]
  
  CC_GO_mat = sapply(1:length(CC_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == CC_GO[i]] ))
  colnames(CC_GO_mat) = CC_GO
  rownames(CC_GO_mat) = go_genes
  per_term = colSums(CC_GO_mat) 
  col_index = per_term >= 10 & per_term <= 1000
  CC_GO_mat = CC_GO_mat[ ,col_index]
  
  MF_GO_mat = sapply(1:length(MF_GO), function(i) as.numeric(go_genes %in% go_table$gene_symbol[go_table$go_id == MF_GO[i]] ))
  colnames(MF_GO_mat) = MF_GO
  rownames(MF_GO_mat) = go_genes
  per_term = colSums(MF_GO_mat) 
  col_index = per_term >= 10 & per_term <= 1000
  MF_GO_mat = MF_GO_mat[ ,col_index]
  
  save(unique_go, BP_GO_mat, CC_GO_mat, MF_GO_mat, file = '/home/werner/tools/GO_enrichment_human/go_tables.Rdata')
  
}


#Input a gene list and a GO term matrix, one of BP_GO_mat, CC_GO_mat, MF_GO_mat
gene_set_enrichment = function(gene_list, GOA){
  
  genes.names     = rownames(GOA)
  labels.names    = colnames(GOA)
  genes.counts    = rowSums(GOA)
  labels.counts   = colSums(GOA)      # p - gives number of genes in a GO group
  
  # look only at genes with known annotations
  m               = match ( toupper(gene_list), toupper(genes.names) )
  filt.genes      = !is.na(m)
  filt.labels     = m[filt.genes]
  
  
  labels.counts.set   = rep( sum(filt.genes), length(labels.counts) )  # q - for broadcasting later.  # of genes involved 
  m               = match (labels.names,  unique_go[,1] )
  v.f             = !is.na(m)
  v.g             = m[v.f]
  
  if(  length(filt.labels) == 1 )  { genes.counts.set = GOA[filt.labels,] 
  }else { genes.counts.set = colSums(GOA[filt.labels,]) }
  # Fisher's Exact test
  test =  cbind( genes.counts.set  , labels.counts, dim(GOA)[1]-labels.counts, labels.counts.set)
  # computes P[X > x].
  pvals = phyper(test[,1]-1, test[,2], test[,3], test[,4], lower.tail=F)
  # FDR Correction
  pvals.adj = p.adjust( pvals, method="BH")
  sigs = pvals.adj < ( 0.05/length(pvals.adj) )
  
  results = data.frame(   GO_term     = as.character(unique_go[v.g,1]),
                          description = as.character(unique_go[v.g,2]),
                          N_sample    = test[v.f,1],
                          N_univ      = test[v.f,2],
                          pvals       = pvals[v.f],
                          adj_pvals   = pvals.adj[v.f],
                          sig         = sigs[v.f],
                          stringsAsFactors = FALSE
  )
  
  # sort by pvals 
  results=results[order(results$pvals), ]
  return(results)
}