def generate_contigency_tables(go_annotations,gene_list, use_background_genes = False, num_background_genes=0 ): """Takes list of Go Annotations as a binary matrix with Genes as the index and GO terms as columns, and a list of genes, and generates the contigency tables on which to run a Fisher Exact Test for enrichment. Args: go_annotations (Panda DataFrame): Binary Matrix, with genes as index and GO Terms as colummns. Generated by Go_annotations function (may not work for non plant species idk check the code for it its been a while) gene_list (list): What is says on the tin """ gene_list = list(set(go_annotations.index) & set(gene_list)) background_distribution_num_genes_annotated_with_go_term = go_annotations.sum(axis =0) if use_background_genes == False: background_dipstribution_num_genes_that_lack_go_term = go_annotations.shape[0]-background_distribution_num_genes_annotated_with_go_term if use_background_genes == True: background_dipstribution_num_genes_that_lack_go_term = num_background_genes-background_distribution_num_genes_annotated_with_go_term annotation_subset_to_genes_in_list = go_annotations.loc[gene_list] go_subset_distribution_num_genes_annotatated_with_go_term = annotation_subset_to_genes_in_list.sum(axis =0) go_subset_distribution_num_genes_that_lack_go_term = annotation_subset_to_genes_in_list.shape[0] - go_subset_distribution_num_genes_annotatated_with_go_term all_go_contingency_tables = [] i = 0 for term_of_go in background_distribution_num_genes_annotated_with_go_term: #[[annotated not in subset, annotated in subset],[unannotated not in subset,annotated not in subset]] contig_table = [[(term_of_go - go_subset_distribution_num_genes_annotatated_with_go_term[i]),go_subset_distribution_num_genes_annotatated_with_go_term[i]],[(background_dipstribution_num_genes_that_lack_go_term[i]-go_subset_distribution_num_genes_that_lack_go_term[i]),go_subset_distribution_num_genes_that_lack_go_term[i]]] all_go_contingency_tables.append(contig_table) i +=1 return all_go_contingency_tables def fishers_exact_on_contigency_tables(all_go_contigency_tables,original_GO_term_panda, return_odds_ratio = False): """Takes Table from generate_contigency_tables and runs fishers exact on all the tables, without correcting the P-values. Returns either just a Panda with the p values, or a Panda of lists for odds ratios, P values, just like stats.fisher_exact. If you're wondering if you want the Odds Ratios, you don't want them Args: all_go_contigency_tables (list): Contigency Tables formatted as list of lists from generate_contigency_tables return_odds_ratio (bool): Boolean for if you want it to return both the odds ratios and the P values or just the P Values. Defaults to just the P Values original_GO_term_panda (Panda DataFrame): Binary Matrix, with genes as index and GO Terms as colummns. Generated by Go_annotations function (may not work for non plant species idk check the code for it its been a while) Same as above """ import scipy.stats as stats import pandas as pd list_of_p_values_for_go_terms = [] list_of_odds_ratios_for_go_terms = [] for table in all_go_contigency_tables: odds, p_value = stats.fisher_exact(table = table, alternative = 'less') list_of_p_values_for_go_terms.append(p_value) list_of_odds_ratios_for_go_terms.append(odds) if return_odds_ratio == True: go_terms_with_P_value_and_odds = pd.DataFrame(data = list(zip(list_of_p_values_for_go_terms, list_of_odds_ratios_for_go_terms)),index = original_GO_term_panda.columns, columns = ['P_value', 'Odds Ratios']) return go_terms_with_P_value_and_odds if return_odds_ratio == False: go_terms_with_P_value = pd.DataFrame(data = list_of_p_values_for_go_terms,index = original_GO_term_panda.columns, columns = ['P_value']) return go_terms_with_P_value def multi_hypothesis_correct_fishers_exact(go_terms_with_P_value, alpha = 0.05, method = 'indep', is_sorted = False, panda_has_odds_ratio = False, Include_rejected_boolean_array = False): """Runs Multiple Hypothesis Testing via statsmodels.stats.multitest.fdrcorrection. Takes all values for that, as well as Panda of go terms from previous step Args: go_terms_with_P_value (Panda Dataframe): Pandas Dataframe from fishers_exact_on_contigency_tables alpha (float): Familywise error rate, defaults to 0.05 method (str): Which method to use for FDR correction. {'i', 'indep', 'p', 'poscorr'} all refer to fdr_bh (Benjamini/Hochberg for independent or positively correlated tests). {'n', 'negcorr'} both refer to fdr_by (Benjamini/Yekutieli for general or negatively correlated tests). Defaults to 'indep'. is_sorted (bool): _f False (default), the p_values will be sorted, but the corrected pvalues are in the original order. If True, then it assumed that the pvalues are already sorted in ascending order. panda_has_odds_ratio (bool): Did you ignore my previous Docstring in fishers_exact_on_contigency_tables and add the odds ratio? If so this fixes your foolishness and drops that column Include_rejected_boolean_array (bool): Makes function return a tuple of a boolean array of if a hypothesis was rejected, and the normal Panda of corrected_p_values. Don't do this just use your eyes """ import statsmodels.stats.multitest import pandas as pd if panda_has_odds_ratio == True: go_terms_with_P_value = go_terms_with_P_value[['P_value']] rejected, corrected_pvalues = statsmodels.stats.multitest.fdrcorrection(pvals = go_terms_with_P_value['P_value'].to_list(), alpha = alpha, method = method, is_sorted= is_sorted) corrected_pvalues = corrected_pvalues.tolist() go_enrichment_results = pd.DataFrame(corrected_pvalues,index = go_terms_with_P_value.index,columns=['P_value']) if Include_rejected_boolean_array == True: return rejected, go_enrichment_results if Include_rejected_boolean_array == False: return go_enrichment_results def test_using_kwargs(go_terms_with_P_value,panda_has_odds_ratio = False, Include_rejected_boolean_array = False, **kwargs): """Runs Multiple Hypothesis Testing via statsmodels.stats.multitest.fdrcorrection. Takes all values for that, as well as Panda of go terms from previous step Args: go_terms_with_P_value (Panda Dataframe): Pandas Dataframe from fishers_exact_on_contigency_tables kwargs : Refer to statsmodels.stats.multitest.fdrcorrection for kwargs alpha (float): Familywise error rate, defaults to 0.05 method (str): Which method to use for FDR correction. {'i', 'indep', 'p', 'poscorr'} all refer to fdr_bh (Benjamini/Hochberg for independent or positively correlated tests). {'n', 'negcorr'} both refer to fdr_by (Benjamini/Yekutieli for general or negatively correlated tests). Defaults to 'indep'. is_sorted (bool): _f False (default), the p_values will be sorted, but the corrected pvalues are in the original order. If True, then it assumed that the pvalues are already sorted in ascending order. panda_has_odds_ratio (bool): Did you ignore my previous Docstring in fishers_exact_on_contigency_tables and add the odds ratio? If so this fixes your foolishness and drops that column Include_rejected_boolean_array (bool): Makes function return a tuple of a boolean array of if a hypothesis was rejected, and the normal Panda of corrected_p_values. Don't do this just use your eyes """ import statsmodels.stats.multitest import pandas as pd if panda_has_odds_ratio == True: go_terms_with_P_value = go_terms_with_P_value[['P_value']] rejected, corrected_pvalues = statsmodels.stats.multitest.fdrcorrection(pvals = go_terms_with_P_value['P_value'].to_list(), **kwargs) go_enrichment_results = pd.DataFrame(corrected_pvalues,index = go_terms_with_P_value.index,columns=['P_value']) if Include_rejected_boolean_array == True: return rejected, go_enrichment_results if Include_rejected_boolean_array == False: return go_enrichment_results