import bottleneck import scipy.stats as stats import numpy as np import pandas as pd from calculate_auc import calc_auroc from sklearn.model_selection import KFold from sklearn.neural_network import MLPClassifier def ml_hic_multi(resolution_list = [], hi_c_percentile = 90, chrm = 10, network_type=""): resolution = resolution_list[0] df_2_or = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/combined_dist_%s_rao.h5' %(resolution)) exp_upper = np.percentile(df_2_or[['exp']], 90) exp_lower = np.percentile(df_2_or[['exp']], 50) adj_tab = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/network_files_v2/network_%s_chr%s_resolution%s_hic_percentile%s_rao.h5' %(network_type, chrm, resolution, hi_c_percentile)) adj_tab['exp_label'] = [1 if x>exp_upper else 0 if x < exp_lower else np.nan for x in adj_tab['exp']] adj_tab = adj_tab[adj_tab['Gene stable ID_x'] != adj_tab['Gene stable ID_y']] # remove all the self pairs from each set adj_tab.dropna(inplace=True) adj_tab['pair'] = [sorted((x,y))[0] + sorted((x,y))[1] for x,y in zip(adj_tab['Gene stable ID_y'], adj_tab['Gene stable ID_x'])] #adj_tab = adj_tab[adj_tab['Gene stable ID_x'] != adj_tab['Gene stable ID_y']] # remove all the self pairs from each set adj_tab.set_index('pair', inplace=True) adj_tab = adj_tab[['exp_label']] adj_tab.replace([np.inf], 100000000000, inplace=True) adj_tab.replace([-np.inf], -100000000000, inplace=True) #print (adj_tab.isna().sum()) adj_tab.replace([np.nan], 0, inplace=True) for resolution in resolution_list: adj_tab_2 = pd.read_hdf('/data/lohia/gene_distance_expresseion/dist_files/norm_dist_files/network_files_v2/network_%s_chr%s_resolution%s_hic_percentile%s_rao.h5' %(network_type, chrm, resolution, hi_c_percentile)) adj_tab_2['pair'] = [sorted((x,y))[0] + sorted((x,y))[1] for x,y in zip(adj_tab_2['Gene stable ID_y'], adj_tab_2['Gene stable ID_x'])] adj_tab_2.set_index('pair', inplace=True) adj_tab_2.replace([np.inf], 100000000000, inplace=True) adj_tab_2.replace([-np.inf], -100000000000, inplace=True) adj_tab_2.replace([np.nan], 0, inplace=True) adj_tab = adj_tab.join(adj_tab_2[['jaccard_coefficient%s' %resolution , 'shortest_path%s' %resolution , 'clustering_avg_%s' %resolution , 'clustering_diff_%s' %resolution , 'closeness_centrality_avg_%s' %resolution , 'closeness_centrality_diff_%s' %resolution , 'betweenness_centrality_avg_%s' %resolution , 'betweenness_centrality_diff_%s' %resolution ]], how='inner') adj_tab_x = adj_tab.drop(columns=['exp_label']) X = adj_tab_x.to_numpy() Y = adj_tab['exp_label'].to_numpy() kf = KFold(n_splits=10, shuffle=True,random_state=1) hidden_layer_sizes = 8 * len(resolution_list) clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1, activation='logistic', hidden_layer_sizes=(hidden_layer_sizes,)) predicted = [] label = [] accuracy = [] for train_indices, test_indices in kf.split(X): print ("here") clf.fit(X[train_indices], Y[train_indices].ravel()) predicted.append(clf.predict_proba(X[test_indices])[:,1].tolist()) label.append(Y[test_indices].ravel().tolist()) accuracy.append(clf.score(X[test_indices], Y[test_indices].ravel())) auroc = [] for predicted_score, true_pos in zip(predicted, label): df = pd.DataFrame(predicted_score,columns=['predicted_score']) df['true_pos'] = true_pos df['true_neg'] = [0 if x==1 else 1 for x in df['true_pos']] auroc.append(calc_auroc(df,predicted_score='predicted_score')) #mean = sum(test_list) / len(test_list) #variance = sum([((x - mean) ** 2) for x in test_list]) / len(test_list) #res = variance ** 0.5 p_val = sum(auroc) / len(auroc) l_g1 = sum(accuracy) / len(accuracy) return (p_val, l_g1) if __name__ == '__main__': file1 = open("/data/lohia/gene_distance_expresseion/dist_files/RNN_multi_resolution_rao.txt","w") chr_list = list(range(1,21)) + ['X'] #chr_list = list(range(3,5)) #chr_list = [11,1,'X'] #for resolution in [50]: #for resolution in [50, 100, 5, 10, 250, 500,1000]: #resolution_list = [50, 100, 5, 10, 250, 500,1000, 1] #resolution_list = [50, 100, 5, 10, 250, 500,1000, 1] resolution_list = [10, 50, 500] #resolution_list = [50, 100, 5, 10] for network_type in ['VC_lib_tss', 'VC_tss']: for chrm in chr_list: p_val, l_g1, l_g2 = ml_hic_multi(resolution_list = resolution_list, hi_c_percentile = 90, chrm = chrm, network_type="") file1.write("%s\t" %chrm) file1.write("%s\t" %p_val) file1.write("%s\t" %l_g1) file1.write("%s\n" %network_type) file1.close()