from hicmatrix import HiCMatrix as hm from hicmatrix.lib import MatrixFileHandler import numpy as np import pandas as pd from scipy import stats, sparse import bottleneck from scipy.stats import mannwhitneyu import h5py import os.path import gc import os import matplotlib.pyplot as plt from pyranges import read_gtf from umap import UMAP import plotly.express as px from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import scipy.spatial as sp, scipy.cluster.hierarchy as hc import seaborn as sns import anndata as ad import scanpy as sc # Assuming your data is organized as NumPy arrays # gene_observation_matrix is genesXobservations # gene_category_matrix is genesXcategories # Both matrices are NumPy arrays # Example gene_observation_matrix (5 genes, 3 observations) with NaN values # Calculate the mean of genes for each category and observation, ignoring NaN values def calculate_category_means_ignore_nan(gene_observation_matrix_df, gene_category_matrix_df): # Calculate mean for each category (axis=0) and each observation (axis=1), ignoring NaN values genes_intersect = gene_observation_matrix_df.index.intersection(gene_category_matrix_df.index) gene_observation_matrix_df = gene_observation_matrix_df.loc[genes_intersect, :] gene_category_matrix_df = gene_category_matrix_df.loc[genes_intersect, :] gene_observation_matrix = gene_observation_matrix_df.values gene_category_matrix = gene_category_matrix_df.values category_counts = np.dot(gene_category_matrix.T, ~np.isnan(gene_observation_matrix)) gene_observation_matrix_cleaned = np.nan_to_num(gene_observation_matrix, nan=0.0) category_sums = np.dot(gene_category_matrix.T, gene_observation_matrix_cleaned) # Calculate mean using only non-NaN values category_means = category_sums / category_counts # Replace NaN values in category_means with 0 where division by 0 occurred #category_means[np.isnan(category_means)] = 0.0 return category_means, category_counts # Call the function to calculate category means, ignoring NaN values def calc_marker_matrix(marker_list,gene_list, no_of_markers=10, drop_duplicates=True, shuffle_labels=False): marker_list = marker_list[marker_list['Ensembl_gene_identifier'].isin(gene_list)] marker_list = marker_list.sort_values(by=['rank']).drop_duplicates(subset=['cell_type', 'gene']) all_cell_types = marker_list.drop_duplicates(['cell_type'])['cell_type'].tolist() marker_list_list = [] for group2 in all_cell_types: m_tmp = marker_list[marker_list['cell_type']==group2].sort_values(by=['rank']) m_tmp['rank_missing_gene'] = [x+1 for x in range(m_tmp.shape[0])] marker_list_list.append(m_tmp) marker_list = pd.concat(marker_list_list) marker_list = marker_list[marker_list['rank_missing_gene']<=no_of_markers] marker_list['%recurrence'] = marker_list['recurrence'] / marker_list['n_datasets'] marker_list['%recurrence'] = 1/marker_list['%recurrence'] marker_list_copy = marker_list.copy() if drop_duplicates==True: marker_list = marker_list.sort_values(by=['rank', '%recurrence', 'auroc', 'rank_missing_gene']).drop_duplicates(subset=['Ensembl_gene_identifier'], keep=False) elif drop_duplicates=="one_copy": marker_list = marker_list.sort_values(by=['rank', '%recurrence', 'auroc', 'rank_missing_gene']).drop_duplicates(subset=['Ensembl_gene_identifier']) else: pass if shuffle_labels == True: shuffled_Ensembl_gene_identifier = marker_list['Ensembl_gene_identifier'].sample(frac=1, replace=False, random_state=42).reset_index(drop=True) marker_list['Ensembl_gene_identifier'] = shuffled_Ensembl_gene_identifier.tolist() else: pass df2_list = [] for group2 in all_cell_types: m_tmp = marker_list[marker_list['cell_type']==group2] #if number_of_times == 1: df2_list.append(m_tmp) #else: # random_indices = np.random.choice(m_tmp.index, size=bootstarp_size, replace=True) # bootstrap_sample = m_tmp.loc[random_indices] # df2_list.append(bootstrap_sample) marker_list=pd.concat(df2_list) marker_list_copy['rank'] = 0 marker_list = pd.concat([marker_list, marker_list_copy]) marker_table = marker_list.pivot_table(index='Ensembl_gene_identifier', columns='cell_type', values='rank', aggfunc='sum') marker_table.fillna(0, inplace=True) marker_table[marker_table != 0] = 1 return marker_table #for each marker-list and input file , I can make one output file def calc_mean_profile(marker_name, base_folder, species, input_file_h5ad, outputfile_h5ad): marker_list= pd.read_csv(f'{base_folder}/hi_c_data_processing/{species}_{marker_name}.csv') cell_type_order = marker_list['cell_type'].drop_duplicates().tolist() adata = ad.read_h5ad(f'{input_file_h5ad}') np_of_col = adata.X.shape[1] np_of_rows = len(cell_type_order) adata_l = ad.AnnData(np.zeros((np_of_rows, np_of_col))) layer_names = adata.layers.keys() no_of_markers_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,15,17,19,21,23,26,29,33,37,41,46,52,58,65,73,82,92,103,115,129,144,162,181,202,227,254,284,318,356,399,446,500] for no_of_markers in no_of_markers_list: for drop_duplicates in [False, True]: for shuf in [True, False]: for given_layer in layer_names: gene_list = adata.obs_names.tolist() gene_observation_matrix_dataframe = adata.to_df(layer=given_layer) gene_category_dataframe = calc_marker_matrix(marker_list, gene_list, no_of_markers=no_of_markers, drop_duplicates=drop_duplicates, shuffle_labels=shuf) gene_category_dataframe = gene_category_dataframe.loc[:,cell_type_order] category_means, category_counts = calculate_category_means_ignore_nan(gene_observation_matrix_dataframe, gene_category_dataframe) l_n = f'{given_layer}_{no_of_markers}_{drop_duplicates}_{shuf}_mean' adata_l.layers[l_n] = category_means l_n = f'{given_layer}_{no_of_markers}_{drop_duplicates}_{shuf}_counts' adata_l.layers[l_n] = category_counts z_scores = stats.zscore(category_means, axis=0, nan_policy='omit') l_n = f'{given_layer}_{no_of_markers}_{drop_duplicates}_{shuf}_mean_scaled' adata_l.layers[l_n] = z_scores z_scores = stats.zscore(category_counts, axis=0, nan_policy='omit') l_n = f'{given_layer}_{no_of_markers}_{drop_duplicates}_{shuf}_counts_scaled' adata_l.layers[l_n] = z_scores adata_l.obs_names = cell_type_order adata_l.var_names = adata.var_names.tolist() adata_l.write(outputfile_h5ad, compression="gzip") if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--marker_name', default=False, help='path or false') parser.add_argument('--base_folder', default=False, help='path or false') parser.add_argument('--species', default=False, help='path or false') parser.add_argument('--input_file_h5ad', default=False, help='path or false') parser.add_argument('--outputfile_h5ad', default=False, help='path or false') args = parser.parse_args() calc_mean_profile(args.marker_name, args.base_folder, args.species, args.input_file_h5ad, args.outputfile_h5ad)