#!/grid/gillis/home/nfox/software/miniconda3/bin/python """Edit Hi-C database digests.csv and samples.csv metadata files. The Hi-C database pipeline stores restriction enzyme and source tissue data in the "digests.csv" and "samples.csv" files respectively. Mostly these are manually edited, but occasionally a complex series of edits is necessary that is too large to do by hand. This is an editable skeleton program for making those programmatic edits. """ import os import pandas as pd # Globals METADATA_PATH = '/grid/gillis/data/nfox/hi_c_data_processing/metadata/manual_accession_files' PATH = '/grid/gillis/data/nfox/hi_c_data_processing/metadata' DIGEST_FILES = {} def get_digest_files(species): """Fill a global variable with digest file information.""" global DIGEST_FILES if species == 'human': DIGEST_FILES = { 'AluI': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_AluI_None_15-47-24_01-10-2020.txt', 'BamHI': 'Digest_hg38_UCSC_BamHI_None_14-08-09_25-03-2021.txt', 'BglII': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_BglII_None_15-46-47_01-10-2020.txt', 'DpnII_Arima': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_DpnII_Arima_None_15-53-10_01-10-2020.txt', 'DpnII': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_DpnII_None_15-49-33_01-10-2020.txt', 'HindIII': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_HindIII_None_15-48-57_01-10-2020.txt', 'MboI': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_MboI_None_15-50-34_01-10-2020.txt', 'MspI': 'Digest_hg38_UCSC_MspI_None_14-26-52_15-06-2021.txt', 'NcoI': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_NcoI_None_15-46-09_01-10-2020.txt', 'NlaIII': '/grid/gillis/data/nfox/genomes/homo_sapiens_38/hicup_digests/Digest_hg38_UCSC_NlaIII_None_15-51-36_01-10-2020.txt' } elif species == 'mouse': DIGEST_FILES = { 'AluI': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_AluI_None_10-49-06_12-02-2021.txt', 'BglII': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_BglII_None_10-48-34_12-02-2021.txt', 'DpnII_Arima': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_DpnII_Arima_None_12-43-28_12-02-2021.txt', 'DpnII': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_DpnII_None_10-51-03_12-02-2021.txt', 'HindIII': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_HindIII_None_10-50-29_12-02-2021.txt', 'MboI': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_MboI_None_10-51-59_12-02-2021.txt', 'NcoI': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_NcoI_None_10-48-03_12-02-2021.txt', 'NlaIII': '/grid/gillis/data/nfox/genomes/mus_musculus_10/hicup_digests/Digest_mm10_NlaIII_None_10-52-52_12-02-2021.txt' } else: raise ValueError(f'species "{species}" is not a valid species') def get_runs(metadata): """Get the list of Runs for which to edit digests or samples. Args: metadata: str. "digests" or "samples". Indicates which set of Runs to return. Raises: ValueError: If metadata isn't one of the two valid inputs. """ global METADATA_PATH global DIGEST_FILES # Custom edited for each use case metadata = metadata[0].lower() if metadata not in {'d', 's'}: raise ValueError('metadata arg must be digests or samples') # EDIT HERE >>> df = pd.read_csv(f'{METADATA_PATH}/SRP292639_MboI_Runs.csv', header=0) runs = df.Run # sample_texts = df.apply(lambda x: f'{x.Tissue} cells from {x.Strain} mice ' # f'with a {x.Genotype} genotype.' # , axis=1) # sample_texts.index = df.Run # <<< EDIT HERE if metadata == 'd': digests = {} for r in runs: digests[r] = DIGEST_FILES['MboI'] return(digests) # return(None) elif metadata == 's': samples = {} for r in runs: samples[r] = sample_texts[r] # return(samples) return(None) def edit_digests(species): """Edit the digests file.""" global PATH backup_path = os.path.join(PATH, 'backups') get_digest_files(species) digests = get_runs('digests') if digests is None: return if os.path.exists(f'{backup_path}/digests_{species}.csv.bak'): os.remove(f'{backup_path}/digests_{species}.csv.bak') os.rename(f'{PATH}/digests_{species}.csv', f'{backup_path}/digests_{species}.csv.bak') with open(f'{backup_path}/digests_{species}.csv.bak', 'r') as old: with open(f'{PATH}/digests_{species}.csv', 'w') as new: for line in old: line = line.strip() line_elements = line.split(',') if len(line_elements) == 0: new.write('\n') next else: run = line_elements[0] if run in digests.keys(): if len(line_elements) > 1 and line_elements[1] != '': print(f'{run} skipped due to existing digest!') new_line = line else: new_line = ','.join([line, digests[run]]) else: new_line = line new.write(f'{new_line}\n') def edit_samples(species): """Edit the samples file.""" global PATH backup_path = os.path.join(PATH, 'backups') samples = get_runs('samples') if samples is None: return if os.path.exists(f'{backup_path}/samples_{species}.csv.bak'): os.remove(f'{backup_path}/samples_{species}.csv.bak') os.rename(f'{PATH}/samples_{species}.csv', f'{backup_path}/samples_{species}.csv.bak') with open(f'{backup_path}/samples_{species}.csv.bak', 'r') as old: with open(f'{PATH}/samples_{species}.csv', 'w') as new: for line in old: line = line.strip() line_elements = line.split(',') if len(line_elements) == 0: new.write('\n') next else: run = line_elements[0] if run in samples.keys(): if len(line_elements) > 1 and line_elements[1] != '': print(f'{run} skipped due to existing sample!') new_line = line else: new_line = ','.join([line, samples[run]]) else: new_line = line new.write(f'{new_line}\n') def main(): global PATH backup_path = os.path.join(PATH, 'backups') if not os.path.exists(backup_path): os.mkdir(backup_path) # species = 'human' species = 'mouse' edit_digests(species) edit_samples(species) if __name__ == '__main__': main()