{ "cells": [ { "cell_type": "markdown", "id": "3e4adad2-04fa-4503-ab29-38b3e8301fcd", "metadata": {}, "source": [ "This notebook gets the TSS and TES for each gene, \n", "It also gets the file of regulatory elemets" ] }, { "cell_type": "code", "execution_count": 3, "id": "8ba98e15-686a-4c28-a4dd-3cc525fa758e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-07-20 23:15:46-- http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed\n", "Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n", "Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 9385520 (9.0M)\n", "Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed’\n", "\n", "100%[======================================>] 9,385,520 6.63MB/s in 1.4s \n", "\n", "2021-07-20 23:15:47 (6.63 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed’ saved [9385520/9385520]\n", "\n" ] } ], "source": [ "! wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "cd2e89a9-3a87-42a9-af4f-f20be6304fe2", "metadata": {}, "outputs": [], "source": [ "!chmod +x /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed" ] }, { "cell_type": "code", "execution_count": 5, "id": "ffd23f0a-a10c-4ec6-b438-085cc7b634d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-07-20 23:16:47-- http://hgdownload.soe.ucsc.edu/gbdb/hg38/gencode/gencodeV36.bb\n", "Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n", "Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 23207803 (22M)\n", "Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb’\n", "\n", "100%[======================================>] 23,207,803 9.77MB/s in 2.3s \n", "\n", "2021-07-20 23:16:50 (9.77 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb’ saved [23207803/23207803]\n", "\n" ] } ], "source": [ "#Thiss downloads gencodev36 knowngenes\n", "#https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1133902899_IYtEfxvteU99lmytOCjaacOaQcnL&clade=mammal&org=Human&db=hg38&hgta_group=genes&hgta_track=encodeCcreCombined&hgta_table=0&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=gencodev32knowngene\n", "! wget http://hgdownload.soe.ucsc.edu/gbdb/hg38/gencode/gencodeV36.bb -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/" ] }, { "cell_type": "code", "execution_count": 12, "id": "05e2783e-ad2a-4b6a-8c3b-7bdbbd0088e1", "metadata": {}, "outputs": [], "source": [ "! /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_gencodeV36.bed\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b46462eb-37dc-4966-b0ea-e1148ca35377", "metadata": {}, "outputs": [], "source": [ "# from AllGENCODE V36, ENST to ENSG mappings are downloaded\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "84c0f538-0abf-46bc-8dd4-56606067a790", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-07-21 12:51:12-- http://hgdownload.soe.ucsc.edu/gbdb/hg38/encode3/ccre/encodeCcreCombined.bb\n", "Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n", "Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 49809776 (48M)\n", "Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb’\n", "\n", "100%[======================================>] 49,809,776 12.7MB/s in 4.2s \n", "\n", "2021-07-21 12:51:17 (11.4 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb’ saved [49809776/49809776]\n", "\n" ] } ], "source": [ "# Schema for ENCODE cCREs - ENCODE Candidate Cis-Regulatory Elements (cCREs) combined from all cell types\n", "#https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1133902899_IYtEfxvteU99lmytOCjaacOaQcnL&clade=mammal&org=Human&db=hg38&hgta_group=regulation&hgta_track=encodeCcreCombined&hgta_table=0&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=gencodev32knowngene\n", "! wget http://hgdownload.soe.ucsc.edu/gbdb/hg38/encode3/ccre/encodeCcreCombined.bb -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/" ] }, { "cell_type": "code", "execution_count": 11, "id": "57b92b87-ceb5-424c-8834-01376f12433c", "metadata": {}, "outputs": [], "source": [ "! /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_encodeCcreCombined.bed\n" ] }, { "cell_type": "code", "execution_count": null, "id": "44f1f702-91e3-46e2-8702-8c744b9f623e", "metadata": {}, "outputs": [], "source": [ "#getting the regulatory elements from ensemble\n", "\n", "\n", "\n", "\t\t\t\n", "\t\n", "\t\t\n", "\t\t\n", "\t\t\n", "\t\t\n", "\t\n", "" ] }, { "cell_type": "code", "execution_count": null, "id": "85ce800c-1b58-42c8-924e-fbf048aed25f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "e6eaf1ea-15a8-4637-bbc8-76df2b19619b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-07-21 12:57:59-- http://may2021.archive.ensembl.org/biomart/martservice?query=%3C?xml%20version=%221.0%22%20encoding=%22UTF-8%22?%3E%3C!DOCTYPE%20Query%3E%3CQuery%20%20virtualSchemaName%20=%20%22default%22%20formatter%20=%20%22TSV%22%20header%20=%20%220%22%20uniqueRows%20=%20%220%22%20count%20=%20%22%22%20datasetConfigVersion%20=%20%220.6%22%20%3E%3CDataset%20name%20=%20%22hsapiens_regulatory_feature%22%20interface%20=%20%22default%22%20%3E%3CAttribute%20name%20=%20%22chromosome_name%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_start%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_end%22%20/%3E%3CAttribute%20name%20=%20%22feature_type_name%22%20/%3E%3C/Dataset%3E%3C/Query%3E\n", "Resolving may2021.archive.ensembl.org (may2021.archive.ensembl.org)... 193.62.193.83\n", "Connecting to may2021.archive.ensembl.org (may2021.archive.ensembl.org)|193.62.193.83|:80... connected.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: http://www.ensembl.org/biomart/martservice?query=%3C?xml%20version=%221.0%22%20encoding=%22UTF-8%22?%3E%3C!DOCTYPE%20Query%3E%3CQuery%20%20virtualSchemaName%20=%20%22default%22%20formatter%20=%20%22TSV%22%20header%20=%20%220%22%20uniqueRows%20=%20%220%22%20count%20=%20%22%22%20datasetConfigVersion%20=%20%220.6%22%20%3E%3CDataset%20name%20=%20%22hsapiens_regulatory_feature%22%20interface%20=%20%22default%22%20%3E%3CAttribute%20name%20=%20%22chromosome_name%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_start%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_end%22%20/%3E%3CAttribute%20name%20=%20%22feature_type_name%22%20/%3E%3C/Dataset%3E%3C/Query%3E [following]\n", "--2021-07-21 12:57:59-- http://www.ensembl.org/biomart/martservice?query=%3C?xml%20version=%221.0%22%20encoding=%22UTF-8%22?%3E%3C!DOCTYPE%20Query%3E%3CQuery%20%20virtualSchemaName%20=%20%22default%22%20formatter%20=%20%22TSV%22%20header%20=%20%220%22%20uniqueRows%20=%20%220%22%20count%20=%20%22%22%20datasetConfigVersion%20=%20%220.6%22%20%3E%3CDataset%20name%20=%20%22hsapiens_regulatory_feature%22%20interface%20=%20%22default%22%20%3E%3CAttribute%20name%20=%20%22chromosome_name%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_start%22%20/%3E%3CAttribute%20name%20=%20%22chromosome_end%22%20/%3E%3CAttribute%20name%20=%20%22feature_type_name%22%20/%3E%3C/Dataset%3E%3C/Query%3E\n", "Resolving www.ensembl.org (www.ensembl.org)... 193.62.193.83\n", "Reusing existing connection to may2021.archive.ensembl.org:80.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: unspecified [text/plain]\n", "Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_regulatory_feature.txt’\n", "\n", " [ <=> ] 23,326,355 850KB/s in 23s \n", "\n", "2021-07-21 12:58:23 (971 KB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_regulatory_feature.txt’ saved [23326355]\n", "\n" ] } ], "source": [ "!wget -O /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_regulatory_feature.txt 'http://may2021.archive.ensembl.org/biomart/martservice?query='\n" ] }, { "cell_type": "code", "execution_count": null, "id": "bb709bdf-925b-49a8-887b-f20cebf14c49", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "\n", "\t\t\t\n", "\t\n", "\t\t\n", "\t\t\n", "\t\n", "" ] }, { "cell_type": "code", "execution_count": 3, "id": "3ad7e4f4-feff-4e29-8d81-bcadfb9335e1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-02-23 22:27:24-- http://aug2020.archive.ensembl.org/biomart/martservice?query=%3C?xml%20version=%221.0%22%20encoding=%22UTF-8%22?%3E%3C!DOCTYPE%20Query%3E%3CQuery%20%20virtualSchemaName%20=%20%22default%22%20formatter%20=%20%22TSV%22%20header%20=%20%220%22%20uniqueRows%20=%20%220%22%20count%20=%20%22%22%20datasetConfigVersion%20=%20%220.6%22%20%3E%3CDataset%20name%20=%20%22mmusculus_gene_ensembl%22%20interface%20=%20%22default%22%20%3E%3CAttribute%20name%20=%20%22ensembl_gene_id%22%20/%3E%3CAttribute%20name%20=%20%22external_gene_name%22%20/%3E%3C/Dataset%3E%3C/Query%3E\n", "Resolving aug2020.archive.ensembl.org (aug2020.archive.ensembl.org)... 193.62.193.83\n", "Connecting to aug2020.archive.ensembl.org (aug2020.archive.ensembl.org)|193.62.193.83|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: unspecified [text/plain]\n", "Saving to: ‘/grid/gillis/data/lohia/hi_c_data_processing/genomes_jlee/mouse_geneid_symbol.txt’\n", "\n", "/grid/gillis/data/l [ <=> ] 1.44M 677KB/s in 2.2s \n", "\n", "2022-02-23 22:27:26 (677 KB/s) - ‘/grid/gillis/data/lohia/hi_c_data_processing/genomes_jlee/mouse_geneid_symbol.txt’ saved [1510569]\n", "\n" ] } ], "source": [ "!wget -O /grid/gillis/data/lohia/hi_c_data_processing/genomes_jlee/mouse_geneid_symbol.txt 'http://aug2020.archive.ensembl.org/biomart/martservice?query='\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "0a157a0c-fbd7-429c-b464-87c8cb962dfa", "metadata": {}, "outputs": [], "source": [ "files_col_names = {}\n", "files_col_names['hsapiens_regulatory_feature.txt'] = ['chromosome_name', \"chromosome_start\", \"chromosome_end\", 'feature_type_name']\n", "files_col_names['hsapiens_encodeCcreCombined.bed'] = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'reserved', 'ccre', 'encodeLabel', 'zScore', 'ucscLabel', 'accessionLabel', 'description']\n", "files_col_names['hsapiens_gencodeV36.bed'] = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'reserved', 'blockCount', 'blockSizes', 'chromStarts', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames', 'type', 'geneName', 'geneName2', 'geneType', 'transcriptClass', 'source', 'transcriptType', 'tag', 'level', 'tier']\n", "\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "42d77b32-58b2-42b2-9321-57e85457ecba", "metadata": {}, "outputs": [], "source": [ "genome_dir = '/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/'" ] }, { "cell_type": "code", "execution_count": 16, "id": "37400d9b-bfe3-48ed-a337-48bf8a6fb17b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 70, "id": "81aa8416-209e-4a3c-8c81-ff346156c8ad", "metadata": {}, "outputs": [], "source": [ "gene_id_enst_df = pd.read_csv(f'{genome_dir}/hsapiens_wgEncodeGencodeAttrsV36_allgencode', sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "id": "0966b9c2-3fc2-4c4b-86c4-2b708627429f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 71, "id": "62d69f11-abe0-42b9-b8e0-c8619470ec93", "metadata": {}, "outputs": [], "source": [ "gene_id_enst_df = gene_id_enst_df[['#geneId' ,'geneName' ,'geneType' ,'geneStatus' ,'transcriptId' ,'transcriptName' ,'transcriptType', 'transcriptClass', 'proteinId']]" ] }, { "cell_type": "code", "execution_count": 72, "id": "687e945a-3008-4191-a38b-d0533665d826", "metadata": {}, "outputs": [], "source": [ "#get tss and tes for each gene\n", "f_name = 'hsapiens_gencodeV36.bed'\n", "df = pd.read_csv(f'{genome_dir}/{f_name}', sep='\\t', names=files_col_names[f_name])\n", "df = df[['chrom', 'chromStart', 'chromEnd', 'name', 'strand']]\n" ] }, { "cell_type": "code", "execution_count": 73, "id": "5529cd9c-7647-4993-ac1e-ed242f32cdb1", "metadata": {}, "outputs": [], "source": [ "#merging gene names with tss\n", "gene_id_enst_tss_df = gene_id_enst_df.merge(df, left_on='transcriptId', right_on='name')" ] }, { "cell_type": "code", "execution_count": null, "id": "f7bbb835-6404-4e64-943a-76f6c56a0834", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 117, "id": "627a4d5b-47d0-492b-9522-4eac7f9a8359", "metadata": {}, "outputs": [], "source": [ "#getting only the list of genes\n", "gene_type_df = gene_id_enst_tss_df[['#geneId', 'geneName', 'geneType', 'chrom', 'strand']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 116, "id": "7f2cddf4-0a61-494c-87ff-3f9d4c375ba8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#geneIdgeneNamegeneTypegeneStatustranscriptIdtranscriptNametranscriptTypetranscriptClassproteinIdchromchromStartchromEndnamestrand
0ENSG00000223972.5DDX11L1transcribed_unprocessed_pseudogeneNaNENST00000456328.2DDX11L1-202processed_transcriptpseudoNaNchr11186814409ENST00000456328.2+
1ENSG00000223972.5DDX11L1transcribed_unprocessed_pseudogeneNaNENST00000450305.2DDX11L1-201transcribed_unprocessed_pseudogenepseudoNaNchr11200913670ENST00000450305.2+
2ENSG00000227232.5WASH7Punprocessed_pseudogeneNaNENST00000488147.1WASH7P-201unprocessed_pseudogenepseudoNaNchr11440329570ENST00000488147.1-
3ENSG00000278267.1MIR6859-1miRNANaNENST00000619216.1MIR6859-1-201miRNAnonCodingNaNchr11736817436ENST00000619216.1-
4ENSG00000243485.5MIR1302-2HGlncRNANaNENST00000473358.1MIR1302-2HG-202lncRNAnonCodingNaNchr12955331097ENST00000473358.1+
.............................................
232179ENSG00000182484.15WASH6Pprotein_codingNaNENST00000483286.6WASH6P-211retained_intronproblemNaNchrY5721034357212074ENST00000483286.6+
232180ENSG00000182484.15WASH6Pprotein_codingNaNENST00000464205.6WASH6P-205processed_transcriptnonCodingNaNchrX156024070156025554ENST00000464205.6+
232181ENSG00000182484.15WASH6Pprotein_codingNaNENST00000464205.6WASH6P-205processed_transcriptnonCodingNaNchrY5721059057212074ENST00000464205.6+
232182ENSG00000227159.8DDX11L16unprocessed_pseudogeneNaNENST00000507418.6DDX11L16-201unprocessed_pseudogenepseudoNaNchrX156025663156027877ENST00000507418.6-
232183ENSG00000227159.8DDX11L16unprocessed_pseudogeneNaNENST00000507418.6DDX11L16-201unprocessed_pseudogenepseudoNaNchrY5721218357214397ENST00000507418.6-
\n", "

232184 rows × 14 columns

\n", "
" ], "text/plain": [ " #geneId geneName geneType \\\n", "0 ENSG00000223972.5 DDX11L1 transcribed_unprocessed_pseudogene \n", "1 ENSG00000223972.5 DDX11L1 transcribed_unprocessed_pseudogene \n", "2 ENSG00000227232.5 WASH7P unprocessed_pseudogene \n", "3 ENSG00000278267.1 MIR6859-1 miRNA \n", "4 ENSG00000243485.5 MIR1302-2HG lncRNA \n", "... ... ... ... \n", "232179 ENSG00000182484.15 WASH6P protein_coding \n", "232180 ENSG00000182484.15 WASH6P protein_coding \n", "232181 ENSG00000182484.15 WASH6P protein_coding \n", "232182 ENSG00000227159.8 DDX11L16 unprocessed_pseudogene \n", "232183 ENSG00000227159.8 DDX11L16 unprocessed_pseudogene \n", "\n", " geneStatus transcriptId transcriptName \\\n", "0 NaN ENST00000456328.2 DDX11L1-202 \n", "1 NaN ENST00000450305.2 DDX11L1-201 \n", "2 NaN ENST00000488147.1 WASH7P-201 \n", "3 NaN ENST00000619216.1 MIR6859-1-201 \n", "4 NaN ENST00000473358.1 MIR1302-2HG-202 \n", "... ... ... ... \n", "232179 NaN ENST00000483286.6 WASH6P-211 \n", "232180 NaN ENST00000464205.6 WASH6P-205 \n", "232181 NaN ENST00000464205.6 WASH6P-205 \n", "232182 NaN ENST00000507418.6 DDX11L16-201 \n", "232183 NaN ENST00000507418.6 DDX11L16-201 \n", "\n", " transcriptType transcriptClass proteinId chrom \\\n", "0 processed_transcript pseudo NaN chr1 \n", "1 transcribed_unprocessed_pseudogene pseudo NaN chr1 \n", "2 unprocessed_pseudogene pseudo NaN chr1 \n", "3 miRNA nonCoding NaN chr1 \n", "4 lncRNA nonCoding NaN chr1 \n", "... ... ... ... ... \n", "232179 retained_intron problem NaN chrY \n", "232180 processed_transcript nonCoding NaN chrX \n", "232181 processed_transcript nonCoding NaN chrY \n", "232182 unprocessed_pseudogene pseudo NaN chrX \n", "232183 unprocessed_pseudogene pseudo NaN chrY \n", "\n", " chromStart chromEnd name strand \n", "0 11868 14409 ENST00000456328.2 + \n", "1 12009 13670 ENST00000450305.2 + \n", "2 14403 29570 ENST00000488147.1 - \n", "3 17368 17436 ENST00000619216.1 - \n", "4 29553 31097 ENST00000473358.1 + \n", "... ... ... ... ... \n", "232179 57210343 57212074 ENST00000483286.6 + \n", "232180 156024070 156025554 ENST00000464205.6 + \n", "232181 57210590 57212074 ENST00000464205.6 + \n", "232182 156025663 156027877 ENST00000507418.6 - \n", "232183 57212183 57214397 ENST00000507418.6 - \n", "\n", "[232184 rows x 14 columns]" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gene_id_enst_tss_df" ] }, { "cell_type": "code", "execution_count": 118, "id": "fc572b1b-9bf8-4038-9e84-05b7020d5506", "metadata": {}, "outputs": [], "source": [ "#getting the outermost TSS and TES for each gene\n", "iso_longest = gene_id_enst_tss_df.groupby(['#geneId', 'chrom']).agg({'chromStart': lambda x: list(set(x)), 'chromEnd': lambda x: list(set(x)), 'strand': lambda x: list(set(x))[0]})\n", "iso_longest['txStart_outer'] = [min(x) if z=='+' else max(y) for x,y,z in zip(iso_longest['chromStart'], iso_longest['chromEnd'], iso_longest['strand'])]\n", "iso_longest['txEnd_outer'] = [max(x) if z=='+' else min(y) for x,y,z in zip(iso_longest['chromEnd'], iso_longest['chromStart'], iso_longest['strand'])]\n", "iso_longest.sort_values(by='txStart_outer', inplace=True)\n", "iso_longest['gene_order_tss'] = [count+1 for count,x in enumerate(iso_longest['txStart_outer'])]\n", "iso_longest.sort_values(by='txEnd_outer', inplace=True)\n", "iso_longest['gene_order_tes'] = [count+1 for count,x in enumerate(iso_longest['txEnd_outer'])]\n", "gene_tss_outer = pd.merge(iso_longest.reset_index()[['#geneId', 'chrom', 'txStart_outer', 'txEnd_outer', 'gene_order_tss', 'gene_order_tes']], gene_type_df, left_on=['#geneId', 'chrom'], right_on=['#geneId', 'chrom'])\n", "\n" ] }, { "cell_type": "code", "execution_count": 119, "id": "1bfd8f7c-e634-4636-98c6-b18d6fbbea39", "metadata": {}, "outputs": [], "source": [ "gene_tss_outer['#geneId'] = [x.split('.')[0] for x in gene_tss_outer['#geneId']]" ] }, { "cell_type": "code", "execution_count": 121, "id": "8e4b5153-f0e8-4a2e-893f-f64022f85525", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#geneIdchromtxStart_outertxEnd_outergene_order_tssgene_order_tesgeneNamegeneTypestrand
0ENSG00000210049chrM57664711MT-TFMt_tRNA+
1ENSG00000211459chrM647160122MT-RNR1Mt_rRNA+
2ENSG00000210077chrM1601167033MT-TVMt_tRNA+
3ENSG00000210082chrM1670322944MT-RNR2Mt_rRNA+
4ENSG00000209082chrM3229330465MT-TL1Mt_tRNA+
..............................
60714ENSG00000171163chr12488591442488500056071560715ZNF692protein_coding-
60715ENSG00000227237chr12488591632488647966071660716AL672291.1lncRNA+
60716ENSG00000200495chr12489127952489126896071860717RNU6-1205PsnRNA-
60717ENSG00000185220chr12489061952489199466071760718PGBD2protein_coding+
60718ENSG00000233084chr12489365802489370436071960719RPL23AP25processed_pseudogene+
\n", "

60719 rows × 9 columns

\n", "
" ], "text/plain": [ " #geneId chrom txStart_outer txEnd_outer gene_order_tss \\\n", "0 ENSG00000210049 chrM 576 647 1 \n", "1 ENSG00000211459 chrM 647 1601 2 \n", "2 ENSG00000210077 chrM 1601 1670 3 \n", "3 ENSG00000210082 chrM 1670 3229 4 \n", "4 ENSG00000209082 chrM 3229 3304 6 \n", "... ... ... ... ... ... \n", "60714 ENSG00000171163 chr1 248859144 248850005 60715 \n", "60715 ENSG00000227237 chr1 248859163 248864796 60716 \n", "60716 ENSG00000200495 chr1 248912795 248912689 60718 \n", "60717 ENSG00000185220 chr1 248906195 248919946 60717 \n", "60718 ENSG00000233084 chr1 248936580 248937043 60719 \n", "\n", " gene_order_tes geneName geneType strand \n", "0 1 MT-TF Mt_tRNA + \n", "1 2 MT-RNR1 Mt_rRNA + \n", "2 3 MT-TV Mt_tRNA + \n", "3 4 MT-RNR2 Mt_rRNA + \n", "4 5 MT-TL1 Mt_tRNA + \n", "... ... ... ... ... \n", "60714 60715 ZNF692 protein_coding - \n", "60715 60716 AL672291.1 lncRNA + \n", "60716 60717 RNU6-1205P snRNA - \n", "60717 60718 PGBD2 protein_coding + \n", "60718 60719 RPL23AP25 processed_pseudogene + \n", "\n", "[60719 rows x 9 columns]" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gene_tss_outer" ] }, { "cell_type": "code", "execution_count": 114, "id": "53a88f61-5ba0-4dd4-ae13-04e2bcf77fa5", "metadata": {}, "outputs": [], "source": [ "gene_tss_outer.to_csv(f'{genome_dir}/hsapiens_gene_tss_mapping.csv', index=False, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 112, "id": "d8082607-8191-49c0-9f2d-86c6f8755059", "metadata": {}, "outputs": [], "source": [ "gr = gene_tss_outer.groupby('chrom')" ] }, { "cell_type": "code", "execution_count": 113, "id": "99f3e97c-c001-4061-be15-9463ae321b14", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#geneIdchromtxStart_outertxEnd_outergene_order_tssgene_order_tesgeneNamegeneType
5729ENSG00000277248chr22107362831073617057315730U2snRNA
5822ENSG00000283047chr22109613381093938758265823FRG1FPunprocessed_pseudogene
5866ENSG00000279973chr22110664171106817458645867CU104787.1lncRNA
5891ENSG00000226444chr22111243361112570558885892ACTR3BP7processed_pseudogene
5944ENSG00000276871chr221124995911249808594459455_8S_rRNArRNA
...........................
26195ENSG00000254499chr2250743520507405922620226196AC002056.2lncRNA
26199ENSG00000100312chr2250738195507453392619526200ACRprotein_coding
26202ENSG00000213683chr2250755434507546742620526203AC002056.1processed_pseudogene
26209ENSG00000079974chr2250783662507675002621726210RABL2Bprotein_coding
26228ENSG00000184319chr2250756947508013092620726229RPL23AP82transcribed_unprocessed_pseudogene
\n", "

1386 rows × 8 columns

\n", "
" ], "text/plain": [ " #geneId chrom txStart_outer txEnd_outer gene_order_tss \\\n", "5729 ENSG00000277248 chr22 10736283 10736170 5731 \n", "5822 ENSG00000283047 chr22 10961338 10939387 5826 \n", "5866 ENSG00000279973 chr22 11066417 11068174 5864 \n", "5891 ENSG00000226444 chr22 11124336 11125705 5888 \n", "5944 ENSG00000276871 chr22 11249959 11249808 5944 \n", "... ... ... ... ... ... \n", "26195 ENSG00000254499 chr22 50743520 50740592 26202 \n", "26199 ENSG00000100312 chr22 50738195 50745339 26195 \n", "26202 ENSG00000213683 chr22 50755434 50754674 26205 \n", "26209 ENSG00000079974 chr22 50783662 50767500 26217 \n", "26228 ENSG00000184319 chr22 50756947 50801309 26207 \n", "\n", " gene_order_tes geneName geneType \n", "5729 5730 U2 snRNA \n", "5822 5823 FRG1FP unprocessed_pseudogene \n", "5866 5867 CU104787.1 lncRNA \n", "5891 5892 ACTR3BP7 processed_pseudogene \n", "5944 5945 5_8S_rRNA rRNA \n", "... ... ... ... \n", "26195 26196 AC002056.2 lncRNA \n", "26199 26200 ACR protein_coding \n", "26202 26203 AC002056.1 processed_pseudogene \n", "26209 26210 RABL2B protein_coding \n", "26228 26229 RPL23AP82 transcribed_unprocessed_pseudogene \n", "\n", "[1386 rows x 8 columns]" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gr.get_group('chr22')" ] }, { "cell_type": "code", "execution_count": 52, "id": "578fba54-3c94-46d2-939b-caa56e90b743", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "geneType\n", "IG_C_gene 24\n", "IG_C_pseudogene 9\n", "IG_D_gene 37\n", "IG_J_gene 18\n", "IG_J_pseudogene 3\n", "IG_V_gene 154\n", "IG_V_pseudogene 187\n", "IG_pseudogene 1\n", "Mt_rRNA 2\n", "Mt_tRNA 22\n", "TEC 1083\n", "TR_C_gene 6\n", "TR_D_gene 4\n", "TR_J_gene 79\n", "TR_J_pseudogene 4\n", "TR_V_gene 107\n", "TR_V_pseudogene 33\n", "lncRNA 47651\n", "miRNA 1881\n", "misc_RNA 2221\n", "polymorphic_pseudogene 119\n", "processed_pseudogene 10171\n", "protein_coding 157259\n", "pseudogene 22\n", "rRNA 53\n", "rRNA_pseudogene 497\n", "ribozyme 8\n", "sRNA 5\n", "scRNA 1\n", "scaRNA 49\n", "snRNA 1910\n", "snoRNA 943\n", "transcribed_processed_pseudogene 1138\n", "transcribed_unitary_pseudogene 712\n", "transcribed_unprocessed_pseudogene 3043\n", "translated_processed_pseudogene 3\n", "translated_unprocessed_pseudogene 1\n", "unitary_pseudogene 103\n", "unprocessed_pseudogene 2620\n", "vault_RNA 1\n", "Name: #geneId, dtype: int64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gene_id_enst_tss_df.groupby(['geneType'])['#geneId'].count()" ] }, { "cell_type": "code", "execution_count": null, "id": "297e4394-2270-4b55-8318-708d7b706756", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 122, "id": "d8e32014-5d7c-4b9f-8b1b-3d78dba8f949", "metadata": {}, "outputs": [], "source": [ "f_name = 'hsapiens_regulatory_feature.txt'\n", "df = pd.read_csv(f'{genome_dir}/{f_name}', sep='\\t', names=files_col_names[f_name])\n", "#df = df[['chrom', 'chromStart', 'chromEnd', 'name', 'strand']]" ] }, { "cell_type": "code", "execution_count": 123, "id": "82f0426b-1946-42ff-9db8-b43e5c3d90e8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromosome_namechromosome_startchromosome_endfeature_type_name
0183511680135120999Promoter Flanking Region
183796711537967453TF binding site
269024920290257999Promoter Flanking Region
335768940157689600CTCF Binding Site
4165564320155643800CTCF Binding Site
...............
622456114714880147150199Promoter Flanking Region
622457121203300112034400CTCF Binding Site
622458122603100126031600Enhancer
62245915493700154938600Enhancer
622460678480857848609Open chromatin
\n", "

622461 rows × 4 columns

\n", "
" ], "text/plain": [ " chromosome_name chromosome_start chromosome_end \\\n", "0 18 35116801 35120999 \n", "1 8 37967115 37967453 \n", "2 6 90249202 90257999 \n", "3 3 57689401 57689600 \n", "4 16 55643201 55643800 \n", "... ... ... ... \n", "622456 11 47148801 47150199 \n", "622457 12 12033001 12034400 \n", "622458 12 26031001 26031600 \n", "622459 1 54937001 54938600 \n", "622460 6 7848085 7848609 \n", "\n", " feature_type_name \n", "0 Promoter Flanking Region \n", "1 TF binding site \n", "2 Promoter Flanking Region \n", "3 CTCF Binding Site \n", "4 CTCF Binding Site \n", "... ... \n", "622456 Promoter Flanking Region \n", "622457 CTCF Binding Site \n", "622458 Enhancer \n", "622459 Enhancer \n", "622460 Open chromatin \n", "\n", "[622461 rows x 4 columns]" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "91cfb873-8cd0-4643-8298-6bae7a5ceb04", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }