{
"cells": [
{
"cell_type": "markdown",
"id": "3e4adad2-04fa-4503-ab29-38b3e8301fcd",
"metadata": {},
"source": [
"This notebook gets the TSS and TES for each gene, \n",
"It also gets the file of regulatory elemets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8ba98e15-686a-4c28-a4dd-3cc525fa758e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2021-07-20 23:15:46-- http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed\n",
"Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n",
"Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 9385520 (9.0M)\n",
"Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed’\n",
"\n",
"100%[======================================>] 9,385,520 6.63MB/s in 1.4s \n",
"\n",
"2021-07-20 23:15:47 (6.63 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed’ saved [9385520/9385520]\n",
"\n"
]
}
],
"source": [
"! wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cd2e89a9-3a87-42a9-af4f-f20be6304fe2",
"metadata": {},
"outputs": [],
"source": [
"!chmod +x /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ffd23f0a-a10c-4ec6-b438-085cc7b634d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2021-07-20 23:16:47-- http://hgdownload.soe.ucsc.edu/gbdb/hg38/gencode/gencodeV36.bb\n",
"Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n",
"Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 23207803 (22M)\n",
"Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb’\n",
"\n",
"100%[======================================>] 23,207,803 9.77MB/s in 2.3s \n",
"\n",
"2021-07-20 23:16:50 (9.77 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb’ saved [23207803/23207803]\n",
"\n"
]
}
],
"source": [
"#Thiss downloads gencodev36 knowngenes\n",
"#https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1133902899_IYtEfxvteU99lmytOCjaacOaQcnL&clade=mammal&org=Human&db=hg38&hgta_group=genes&hgta_track=encodeCcreCombined&hgta_table=0&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=gencodev32knowngene\n",
"! wget http://hgdownload.soe.ucsc.edu/gbdb/hg38/gencode/gencodeV36.bb -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "05e2783e-ad2a-4b6a-8c3b-7bdbbd0088e1",
"metadata": {},
"outputs": [],
"source": [
"! /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/gencodeV36.bb /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_gencodeV36.bed\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b46462eb-37dc-4966-b0ea-e1148ca35377",
"metadata": {},
"outputs": [],
"source": [
"# from AllGENCODE V36, ENST to ENSG mappings are downloaded\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "84c0f538-0abf-46bc-8dd4-56606067a790",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2021-07-21 12:51:12-- http://hgdownload.soe.ucsc.edu/gbdb/hg38/encode3/ccre/encodeCcreCombined.bb\n",
"Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163\n",
"Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 49809776 (48M)\n",
"Saving to: ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb’\n",
"\n",
"100%[======================================>] 49,809,776 12.7MB/s in 4.2s \n",
"\n",
"2021-07-21 12:51:17 (11.4 MB/s) - ‘/sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb’ saved [49809776/49809776]\n",
"\n"
]
}
],
"source": [
"# Schema for ENCODE cCREs - ENCODE Candidate Cis-Regulatory Elements (cCREs) combined from all cell types\n",
"#https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1133902899_IYtEfxvteU99lmytOCjaacOaQcnL&clade=mammal&org=Human&db=hg38&hgta_group=regulation&hgta_track=encodeCcreCombined&hgta_table=0&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=gencodev32knowngene\n",
"! wget http://hgdownload.soe.ucsc.edu/gbdb/hg38/encode3/ccre/encodeCcreCombined.bb -P /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "57b92b87-ceb5-424c-8834-01376f12433c",
"metadata": {},
"outputs": [],
"source": [
"! /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/bigBedToBed /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/encodeCcreCombined.bb /sonas-hs/gillis/hpc/data/lohia/hi_c_data_processing/genomes/hsapiens_encodeCcreCombined.bed\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44f1f702-91e3-46e2-8702-8c744b9f623e",
"metadata": {},
"outputs": [],
"source": [
"#getting the regulatory elements from ensemble\n",
"\n",
"\n",
"
\n", " | #geneId | \n", "geneName | \n", "geneType | \n", "geneStatus | \n", "transcriptId | \n", "transcriptName | \n", "transcriptType | \n", "transcriptClass | \n", "proteinId | \n", "chrom | \n", "chromStart | \n", "chromEnd | \n", "name | \n", "strand | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "ENSG00000223972.5 | \n", "DDX11L1 | \n", "transcribed_unprocessed_pseudogene | \n", "NaN | \n", "ENST00000456328.2 | \n", "DDX11L1-202 | \n", "processed_transcript | \n", "pseudo | \n", "NaN | \n", "chr1 | \n", "11868 | \n", "14409 | \n", "ENST00000456328.2 | \n", "+ | \n", "
1 | \n", "ENSG00000223972.5 | \n", "DDX11L1 | \n", "transcribed_unprocessed_pseudogene | \n", "NaN | \n", "ENST00000450305.2 | \n", "DDX11L1-201 | \n", "transcribed_unprocessed_pseudogene | \n", "pseudo | \n", "NaN | \n", "chr1 | \n", "12009 | \n", "13670 | \n", "ENST00000450305.2 | \n", "+ | \n", "
2 | \n", "ENSG00000227232.5 | \n", "WASH7P | \n", "unprocessed_pseudogene | \n", "NaN | \n", "ENST00000488147.1 | \n", "WASH7P-201 | \n", "unprocessed_pseudogene | \n", "pseudo | \n", "NaN | \n", "chr1 | \n", "14403 | \n", "29570 | \n", "ENST00000488147.1 | \n", "- | \n", "
3 | \n", "ENSG00000278267.1 | \n", "MIR6859-1 | \n", "miRNA | \n", "NaN | \n", "ENST00000619216.1 | \n", "MIR6859-1-201 | \n", "miRNA | \n", "nonCoding | \n", "NaN | \n", "chr1 | \n", "17368 | \n", "17436 | \n", "ENST00000619216.1 | \n", "- | \n", "
4 | \n", "ENSG00000243485.5 | \n", "MIR1302-2HG | \n", "lncRNA | \n", "NaN | \n", "ENST00000473358.1 | \n", "MIR1302-2HG-202 | \n", "lncRNA | \n", "nonCoding | \n", "NaN | \n", "chr1 | \n", "29553 | \n", "31097 | \n", "ENST00000473358.1 | \n", "+ | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
232179 | \n", "ENSG00000182484.15 | \n", "WASH6P | \n", "protein_coding | \n", "NaN | \n", "ENST00000483286.6 | \n", "WASH6P-211 | \n", "retained_intron | \n", "problem | \n", "NaN | \n", "chrY | \n", "57210343 | \n", "57212074 | \n", "ENST00000483286.6 | \n", "+ | \n", "
232180 | \n", "ENSG00000182484.15 | \n", "WASH6P | \n", "protein_coding | \n", "NaN | \n", "ENST00000464205.6 | \n", "WASH6P-205 | \n", "processed_transcript | \n", "nonCoding | \n", "NaN | \n", "chrX | \n", "156024070 | \n", "156025554 | \n", "ENST00000464205.6 | \n", "+ | \n", "
232181 | \n", "ENSG00000182484.15 | \n", "WASH6P | \n", "protein_coding | \n", "NaN | \n", "ENST00000464205.6 | \n", "WASH6P-205 | \n", "processed_transcript | \n", "nonCoding | \n", "NaN | \n", "chrY | \n", "57210590 | \n", "57212074 | \n", "ENST00000464205.6 | \n", "+ | \n", "
232182 | \n", "ENSG00000227159.8 | \n", "DDX11L16 | \n", "unprocessed_pseudogene | \n", "NaN | \n", "ENST00000507418.6 | \n", "DDX11L16-201 | \n", "unprocessed_pseudogene | \n", "pseudo | \n", "NaN | \n", "chrX | \n", "156025663 | \n", "156027877 | \n", "ENST00000507418.6 | \n", "- | \n", "
232183 | \n", "ENSG00000227159.8 | \n", "DDX11L16 | \n", "unprocessed_pseudogene | \n", "NaN | \n", "ENST00000507418.6 | \n", "DDX11L16-201 | \n", "unprocessed_pseudogene | \n", "pseudo | \n", "NaN | \n", "chrY | \n", "57212183 | \n", "57214397 | \n", "ENST00000507418.6 | \n", "- | \n", "
232184 rows × 14 columns
\n", "\n", " | #geneId | \n", "chrom | \n", "txStart_outer | \n", "txEnd_outer | \n", "gene_order_tss | \n", "gene_order_tes | \n", "geneName | \n", "geneType | \n", "strand | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "ENSG00000210049 | \n", "chrM | \n", "576 | \n", "647 | \n", "1 | \n", "1 | \n", "MT-TF | \n", "Mt_tRNA | \n", "+ | \n", "
1 | \n", "ENSG00000211459 | \n", "chrM | \n", "647 | \n", "1601 | \n", "2 | \n", "2 | \n", "MT-RNR1 | \n", "Mt_rRNA | \n", "+ | \n", "
2 | \n", "ENSG00000210077 | \n", "chrM | \n", "1601 | \n", "1670 | \n", "3 | \n", "3 | \n", "MT-TV | \n", "Mt_tRNA | \n", "+ | \n", "
3 | \n", "ENSG00000210082 | \n", "chrM | \n", "1670 | \n", "3229 | \n", "4 | \n", "4 | \n", "MT-RNR2 | \n", "Mt_rRNA | \n", "+ | \n", "
4 | \n", "ENSG00000209082 | \n", "chrM | \n", "3229 | \n", "3304 | \n", "6 | \n", "5 | \n", "MT-TL1 | \n", "Mt_tRNA | \n", "+ | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
60714 | \n", "ENSG00000171163 | \n", "chr1 | \n", "248859144 | \n", "248850005 | \n", "60715 | \n", "60715 | \n", "ZNF692 | \n", "protein_coding | \n", "- | \n", "
60715 | \n", "ENSG00000227237 | \n", "chr1 | \n", "248859163 | \n", "248864796 | \n", "60716 | \n", "60716 | \n", "AL672291.1 | \n", "lncRNA | \n", "+ | \n", "
60716 | \n", "ENSG00000200495 | \n", "chr1 | \n", "248912795 | \n", "248912689 | \n", "60718 | \n", "60717 | \n", "RNU6-1205P | \n", "snRNA | \n", "- | \n", "
60717 | \n", "ENSG00000185220 | \n", "chr1 | \n", "248906195 | \n", "248919946 | \n", "60717 | \n", "60718 | \n", "PGBD2 | \n", "protein_coding | \n", "+ | \n", "
60718 | \n", "ENSG00000233084 | \n", "chr1 | \n", "248936580 | \n", "248937043 | \n", "60719 | \n", "60719 | \n", "RPL23AP25 | \n", "processed_pseudogene | \n", "+ | \n", "
60719 rows × 9 columns
\n", "\n", " | #geneId | \n", "chrom | \n", "txStart_outer | \n", "txEnd_outer | \n", "gene_order_tss | \n", "gene_order_tes | \n", "geneName | \n", "geneType | \n", "
---|---|---|---|---|---|---|---|---|
5729 | \n", "ENSG00000277248 | \n", "chr22 | \n", "10736283 | \n", "10736170 | \n", "5731 | \n", "5730 | \n", "U2 | \n", "snRNA | \n", "
5822 | \n", "ENSG00000283047 | \n", "chr22 | \n", "10961338 | \n", "10939387 | \n", "5826 | \n", "5823 | \n", "FRG1FP | \n", "unprocessed_pseudogene | \n", "
5866 | \n", "ENSG00000279973 | \n", "chr22 | \n", "11066417 | \n", "11068174 | \n", "5864 | \n", "5867 | \n", "CU104787.1 | \n", "lncRNA | \n", "
5891 | \n", "ENSG00000226444 | \n", "chr22 | \n", "11124336 | \n", "11125705 | \n", "5888 | \n", "5892 | \n", "ACTR3BP7 | \n", "processed_pseudogene | \n", "
5944 | \n", "ENSG00000276871 | \n", "chr22 | \n", "11249959 | \n", "11249808 | \n", "5944 | \n", "5945 | \n", "5_8S_rRNA | \n", "rRNA | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
26195 | \n", "ENSG00000254499 | \n", "chr22 | \n", "50743520 | \n", "50740592 | \n", "26202 | \n", "26196 | \n", "AC002056.2 | \n", "lncRNA | \n", "
26199 | \n", "ENSG00000100312 | \n", "chr22 | \n", "50738195 | \n", "50745339 | \n", "26195 | \n", "26200 | \n", "ACR | \n", "protein_coding | \n", "
26202 | \n", "ENSG00000213683 | \n", "chr22 | \n", "50755434 | \n", "50754674 | \n", "26205 | \n", "26203 | \n", "AC002056.1 | \n", "processed_pseudogene | \n", "
26209 | \n", "ENSG00000079974 | \n", "chr22 | \n", "50783662 | \n", "50767500 | \n", "26217 | \n", "26210 | \n", "RABL2B | \n", "protein_coding | \n", "
26228 | \n", "ENSG00000184319 | \n", "chr22 | \n", "50756947 | \n", "50801309 | \n", "26207 | \n", "26229 | \n", "RPL23AP82 | \n", "transcribed_unprocessed_pseudogene | \n", "
1386 rows × 8 columns
\n", "\n", " | chromosome_name | \n", "chromosome_start | \n", "chromosome_end | \n", "feature_type_name | \n", "
---|---|---|---|---|
0 | \n", "18 | \n", "35116801 | \n", "35120999 | \n", "Promoter Flanking Region | \n", "
1 | \n", "8 | \n", "37967115 | \n", "37967453 | \n", "TF binding site | \n", "
2 | \n", "6 | \n", "90249202 | \n", "90257999 | \n", "Promoter Flanking Region | \n", "
3 | \n", "3 | \n", "57689401 | \n", "57689600 | \n", "CTCF Binding Site | \n", "
4 | \n", "16 | \n", "55643201 | \n", "55643800 | \n", "CTCF Binding Site | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
622456 | \n", "11 | \n", "47148801 | \n", "47150199 | \n", "Promoter Flanking Region | \n", "
622457 | \n", "12 | \n", "12033001 | \n", "12034400 | \n", "CTCF Binding Site | \n", "
622458 | \n", "12 | \n", "26031001 | \n", "26031600 | \n", "Enhancer | \n", "
622459 | \n", "1 | \n", "54937001 | \n", "54938600 | \n", "Enhancer | \n", "
622460 | \n", "6 | \n", "7848085 | \n", "7848609 | \n", "Open chromatin | \n", "
622461 rows × 4 columns
\n", "