# Answers to exercises: 2_datatypes.ipynb

In [None]:
# Import the usual stuff first
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

# We'll need this too
import logomaker
import os.path 
from scipy.signal import convolve

This course will include a variety of exercises to increase your Python skills. Note that the knowledge needed to complete each exercise will NOT necessarily have been presented or discussed. If you find yourself at sea, **the first thing you should do is Google your question.** 

Here is the DNA sequence of the multiple cloning site (MCS) on the plasmid [pcDNA5](https://www.addgene.org/vector-database/2132/), a popular vector for mammalian gene expression.

In [None]:
# Note how to define a long string over multiple lines
mcs_seq = 'GAGACCCAAGCTGGCTAGCGTTTAAACTTAAGCTTGGTACCGAGCTCGGATCCACTA' \
 'GTCCAGTGTGGTGGAATTCTGCAGATATCCAGCACAGTGGCGGCCGCTCGAGTCTAG' \
 'AGGGCCCGTTTAAACCCGCTGATCAGCCT'
print(mcs_seq)

**E2.1**: Does this MCS contain a restriction site for NheI (GCTAGC)? How about for MscI (TGGCCA)? 

In [None]:
# Answer

site_NheI = 'GCTAGC'
site_MscI = 'TGGCCA'

print('NheI: ', site_NheI in mcs_seq)
print('MscI: ', site_MscI in mcs_seq)

**E2.2**: Using the string method `.find()`, find the location(s) of the above restriction sites within the MCS.

In [None]:
# find site
site_start = mcs_seq.find(site_NheI)
print(f'site starts at position {site_start}')

# check
site_stop = site_start + len(site_NheI)
print('found site: ', mcs_seq[site_start:site_stop])
print('NheI site : ', site_NheI)

**E2.3**: Using the string method `.replace()`, compute the RNA sequence transcribed from the GFP gene sequence (given below). 

In [None]:
gfp_seq = 'ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATG' \
 'TTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTAC' \
 'CCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTC' \
 'GCGTATGGTCTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGA' \
 'GTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAA' \
 'GACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATT' \
 'GATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATG' \
 'TATACATCATGGCAGACAAACAAAAGAATGGAATCAAAGTTAACTTCAAAATTAGACACAACAT' \
 'TGAAGATGGAAGCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCT' \
 'GTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTGCCCTTTCGAAAGATCCCAACGAAA' \
 'AGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGA' \
 'ACTATACAAATAA'

# Answer here
gfp_rna = gfp_seq.replace('T','U')
gfp_rna

**E2.4**: Create a dictionary called `rc_dict` that maps DNA bases to their complementary bases. I.e., A -> T, C -> G, etc. 

In [None]:
# Answer
rc_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}

# For example:
rc_dict['A']

In [None]:
# To compute the reverse complement, we need to create a 'translation table',
# which is also a dictionary, but takes numerical ascii values as keys
# instead of strings
rc_table = str.maketrans(rc_dict)
rc_table

**E2.5**: By passing `rc_table` to the string method `.translate()`, then using indexing with a step of -1, compute the reverse complement of the MCS sequence given above.

In [None]:
# Compute reverse complement
mcs_seq_rc = mcs_seq.translate(rc_table)[::-1]

# Print forward and RC sequences
print('FW:', mcs_seq)
print('RC:', mcs_seq_rc)

**E2.6**: We have not yet discussed sets. Using Google, figure out what `set` objects are and explain what they represent. In particular, explain why Python evaluates {2,3,3} < {1,2,3} as True.

In [None]:
# Sets are like lists, but the elements therein don't have a specific order.
# Moreover, each element can occur at most once. 
# So {2,3,3} and {2,3} are the same set. To see this:
print('{2,3,3} == {2,3} is ', {2,3,3} == {2,3})

# The '<' sign is interpreted as Python as 'is subset'. Because
# {2,3} is a subset of {1,2,3}, this evaluates to true
print('{2,3} < {1,2,3} is ', {2,3} < {1,2,3})