#include #include #include "memutil.h" #include "nuc.h" #include "util.h" #include "err.h" static const char NUC_SYMBOL[NUM_NUCS] = {'A', 'C', 'G', 'T', 'M', 'K', 'R', 'Y', 'W', 'S', '-', 'N'}; /** * Converts a nucleotide ID into a character. */ char nuc_id_to_char(const unsigned char id) { if(id > NUM_NUCS) { my_err("Invalid nucleotide identifier %d", id); } return NUC_SYMBOL[id]; } /** * returns TRUE if this is an id for an unambiguous nucleotide * (one of A, C, T, G) */ int nuc_is_real(const unsigned char id) { return (id < NUC_M); } /* * returns TRUE if this is an id for a nucleotide ambiguity code * (one of M, K, R, Y, W, S). */ int nuc_is_ambi(const unsigned char id) { return (id > NUC_T) && (id < NUC_GAP); } /** * Converts a nucleotide to a unique nucleotide integer identifier. */ unsigned char nuc_char_to_id(const char nuc) { switch(nuc) { case('A'): case('a'): return NUC_A; case('C'): case('c'): return NUC_C; case('T'): case('t'): return NUC_T; case('G'): case('g'): return NUC_G; case('M'): case('m'): return NUC_M; case('K'): case('k'): return NUC_K; case('R'): case('r'): return NUC_R; case('Y'): case('y'): return NUC_Y; case('W'): case('w'): return NUC_W; case('S'): case('s'): return NUC_S; case('.'): case('-'): case('*'): return NUC_GAP; case('N'): case('n'): return NUC_N; } my_warn("%s:%d: unknown nucleotide character %c", __FILE__, __LINE__, nuc); return NUC_N; } /** * returns the complement of the provided nucleotide ID */ unsigned char nuc_comp(const unsigned char nuc_id) { switch(nuc_id) { case(NUC_A): return NUC_T; case(NUC_C): return NUC_G; case(NUC_G): return NUC_C; case(NUC_T): return NUC_A; case(NUC_M): return NUC_K; case(NUC_K): return NUC_M; case(NUC_R): return NUC_Y; case(NUC_Y): return NUC_R; case(NUC_W): return NUC_W; case(NUC_S): return NUC_S; } /* for N, GAP, complement is same */ return nuc_id; } /** * Fills the provided buffer with a null-terminated string representation * of the provided array of nucleotide arrays. The provided buffer * must be at least len+1 bytes long, and the provided nucleotide * id array must be at least len bytes long. * * If the provided buffer is NULL, a new buffer of length len+1 is * allocated and returned. */ char *nuc_ids_to_str(char *buf, const unsigned char *ids, const long len) { long i; if(buf == NULL) { buf = my_new(char, len+1); } for(i = 0; i < len; i++) { buf[i] = nuc_id_to_char(ids[i]); } buf[len] = '\0'; return buf; } /** * Fills the provided buffer with nucleotide ids corresponding to the * string representation of DNA sequence provided. The returned array * is NOT null terminated. The len argument should correspond to the * length of the provided DNA character string. The provided buf must * be at least len bytes long. If the provided buf is NULL a new buffer * of length len is allocated and returned. * * The return value is just the ptr to the provided buf. */ unsigned char *nuc_str_to_ids(unsigned char *buf, const char *str, const long len) { long i; if(buf == NULL) { buf = my_new(unsigned char, len); } for(i = 0; i < len; i++) { buf[i] = nuc_char_to_id(str[i]); } return buf; } /** * Does an in-place reverse complement of an array of * nucleotide ids */ void nuc_ids_revcomp(unsigned char *nuc_ids, long len) { long i; util_breverse(nuc_ids, len); for(i = 0; i < len; i++) { nuc_ids[i] = nuc_comp(nuc_ids[i]); } } /** * Returns true if any nucleotides in provided array are NUC_N */ int nuc_ids_have_n(unsigned char *nucs, const long len) { long i; for(i = 0; i < len; i++) { if(nucs[i] == NUC_N) { return TRUE; } } return FALSE; }