Source code for deepcpg.data.dna
"""Functions for representing DNA sequences."""
from __future__ import division
from __future__ import print_function
from collections import OrderedDict
import numpy as np
from six.moves import range
# Mapping of nucleotides to integers
CHAR_TO_INT = OrderedDict([('A', 0), ('T', 1), ('G', 2), ('C', 3), ('N', 4)])
# Mapping of integers to nucleotides
INT_TO_CHAR = {v: k for k, v in CHAR_TO_INT.items()}
[docs]def get_alphabet(special=False, reverse=False):
"""Return char->int alphabet.
Parameters
----------
special: bool
If `True`, remove special 'N' character.
reverse: bool
If `True`, return int->char instead of char->int alphabet.
Returns
-------
OrderedDict
DNA alphabet.
"""
alpha = OrderedDict(CHAR_TO_INT)
if not special:
del alpha['N']
if reverse:
alpha = {v: k for k, v in alpha.items()}
return alpha
[docs]def char_to_int(seq):
"""Translate chars of single sequence `seq` to ints.
Parameters
----------
seq: str
DNA sequence.
Returns
-------
list
Integer-encoded `seq`.
"""
return [CHAR_TO_INT[x] for x in seq.upper()]
[docs]def int_to_char(seq, join=True):
"""Translate ints of single sequence `seq` to chars.
Parameters
----------
seq: list
Integers of sequences
join: bool
If `True` joint characters to `str`.
Returns
-------
If `join=True`, `str`, otherwise list of chars.
"""
t = [INT_TO_CHAR[x] for x in seq]
if join:
t = ''.join(t)
return t
[docs]def int_to_onehot(seqs, dim=4):
"""One-hot encodes array of integer sequences.
Takes array [nb_seq, seq_len] of integer sequence end encodes them one-hot.
Special nucleotides (int > 4) will be encoded as [0, 0, 0, 0].
Paramters
---------
seqs: :class:`numpy.ndarray`
[nb_seq, seq_len] :class:`numpy.ndarray` of integer sequences.
dim: int
Number of nucleotides
Returns
-------
:class:`numpy.ndarray`
[nb_seq, seq_len, dim] :class:`numpy.ndarray` of one-hot encoded
sequences.
"""
seqs = np.atleast_2d(np.asarray(seqs))
n = seqs.shape[0]
l = seqs.shape[1]
enc_seqs = np.zeros((n, l, dim), dtype='int8')
for i in range(dim):
t = seqs == i
enc_seqs[t, i] = 1
return enc_seqs
[docs]def onehot_to_int(seqs, axis=-1):
"""Translates one-hot sequences to integer sequences."""
return seqs.argmax(axis=axis)