Source code for deepcpg.data.fasta
"""Functions reading FASTA files."""
from __future__ import division
from __future__ import print_function
import os
from glob import glob
import gzip as gz
from six.moves import range
from ..utils import to_list
[docs]class FastaSeq(object):
"""FASTA sequence."""
def __init__(self, head, seq):
self.head = head
self.seq = seq
[docs]def parse_lines(lines):
"""Parse FASTA sequences from list of strings.
Parameters
----------
lines: list
List of lines from FASTA file.
Returns
-------
list
List of :class:`FastaSeq` objects.
"""
seqs = []
seq = None
start = None
lines = [line.strip() for line in lines]
lines = [line for line in lines if len(line) > 0]
for i in range(len(lines)):
if lines[i][0] == '>':
if start is not None:
head = lines[start]
seq = ''.join(lines[start + 1: i])
seqs.append(FastaSeq(head, seq))
start = i
if start is not None:
head = lines[start]
seq = ''.join(lines[start + 1:])
seqs.append(FastaSeq(head, seq))
return seqs
[docs]def read_file(filename, gzip=None):
"""Read FASTA file and return sequences.
Parameters
----------
filename: str
File name.
gzip: bool
If `True`, file is gzip compressed. If `None`, suffix is used to
determine if file is compressed.
Returns
-------
List of :class:`FastaSeq` objects.
"""
list
if gzip is None:
gzip = filename.endswith('.gz')
if gzip:
lines = gz.open(filename, 'r').read().decode()
else:
lines = open(filename, 'r').read()
lines = lines.splitlines()
return parse_lines(lines)
[docs]def select_file_by_chromo(filenames, chromo):
"""Select file of chromosome `chromo`.
Parameters
----------
filenames: list
List of file names or directory with FASTA files.
chromo: str
Chromosome that is selected.
Returns
-------
str
Filename in `filenames` that contains chromosome `chromo`.
"""
filenames = to_list(filenames)
if len(filenames) == 1 and os.path.isdir(filenames[0]):
filenames = glob(os.path.join(filenames[0],
'*.dna.chromosome.%s.fa*' % chromo))
for filename in filenames:
if filename.find('chromosome.%s.fa' % chromo) >= 0:
return filename
[docs]def read_chromo(filenames, chromo):
"""Read DNA sequence of chromosome `chromo`.
Parameters
----------
filenames: list
List of FASTA files.
chromo: str
Chromosome that is read.
Returns
-------
str
DNA sequence of chromosome `chromo`.
"""
filename = select_file_by_chromo(filenames, chromo)
if not filename:
raise ValueError('DNA file for chromosome "%s" not found!' % chromo)
fasta_seqs = read_file(filename)
if len(fasta_seqs) != 1:
raise ValueError('Single sequence expected in file "%s"!' % filename)
return fasta_seqs[0].seq