Source code for deepcpg.data.utils

"""General purpose IO functions."""

from __future__ import division
from __future__ import print_function

import gzip
import threading
import re

import h5py as h5
import numpy as np
import pandas as pd
import six
from six.moves import range

from . import hdf

# Constant for missing labels.
CPG_NAN = -1
# Constant for separating output names, e.g. 'cpg/cell'.
OUTPUT_SEP = '/'


[docs]class threadsafe_iter: """Takes an iterator/generator and makes it thread-safe by serializing call to the `next` method of given iterator/generator. """ def __init__(self, it): self.it = it self.lock = threading.Lock() def __iter__(self): return self def __next__(self): with self.lock: return next(self.it) def next(self): return self.__next__()
[docs]def threadsafe_generator(f): """A decorator that takes a generator function and makes it thread-safe.""" def g(*a, **kw): return threadsafe_iter(f(*a, **kw)) return g
[docs]def add_to_dict(src, dst): """Add `dict `src` to `dict` `dst` Adds values in `dict` `src` to `dict` `dst` with same keys but values are lists of added values. lists of values in `dst` can be stacked with :func:`stack_dict`. Used for example in `dpcg_eval.py` to stack dicts from different batches. Example ------- src = dict() src['a'] = 1 src['b'] = {'b1': 10} dst = dict() add_to_dict(src, dst) add_to_dict(src, dst) -> dst['a'] = [1, 1] -> dst['b'] = {'b1': [10, 10]} """ for key, value in six.iteritems(src): if isinstance(value, dict): if key not in dst: dst[key] = dict() add_to_dict(value, dst[key]) else: if key not in dst: dst[key] = [] dst[key].append(value)
[docs]def stack_dict(data): """Stacks lists of numpy arrays in `dict` `data`.""" sdata = dict() for key, value in six.iteritems(data): if isinstance(value, dict): sdata[key] = stack_dict(value) else: fun = np.vstack if value[0].ndim > 1 else np.hstack sdata[key] = fun(value) return sdata
[docs]def get_nb_sample(data_files, nb_max=None, batch_size=None): """Count number of samples in all `data_files`. Parameters ---------- data_files: list `list` with file name of DeepCpG data files. nb_max: int If defined, stop counting if that number is reached. batch_size: int If defined, return the largest multiple of `batch_size` that is smaller or equal than the actual number of samples. Returns ------- int Number of samples in `data_files`. """ nb_sample = 0 for data_file in data_files: data_file = h5.File(data_file, 'r') nb_sample += len(data_file['pos']) data_file.close() if nb_max and nb_sample > nb_max: nb_sample = nb_max break if batch_size: nb_sample = (nb_sample // batch_size) * batch_size return nb_sample
[docs]def get_dna_wlen(data_file, max_len=None): """Return length of DNA sequence windows stored in `data_file`.""" data_file = h5.File(data_file, 'r') wlen = data_file['/inputs/dna'].shape[1] if max_len: wlen = min(max_len, wlen) return wlen
[docs]def get_cpg_wlen(data_file, max_len=None): """Return number of CpG neighbors stored in `data_file`.""" data_file = h5.File(data_file, 'r') group = data_file['/inputs/cpg'] wlen = group['%s/dist' % list(group.keys())[0]].shape[1] if max_len: wlen = min(max_len, wlen) return wlen
[docs]def get_output_names(data_file, *args, **kwargs): """Return name of outputs stored in `data_file`.""" return hdf.ls(data_file, 'outputs', recursive=True, groups=False, *args, **kwargs)
[docs]def get_replicate_names(data_file, *args, **kwargs): """Return name of replicates stored in `data_file`.""" return hdf.ls(data_file, 'inputs/cpg', recursive=False, groups=True, must_exist=False, *args, **kwargs)
[docs]def get_anno_names(data_file, *args, **kwargs): """Return name of annotations stored in `data_file`.""" return hdf.ls(data_file, 'inputs/annos', recursive=False, *args, **kwargs)
[docs]def is_bedgraph(filename): """Test if `filename` is a bedGraph file. bedGraph files are assumed to start with 'track type=bedGraph' """ if isinstance(filename, str): with open(filename) as f: line = f.readline() else: pos = filename.tell() line = filename.readline() if isinstance(line, bytes): line = line.decode() filename.seek(pos) return re.match(r'track\s+type=bedGraph', line) is not None
[docs]def format_chromo(chromo): """Format chromosome name. Makes name upper case, e.g. 'mt' -> 'MT' and removes 'chr', e.g. 'chr1' -> '1'. """ return chromo.str.upper().str.replace('^CHR', '')
[docs]def sample_from_chromo(frame, nb_sample): """Randomly sample `nb_sample` samples from each chromosome. Samples `nb_sample` records from :class:`pandas.DataFrame` which must contain a column with name 'chromo'. """ def sample_frame(frame): if len(frame) <= nb_sample: return frame idx = np.random.choice(len(frame), nb_sample, replace=False) return frame.iloc[idx] frame = frame.groupby('chromo', as_index=False).apply(sample_frame) frame.index = range(len(frame)) return frame
[docs]def is_binary(values): """Check if values in array `values` are binary, i.e. zero or one.""" return ~np.any((values > 0) & (values < 1))
[docs]def read_cpg_profile(filename, chromos=None, nb_sample=None, round=False, sort=True, nb_sample_chromo=None): """Read CpG profile from TSV or bedGraph file. Reads CpG profile from either tab delimited file with columns `chromo`, `pos`, `value`. `value` or bedGraph file. `value` columns contains methylation states, which can be binary or continuous. Parameters ---------- filenamne: str Path of file. chromos: list List of formatted chromosomes to be read, e.g. ['1', 'X']. nb_sample: int Maximum number of sample in total. round: bool If `True`, round methylation states in column 'value' to zero or one. sort: bool If `True`, sort by rows by chromosome and position. nb_sample_chromo: int Maximum number of sample per chromosome. Returns ------- :class:`pandas.DataFrame` :class:`pandas.DataFrame` with columns `chromo`, `pos`, `value`. """ if is_bedgraph(filename): usecols = [0, 1, 3] skiprows = 1 else: usecols = [0, 1, 2] skiprows = 0 dtype = {usecols[0]: np.str, usecols[1]: np.int32, usecols[2]: np.float32} nrows = None if chromos is None and nb_sample_chromo is None: nrows = nb_sample d = pd.read_table(filename, header=None, comment='#', nrows=nrows, usecols=usecols, dtype=dtype, skiprows=skiprows) d.columns = ['chromo', 'pos', 'value'] if np.any((d['value'] < 0) | (d['value'] > 1)): raise ValueError('Methylation values must be between 0 and 1!') d['chromo'] = format_chromo(d['chromo']) if chromos is not None: if not isinstance(chromos, list): chromos = [str(chromos)] d = d.loc[d.chromo.isin(chromos)] if len(d) == 0: raise ValueError('No data available for selected chromosomes!') if nb_sample_chromo is not None: d = sample_from_chromo(d, nb_sample_chromo) if nb_sample is not None: d = d.iloc[:nb_sample] if sort: d.sort_values(['chromo', 'pos'], inplace=True) if round: d['value'] = np.round(d.value) if is_binary(d['value']): d['value'] = d['value'].astype(np.int8) return d
[docs]class GzipFile(object): """Wrapper to read and write gzip-compressed files. If `filename` ends with `gz`, opens file with gzip package, otherwise builtin `open` function. Parameters ---------- filename: str Path of file mode: str File access mode *args: list Unnamed arguments passed to open function. **kwargs: dict Named arguments passed to open function. """ def __init__(self, filename, mode='r', *args, **kwargs): self.is_gzip = filename.endswith('.gz') if self.is_gzip: self.fh = gzip.open(filename, mode, *args, **kwargs) else: self.fh = open(filename, mode, *args, **kwargs) def __iter__(self): return self.fh.__iter__() def __next__(self): return self.fh.__next__() def read(self, *args, **kwargs): return self.fh.read(*args, **kwargs) def readline(self, *args, **kwargs): return self.fh.readline(*args, **kwargs) def readlines(self, *args, **kwargs): return self.fh.readlines(*args, **kwargs) def write(self, data): if self.is_gzip and isinstance(data, str): data = data.encode() self.fh.write(data) def writelines(self, *args, **kwargs): self.fh.writelines(*args, **kwargs) def tell(self, *args, **kwargs): return self.fh.tell(*args, **kwargs) def seek(self, *args, **kwargs): self.fh.seek(*args, **kwargs) def closed(self): return self.fh.closed() def close(self): self.fh.close() def __iter__(self): self.fh.__iter__() def iter(self): self.fh.iter()