Source code for deepcpg.motifs
"""Motif analysis."""
from __future__ import division
from __future__ import print_function
from collections import OrderedDict
import re
import pandas as pd
[docs]def read_tomtom(path):
"""Read Tomtom output file."""
d = pd.read_table(path)
d.rename(columns={'#Query ID': 'Query ID'}, inplace=True)
d.columns = [x.lower() for x in d.columns]
d['idx'] = [int(x) for x in d['query id'].str.replace('filter', '')]
return d
[docs]def read_meme_db(meme_db_file):
"""Read MEME database as Pandas DataFrame.
Parameters
----------
meme_db_file: str
File name of MEME database.
Returns
-------
:class:`pandas.DataFrame`
:class:`pandas.DataFrame` with columns 'id', 'protein', 'url'.
"""
motifs = []
motif = None
for line in open(meme_db_file):
if line.startswith('MOTIF'):
if motif:
motifs.append(motif)
motif = None
tmp = line.split()[1:]
if len(tmp) < 2:
continue
motif = OrderedDict()
motif['id'] = tmp[0]
protein = re.sub(r'\(([^)]+)\)', r'\1', tmp[1])
motif['protein'] = protein.split('_')[0]
motif['url'] = ''
elif motif and line.startswith('URL'):
motif['url'] = line.split()[1]
if motif:
motifs.append(motif)
for i, motif in enumerate(motifs):
motifs[i] = pd.DataFrame(motif, index=[0])
motifs = pd.concat(motifs)
return motifs
[docs]def get_report(filter_stats_file, tomtom_file, meme_motifs):
"""Read and join `filter_stats_file` and `tomtom_file`.
Used by `dcpg_filter_motifs.py` to read and join output files.
Paramters
---------
filter_stats_file: str
Path of stats file created with `dcpg_filter_motifs.py`.
tomtom_file: str
Path of Tomtom output file.
meme_motifs: :class:`pandas.DataFrame`
:class:`pandas.DataFrame` from `read_meme_db`.
Returns
-------
:class:`pandas.DataFrame`
:class:`pandas.DataFrame` with columns from Tomtom and statistic file.
"""
filter_stats = pd.read_table(filter_stats_file)
tomtom = read_tomtom(tomtom_file)
tomtom = tomtom.sort_values(['idx', 'q-value', 'e-value'])
tomtom = tomtom.loc[:, ~tomtom.columns.isin(['query id', 'optimal offset'])]
d = pd.merge(filter_stats, tomtom, on='idx', how='outer')
meme_motifs = meme_motifs.rename(columns={'id': 'target id'})
d = pd.merge(d, meme_motifs, on='target id', how='left')
d.index.name = None
return d