Source code for perseuspy.dependent_peptides

"""
Dependent peptides can be extracted from the `allPeptides.txt` table
and are annotated using the `experimentalDesign.txt`.

This code forms the basis for the corresponding Perseus plugin PluginDependentPeptides.
"""
import pandas as pd
from perseuspy.io.perseus.matrix import read_perseus
pd.read_perseus = read_perseus
from perseuspy.io.maxquant import read_rawFilesTable
from perseuspy.parameters import fileParam, parse_parameters
import numpy as np

_index_columns = ['DP Proteins', 'DP Base Sequence', 'DP Cluster Index', 'DP Modification']
_cols = ['DP Ratio mod/base', 'Raw file', 'DP AA'] + _index_columns
[docs]def read_dependent_peptides(filename):
    """ read the dependent peptides table and extract localiztion information
    :param filename: path to the 'allPeptides.txt' table.
    :returns dep, localization: the dependent peptide table, localization information.
    """
    df = (pd.read_perseus(filename, usecols=_cols)
            .dropna(subset=['DP Ratio mod/base']))
    df['DP Ratio mod/base'] = df['DP Ratio mod/base'].astype(float)
    dep = df.pivot_table('DP Ratio mod/base', index=_index_columns,
                columns='Raw file', aggfunc=np.median)
    localization = _count_localizations(df)
    return dep, localization

def _set_column_names(dep, exp):
    """ rename the columns in the dependent peptides table from
    the raw file to the corresponding {experiment}_{fraction}.
    :param dep: dependent peptides table.
    :param exp: experimental design table.
    """
    colnames = exp['Experiment'] + '_' + exp['Fraction'].astype(str)
    file2col = dict(zip(exp['Raw file'], colnames))
    _dep = dep.rename(columns=file2col)
    _dep.columns.name = 'Column Name'
    return _dep
    
from collections import defaultdict
[docs]def count(args):
    """ count occurences in a list of lists
    >>> count([['a','b'],['a']])
    defaultdict(int, {'a' : 2, 'b' : 1})
    """
    counts = defaultdict(int)
    for arg in args:
        for item in arg:
            counts[item] = counts[item] + 1
    return counts

def _count_localizations(df):
    """ count the most likely localization for each depentent peptide.
    :param df: allPeptides.txt table.
    """
    grp = df.groupby(_index_columns)
    counts = grp['DP AA'].apply(lambda x: count(x.str.split(';').values))
    counts.index = counts.index.set_names('DP AA', level=4)
    counts.name = 'DP AA count'
    best_localization = counts.reset_index().groupby(_index_columns).apply(_frequent_localizations)
    return best_localization

def _frequent_localizations(df):
    """ returns the most frequent localization for any dependent peptide.
    In case of ties, preference is given to n-terminal modification which are
    biologically more likely to occur
    :param df: allPeptides.txt table.
    """
    max_count = int(df['DP AA count'].max())
    max_aa = set(df[df['DP AA count'] == max_count]['DP AA'].unique())
    result = {'DP AA max count' : max_count}
    if 'nterm' in max_aa:
        result['DP AA'] = 'nterm'
    else:
        result['DP AA'] = ';'.join(sorted(max_aa))
    return pd.Series(result)

[docs]def run_dependent_peptides_from_parameters(paramfile, outfile):
    """ transform a allPeptides.txt and experimentalDesign.txt table
    into the dependentPeptides.txt table written in outfile.
    :param paramfile: Perseus parameters.xml including at least two FileParam
    entries names 'allPeptides.txt' and 'experimentalDesign.txt'.
    :param outfile: Path to the output file.
    """
    parameters = parse_parameters(paramfile)
    allPeptides_file = fileParam(parameters, 'allPeptides.txt')
    rawFilesTable_file = fileParam(parameters, 'Raw files table')
    run_dependent_peptides(allPeptides_file, rawFilesTable_file, outfile)

[docs]def run_dependent_peptides(allPeptides_file, rawFilesTable_file, outfile):
    """ transform a allPeptides.txt and experimentalDesign.txt table
    into the dependentPeptides.txt table written in outfile.
    :param allPeptides_file: MaxQuant 'allPeptides.txt' output table.
    :param rawFilesTable_file: MaxQuant 'Raw files'-tab table.
    :param outfile: Path to the output file.
    """
    __dep, localization = read_dependent_peptides(allPeptides_file)
    exp = read_rawFilesTable(rawFilesTable_file)
    _dep = _set_column_names(__dep, exp)
    main_columns = list(_dep.columns)
    dep = _dep.join(localization).reset_index()
    dep.to_perseus(outfile, main_columns=main_columns)