Source code for perseuspy.io.perseus.matrix

import numpy as np
import pandas as pd
from collections import OrderedDict

separator = '\t'
perseus_to_dtype = {'E' : float, 'T' : str, 'C' : 'category', 'M' : str, 'N' : float}
dtype_to_perseus = { np.dtype('float') : 'N', np.dtype('str') : 'T', np.dtype('object') : 'T',
        np.dtype('int64') : 'N', pd.Categorical.dtype : 'C' }

[docs]def read_annotations(path_or_file, separator, type_map=perseus_to_dtype, reset=True): """ Read all annotations from the specified file. >>> annotations = read_annotations(path_or_file, separator) >>> colnames = annotations['Column Name'] >>> types = annotations['Type'] >>> annot_row = annotations['Annot. row name'] :param path_or_file: Path or file-like object :param separator: Column separator :param type_map: Mapping Perseus types to numpy.dtype :param reset: Reset the file after reading. Useful for file-like, no-op for paths. :returns: Ordered dictionary of annotations. """ annotations = OrderedDict({}) with PathOrFile(path_or_file, 'r', reset=reset) as f: annotations['Column Name'] = f.readline().strip().split(separator) for line in f: if line.startswith('#!{'): tokens = line.strip().split(separator) _name, first_value = tokens[0].split('}') name = _name.replace('#!{', '') values = [first_value] + tokens[1:] if name == 'Type': values = [type_map[x] for x in values] annotations[name] = values return annotations
[docs]def create_column_index(annotations): """ Create a pd.MultiIndex using the column names and any categorical rows. Note that also non-main columns will be assigned a default category ''. """ _column_index = OrderedDict({'Column Name' : annotations['Column Name']}) ncol = len(_column_index['Column Name']) categorical_rows = {name.replace('C:','',1) : values + [''] * (ncol - len(values)) for name, values in annotations.items() if name.startswith('C:')} _column_index.update(categorical_rows) column_index = pd.MultiIndex.from_tuples(list(zip(*_column_index.values())), names=list(_column_index.keys())) return column_index
[docs]def read_perseus(path_or_file, type_map = perseus_to_dtype, **kwargs): """ Read a Perseus-formatted matrix into a pd.DataFrame. Annotation rows will be converted into a multi-index. By monkey-patching the returned pd.DataFrame a `to_perseus` method for exporting the pd.DataFrame is made available. :param path_or_file: File path or file-like object :param type_map: How to map Perseus types to numpy.dtype :param kwargs: Keyword arguments passed as-is to pandas.read_csv :returns: The parsed data frame """ annotations = read_annotations(path_or_file, separator, type_map) column_index = create_column_index(annotations) if 'usecols' in kwargs: usecols = kwargs['usecols'] if type(usecols[0]) is str: usecols = sorted([list(column_index).index(x) for x in usecols]) column_index = column_index[usecols] if 'Type' in annotations: dtype = {name : t for name, t in zip(annotations['Column Name'], annotations['Type'])} if 'dtype' in kwargs: dtype.update(kwargs['dtype']) kwargs['dtype'] = dtype df = pd.read_csv(path_or_file, sep=separator, comment='#', **kwargs) df.columns = column_index return df
import numpy as np
[docs]def to_perseus(df, path_or_file, main_columns=None, separator=separator, type_map = dtype_to_perseus, numerical_annotation_rows = set([])): """ Save pd.DataFrame to Perseus text format. :param df: pd.DataFrame :param path_or_file: File name or file-like object :param main_columns: Main columns. Will be infered if set to None. All numeric columns up-until the first non-numeric column are considered main columns. :param separator: For separating fields, default '\t' """ if not df.columns.name: df.columns.name = 'Column Name' column_names = df.columns.get_level_values('Column Name') annotations = {} main_columns = _infer_main_columns(df) if main_columns is None else main_columns annotations['Type'] = ['E' if column_names[i] in main_columns else type_map[dtype] for i, dtype in enumerate(df.dtypes)] annotation_row_names = set(df.columns.names) - {'Column Name'} for name in annotation_row_names: annotation_type = 'N' if name in numerical_annotation_rows else 'C' annotations['{}:{}'.format(annotation_type, name)] = df.columns.get_level_values(name) with PathOrFile(path_or_file, 'w') as f: f.write(separator.join(column_names) + '\n') for name, values in annotations.items(): f.write('#!{{{name}}}{values}\n'.format(name=name, values=separator.join([str(x) for x in values]))) df.to_csv(f, header=None, index=False, sep=separator)
[docs]class PathOrFile(): """Small context manager for file paths or file-like objects :param path_or_file: Path to a file or file-like object :param mode: Set reading/writing mode :param reset: Reset file-like to initial position. Has no effect on path.""" def __init__(self, path_or_file, mode = None, reset=False): self.path_or_file = path_or_file self.mode = mode self.isPath = isinstance(path_or_file, str) self.reset = reset and not self.isPath if self.reset: self.position = self.path_or_file.seek(0, 1) def __enter__(self): if self.isPath: self.open_file = open(self.path_or_file, self.mode) return self.open_file else: self.open_file = None return self.path_or_file def __exit__(self, *args): if self.open_file: self.open_file.close() if self.reset: self.path_or_file.seek(self.position)
_numeric_dtypes = {np.dtype('float32'), np.dtype('float64'), np.dtype('int32'), np.dtype('int64')} def _infer_main_columns(df, index_level='Column Name', numeric_dtypes=_numeric_dtypes): """ All numeric columns up-until the first non-numeric column are considered main columns. :param df: The pd.DataFrame :param index_level: Name of the index level of the column names. Default 'Column Name' :param numeric_dtypes: Set of numpy.dtype containing all numeric types. Default int/float. :returns: The names of the infered main columns """ columns = df.columns.get_level_values(index_level) main_columns = [] for i,dtype in enumerate(df.dtypes): if dtype in numeric_dtypes: main_columns.append(columns[i]) else: break return main_columns