Source code for skchem.io.smiles

#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

"""
# skchem.io.smiles

Defining input and output operations for smiles files.
"""

import warnings
from functools import wraps

import pandas as pd

from ..utils import Suppressor, squeeze
from ..core import Mol


[docs]def read_smiles(smiles_file, smiles_column=0, name_column=None, delimiter='\t', title_line=False, error_bad_mol=False, warn_bad_mol=True, drop_bad_mol=True, *args, **kwargs): """Read a smiles file into a pandas dataframe. The class wraps the pandas read_csv function. smiles_file (str, file-like): Location of data to load, specified as a string or passed directly as a file-like object. URLs may also be used, see the pandas.read_csv documentation. smiles_column (int): The column index at which SMILES are provided. Defaults to `0`. name_column (int): The column index at which compound names are provided, for use as the index in the DataFrame. If None, use the default index. Defaults to `None`. delimiter (str): The delimiter used. Defaults to `\\t`. title_line (bool): Whether a title line is provided, to use as column titles. Defaults to `False`. error_bad_mol (bool): Whether an error should be raised when a molecule fails to parse. Defaults to `False`. warn_bad_mol (bool): Whether a warning should be raised when a molecule fails to parse. Defaults to `True`. drop_bad_mol (bool): If true, drop any column with smiles that failed to parse. Otherwise, the field is None. Defaults to `True`. args, kwargs: Arguments will be passed to pandas read_csv arguments. Returns: pandas.DataFrame: The loaded data frame, with Mols supplied in the `structure` field. See Also: pandas.read_csv skchem.Mol.from_smiles skchem.io.sdf """ with Suppressor(): # set the header line to pass to the pandas parser # we accept True as being line zero, as is usual for smiles # if user specifies a header already, then do nothing header = kwargs.pop('header', None) if title_line is True: header = 0 elif header is not None: pass #remove from the kwargs to not pass it twice else: header = None # read the smiles file data = pd.read_csv(smiles_file, delimiter=delimiter, header=header, *args, **kwargs) # replace the smiles column with the structure column lst = list(data.columns) lst[smiles_column] = 'structure' if name_column: lst[name_column] = 'batch' data.columns = lst def parse(row): """ Parse smiles for row """ try: return Mol.from_smiles(row.structure) except ValueError: msg = 'Molecule {} could not be decoded.'.format(row.name) if error_bad_mol: raise ValueError(msg) elif warn_bad_mol: warnings.warn(msg) return None data['structure'] = data['structure'].apply(str) data['structure'] = data.apply(parse, axis=1) if drop_bad_mol: data = data[data['structure'].notnull()] # set index if passed if name_column is not None: data = data.set_index(data.columns[name_column]) cols = data.columns.tolist() cols.remove('structure') data = data[['structure'] + cols] return squeeze(data, axis=1)
[docs]def write_smiles(data, smiles_path): """ Write a dataframe to a smiles file. Args: data (pd.Series or pd.DataFrame): The dataframe to write. smiles_path (str): The path to write the dataframe to. """ if isinstance(data, pd.Series): data = data.to_frame(name='structure') data = data.copy() data['structure'] = data.structure.apply(lambda m: m.to_smiles()) data = data.reset_index() cols = list(data.columns) cols.insert(0, cols.pop(cols.index('structure'))) data = data.reindex(columns=cols)[cols] data.to_csv(smiles_path, sep='\t', header=None, index=None) del data
@classmethod @wraps(read_smiles) def _from_smiles_df(_, *args, **kwargs): return read_smiles(*args, **kwargs) @classmethod @wraps(read_smiles) def _from_smiles_series(_, *args, **kwargs): return read_smiles(*args, **kwargs).structure @wraps(write_smiles) def _to_smiles_df(self, *args, **kwargs): return write_smiles(self, *args, **kwargs) pd.DataFrame.from_smiles = _from_smiles_df pd.Series.from_smiles = _from_smiles_series pd.Series.to_smiles = _to_smiles_df pd.DataFrame.to_smiles = _to_smiles_df