Source code for skchem.io.smiles
#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
"""
# skchem.io.smiles
Defining input and output operations for smiles files.
"""
import warnings
from functools import wraps
import pandas as pd
from ..utils import Suppressor, squeeze
from ..core import Mol
[docs]def read_smiles(smiles_file, smiles_column=0, name_column=None, delimiter='\t',
title_line=False, error_bad_mol=False, warn_bad_mol=True,
drop_bad_mol=True, *args, **kwargs):
"""Read a smiles file into a pandas dataframe.
The class wraps the pandas read_csv function.
smiles_file (str, file-like):
Location of data to load, specified as a string or passed directly as a
file-like object. URLs may also be used, see the pandas.read_csv
documentation.
smiles_column (int):
The column index at which SMILES are provided.
Defaults to `0`.
name_column (int):
The column index at which compound names are provided, for use as the
index in the DataFrame. If None, use the default index.
Defaults to `None`.
delimiter (str):
The delimiter used.
Defaults to `\\t`.
title_line (bool):
Whether a title line is provided, to use as column titles.
Defaults to `False`.
error_bad_mol (bool):
Whether an error should be raised when a molecule fails to parse.
Defaults to `False`.
warn_bad_mol (bool):
Whether a warning should be raised when a molecule fails to parse.
Defaults to `True`.
drop_bad_mol (bool):
If true, drop any column with smiles that failed to parse. Otherwise,
the field is None. Defaults to `True`.
args, kwargs:
Arguments will be passed to pandas read_csv arguments.
Returns:
pandas.DataFrame:
The loaded data frame, with Mols supplied in the `structure` field.
See Also:
pandas.read_csv
skchem.Mol.from_smiles
skchem.io.sdf
"""
with Suppressor():
# set the header line to pass to the pandas parser
# we accept True as being line zero, as is usual for smiles
# if user specifies a header already, then do nothing
header = kwargs.pop('header', None)
if title_line is True:
header = 0
elif header is not None:
pass #remove from the kwargs to not pass it twice
else:
header = None
# read the smiles file
data = pd.read_csv(smiles_file, delimiter=delimiter, header=header,
*args, **kwargs)
# replace the smiles column with the structure column
lst = list(data.columns)
lst[smiles_column] = 'structure'
if name_column:
lst[name_column] = 'batch'
data.columns = lst
def parse(row):
""" Parse smiles for row """
try:
return Mol.from_smiles(row.structure)
except ValueError:
msg = 'Molecule {} could not be decoded.'.format(row.name)
if error_bad_mol:
raise ValueError(msg)
elif warn_bad_mol:
warnings.warn(msg)
return None
data['structure'] = data['structure'].apply(str)
data['structure'] = data.apply(parse, axis=1)
if drop_bad_mol:
data = data[data['structure'].notnull()]
# set index if passed
if name_column is not None:
data = data.set_index(data.columns[name_column])
cols = data.columns.tolist()
cols.remove('structure')
data = data[['structure'] + cols]
return squeeze(data, axis=1)
[docs]def write_smiles(data, smiles_path):
""" Write a dataframe to a smiles file.
Args:
data (pd.Series or pd.DataFrame):
The dataframe to write.
smiles_path (str):
The path to write the dataframe to.
"""
if isinstance(data, pd.Series):
data = data.to_frame(name='structure')
data = data.copy()
data['structure'] = data.structure.apply(lambda m: m.to_smiles())
data = data.reset_index()
cols = list(data.columns)
cols.insert(0, cols.pop(cols.index('structure')))
data = data.reindex(columns=cols)[cols]
data.to_csv(smiles_path, sep='\t', header=None, index=None)
del data
@classmethod
@wraps(read_smiles)
def _from_smiles_df(_, *args, **kwargs):
return read_smiles(*args, **kwargs)
@classmethod
@wraps(read_smiles)
def _from_smiles_series(_, *args, **kwargs):
return read_smiles(*args, **kwargs).structure
@wraps(write_smiles)
def _to_smiles_df(self, *args, **kwargs):
return write_smiles(self, *args, **kwargs)
pd.DataFrame.from_smiles = _from_smiles_df
pd.Series.from_smiles = _from_smiles_series
pd.Series.to_smiles = _to_smiles_df
pd.DataFrame.to_smiles = _to_smiles_df