#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
"""
# skchem.io.sdf
Defining input and output operations for sdf files.
"""
from functools import wraps
import warnings
from rdkit import Chem
import pandas as pd
from ..core import Mol
from ..utils import Suppressor, squeeze
def _drop_props(row):
for prop in row.structure.props.keys():
row.structure.ClearProp(prop)
def _set_props(row, cols):
for i in cols:
row.structure.SetProp(str(i), str(row[i]))
def _set_name(row):
row.structure.name = str(row.name) # rdkit props can only be strs
[docs]def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
skipmols=None, skipfooter=None, read_props=True, mol_props=False,
*args, **kwargs):
"""Read an sdf file into a `pd.DataFrame`.
The function wraps the RDKit `ForwardSDMolSupplier` object.
Args:
sdf (str or file-like):
The location of data to load as a file path, or a file-like object.
error_bad_mol (bool):
Whether an error should be raised if a molecule fails to parse.
Default is False.
warn_bad_mol (bool):
Whether a warning should be output if a molecule fails to parse.
Default is True.
nmols (int):
The number of molecules to read. If `None`, read all molecules.
Default is `None`.
skipmols (int):
The number of molecules to skip at start.
Default is `0`.
skipfooter (int):
The number of molecules to skip from the end.
Default is `0`.
read_props (bool):
Whether to read the properties into the data frame.
Default is `True`.
mol_props (bool):
Whether to keep properties in the molecule dictionary after they
are extracted to the DataFrame.
Default is `False`.
args, kwargs:
Arguments will be passed to RDKit ForwardSDMolSupplier.
Returns:
pandas.DataFrame:
The loaded data frame, with Mols supplied in the `structure` field.
See also:
rdkit.Chem.SDForwardMolSupplier
skchem.read_smiles
"""
# nmols is actually the index to cutoff. If we skip some at start, we need
# to add this number
if skipmols:
nmols += skipmols
if isinstance(sdf, str):
sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
# use the suppression context manager to not pollute our stdout with rdkit
# errors and warnings.
# perhaps this should be captured better by Mol etc.
with Suppressor():
mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)
mols = []
# single loop through sdf
for i, mol in enumerate(mol_supp):
if skipmols and i < skipmols:
continue
if nmols and i >= nmols:
break
if mol is None:
msg = 'Molecule {} could not be decoded.'.format(i + 1)
if error_bad_mol:
raise ValueError(msg)
elif warn_bad_mol:
warnings.warn(msg)
continue
mols.append(Mol(mol))
if skipfooter:
mols = mols[:-skipfooter]
idx = pd.Index((m.name for m in mols), name='batch')
data = pd.DataFrame(mols, columns=['structure'])
if read_props:
props = pd.DataFrame([{k: v for (k, v) in mol.props.items()}
for mol in mols])
data = pd.concat([data, props], axis=1)
# now we have extracted the props, we can delete if required
if not mol_props:
data.apply(_drop_props, axis=1)
data.index = idx
return squeeze(data, axis=1)
[docs]def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
*args, **kwargs):
""" Write an sdf file from a dataframe.
Args:
data (pandas.Series or pandas.DataFrame):
Pandas data structure with a `structure` column containing
compounds to serialize.
sdf (str or file-like):
A file path or file-like object specifying where to write the
compound data.
write_cols (bool):
Whether columns should be written as props. Default `True`.
index_as_name (bool):
Whether to use index as the header, or the molecule's name.
Default is `True`.
mol_props (bool):
Whether to write properties in the Mol dictionary in addition to
fields in the frame.
Warn:
This function will change the names of the compounds if the
`index_as_name` argument is `True`, and will delete all properties in
the molecule dictionary if `mol_props` is `False`.
"""
if isinstance(data, pd.Series):
data = data.to_frame(name='structure')
names = [m.name for m in data.structure]
writer = Chem.SDWriter(sdf, *args, **kwargs)
cols = list(data.columns.drop('structure'))
if not mol_props:
data.apply(_drop_props, axis=1)
if write_cols:
data.apply(_set_props, cols=cols, axis=1)
if index_as_name:
data.apply(_set_name, axis=1)
data.structure.apply(writer.write)
# rdkit writer changes names sometimes
for mol, name in zip(data.structure, names):
mol.name = name
@wraps(write_sdf)
def _to_sdf_series(self, *args, **kwargs):
return write_sdf(self, write_cols=False, *args, **kwargs)
@wraps(write_sdf)
def _to_sdf_df(self, *args, **kwargs):
return write_sdf(self, *args, **kwargs)
pd.Series.to_sdf = _to_sdf_series
pd.DataFrame.to_sdf = _to_sdf_df
@classmethod
@wraps(read_sdf)
def _from_sdf_df(_, *args, **kwargs):
return read_sdf(*args, **kwargs)
pd.DataFrame.from_sdf = _from_sdf_df
@classmethod
@wraps(read_sdf)
def _from_sdf_series(_, *args, **kwargs):
return read_sdf(*args, **kwargs).structure
pd.Series.from_sdf = _from_sdf_series