Source code for skchem.io.sdf

#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

"""
# skchem.io.sdf

Defining input and output operations for sdf files.
"""

from functools import wraps
import warnings

from rdkit import Chem
import pandas as pd

from ..core import Mol
from ..utils import Suppressor, squeeze


def _drop_props(row):
    for prop in row.structure.props.keys():
        row.structure.ClearProp(prop)


def _set_props(row, cols):
    for i in cols:
        row.structure.SetProp(str(i), str(row[i]))


def _set_name(row):
    row.structure.name = str(row.name)  # rdkit props can only be strs


[docs]def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None, skipmols=None, skipfooter=None, read_props=True, mol_props=False, *args, **kwargs): """Read an sdf file into a `pd.DataFrame`. The function wraps the RDKit `ForwardSDMolSupplier` object. Args: sdf (str or file-like): The location of data to load as a file path, or a file-like object. error_bad_mol (bool): Whether an error should be raised if a molecule fails to parse. Default is False. warn_bad_mol (bool): Whether a warning should be output if a molecule fails to parse. Default is True. nmols (int): The number of molecules to read. If `None`, read all molecules. Default is `None`. skipmols (int): The number of molecules to skip at start. Default is `0`. skipfooter (int): The number of molecules to skip from the end. Default is `0`. read_props (bool): Whether to read the properties into the data frame. Default is `True`. mol_props (bool): Whether to keep properties in the molecule dictionary after they are extracted to the DataFrame. Default is `False`. args, kwargs: Arguments will be passed to RDKit ForwardSDMolSupplier. Returns: pandas.DataFrame: The loaded data frame, with Mols supplied in the `structure` field. See also: rdkit.Chem.SDForwardMolSupplier skchem.read_smiles """ # nmols is actually the index to cutoff. If we skip some at start, we need # to add this number if skipmols: nmols += skipmols if isinstance(sdf, str): sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility # use the suppression context manager to not pollute our stdout with rdkit # errors and warnings. # perhaps this should be captured better by Mol etc. with Suppressor(): mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs) mols = [] # single loop through sdf for i, mol in enumerate(mol_supp): if skipmols and i < skipmols: continue if nmols and i >= nmols: break if mol is None: msg = 'Molecule {} could not be decoded.'.format(i + 1) if error_bad_mol: raise ValueError(msg) elif warn_bad_mol: warnings.warn(msg) continue mols.append(Mol(mol)) if skipfooter: mols = mols[:-skipfooter] idx = pd.Index((m.name for m in mols), name='batch') data = pd.DataFrame(mols, columns=['structure']) if read_props: props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} for mol in mols]) data = pd.concat([data, props], axis=1) # now we have extracted the props, we can delete if required if not mol_props: data.apply(_drop_props, axis=1) data.index = idx return squeeze(data, axis=1)
[docs]def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False, *args, **kwargs): """ Write an sdf file from a dataframe. Args: data (pandas.Series or pandas.DataFrame): Pandas data structure with a `structure` column containing compounds to serialize. sdf (str or file-like): A file path or file-like object specifying where to write the compound data. write_cols (bool): Whether columns should be written as props. Default `True`. index_as_name (bool): Whether to use index as the header, or the molecule's name. Default is `True`. mol_props (bool): Whether to write properties in the Mol dictionary in addition to fields in the frame. Warn: This function will change the names of the compounds if the `index_as_name` argument is `True`, and will delete all properties in the molecule dictionary if `mol_props` is `False`. """ if isinstance(data, pd.Series): data = data.to_frame(name='structure') names = [m.name for m in data.structure] writer = Chem.SDWriter(sdf, *args, **kwargs) cols = list(data.columns.drop('structure')) if not mol_props: data.apply(_drop_props, axis=1) if write_cols: data.apply(_set_props, cols=cols, axis=1) if index_as_name: data.apply(_set_name, axis=1) data.structure.apply(writer.write) # rdkit writer changes names sometimes for mol, name in zip(data.structure, names): mol.name = name
@wraps(write_sdf) def _to_sdf_series(self, *args, **kwargs): return write_sdf(self, write_cols=False, *args, **kwargs) @wraps(write_sdf) def _to_sdf_df(self, *args, **kwargs): return write_sdf(self, *args, **kwargs) pd.Series.to_sdf = _to_sdf_series pd.DataFrame.to_sdf = _to_sdf_df @classmethod @wraps(read_sdf) def _from_sdf_df(_, *args, **kwargs): return read_sdf(*args, **kwargs) pd.DataFrame.from_sdf = _from_sdf_df @classmethod @wraps(read_sdf) def _from_sdf_series(_, *args, **kwargs): return read_sdf(*args, **kwargs).structure pd.Series.from_sdf = _from_sdf_series