Source code for skchem.base

#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD


"""
# skchem.base

Base classes for scikit-chem objects.
"""
import subprocess
from abc import ABCMeta, abstractmethod
from tempfile import NamedTemporaryFile
import time
import logging

import pandas as pd

from .utils import NamedProgressBar
from . import core
from .utils import iterable_to_series, optional_second_method, nanarray, squeeze
from . import io

LOGGER = logging.getLogger(__name__)


[docs]class BaseTransformer(object): """ Transformer Base Class. Specific Base Transformer classes inherit from this class and implement `transform` and `axis_names`. """ __metaclass__ = ABCMeta # To share some functionality betweeen Transformer and AtomTransformer def __init__(self, verbose=True): self.verbose = verbose
[docs] def optional_bar(self, **kwargs): if self.verbose: bar = NamedProgressBar(name=self.__class__.__name__, **kwargs) else: def bar(x): return x return bar
@property @abstractmethod def axes_names(self): """ tuple: The names of the axes. """ pass @abstractmethod
[docs] def transform(self, mols): """ Transform objects according to the objects transform protocol. Args: mols (skchem.Mol or pd.Series or iterable): The mol objects to transform. Returns: pd.Series or pd.DataFrame """ pass
[docs]class Transformer(BaseTransformer): """ Molecular based Transformer Base class. Concrete Transformers inherit from this class and must implement `_transform_mol` and `_columns`. See Also: AtomTransformer.""" @property @abstractmethod def columns(self): """ pd.Index: The column index to use. """ return pd.Index(None) @abstractmethod def _transform_mol(self, mol): """ Transform a molecule. """ pass def _transform_series(self, ser): """ Transform a series of molecules to an np.ndarray. """ bar = self.optional_bar() return [self._transform_mol(mol) for mol in bar(ser)] @optional_second_method
[docs] def transform(self, mols, **kwargs): """ Transform objects according to the objects transform protocol. Args: mols (skchem.Mol or pd.Series or iterable): The mol objects to transform. Returns: pd.Series or pd.DataFrame """ if isinstance(mols, core.Mol): # just squeeze works on series return pd.Series(self._transform_mol(mols), index=self.columns, name=self.__class__.__name__).squeeze() elif not isinstance(mols, pd.Series): mols = iterable_to_series(mols) res = pd.DataFrame(self._transform_series(mols), index=mols.index, columns=self.columns) return squeeze(res, axis=1)
@property def axes_names(self): """ tuple: The names of the axes. """ return 'batch', self.columns.name
[docs]class BatchTransformer(BaseTransformer): """ Transformer Mixin in which transforms on multiple molecules save overhead. Implement `_transform_series` with the transformation rather than `_transform_mol`. Must occur before `Transformer` or `AtomTransformer` in method resolution order. See Also: Transformer, AtomTransformer. """ def _transform_mol(self, mol): """ Transform a molecule. """ v = self.verbose self.verbose = False res = self.transform([mol]).iloc[0] self.verbose = v return res @abstractmethod def _transform_series(self, ser): """ Transform a series of molecules to an np.ndarray. """ pass
[docs]class AtomTransformer(BaseTransformer): """ Transformer that will produce a Panel. Concrete classes inheriting from this should implement `_transform_atom`, `_transform_mol` and `minor_axis`. See Also: Transformer """ def __init__(self, max_atoms=100, **kwargs): self.max_atoms = max_atoms self.major_axis = pd.RangeIndex(self.max_atoms, name='atom_idx') super(AtomTransformer, self).__init__(**kwargs) @property @abstractmethod def minor_axis(self): """ pd.Index: Minor axis of transformed values. """ return pd.Index(None) # expects a length @property def axes_names(self): """ tuple: The names of the axes. """ return 'batch', 'atom_idx', self.minor_axis.name @optional_second_method
[docs] def transform(self, mols): """ Transform objects according to the objects transform protocol. Args: mols (skchem.Mol or pd.Series or iterable): The mol objects to transform. Returns: pd.Series or pd.DataFrame """ if isinstance(mols, core.Atom): # just squeeze works on series return pd.Series(self._transform_atom(mols), index=self.minor_axis).squeeze() elif isinstance(mols, core.Mol): res = pd.DataFrame(self._transform_mol(mols), index=self.major_axis[:len(mols.atoms)], columns=self.minor_axis) return squeeze(res, axis=1) elif not isinstance(mols, pd.Series): mols = iterable_to_series(mols) res = pd.Panel(self._transform_series(mols), items=mols.index, major_axis=self.major_axis, minor_axis=self.minor_axis) return squeeze(res, axis=(1, 2))
@abstractmethod def _transform_atom(self, atom): """ Transform an atom to a 1D array of length `len(self.columns)`. """ pass def _transform_mol(self, mol): """ Transform a Mol to a 2D array. """ res = nanarray((len(mol.atoms), len(self.minor_axis))) for i, atom in enumerate(mol.atoms): res[i] = self._transform_atom(atom) return res def _transform_series(self, ser): """ Transform a Series<Mol> to a 3D array. """ if self.verbose: bar = NamedProgressBar(name=self.__class__.__name__) else: # use identity. def bar(obj): return obj res = nanarray((len(ser), self.max_atoms, len(self.minor_axis))) for i, mol in enumerate(bar(ser)): res[i, :len(mol.atoms), :len(self.minor_axis)] = self._transform_mol(mol) return res
[docs]class External(object): """ Mixin for wrappers of external CLI tools. Concrete classes must implement `validate_install`.""" __metaclass__ = ABCMeta install_hint = "" # give an explanation of how to install external tool here. def __init__(self, **kwargs): assert self.validated, 'External tool not installed. ' + self.install_hint super(External, self).__init__(**kwargs) @property def validated(self): """ bool: whether the external tool is installed and active. """ if not hasattr(self.__class__, '_validated'): self.__class__._validated = self.validate_install() return self.__class__._validated @staticmethod @abstractmethod
[docs] def validate_install(): """ Determine if the external tool is available. """ pass
[docs]class CLIWrapper(External, BaseTransformer): """ CLI wrapper. Concrete classes inheriting from this must implement `_cli_args`, `monitor_progress`, `_parse_outfile`, `_parse_errors`.""" def __init__(self, error_on_fail=False, warn_on_fail=True, **kwargs): super(CLIWrapper, self).__init__(**kwargs) self.error_on_fail = error_on_fail self.warn_on_fail = warn_on_fail def _transform_series(self, ser): """ Transform a series. """ with NamedTemporaryFile(suffix='.sdf') as infile, NamedTemporaryFile() as outfile: io.write_sdf(ser, infile.name) args = self._cli_args(infile.name, outfile.name) p = subprocess.Popen(args, stderr=subprocess.PIPE) if self.verbose: bar = self.optional_bar(max_value=len(ser)) while p.poll() is None: time.sleep(0.5) bar.update(self.monitor_progress(outfile.name)) bar.finish() p.wait() res = self._parse_outfile(outfile.name) errs = p.stderr.read().decode() errs = self._parse_errors(errs) # set the index of results to that of the input, with the failed indices removed if isinstance(res, (pd.Series, pd.DataFrame)): res.index = ser.index.delete(errs) elif isinstance(res, pd.Panel): res.items = ser.index.delete(errs) else: raise ValueError('Parsed datatype ({}) not supported.'.format(type(res))) # go through the errors and put them back in (transform doesn't lose instances) if len(errs): for err in errs: err = ser.index[err] if self.error_on_fail: raise ValueError('Failed to transform {}.'.format(err)) if self.warn_on_fail: LOGGER.warn('Failed to transform %s', err) res.ix[err] = None return res.loc[ser.index].values @abstractmethod def _cli_args(self, infile, outfile): """ list: The cli arguments. """ return [] @abstractmethod
[docs] def monitor_progress(self, filename): """ Report the progress. """ pass
@abstractmethod def _parse_outfile(self, outfile): """ Parse the file written and return a series. """ pass @abstractmethod def _parse_errors(self, errs): """ Parse stderr and return error indices. """ pass
[docs]class Featurizer(object): """ Base class for m -> data transforms, such as Fingerprinting etc. Concrete subclasses should implement `name`, returning a string uniquely identifying the featurizer. """ __metaclass__ = ABCMeta