Source code for skchem.filters.simple

#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

"""

# skchem.filters.simple

Simple filters for compounds.

"""

from collections import Counter

import numpy as np
import pandas as pd

from ..resource import ORGANIC, PERIODIC_TABLE
from .base import Filter


[docs]class ElementFilter(Filter): """ Filter by elements. Examples: Basic usage on molecules: >>> import skchem >>> hal_f = skchem.filters.ElementFilter(['F', 'Cl', 'Br', 'I']) Molecules with one of the atoms transform to `True`. >>> m1 = skchem.Mol.from_smiles('ClC(Cl)Cl', name='chloroform') >>> hal_f.transform(m1) True Molecules with none of the atoms transform to `False`. >>> m2 = skchem.Mol.from_smiles('CC', name='ethane') >>> hal_f.transform(m2) False Can see the atom breakdown by passing `agg` == `False`: >>> hal_f.transform(m1, agg=False) has_element F 0 Cl 3 Br 0 I 0 Name: ElementFilter, dtype: int64 Can transform series. >>> ms = [m1, m2] >>> hal_f.transform(ms) chloroform True ethane False dtype: bool >>> hal_f.transform(ms, agg=False) has_element F Cl Br I chloroform 0 3 0 0 ethane 0 0 0 0 Can also filter series: >>> hal_f.filter(ms) chloroform <Mol: ClC(Cl)Cl> Name: structure, dtype: object >>> hal_f.filter(ms, neg=True) ethane <Mol: CC> Name: structure, dtype: object """ def __init__(self, elements=None, as_bits=False, agg='any', n_jobs=1, verbose=True): """ Initialize an ElementFilter object. Args: elements (list[str]): A list of elements to filter with. If an element not in the list is found in a molecule, return False, else return True. as_bits (bool): Whether to return integer counts or booleans for atoms if mode is `count`. agg (str or callable): The callable to combine rows to produce the predicate. n_jobs (int): How many processes to use. verbose(bool): Whether to output a progress bar. """ self._elements = None self.elements = elements self.as_bits = as_bits super(ElementFilter, self).__init__(agg=agg, n_jobs=n_jobs, verbose=verbose) @property def elements(self): return self._elements @elements.setter def elements(self, val): if val is None: self._elements = PERIODIC_TABLE.symbol.tolist() else: self._elements = val @property def columns(self): return pd.Index(self.elements, name='has_element') def _transform_mol(self, mol): counter = Counter(atom.symbol for atom in mol.atoms) res = pd.Series(counter) res = res[self.elements].fillna(0).astype(int) if self.as_bits: res = (res > 0).astype(np.uint8) return res
[docs]class OrganicFilter(ElementFilter): """ Whether a molecule is organic. For the purpose of this function, an organic molecule is defined as having atoms with elements only in the set H, B, C, N, O, F, P, S, Cl, Br, I. Examples: Basic usage as a function on molecules: >>> import skchem >>> of = skchem.filters.OrganicFilter() >>> benzene = skchem.Mol.from_smiles('c1ccccc1', name='benzene') >>> of.transform(benzene) True >>> ferrocene = skchem.Mol.from_smiles('[cH-]1cccc1.[cH-]1cccc1.[Fe+2]', ... name='ferrocene') >>> of.transform(ferrocene) False More useful on collections: >>> sa = skchem.Mol.from_smiles('CC(=O)[O-].[Na+]', name='sodium acetate') >>> norbornane = skchem.Mol.from_smiles('C12CCC(C2)CC1', name='norbornane') >>> data = [benzene, ferrocene, norbornane, sa] >>> of.transform(data) benzene True ferrocene False norbornane True sodium acetate False dtype: bool >>> of.filter(data) benzene <Mol: c1ccccc1> norbornane <Mol: C1CC2CCC1C2> Name: structure, dtype: object >>> of.filter(data, neg=True) ferrocene <Mol: [Fe+2].c1cc[cH-]c1.c1cc[cH-]c1> sodium acetate <Mol: CC(=O)[O-].[Na+]> Name: structure, dtype: object """ def __init__(self, n_jobs=1, verbose=True): """ Initialize an Organic Filter object. Args: n_jobs (int): The number of processes to run the filter in. verbose (bool): Whether to output a progress bar. """ super(OrganicFilter, self).__init__(elements=None, agg='not any', n_jobs=n_jobs, verbose=verbose) self.elements = [element for element in self.elements if element not in ORGANIC]
[docs]def n_atoms(mol, above=2, below=75, include_hydrogens=False): """ Whether the number of atoms in a molecule falls in a defined interval. ``above <= n_atoms < below`` Args: mol: (skchem.Mol): The molecule to be tested. above (int): The lower threshold number of atoms (exclusive). below (int): The higher threshold number of atoms (inclusive). include_hydrogens (bool): Whether to consider hydrogens in the atom count. Returns: bool: Whether the molecule has more atoms than the threshold. Examples: Basic usage as a function on molecules: >>> import skchem >>> m = skchem.Mol.from_smiles('c1ccccc1') # benzene has 6 atoms. Lower threshold: >>> skchem.filters.n_atoms(m, above=3) True >>> skchem.filters.n_atoms(m, above=8) False Higher threshold: >>> skchem.filters.n_atoms(m, below=8) True >>> skchem.filters.n_atoms(m, below=3) False Bounds work like Python slices - inclusive lower, exclusive upper: >>> skchem.filters.n_atoms(m, above=6) True >>> skchem.filters.n_atoms(m, below=6) False Both can be used at once: >>> skchem.filters.n_atoms(m, above=3, below=8) True Can include hydrogens: >>> skchem.filters.n_atoms(m, above=3, below=8, include_hydrogens=True) False >>> skchem.filters.n_atoms(m, above=9, below=14, include_hydrogens=True) True """ assert above < below, 'Interval {} < a < {} undefined.'.format(above, below) n_a = len(mol.atoms) if include_hydrogens: n_a += sum(atom.GetNumImplicitHs() + atom.GetNumExplicitHs() for atom in mol.atoms) return above <= n_a < below
[docs]class AtomNumberFilter(Filter): """Filter whether the number of atoms in a Mol falls in a defined interval. `above <= n_atoms < below` Examples: >>> import skchem >>> data = [ ... skchem.Mol.from_smiles('CC', name='ethane'), ... skchem.Mol.from_smiles('CCCC', name='butane'), ... skchem.Mol.from_smiles('NC(C)C(=O)O', name='alanine'), ... skchem.Mol.from_smiles('C12C=CC(C=C2)C=C1', name='barrelene') ... ] >>> af = skchem.filters.AtomNumberFilter(above=3, below=7) >>> af.transform(data) ethane False butane True alanine True barrelene False Name: num_atoms_in_range, dtype: bool >>> af.filter(data) butane <Mol: CCCC> alanine <Mol: CC(N)C(=O)O> Name: structure, dtype: object >>> af = skchem.filters.AtomNumberFilter(above=5, below=15, include_hydrogens=True) >>> af.transform(data) ethane True butane True alanine True barrelene False Name: num_atoms_in_range, dtype: bool """ def __init__(self, above=3, below=60, include_hydrogens=False, n_jobs=1, verbose=True): """ Initialize an AtomNumberFilter object. Args: mol: (skchem.Mol): The molecule to be tested. above (int): The lower threshold on the mass. below (int): The higher threshold on the mass. n_jobs (int): The number of processes to run the filter in. verbose (bool): Whether to output a progress bar. """ assert above < below, 'Interval {} < a < {} undefined.'.format(above, below) self.above = above self.below = below self.include_hydrogens = include_hydrogens super(AtomNumberFilter, self).__init__(agg='any', n_jobs=n_jobs, verbose=verbose) def _transform_mol(self, mol): return n_atoms(mol, above=self.above, below=self.below, include_hydrogens=self.include_hydrogens) @property def columns(self): return pd.Index(['num_atoms_in_range'])
[docs]def mass(mol, above=10, below=900): """ Whether a the molecular weight of a molecule is lower than a threshold. `above <= mass < below` Args: mol: (skchem.Mol): The molecule to be tested. above (float): The lower threshold on the mass. Defaults to None. below (float): The higher threshold on the mass. Defaults to None. Returns: bool: Whether the mass of the molecule is lower than the threshold. Examples: Basic usage as a function on molecules: >>> import skchem >>> m = skchem.Mol.from_smiles('c1ccccc1') # benzene has M_r = 78. >>> skchem.filters.mass(m, above=70) True >>> skchem.filters.mass(m, above=80) False >>> skchem.filters.mass(m, below=80) True >>> skchem.filters.mass(m, below=70) False >>> skchem.filters.mass(m, above=70, below=80) True """ return above <= mol.mass < below
[docs]class MassFilter(Filter): """ Filter whether the molecular weight of a molecule is outside a range. `above <= mass < below` Examples: >>> import skchem >>> data = [ ... skchem.Mol.from_smiles('CC', name='ethane'), ... skchem.Mol.from_smiles('CCCC', name='butane'), ... skchem.Mol.from_smiles('NC(C)C(=O)O', name='alanine'), ... skchem.Mol.from_smiles('C12C=CC(C=C2)C=C1', name='barrelene') ... ] >>> mf = skchem.filters.MassFilter(above=31, below=100) >>> mf.transform(data) ethane False butane True alanine True barrelene False Name: mass_in_range, dtype: bool >>> mf.filter(data) butane <Mol: CCCC> alanine <Mol: CC(N)C(=O)O> Name: structure, dtype: object """ def __init__(self, above=3, below=900, n_jobs=1, verbose=True): """ Initialize a MassFilter object. Args: mol: (skchem.Mol): The molecule to be tested. above (float): The lower threshold on the mass. below (float): The higher threshold on the mass. n_jobs (int): The number of processes to run the filter in. verbose (bool): Whether to output a progress bar. """ assert above < below, 'Interval {} < a < {} undefined.'.format(above, below) self.above = above self.below = below super(MassFilter, self).__init__(agg='any', n_jobs=n_jobs, verbose=verbose) def _transform_mol(self, mol): return mass(mol, above=self.above, below=self.below) @property def columns(self): return pd.Index(['mass_in_range'])