#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
"""
## skchem.descriptors.atom
Module specifying atom based descriptor generators.
"""
import logging
import subprocess
import re
from abc import ABCMeta
import pandas as pd
import numpy as np
from ..utils import line_count, nanarray
from ..base import CLIWrapper, Transformer, AtomTransformer, BatchTransformer, Featurizer
LOGGER = logging.getLogger(__file__)
# TODO: fix averagemicrospeciescharge
# TODO: fix logd logp logs
# TODO: oen (orbital electronegativity) - sigma + pi
# TODO: water accessible surface area
# TODO: these don't produce csv
# ['doublebondstereoisomers', 'conformers', 'stereoisomers', 'moleculardynamics', 'stereoanalysis',
# 'lowestenergyconformer', 'msdistr2', 'conformations', 'dominanttautomerdistribution', 'hnmr', 'moldyn', 'cnmr',
# 'frameworks', 'microspeciesdistribution', 'nmr', 'leconformer', 'msdistr', 'tetrahedralstereoisomers']
#
CHEMAXON_HINT = """ Install ChemAxon from https://www.chemaxon.com. It requires a license, which can be freely obtained
for academics. """
[docs]class ChemAxonBaseFeaturizer(CLIWrapper, Featurizer):
__metaclass__ = ABCMeta
install_hint = CHEMAXON_HINT
_feat_columns = {'averagepol': ['a_avg'], 'name': ['preferred_iupac_name'],
'aromaticbondcount': ['aromatic_bond_count'],
'maximalprojectionradius': ['maximal_projection_radius'],
'tpolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], 'distance': ['distance'],
'acceptor': ['acceptor_count', 'acceptor_site_count'], 'fusedringcount': ['fused_ring_count'],
'charge': ['total_charge'], 'donor': ['donor_count', 'donor_site_count'], 'ringcount': ['ring_count'],
'chainbond': ['chain_bond'], 'mmff94energy': ['mmff94_energy'],
'huckel': ['aromatic_e+/nu-_order', 'localization_energy_l_+/l-', 'pi_energy',
'electron_density', 'charge_density'], 'chainatom': ['chain_atom'],
'shortestpath': ['shortest_path'], 'resonantcount': ['resonant_count'],
'tpol': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], 'moststabletautomer': ['most_stable_tautomer'],
'generictautomer': ['generictautomer'], 'hmoelectrophilicityorder': ['hmo_aromatic_e+_order'],
'ringsystemcountofsize': ['ring_system_count_of_size'],
'largestatomringsize': ['largest_ring_size_of_atom'],
'tetrahedralstereoisomercount': ['tetrahedral_stereoisomer_count'], 'enumerations': ['enumerations'],
'ringatom': ['ring_atom'], 'connected': ['connected'],
'hmolocalizationenergy': ['hmo_localization_energy_l+/l-'],
'averagemolecularpolarizability': ['a_avg'], 'donorsitecount': ['donor_site_count'],
'donorcount': ['donor_count'], 'asymmetricatom': ['asymmetric_atom'], 'pienergy': ['pi_energy'],
'bondcount': ['bond_count'], 'chiralcenters': ['chiral_centers'],
'hmohuckel': ['hmo_aromatic_e+/nu-_order', 'hmo_localization_energy_l+/l-', 'hmo_pi_energy',
'hmo_electron_density', 'hmo_charge_density'], 'huckeleigenvector': ['eigenvector'],
'ringcountofsize': ['ring_count_of_size'],
'heteroaliphaticringcount': ['heteroaliphatic_ring_count'], 'markushenumerations': ['enumerations'],
'minimalprojectionradius': ['minimal_projection_radius'], 'dipole': ['dipole'],
'balabanindex': ['balaban_index'], 'aromaticnucleophilicityorder': ['aromatic_nu-_order'],
'tautomercount': ['tautomer_count'], 'cyclomaticnumber': ['cyclomatic_number'],
'psa': ['polar_surface_area'], 'isoelectricpoint': ['pi'], 'hmopienergy': ['hmo_pi_energy'],
'ayypol': ['a_yy'], 'fragmentcount': ['fragment_count'], 'acceptormultiplicity': ['acceptor_multiplicity'],
'topologyanalysistable': ['atom_count', 'aliphatic_atom_count', 'aromatic_atom_count',
'bond_count', 'aliphatic_bond_count', 'aromatic_bond_count',
'rotatable_bond_count', 'ring_count', 'aliphatic_ring_count',
'aromatic_ring_count', 'hetero_ring_count', 'heteroaliphatic_ring_count',
'heteroaromatic_ring_count', 'ring_atom_count', 'ring_bond_count',
'chain_atom_count', 'chain_bond_count', 'smallest_ring_size',
'largest_ring_size'], 'ioncharge': ['charge'],
'asymmetricatoms': ['asymmetric_atoms'],
'wateraccessiblesurfacearea': ['asa', 'asa+', 'asa-', 'asa_h', 'asa_p'], 'avgpol': ['a_avg'],
'carboaliphaticringcount': ['carboaliphatic_ring_count'],
'aliphaticringcount': ['aliphatic_ring_count'], 'donormultiplicity': ['donor_multiplicity'],
'minimalprojectionarea': ['minimal_projection_area'],
'nucleophiliclocalizationenergy': ['localization_energy_l-'], 'dihedral': ['dihedral'],
'heteroringcount': ['hetero_ring_count'], 'azzpol': ['a_zz'],
'molecularsurfacearea': ['van_der_waals_surface_area_3d'],
'hmonucleophiliclocalizationenergy': ['hmo_localization_energy_l-'],
'chargedistribution': ['charge_distribution'], 'pol': ['a_mol', 'a_atom'],
'hmoelectrondensity': ['hmo_electron_density'], 'carboaromaticringcount': ['carboaromatic_ring_count'],
'acceptorsitecount': ['acceptor_site_count'], 'markushenumerationcount': ['markush_library_size'],
'localizationenergy': ['localization_energy_l+/l-'], 'hararyindex': ['harary_index'],
'asa': ['asa', 'asa+', 'asa-', 'asa_h', 'asa_p'], 'acc': ['acc'], 'majortautomer': ['major_tautomer'],
'majormicrospecies': ['major-ms'], 'aliphaticatomcount': ['aliphatic_atom_count'],
'angle': ['angle'], 'huckeleigenvalue': ['eigenvalue'], 'axxpol': ['a_xx'],
'chiralcenter': ['chiral_center'], 'aliphaticbondcount': ['aliphatic_bond_count'],
'smallestatomringsize': ['smallest_ring_size_of_atom'], 'dreidingenergy': ['dreiding_energy'],
'maximalprojectionsize': ['length_perpendicular_to_the_max_area'],
'largestringsystemsize': ['largest_ring_system_size'], 'accsitecount': ['acceptor_site_count'],
'refractivity': ['refractivity'], 'bondtype': ['bond_type'], 'chargedensity': ['charge_density'],
'resonants': ['resonants'], 'aromaticatomcount': ['aromatic_atom_count'],
'distancedegree': ['distance_degree'], 'hasvalidconformer': ['has_valid_conformer'],
'electrondensity': ['electron_density'], 'asymmetricatomcount': ['asymmetric_atom_count'],
'fsp3': ['fsp3'], 'don': ['don'], 'fusedaliphaticringcount': ['fused_aliphatic_ring_count'],
'pkat': ['pkat'], 'fusedaromaticringcount': ['fused_aromatic_ring_count'],
'majorms2': ['majorms2'], 'maximalprojectionarea': ['maximal_projection_area'],
'hbonddonoracceptor': ['acceptor_count', 'donor_count', 'acceptor_site_count', 'donor_site_count'],
'acceptorcount': ['acceptor_count'], 'molecularpolarizability': ['a_mol'],
'huckeltable': ['aromatic_e+/nu-_order', 'localization_energy_l+/l-', 'pi_energy',
'electron_density', 'charge_density'],
'rotatablebondcount': ['rotatable_bond_count'],
'minimalprojectionsize': ['length_perpendicular_to_the_min_area'],
'polarizability': ['a_mol', 'a_atom'], 'acceptortable': ['acceptor_count', 'acceptor_site_count'],
'aliphaticringcountofsize': ['aliphatic_ring_count_of_size'], 'hlb': ['hlb'],
'eccentricity': ['eccentricity'], 'hmochargedensity': ['hmo_charge_density'],
'hmohuckeleigenvalue': ['hmo_eigenvalue'], 'totalchargedensity': ['total_charge_density'],
'hmonucleophilicityorder': ['hmo_aromatic_nu-_order'],
'aromaticringcountofsize': ['aromatic_ring_count_of_size'],
'electrophilicityorder': ['aromatic_e+_order'], 'connectedgraph': ['connected_graph'],
'plattindex': ['platt_index'], 'logp': ['logp'],
'topanal': ['atom_count', 'aliphatic_atom_count', 'aromatic_atom_count', 'bond_count',
'aliphatic_bond_count', 'aromatic_bond_count', 'rotatable_bond_count', 'ring_count',
'aliphatic_ring_count', 'aromatic_ring_count', 'hetero_ring_count',
'heteroaliphatic_ring_count', 'heteroaromatic_ring_count', 'ring_atom_count',
'ring_bond_count', 'chain_atom_count', 'chain_bond_count', 'smallest_ring_size',
'largest_ring_size'],
'logdcalculator': ['ph=0', 'ph=1', 'ph=2', 'ph=3', 'ph=4', 'ph=5', 'ph=6', 'ph=7', 'ph=8', 'ph=9',
'ph=10', 'ph=11', 'ph=12', 'ph=13', 'ph=14', 'unnamed:_16'],
'logs': ['ph=0.0', 'ph=1.0', 'ph=2.0', 'ph=3.0', 'ph=4.0', 'ph=5.0', 'ph=6.0', 'ph=7.0', 'ph=8.0',
'ph=9.0', 'ph=10.0', 'ph=11.0', 'ph=12.0', 'ph=13.0', 'ph=14.0', 'unnamed:_16'],
'atompol': ['a_atom'], 'canonicalresonant': ['structure'], 'ringbond': ['ring_bond'],
'ringatomcount': ['ring_atom_count'], 'donortable': ['donor_count', 'donor_site_count'],
'randicindex': ['randic_index'], 'rotatablebond': ['rotatable_bond'],
'hyperwienerindex': ['hyper_wiener_index'], 'hmohuckeleigenvector': ['hmo_eigenvector'],
'carboringcount': ['carbo_ring_count'], 'logpcalculator': ['logp', 'unnamed:_2'],
'ringsystemcount': ['ring_system_count'], 'largestringsize': ['largest_ring_size'],
'stereodoublebondcount': ['stereo_double_bond_count'], 'pi': ['pi'],
'stericeffectindex': ['steric_effect_index'], 'volume': ['van_der_waals_volume'],
'averagemicrospeciescharge': ['charge'], 'pka': ['apka1', 'apka2', 'bpka1', 'bpka2', 'atoms'],
'hmohuckeltable': ['hmo_aromatic_e+/nu-_order', 'hmo_localization_energy_l+/l-', 'hmo_pi_energy',
'hmo_electron_density', 'hmo_charge_density'],
'ringcountofatom': ['ring_count_of_atom'],
'aromaticelectrophilicityorder': ['aromatic_e+_order'], 'hindrance': ['steric_hindrance'],
'chainatomcount': ['chain_atom_count'],
'pkacalculator': ['apka1', 'apka2', 'bpka1', 'bpka2', 'atoms'],
'heteroaromaticringcount': ['heteroaromatic_ring_count'], 'sterichindrance': ['steric_hindrance'],
'hbda': ['acceptor_count', 'donor_count', 'acceptor_site_count', 'donor_site_count'], 'molpol': ['a_mol'],
'atomicpolarizability': ['a_atom'],
'msdon': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', 'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00',
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', 'ph=12.00', 'ph=13.00', 'ph=14.00'],
'enumerationcount': ['markush_library_size'], 'vdwsa': ['van_der_waals_surface_area_3d'],
'orbitalelectronegativity': ['sigma_orbital_electronegativity', 'pi_orbital_electronegativity'],
'hmoelectrophiliclocalizationenergy': ['hmo_localization_energy_l+'],
'smallestringsize': ['smallest_ring_size'], 'szegedindex': ['szeged_index'],
'nucleophilicityorder': ['aromatic_nu-_order'], 'canonicaltautomer': ['canonical_tautomer'],
'stereoisomercount': ['stereoisomer_count'], 'msa': ['van_der_waals_surface_area_3d'],
'donsitecount': ['donor_site_count'], 'randommarkushenumerations': ['randommarkushenumerations'],
'wienerindex': ['wiener_index'], 'huckelorbitals': ['orbitals'],
'doublebondstereoisomercount': ['double_bond_stereoisomer_count'], 'tautomers': ['tautomers'],
'polarsurfacearea': ['polar_surface_area'], 'chiralcentercount': ['chiral_center_count'],
'electrophiliclocalizationenergy': ['localization_energy_l+'],
'aliphaticatom': ['aliphatic_atom'], 'ringbondcount': ['ring_bond_count'],
'wienerpolarity': ['wiener_polarity'],
'msacc': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', 'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00',
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', 'ph=12.00', 'ph=13.00', 'ph=14.00'],
'formalcharge': ['formal_charge'], 'smallestringsystemsize': ['smallest_ring_system_size'],
'majorms': ['major-ms'], 'tholepolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'],
'aromaticatom': ['aromatic_atom'],
'oen': ['sigma_orbital_electronegativity', 'pi_orbital_electronegativity'],
'chainbondcount': ['chain_bond_count'],
'logd': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', 'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00',
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', 'ph=12.00', 'ph=13.00', 'ph=14.00'],
'hmohuckelorbitals': ['hmo_orbitals'], 'aromaticringcount': ['aromatic_ring_count'],
'pichargedensity': ['pi_charge_density']}
def __init__(self, features='optimal', **kwargs):
super(ChemAxonBaseFeaturizer, self).__init__(**kwargs)
self.features = features
@property
def features(self):
return self._features
@features.setter
def features(self, features):
if features in ('optimal', 'all'):
self._features = self._optimal_feats
elif isinstance(features, str):
self.features = [features]
elif isinstance(features, (list, tuple)):
valid = np.array([feat in self._feat_columns.keys() for feat in features])
if not all(valid):
raise NotImplementedError('Descriptor \'{}\' not available.'.format(np.array(features)[~valid]))
else:
self._features = list(features)
else:
raise NotImplementedError('Feature set {} not available.'.format(features))
def _feature_index(self):
return pd.Index(sum((self._feat_columns[feat] for feat in self.features), []), name='features')
[docs] def validate_install(self):
try:
return 0 == subprocess.call(['cxcalc'], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
except FileNotFoundError:
return False
[docs] def monitor_progress(self, filename):
res = line_count(filename)
return res - 1 if res else 0
def _cli_args(self, infile, outfile):
return ['cxcalc', infile, '-o', outfile] + self.features
def _parse_outfile(self, outfile):
res = pd.read_table(outfile, engine='python').drop('id', axis=1)
return res
def _parse_errors(self, errs):
LOGGER.debug('stderr: %s', errs)
return [] # instances are not skipped ever, so don't return anything
[docs]class ChemAxonFeaturizer(ChemAxonBaseFeaturizer, BatchTransformer, Transformer):
_optimal_feats = ['acceptorcount', 'accsitecount', 'aliphaticatomcount', 'aliphaticbondcount', 'aliphaticringcount',
'aromaticatomcount', 'aromaticbondcount', 'aromaticringcount', 'asymmetricatomcount',
'averagemolecularpolarizability', 'axxpol', 'ayypol', 'azzpol', 'balabanindex', 'bondcount',
'carboaliphaticringcount', 'carboaromaticringcount', 'carboringcount', 'chainatomcount',
'chainbondcount', 'chiralcentercount', 'connectedgraph', 'cyclomaticnumber', 'dipole',
'donorcount', 'donorsitecount', 'doublebondstereoisomercount', 'dreidingenergy', 'formalcharge',
'fragmentcount', 'fsp3', 'fusedaliphaticringcount', 'fusedaromaticringcount', 'fusedringcount',
'hararyindex', 'heteroaliphaticringcount', 'heteroaromaticringcount', 'heteroringcount', 'hlb',
'hmopienergy', 'hyperwienerindex', 'largestringsize', 'largestringsystemsize',
'markushenumerationcount', 'maximalprojectionarea', 'maximalprojectionradius',
'maximalprojectionsize', 'minimalprojectionarea', 'minimalprojectionradius',
'minimalprojectionsize', 'mmff94energy', 'molpol', 'pienergy', 'plattindex', 'psa', 'randicindex',
'refractivity', 'resonantcount', 'ringatomcount', 'ringbondcount', 'ringcount', 'ringsystemcount',
'rotatablebondcount', 'smallestringsize', 'smallestringsystemsize',
'stereodoublebondcount', 'stereoisomercount', 'szegedindex', 'tetrahedralstereoisomercount',
'vdwsa', 'volume', 'wateraccessiblesurfacearea', 'wienerindex', 'wienerpolarity']
@property
def name(self):
return 'cx_mol'
@property
def columns(self):
return self._feature_index()
def _parse_outfile(self, outfile):
res = super(ChemAxonFeaturizer, self)._parse_outfile(outfile)
return res.applymap(lambda s: np.nan if isinstance(s, str) and 'FAILED' in s else float(s))
[docs]class ChemAxonAtomFeaturizer(ChemAxonBaseFeaturizer, AtomTransformer, BatchTransformer):
_optimal_feats = ['acceptormultiplicity', 'aliphaticatom', 'aromaticatom', 'aromaticelectrophilicityorder',
'asymmetricatom', 'atomicpolarizability', 'chainatom', 'chargedensity', 'chiralcenter',
'distancedegree', 'donormultiplicity', 'eccentricity', 'electrondensity',
'electrophiliclocalizationenergy', 'hindrance', 'hmochargedensity', 'hmoelectrondensity',
'hmoelectrophilicityorder',
'hmoelectrophiliclocalizationenergy', 'hmonucleophilicityorder',
'hmonucleophiliclocalizationenergy', 'ioncharge', 'largestatomringsize',
'nucleophilicityorder', 'nucleophiliclocalizationenergy', 'oen', 'pichargedensity', 'ringatom',
'ringcountofatom',
'stericeffectindex', 'totalchargedensity']
_h_inc_feats = ['acc', 'atomicpolarizability', 'charge', 'distancedegree', 'don',
'eccentricity', 'hindrance', 'largestatomringsize', 'oen',
'ringcountofatom', 'smallestatomringsize', 'stericeffectindex']
@property
def name(self):
return 'cx_atom'
@property
def minor_axis(self):
return self._feature_index()
def _transform_atom(self, atom):
raise NotImplementedError('Cannot calculate per atom with ChemAxon')
def _parse_outfile(self, outfile):
res = super(ChemAxonAtomFeaturizer, self)._parse_outfile(outfile)
def parse_string(s):
if s == '':
return np.nan
elif s == 'false':
return 0
elif s == 'true':
return 1
else:
try:
return float(s)
except ValueError:
return np.nan
def to_padded(s):
res = np.repeat(np.nan, self.max_atoms)
ans = np.array([parse_string(i) for i in str(s).split(';')])
res[:len(ans)] = ans
return res
res = res.applymap(to_padded)
return pd.Panel(res.values.tolist()).swapaxes(1, 2)
[docs]class ChemAxonNMRPredictor(ChemAxonBaseFeaturizer, BatchTransformer, AtomTransformer):
_feat_columns = {'cnmr': ['cnmr'], 'hnmr': ['hnmr']}
_optimal_feats = ['cnmr']
[docs] def name(self):
return 'cx_nmr'
def _transform_atom(self, atom):
raise NotImplementedError('ChemAxon cannot predict for atoms.')
[docs] def monitor_progress(self, filename):
return sum(1 for l in open(filename, 'rb') if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n')
@property
def minor_axis(self):
return pd.Index(self.features, name='shift')
@property
def features(self):
return self._features
@features.setter
def features(self, val):
if val == 'c':
self._features = ['cnmr']
elif val == 'h':
self._features = ['hnmr']
else:
raise NotImplementedError('Feature {} not implemented'.format(val))
def _parse_outfile(self, outfile):
n_mols = self.monitor_progress(outfile)
res = nanarray((n_mols, self.max_atoms, 1))
regex = re.compile(b'\((-?\d+.\d+),\d+,[A-Z],<([0-9\,]+)>\)\r\n')
mol_idx = 0
with open(outfile, 'rb') as f:
# loop through the file - inner loop will also advance the pointer
for l in f:
if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n':
for row in f:
if row == b'##END=\r\n':
break
else:
LOGGER.debug('Row to parse: %s', row)
shift, idxs = regex.match(row).groups()
shift, idxs = float(shift), [int(idx) for idx in idxs.split(b',')]
for atom_idx in idxs:
res[mol_idx, atom_idx] = shift
mol_idx += 1
res = pd.Panel(res)
return res