Source code for skchem.data.converters.physprop

#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

import os
import zipfile
import logging
LOGGER = logging.getLogger(__name__)

import pandas as pd
import numpy as np

from ... import io
from .base import Converter, contiguous_order

from ...cross_validation import SimThresholdSplit

TXT_COLUMNS = [l.lower() for l in """CAS
Formula
Mol_Weight
Chemical_Name
WS
WS_temp
WS_type
WS_reference
LogP
LogP_temp
LogP_type
LogP_reference
VP
VP_temp
VP_type
VP_reference
DC_pKa
DC_temp
DC_type
DC_reference
henry_law Constant
HL_temp
HL_type
HL_reference
OH
OH_temp
OH_type
OH_reference
BP_pressure
MP
BP
FP""".split('\n')]

[docs]class PhysPropConverter(Converter): def __init__(self, directory, output_directory, output_filename='physprop.h5'): output_path = os.path.join(output_directory, output_filename) sdf, txt = self.extract(directory) mols, data = self.process_sdf(sdf), self.process_txt(txt) LOGGER.debug('Compounds with data extracted: %s', len(data)) data = mols.to_frame().join(data) data = self.drop_inconsistencies(data) y = self.process_targets(data) LOGGER.debug('Compounds with experimental: %s', len(y)) data = data.ix[y.index] data.columns.name = 'targets' ms, y = data.structure, data.drop('structure', axis=1) cv = SimThresholdSplit(min_threshold=0.6, block_width=4000, n_jobs=-1).fit(ms) train, valid, test = cv.split((70, 15, 15)) (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) splits = (('train', train), ('valid', valid), ('test', test)) self.run(ms, y, output_path=output_path, splits=splits)
[docs] def extract(self, directory): LOGGER.info('Extracting from %s', directory) with zipfile.ZipFile(os.path.join(directory, 'phys_sdf.zip')) as f: sdf = f.extract('PhysProp.sdf') with zipfile.ZipFile(os.path.join(directory, 'phys_txt.zip')) as f: txt = f.extract('PhysProp.txt') return sdf, txt
[docs] def process_sdf(self, path): LOGGER.info('Processing sdf at %s', path) mols = io.read_sdf(path, read_props=False).structure mols.index = mols.apply(lambda m: m.GetProp('CAS')) mols.index.name = 'cas' LOGGER.debug('Structures extracted: %s', len(mols)) return mols
[docs] def process_txt(self, path): LOGGER.info('Processing txt at %s', path) data = pd.read_table(path, header=None, engine='python').iloc[:, :32] data.columns = TXT_COLUMNS data_types = data.columns[[s.endswith('_type') for s in data.columns]] data[data_types] = data[data_types].fillna('NAN') data = data.set_index('cas') return data
[docs] def drop_inconsistencies(self, data): LOGGER.info('Dropping inconsistent data...') formula = data.structure.apply(lambda m: m.to_formula()) LOGGER.info('Inconsistent compounds: %s', (formula != data.formula).sum()) data = data[formula == data.formula] return data
[docs] def process_targets(self, data): LOGGER.info('Dropping estimated data...') data = pd.concat([self.process_logS(data), self.process_logP(data), self.process_mp(data), self.process_bp(data)], axis=1) LOGGER.info('Dropped compounds: %s', data.isnull().all(axis=1).sum()) data = data[data.notnull().any(axis=1)] LOGGER.debug('Compounds with experimental activities: %s', len(data)) return data
[docs] def process_logS(self, data): cleaned = pd.DataFrame(index=data.index) S = 0.001 * data.ws / data.mol_weight logS = np.log10(S) return logS[data.ws_type == 'EXP']
[docs] def process_logP(self, data): logP = data.logp[data.logp_type == 'EXP'] return logP[logP > -10]
[docs] def process_mp(self, data): return data.mp.apply(self.fix_temp)
[docs] def process_bp(self, data): return data.bp.apply(self.fix_temp)
@staticmethod
[docs] def fix_temp(s, mean_range=5): try: return float(s) except ValueError: if '<' in s or '>' in s: return np.nan s = s.strip(' dec') s = s.strip(' sub') if '-' in s and mean_range: rng = [float(n) for n in s.split('-')] if len(rng) > 2: return np.nan if np.abs(rng[1] - rng[0]) < mean_range: return (rng[0] + rng[1])/2 try: return float(s) except ValueError: return np.nan
if __name__ == '__main__': logging.basicConfig(level=logging.INFO) LOGGER.info('Converting PhysProp Dataset...') PhysPropConverter.convert()