#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
import os
import zipfile
import logging
LOGGER = logging.getLogger(__name__)
import pandas as pd
import numpy as np
from ... import io
from .base import Converter, contiguous_order
from ...cross_validation import SimThresholdSplit
TXT_COLUMNS = [l.lower() for l in """CAS
Formula
Mol_Weight
Chemical_Name
WS
WS_temp
WS_type
WS_reference
LogP
LogP_temp
LogP_type
LogP_reference
VP
VP_temp
VP_type
VP_reference
DC_pKa
DC_temp
DC_type
DC_reference
henry_law Constant
HL_temp
HL_type
HL_reference
OH
OH_temp
OH_type
OH_reference
BP_pressure
MP
BP
FP""".split('\n')]
[docs]class PhysPropConverter(Converter):
def __init__(self, directory, output_directory, output_filename='physprop.h5'):
output_path = os.path.join(output_directory, output_filename)
sdf, txt = self.extract(directory)
mols, data = self.process_sdf(sdf), self.process_txt(txt)
LOGGER.debug('Compounds with data extracted: %s', len(data))
data = mols.to_frame().join(data)
data = self.drop_inconsistencies(data)
y = self.process_targets(data)
LOGGER.debug('Compounds with experimental: %s', len(y))
data = data.ix[y.index]
data.columns.name = 'targets'
ms, y = data.structure, data.drop('structure', axis=1)
cv = SimThresholdSplit(min_threshold=0.6, block_width=4000, n_jobs=-1).fit(ms)
train, valid, test = cv.split((70, 15, 15))
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test))
splits = (('train', train), ('valid', valid), ('test', test))
self.run(ms, y, output_path=output_path, splits=splits)
[docs] def process_sdf(self, path):
LOGGER.info('Processing sdf at %s', path)
mols = io.read_sdf(path, read_props=False).structure
mols.index = mols.apply(lambda m: m.GetProp('CAS'))
mols.index.name = 'cas'
LOGGER.debug('Structures extracted: %s', len(mols))
return mols
[docs] def process_txt(self, path):
LOGGER.info('Processing txt at %s', path)
data = pd.read_table(path, header=None, engine='python').iloc[:, :32]
data.columns = TXT_COLUMNS
data_types = data.columns[[s.endswith('_type') for s in data.columns]]
data[data_types] = data[data_types].fillna('NAN')
data = data.set_index('cas')
return data
[docs] def drop_inconsistencies(self, data):
LOGGER.info('Dropping inconsistent data...')
formula = data.structure.apply(lambda m: m.to_formula())
LOGGER.info('Inconsistent compounds: %s', (formula != data.formula).sum())
data = data[formula == data.formula]
return data
[docs] def process_targets(self, data):
LOGGER.info('Dropping estimated data...')
data = pd.concat([self.process_logS(data),
self.process_logP(data),
self.process_mp(data),
self.process_bp(data)], axis=1)
LOGGER.info('Dropped compounds: %s', data.isnull().all(axis=1).sum())
data = data[data.notnull().any(axis=1)]
LOGGER.debug('Compounds with experimental activities: %s', len(data))
return data
[docs] def process_logS(self, data):
cleaned = pd.DataFrame(index=data.index)
S = 0.001 * data.ws / data.mol_weight
logS = np.log10(S)
return logS[data.ws_type == 'EXP']
[docs] def process_logP(self, data):
logP = data.logp[data.logp_type == 'EXP']
return logP[logP > -10]
[docs] def process_mp(self, data):
return data.mp.apply(self.fix_temp)
[docs] def process_bp(self, data):
return data.bp.apply(self.fix_temp)
@staticmethod
[docs] def fix_temp(s, mean_range=5):
try:
return float(s)
except ValueError:
if '<' in s or '>' in s:
return np.nan
s = s.strip(' dec')
s = s.strip(' sub')
if '-' in s and mean_range:
rng = [float(n) for n in s.split('-')]
if len(rng) > 2:
return np.nan
if np.abs(rng[1] - rng[0]) < mean_range:
return (rng[0] + rng[1])/2
try:
return float(s)
except ValueError:
return np.nan
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
LOGGER.info('Converting PhysProp Dataset...')
PhysPropConverter.convert()