#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
"""
# skchem.data.converters.chembl
Dataset constructor for ChEMBL
"""
import logging
import pandas as pd
import os
from .base import Converter, default_pipeline, contiguous_order, Feature
from ...cross_validation import SimThresholdSplit
from ... import features
LOGGER = logging.getLogger(__name__)
[docs]class ChEMBLConverter(Converter):
""" Converter for the ChEMBL dataset. """
def __init__(self, directory, output_directory, output_filename='chembl.h5'):
output_path = os.path.join(output_directory, output_filename)
infile = os.path.join(directory, 'chembl_raw.h5')
ms, y = self.parse_infile(infile)
pipeline = default_pipeline()
ms, y = pipeline.transform_filter(ms, y)
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms)
train, valid, test = cv.split((70, 15, 15))
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test))
splits = (('train', train), ('valid', valid), ('test', test))
feats = (
Feature(fper=features.MorganFeaturizer(),
key='X_morg',
axis_names=['batch', 'features']),
Feature(fper=features.PhysicochemicalFeaturizer(),
key='X_pc',
axis_names=['batch', 'features']),
Feature(fper=features.AtomFeaturizer(max_atoms=100),
key='A',
axis_names=['batch', 'atom_idx', 'features']),
Feature(fper=features.GraphDistanceTransformer(max_atoms=100),
key='G',
axis_names=['batch', 'atom_idx', 'atom_idx']),
Feature(fper=features.SpacialDistanceTransformer(max_atoms=100),
key='G_d'))
self.run(ms, y, output_path, features=feats, splits=splits)
[docs] def parse_infile(self, filename):
ms = pd.read_hdf(filename, 'structure')
y = pd.read_hdf(filename, 'targets/Y')
return ms, y
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
LOGGER.info('Converting ChEMBL...')
ChEMBLConverter.convert()