Source code for skchem.data.converters.bradley_open_mp

#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

import os
import logging
logger = logging.getLogger(__name__)

import pandas as pd

from .base import Converter, default_pipeline, contiguous_order
from ...core import Mol
from ...cross_validation import SimThresholdSplit

[docs]class BradleyOpenMPConverter(Converter): def __init__(self, directory, output_directory, output_filename='bradley_open_mp.h5'): output_path = os.path.join(output_directory, output_filename) data = self.parse_data(os.path.join(directory, 'bradley_melting_point_dataset.xlsx')) data = self.filter_bad(data) def parse_smiles(smi): try: return Mol.from_smiles(smi) except ValueError: return None data['structure'] = data.smiles.apply(parse_smiles) data = data[data.structure.notnull()] ms, y = data.structure, self.fix_mp(data) pipeline = default_pipeline() ms, y = pipeline.transform_filter(ms, y) cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) train, valid, test = cv.split((70, 15, 15)) (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) splits = (('train', train), ('valid', valid), ('test', test)) self.run(ms, y, output_path=output_path, splits=splits) @staticmethod
[docs] def parse_data(path): logger.info('Parsing data at %s...', path) return pd.read_excel(path, index_col=0)
@staticmethod
[docs] def filter_bad(data): logger.info('Removing manually annotated errors...') bad_data = data.donotuse.notnull() logger.debug('Removed %s', bad_data.sum()) return data[~bad_data]
@staticmethod
[docs] def fix_mp(data): logger.info('Converting temperature to Kelvin...') return data.mpC + 278.15
if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) LOGGER.info('Converting Bradley Open Melting Point Dataset...') BradleyOpenMPConverter.convert()