Source code for skchem.data.converters.bursi_ames

#! /usr/bin/env python
#
# Copyright (C) 2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

import os
import zipfile
import logging

LOGGER = logging.getLogger(__name__)

import numpy as np

from ... import io

from .base import Converter, default_pipeline, contiguous_order
from ...cross_validation import SimThresholdSplit

[docs]class BursiAmesConverter(Converter): def __init__(self, directory, output_directory, output_filename='bursi_ames.h5'): zip_path = os.path.join(directory, 'cas_4337.zip') output_path = os.path.join(output_directory, output_filename) with zipfile.ZipFile(zip_path) as f: sdf_path = f.extract('cas_4337.sdf') data = io.read_sdf(sdf_path) data.index.name = 'batch' data['is_mutagen'] = (data['Ames test categorisation'] == 'mutagen').astype(np.uint8) ms, y = data.structure, data.is_mutagen pipeline = default_pipeline() ms, y = pipeline.transform_filter(ms, y) cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) train, valid, test = cv.split((70, 15, 15)) (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) splits = (('train', train), ('valid', valid), ('test', test)) self.run(ms, y, output_path, splits=splits)
if __name__ == '__main__': logging.basicConfig(level=logging.INFO) LOGGER.info('Converting Bursi Ames Dataset...') BursiAmesConverter.convert()