Source code for skchem.test.test_io.test_sdf

#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD

""" Tests for sdf io functionality """

import pandas as pd
import pytest

from ...resource import resource
from ...io import read_sdf

SINGLE_MOLECULE_PROPS = {
    'PUBCHEM_IUPAC_INCHIKEY', 'PUBCHEM_COMPOUND_CANONICALIZED',
    'PUBCHEM_IUPAC_INCHI', 'PUBCHEM_COMPOUND_CID', 'PUBCHEM_OPENEYE_ISO_SMILES',
    'PUBCHEM_ATOM_UDEF_STEREO_COUNT', 'PUBCHEM_MOLECULAR_FORMULA',
    'PUBCHEM_ISOTOPIC_ATOM_COUNT', 'PUBCHEM_CACTVS_COMPLEXITY',
    'PUBCHEM_COORDINATE_TYPE', 'PUBCHEM_BOND_DEF_STEREO_COUNT',
    'PUBCHEM_CACTVS_HBOND_DONOR', 'PUBCHEM_IUPAC_OPENEYE_NAME',
    'PUBCHEM_EXACT_MASS', 'PUBCHEM_ATOM_DEF_STEREO_COUNT',
    'PUBCHEM_IUPAC_TRADITIONAL_NAME', 'PUBCHEM_OPENEYE_CAN_SMILES',
    'PUBCHEM_IUPAC_NAME', 'PUBCHEM_MOLECULAR_WEIGHT',
    'PUBCHEM_CACTVS_TAUTO_COUNT', 'PUBCHEM_CACTVS_HBOND_ACCEPTOR',
    'PUBCHEM_CACTVS_ROTATABLE_BOND', 'PUBCHEM_TOTAL_CHARGE',
    'PUBCHEM_IUPAC_CAS_NAME', 'PUBCHEM_MONOISOTOPIC_WEIGHT',
    'PUBCHEM_HEAVY_ATOM_COUNT', 'PUBCHEM_BOND_UDEF_STEREO_COUNT',
    'PUBCHEM_CACTVS_SUBSKEYS', 'PUBCHEM_IUPAC_SYSTEMATIC_NAME',
    'PUBCHEM_CACTVS_TPSA', 'PUBCHEM_XLOGP3_AA', 'PUBCHEM_COMPONENT_COUNT'
}

NON_SHARED_PROPS = {'DUMMY_PROPERTY_A', 'DUMMY_PROPERTY_B', 'DUMMY_PROPERTY_C'}
SINGLE_MOLECULE_NAME = '297'
SINGLE_MOLECULE_NUM_ATOMS = 1
SINGLE_MOLECULE_NUM_ATOMS_W_HS = 5

MULTI_MOLECULE_NUM_MOLECULES = 3
MULTI_MOLECULE_NAMES = ['297', '6324', '6334']
MULTI_MOLECULE_PROPS = SINGLE_MOLECULE_PROPS.union(NON_SHARED_PROPS)

[docs]class TestSDF(object):

    """ Test class for sdf file parser """

[docs]    def test_opening_with_file(self):

        """ Can an sdf file be opened with a file-like object? """

        with open(resource('test_sdf', 'single_molecule-simple.sdf'), 'rb') as f:
            df = read_sdf(f)
            assert len(df) == 1

[docs]    def test_file_correct_structure(self):

        """ When opened with a file-like object, is the structure correct?
        Done by checking atom number (should be one, as rdkit ignores Hs by default """

        with open(resource('test_sdf', 'single_molecule-simple.sdf'), 'rb') as f:
            df = read_sdf(f)
            assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() == 1

[docs]    def test_opening_with_path(self):

        """ Do we find a molecule in example file? """

        df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
        assert len(df) == 1

[docs]    def test_path_correct_structure(self):

        """ When opened with a path, is the structure correct? """

        df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
        assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() \
            == SINGLE_MOLECULE_NUM_ATOMS

[docs]    def test_arg_forwarding(self):

        """ Check that kwargs can still be parsed to the rdkit object """

        df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'), removeHs=False)
        assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() \
            == SINGLE_MOLECULE_NUM_ATOMS_W_HS

[docs]    def test_single_index_detected(self):

        """ Does molecule have a name set to index? """

        df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
        assert not (df.index == pd.DataFrame(['dummy']).index).all()

[docs]    def test_single_index_correct(self):

        """ is name correct? """

        single_molecule_df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
        assert single_molecule_df.index[0] == SINGLE_MOLECULE_NAME

[docs]    def test_single_properties_detected(self):

        """ Does the dataframe have properties? """

        df = read_sdf(resource('test_sdf', 'single_molecule-properties.sdf'))
        test = set(df.columns)
        test.remove('structure')
        assert len(test) > 1

[docs]    def test_single_properties_correct(self):

        """ Are they the right properties? """

        df = read_sdf(resource('test_sdf', 'single_molecule-properties.sdf'))
        props = set(df.columns)
        props.remove('structure')
        assert props == SINGLE_MOLECULE_PROPS

[docs]    def test_multi_parsed(self):

        """ Do we find right number of molecules?"""

        df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
        assert df.shape[0] == MULTI_MOLECULE_NUM_MOLECULES

[docs]    def test_multi_index_detected(self):

        """ Is index set? """

        df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
        dummy_df = pd.DataFrame(['dummy'] * MULTI_MOLECULE_NUM_MOLECULES)
        assert not (df.index == dummy_df.index).all()

[docs]    def test_multi_index_correct(self):
        '''is it the right index?'''

        df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
        assert (df.index == MULTI_MOLECULE_NAMES).all()

[docs]    def test_multi_diff_properties(self):
        '''if there are properties not common for all, are they all detected?'''

        df = read_sdf(resource('test_sdf', 'multi_molecule-properties.sdf'))
        props = set(df.columns)
        props.remove('structure')
        assert props == MULTI_MOLECULE_PROPS

[docs]    def test_bad_structure(self):
        """ Does it throw an error if bad structures are given? """

        with pytest.raises(ValueError):
            read_sdf(resource('test_sdf', 'multi_molecule-bad_structure.sdf'), error_bad_mol=True)