#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <rl403@cam.ac.uk>
# License: 3-clause BSD
""" Tests for sdf io functionality """
import pandas as pd
import pytest
from ...resource import resource
from ...io import read_sdf
SINGLE_MOLECULE_PROPS = {
'PUBCHEM_IUPAC_INCHIKEY', 'PUBCHEM_COMPOUND_CANONICALIZED',
'PUBCHEM_IUPAC_INCHI', 'PUBCHEM_COMPOUND_CID', 'PUBCHEM_OPENEYE_ISO_SMILES',
'PUBCHEM_ATOM_UDEF_STEREO_COUNT', 'PUBCHEM_MOLECULAR_FORMULA',
'PUBCHEM_ISOTOPIC_ATOM_COUNT', 'PUBCHEM_CACTVS_COMPLEXITY',
'PUBCHEM_COORDINATE_TYPE', 'PUBCHEM_BOND_DEF_STEREO_COUNT',
'PUBCHEM_CACTVS_HBOND_DONOR', 'PUBCHEM_IUPAC_OPENEYE_NAME',
'PUBCHEM_EXACT_MASS', 'PUBCHEM_ATOM_DEF_STEREO_COUNT',
'PUBCHEM_IUPAC_TRADITIONAL_NAME', 'PUBCHEM_OPENEYE_CAN_SMILES',
'PUBCHEM_IUPAC_NAME', 'PUBCHEM_MOLECULAR_WEIGHT',
'PUBCHEM_CACTVS_TAUTO_COUNT', 'PUBCHEM_CACTVS_HBOND_ACCEPTOR',
'PUBCHEM_CACTVS_ROTATABLE_BOND', 'PUBCHEM_TOTAL_CHARGE',
'PUBCHEM_IUPAC_CAS_NAME', 'PUBCHEM_MONOISOTOPIC_WEIGHT',
'PUBCHEM_HEAVY_ATOM_COUNT', 'PUBCHEM_BOND_UDEF_STEREO_COUNT',
'PUBCHEM_CACTVS_SUBSKEYS', 'PUBCHEM_IUPAC_SYSTEMATIC_NAME',
'PUBCHEM_CACTVS_TPSA', 'PUBCHEM_XLOGP3_AA', 'PUBCHEM_COMPONENT_COUNT'
}
NON_SHARED_PROPS = {'DUMMY_PROPERTY_A', 'DUMMY_PROPERTY_B', 'DUMMY_PROPERTY_C'}
SINGLE_MOLECULE_NAME = '297'
SINGLE_MOLECULE_NUM_ATOMS = 1
SINGLE_MOLECULE_NUM_ATOMS_W_HS = 5
MULTI_MOLECULE_NUM_MOLECULES = 3
MULTI_MOLECULE_NAMES = ['297', '6324', '6334']
MULTI_MOLECULE_PROPS = SINGLE_MOLECULE_PROPS.union(NON_SHARED_PROPS)
[docs]class TestSDF(object):
""" Test class for sdf file parser """
[docs] def test_opening_with_file(self):
""" Can an sdf file be opened with a file-like object? """
with open(resource('test_sdf', 'single_molecule-simple.sdf'), 'rb') as f:
df = read_sdf(f)
assert len(df) == 1
[docs] def test_file_correct_structure(self):
""" When opened with a file-like object, is the structure correct?
Done by checking atom number (should be one, as rdkit ignores Hs by default """
with open(resource('test_sdf', 'single_molecule-simple.sdf'), 'rb') as f:
df = read_sdf(f)
assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() == 1
[docs] def test_opening_with_path(self):
""" Do we find a molecule in example file? """
df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
assert len(df) == 1
[docs] def test_path_correct_structure(self):
""" When opened with a path, is the structure correct? """
df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() \
== SINGLE_MOLECULE_NUM_ATOMS
[docs] def test_arg_forwarding(self):
""" Check that kwargs can still be parsed to the rdkit object """
df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'), removeHs=False)
assert df[SINGLE_MOLECULE_NAME].GetNumAtoms() \
== SINGLE_MOLECULE_NUM_ATOMS_W_HS
[docs] def test_single_index_detected(self):
""" Does molecule have a name set to index? """
df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
assert not (df.index == pd.DataFrame(['dummy']).index).all()
[docs] def test_single_index_correct(self):
""" is name correct? """
single_molecule_df = read_sdf(resource('test_sdf', 'single_molecule-simple.sdf'))
assert single_molecule_df.index[0] == SINGLE_MOLECULE_NAME
[docs] def test_single_properties_detected(self):
""" Does the dataframe have properties? """
df = read_sdf(resource('test_sdf', 'single_molecule-properties.sdf'))
test = set(df.columns)
test.remove('structure')
assert len(test) > 1
[docs] def test_single_properties_correct(self):
""" Are they the right properties? """
df = read_sdf(resource('test_sdf', 'single_molecule-properties.sdf'))
props = set(df.columns)
props.remove('structure')
assert props == SINGLE_MOLECULE_PROPS
[docs] def test_multi_parsed(self):
""" Do we find right number of molecules?"""
df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
assert df.shape[0] == MULTI_MOLECULE_NUM_MOLECULES
[docs] def test_multi_index_detected(self):
""" Is index set? """
df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
dummy_df = pd.DataFrame(['dummy'] * MULTI_MOLECULE_NUM_MOLECULES)
assert not (df.index == dummy_df.index).all()
[docs] def test_multi_index_correct(self):
'''is it the right index?'''
df = read_sdf(resource('test_sdf', 'multi_molecule-simple.sdf'))
assert (df.index == MULTI_MOLECULE_NAMES).all()
[docs] def test_multi_diff_properties(self):
'''if there are properties not common for all, are they all detected?'''
df = read_sdf(resource('test_sdf', 'multi_molecule-properties.sdf'))
props = set(df.columns)
props.remove('structure')
assert props == MULTI_MOLECULE_PROPS
[docs] def test_bad_structure(self):
""" Does it throw an error if bad structures are given? """
with pytest.raises(ValueError):
read_sdf(resource('test_sdf', 'multi_molecule-bad_structure.sdf'), error_bad_mol=True)