Source code for examol.store.models

"""Data models used for molecular data"""
from dataclasses import dataclass
from hashlib import md5
from datetime import datetime
from typing import Collection

import ase
import numpy as np
from pydantic import BaseModel, Field
from rdkit import Chem

from examol.simulate.base import SimResult
from examol.utils.chemistry import parse_from_molecule_string
from examol.utils.conversions import read_from_string, write_to_string


[docs] @dataclass class MissingData(ValueError): """No conformer or energy with the desired settings was found""" config_name: str = ... """Configuration used to compute the energy""" charge: int = ... """Charge used when computing the energy""" solvent: str | None = ... """Solvent used, if any""" def __str__(self): return f'No data for config={self.config_name} charge={self.charge} solvent={self.solvent}'
[docs] class Identifiers(BaseModel): """IDs known for a molecule""" smiles: str """A SMILES string""" inchi: str """The InChI string""" pubchem_id: int | None = None """PubChem ID, if known"""
[docs] class EnergyEvaluation(BaseModel): """Energy of a conformer under a certain condition""" energy: float """Energy of the conformer (eV)""" config_name: str """Configuration used to compute the energy""" charge: int """Charge used when computing the energy""" solvent: str | None """Solvent used, if any""" completed: datetime = Field(default_factory=lambda: datetime.utcnow()) """When this energy computation was added""" def __eq__(self, other): if not isinstance(other, self.__class__): return False return self.config_name == other.config_name \ and self.charge == other.charge \ and self.solvent == other.solvent
[docs] class Conformer(BaseModel): """Describes a single conformer of a molecule""" # Define the structure xyz: str """XYZ-format description of the atomic coordinates""" xyz_hash: str """MDF hash of the XYZ coordinates""" # Provenance of the structure date_created: datetime """Date this conformer was inserted""" source: str | None = None """Method used to generate this structure (e.g., via relaxation)""" config_name: str | None = None """Configuration used to relax the structure, if applicable""" charge: int """Charge used when relaxing the structure""" # Energies of the structure energies: list[EnergyEvaluation] = Field(default_factory=list) """List of energies for this structure""" @property def atoms(self) -> ase.Atoms: return read_from_string(self.xyz, 'xyz')
[docs] @classmethod def from_simulation_result(cls, sim_result: SimResult, source: str = 'relaxation') -> 'Conformer': """Create a new object from a simulation Args: sim_result: Simulation result source: How this conformer was determined Returns: An initialized conformer record which includes energies """ new_record = cls.from_xyz(sim_result.xyz, source=source, config_name=sim_result.config_name, charge=sim_result.charge) new_record.add_energy(sim_result) return new_record
[docs] @classmethod def from_xyz(cls, xyz: str, **kwargs): """Create a new object from a XYZ-format object Args: xyz: XYZ-format description of the molecule Returns: An initialized conformer object """ # Make sure the simulation results is center atoms = read_from_string(xyz, 'xyz') atoms.center() xyz = write_to_string(atoms, 'xyz') return cls( xyz=xyz, xyz_hash=md5(xyz.encode()).hexdigest(), date_created=datetime.now(), **kwargs )
[docs] def add_energy(self, sim_result: SimResult) -> bool: """Add the energy from a simulation result Args: sim_result: Result to be added """ new_energy = EnergyEvaluation( energy=sim_result.energy, config_name=sim_result.config_name, charge=sim_result.charge, solvent=sim_result.solvent, ) if new_energy in self.energies: return False self.energies.append(new_energy) return True
[docs] def get_energy_index(self, config_name: str, charge: int, solvent: str | None) -> int | None: """Get the index of the record for a certain level of energy Args: config_name: Name of the compute configuration charge: Charge of the molecule solvent: Solvent in which the molecule is dissolved Returns: Index of the record, if available, or ``None``, if not. """ for i, record in enumerate(self.energies): if record.config_name == config_name and \ record.charge == charge and \ record.solvent == solvent: return i return None
[docs] def get_energy(self, config_name: str, charge: int, solvent: str | None) -> float: """Get the energy for a certain level Args: config_name: Name of the compute configuration charge: Charge of the molecule solvent: Solvent in which the molecule is dissolved Returns: Energy of the target conformer Raises: NoSuchConformer: If there is no such energy for this conformer """ ind = self.get_energy_index(config_name, charge, solvent) if ind is None: raise MissingData(config_name, charge, solvent) return self.energies[ind].energy
[docs] class MoleculeRecord(BaseModel): """Defines whatever we know about a molecule""" # Identifiers key: str = Field(min_length=27, max_length=27) """InChI key""" identifier: Identifiers """Collection of identifiers which define the molecule""" names: list[str] = Field(default_factory=list) """Names this molecule is known by""" subsets: list[str] = Field(default_factory=list) """List of subsets this molecule is part of""" # Data about the molecule conformers: list[Conformer] = Field(default_factory=list) """All known conformers for this molecule""" # Characteristics properties: dict[str, dict[str, float]] = Field(default_factory=dict) """Properties available for the molecule"""
[docs] @classmethod def from_identifier(cls, mol_string: str): """Parse the molecule from either the SMILES or InChI string Args: mol_string: Molecule to parse Returns: Empty record for this molecule """ # Load in the molecule mol = parse_from_molecule_string(mol_string) # Create the object return cls(key=Chem.MolToInchiKey(mol), identifier=Identifiers(smiles=Chem.MolToSmiles(mol), inchi=Chem.MolToInchi(mol)))
[docs] def add_energies(self, result: SimResult, opt_steps: Collection[SimResult] = (), match_tol: float = 1e-3) -> bool: """Add a new set of energies to a structure Will add a new conformer if the structure does not yet exist If provided, will match the energies of any materials within the optimization steps Args: result: Energy computation to be added opt_steps: Optimization steps, if available match_tol: Maximum absolute difference between XYZ coordinates to match Returns: Whether a new conformer was added """ # Prepare to match conformers conf_pos = [c.atoms.positions for c in self.conformers] def _match_conformers(atoms: ase.Atoms) -> int | None: atoms = atoms.copy() atoms.center() positions = atoms.positions for i, c in enumerate(conf_pos): if np.isclose(positions, c, atol=match_tol).all(): return i # First try to add optimization steps for step in opt_steps: if (match_id := _match_conformers(step.atoms)) is not None: self.conformers[match_id].add_energy(step) # Get the atomic positions for me my_match = _match_conformers(result.atoms) if my_match is None: self.conformers.append(Conformer.from_simulation_result(result)) return True else: self.conformers[my_match].add_energy(result) return False
[docs] def find_lowest_conformer(self, config_name: str, charge: int, solvent: str | None, optimized_only: bool = True) -> tuple[Conformer, float]: """Get the energy of the lowest-energy conformer of a molecule in a certain state Args: config_name: Name of the compute configuration charge: Charge of the molecule solvent: Solvent in which the molecule is dissolved optimized_only: Only match conformers which were optimized with the specified configuration and charge Returns: - Lowest-energy conformer - Energy of the structure (eV) Raises: NoSuchConformer: If we lack a conformer with these settings """ # Output results lowest_energy: float = np.inf stable_conformer: Conformer | None = None # Check all conformers for conf in self.conformers: if optimized_only and (conf.config_name != config_name or conf.charge != charge): continue energy_ind = conf.get_energy_index(config_name, charge, solvent) if energy_ind is not None and conf.energies[energy_ind].energy < lowest_energy: stable_conformer = conf lowest_energy = conf.energies[energy_ind].energy if stable_conformer is None: raise MissingData(config_name, charge, solvent) return stable_conformer, lowest_energy