Source code for examol.select.base

"""Interfaces for selector classes"""
import heapq
import logging
from itertools import chain
from typing import Iterator, Sequence

import numpy as np

from examol.store.db.base import MoleculeStore
from examol.store.recipes import PropertyRecipe

logger = logging.getLogger(__name__)


def _extract_observations(database: MoleculeStore, recipes: Sequence[PropertyRecipe]) -> np.ndarray:
    """Get an array of observations from the training set

    Args:
        database: Database of molecular records to process
        recipes: List of recipes to extract
    Returns:
        Properties for all molecules which have values for all recipes. Shape: (num molecules) x (num recipes)
    """

    output = []
    for record in database.iterate_over_records():
        if not all(recipe.lookup(record) is not None for recipe in recipes):
            continue
        output.append([recipe.lookup(record) for recipe in recipes])
    return np.array(output)


[docs] class Selector: """Base class for selection algorithms **Using a Selector** Selectors function in two phases: gathering and dispensing. Selectors are in the gathering phase when first created. Add potential computations in batches with :meth:`add_possibilities`, which takes a list of keys describing the computations and a distribution of probable scores (e.g., predictions from different models in an ensemble) for each computation. Sample arrays are 3D and shaped ``num_recipes x num_samples x num_models`` The dispensing phase starts by calling :meth:`dispense`. ``dispense`` generates a selected computation from the list of keys acquired during gathering phase paired with a score. Selections are generated from highest to lowest priority. **Creating a Selector** You must implement three operations: - :meth:`start_gathering`, which is called at the beginning of a gathering phase and must clear state from the previous selection round. - :meth:`add_possibilities` updates the state of a selection to account for a new batch of computations. For example, you could update an ranked list of best-scored computations. - :meth:`dispense` generates a list of :attr:`to_select` in ranked order from best to worst """ multiobjective: bool = False """Whether the selector supports multi-objective optimization""" def __init__(self, to_select: int): """ Args: to_select: Target number of computations to select """ self.to_select: int = to_select """Number of computations to select""" self.gathering: bool = True """Whether the selector is waiting to accept more possibilities.""" self.start_gathering()
[docs] def start_gathering(self): """Prepare to gather new batches potential computations""" self.gathering = True
[docs] def add_possibilities(self, keys: list, samples: np.ndarray, **kwargs): """Add potential options to be selected Args: keys: Labels by which to identify the records being evaluated samples: A distribution of scores for each record. Expects a 3-dimensional array of shape (num recipes) x (num records) x (num models) """ # Test for error conditions if samples.shape[0] > 1 and not self.multiobjective: raise ValueError(f'Provided {samples.shape[0]} objectives but the class does not support multi-objective selection') if samples.ndim != 3: # pragma: no-coverage raise ValueError(f'Expected samples dimension of 3. Found {samples.ndim}. Array should be (recipe, records, model)') if samples.shape[1] != len(keys): # pragma: no-coverage raise ValueError(f'Number of keys and number of samples differ. Keys={len(keys)}. Samples={samples.shape[1]}') # Do the work if not self.gathering: logger.info('Switching selector back to gathering phase. Clearing any previous selection information') self.start_gathering() self._add_possibilities(keys, samples, **kwargs)
def _add_possibilities(self, keys: list, samples: np.ndarray, **kwargs): raise NotImplementedError()
[docs] def update(self, database: MoleculeStore, recipes: Sequence[PropertyRecipe]): """Update the selector given the current database Args: database: Known molecules recipes: Recipe being optimized """ pass
[docs] def dispense(self) -> Iterator[tuple[object, float]]: """Dispense selected computations from highest- to least-rated. Yields: A pair of "selected computation" (as identified by the keys provided originally) and a score. """ self.gathering = False yield from self._dispense()
def _dispense(self) -> Iterator[tuple[object, float]]: raise NotImplementedError()
[docs] class RankingSelector(Selector): """Base class where we assign an independent score to each possibility. Implementations should assume that the goal is maximization because this abstract class negates the samples for objective is to minimize. Args: to_select: How many computations to select per batch maximize: Whether to select entries with high or low values of the samples. Provide either a single value if maximizing or minimizing all objectives, or a list for whether to maximize each objectives. """ def __init__(self, to_select: int, maximize: bool | Sequence[bool] = True): self._options: list[tuple[object, float]] = [] self.maximize = maximize super().__init__(to_select) def _add_possibilities(self, keys: list, samples: np.ndarray, **kwargs): # Determine user options for minimization n_objectives = samples.shape[0] maximize = self.maximize if isinstance(maximize, bool): maximize = [maximize] * n_objectives elif len(maximize) != n_objectives: # pragma: no-cover raise ValueError(f'Different number of recipes ({n_objectives} and number of maximization selections ({len(maximize)})') # Negate, if needed if not all(maximize): samples = samples.copy() for i, m in enumerate(maximize): if not m: samples[i, :, :] *= -1 score = self._assign_score(samples) self._options = heapq.nlargest(self.to_select, chain(self._options, zip(keys, score)), key=lambda x: x[1]) def _dispense(self) -> Iterator[tuple[object, float]]: yield from self._options
[docs] def start_gathering(self): super().start_gathering() self._options.clear()
def _assign_score(self, samples: np.ndarray) -> np.ndarray: raise NotImplementedError()