Source code for examol.select.botorch

"""Employ the acquisition functions from `BOTorch <https://botorch.org/>`_"""
from typing import List, Any, Callable, Sequence, Optional

from botorch.acquisition.multi_objective import qExpectedHypervolumeImprovement
from botorch.acquisition.objective import PosteriorTransform
from botorch.posteriors import Posterior, GPyTorchPosterior
from botorch.sampling import SobolQMCNormalSampler
from botorch.utils.multi_objective.box_decompositions import FastNondominatedPartitioning
from gpytorch.distributions import MultitaskMultivariateNormal

try:
    from botorch.acquisition import AcquisitionFunction
except ImportError as e:  # pragma: no-coverage
    raise ImportError('You may need to install BOTorch and PyTorch') from e
from botorch.models.model import Model
from torch import Tensor
import numpy as np
import torch

from examol.select.base import RankingSelector, _extract_observations
from examol.store.recipes import PropertyRecipe
from examol.store.db.base import MoleculeStore



[docs]
class EnsembleCovarianceModel(Model):
    """Model which generates a multivariate Gaussian distribution given samples from an ensemble of models"""

    def __init__(self, num_outputs: int, c: float = 1e-6):
        """
        Args:
            num_outputs: Number of outputs for the model
            c: Regularization parameter used to ensure covariance matrix is positive definite
        """
        super().__init__()
        self._num_outputs = num_outputs
        self.regularization = torch.eye(num_outputs) * c

    @property
    def num_outputs(self) -> int:
        return self._num_outputs


[docs]
    def posterior(
            self,
            X: Tensor,
            output_indices: Optional[List[int]] = None,
            observation_noise: bool = False,
            posterior_transform: Optional[PosteriorTransform] = None,
            **kwargs: Any,
    ) -> Posterior:
        # Reshape X from (b x q x d) -> (b x q x o x s):
        #  b - batch size used for efficiency's sake
        #  q - number of points considered together (this is the batch size for active learning's sake)
        #  o - number of outputs
        #  s - ensemble size
        b, q, _ = X.size()
        x_reshaped = torch.unflatten(X, dim=-1, sizes=(self._num_outputs, -1))

        # Compute the mean of each dimension
        #   GPyTorch wants this as a (b x q x o) matrix
        means = torch.mean(x_reshaped, axis=-1)

        # Compute the covariance between points in a batch (q)
        #  GPyTorch wants this as a (b x (q x o)) matrix
        #  Thanks: https://github.com/pytorch/pytorch/issues/19037
        centered = x_reshaped - means.unsqueeze(-1)  # b x q x o x s
        combined_task_and_samples = torch.flatten(centered, start_dim=1, end_dim=2)  # b x (q x o) x s
        d = combined_task_and_samples.shape[-1]
        cov = 1 / (d - 1) * combined_task_and_samples @ combined_task_and_samples.transpose(-1, -2)
        cov += self.regularization

        # Make the multivariate normal as an output
        posterior = GPyTorchPosterior(
            distribution=MultitaskMultivariateNormal(mean=means, covariance_matrix=cov, validate_args=True)
        )
        if posterior_transform is not None:
            return posterior_transform(posterior)
        return posterior





[docs]
class BOTorchSequentialSelector(RankingSelector):
    """Use an acquisition function from BOTorch to score candidates assuming a pool size of :math:`q=1`

    Provide the acquisition function type and any options needed to configure it.
    Options can be updated by supplying a function which updates them based
    on the properties of molecules which have been evaluated so far.

    For example, Expected Improvement which updates the maximum observed value would be

    .. code-block:: python

        def update_fn(selector: 'BOTorchSequentialSelector', obs: np.ndarray) -> dict:
            return {'best_f': max(obs) if selector.maximize else min(obs)}

        selector = BOTorchSequentialSelector(qExpectedImprovement,
                                             acq_options={'best_f': 0.5},
                                             acq_options_updater=update_fn,
                                             to_select=1)

    Args:
        acq_function_type: Class of the acquisition function
        acq_options: Dictionary of options passed to the acquisition function maker
        acq_options_updater: Function which takes the current selector and an array of observations
            of shape (num molecules) x (num recipes)
        maximize: Whether to maximize or minimize the objectives
        to_select: Number of top candidates to select each round
    """

    multiobjective = True

    def __init__(self,
                 acq_function_type: type[AcquisitionFunction],
                 acq_options: dict[str, object],
                 to_select: int,
                 acq_options_updater: Callable[['BOTorchSequentialSelector', np.ndarray], dict] | None = None,
                 maximize: bool = True):
        self.acq_function: AcquisitionFunction | None = None
        self.acq_function_type = acq_function_type
        self.acq_options = acq_options.copy()
        self.acq_options_updater = acq_options_updater
        super().__init__(to_select, maximize)


[docs]
    def update(self, database: MoleculeStore, recipes: Sequence[PropertyRecipe]):
        if self.acq_options_updater is not None:
            # Run the update function on the properties observed so far
            outputs = _extract_observations(database, recipes)
            self.acq_options = self.acq_options_updater(self, outputs)

        self.acq_function = self.acq_function_type(model=EnsembleCovarianceModel(len(recipes)), **self.acq_options)


    def _assign_score(self, samples: np.ndarray) -> np.ndarray:
        # Shape the tensor in the form expected by BOtorch's GPyTorch
        #  Samples is a `objectives x samples x models` array
        #  BOTorch expects `samples x batch size (q) x (objectives x models)` array
        samples_tensor = samples[:, :, None, :]  # o x s x q x m
        samples_tensor = samples_tensor.transpose((1, 2, 0, 3))  # `s x q x o x m`
        samples_tensor = torch.from_numpy(samples_tensor)
        samples_tensor = torch.flatten(samples_tensor, start_dim=2)  # s x q x (o x m)
        score_tensor = self.acq_function(samples_tensor)
        return score_tensor.detach().cpu().numpy()



def _evhi_update_fn(selector: 'EHVISelector', obs: np.ndarray) -> dict:
    """Options update function used for EVHI

    Args:
        selector: Selector
        obs: Observations of target properties
    """
    # Determine the reference point based on the observations
    #  We should use the minim
    options = selector.acq_options.copy()
    if not isinstance(selector.maximize, bool):
        obs = obs.copy()
        for i, m in enumerate(selector.maximize):
            if not m:
                obs[:, i] *= -1
    elif not selector.maximize:
        obs = obs * -1
    options['ref_point'] = obs.min(axis=0)

    # Set up the partitioning
    options['partitioning'] = FastNondominatedPartitioning(
        ref_point=torch.from_numpy(options['ref_point']),
        Y=torch.from_numpy(obs),
    )
    return options



[docs]
class EHVISelector(BOTorchSequentialSelector):
    """Rank entries based on the Expected Hypervolume Improvement (EVHI)

    EVHI is a multi-objective optimization scores which measures how much a new point will expand the Pareto surface.
    We use the Monte Carlo implementation of EVHI of `Daulton et al. <https://arxiv.org/abs/2006.05078>`_,
    but do not yet support the algorithms batch-aware implementation.

    Constructing the Pareto surface requires the definition of a reference point where
    farther from the reference point is better.
    We use the minimum value of objectives which are being maximized and the maximum value of those being minimized.

    Args:
        maximize: Whether to maximize or minimize the objectives
        to_select: Number of top candidates to select each round
    """

    def __init__(self, to_select: int, maximize: bool | Sequence[bool] = True):
        super().__init__(
            acq_function_type=qExpectedHypervolumeImprovement,
            acq_options_updater=_evhi_update_fn,
            acq_options={'sampler': SobolQMCNormalSampler(sample_shape=torch.Size([128]))},  # TODO (wardlt): Make sampler configurable
            to_select=to_select,
            maximize=maximize
        )