"""Interfaces for selector classes"""importheapqimportloggingfromitertoolsimportchainfromtypingimportIterator,Sequenceimportnumpyasnpfromexamol.store.db.baseimportMoleculeStorefromexamol.store.recipesimportPropertyRecipelogger=logging.getLogger(__name__)def_extract_observations(database:MoleculeStore,recipes:Sequence[PropertyRecipe])->np.ndarray:"""Get an array of observations from the training set Args: database: Database of molecular records to process recipes: List of recipes to extract Returns: Properties for all molecules which have values for all recipes. Shape: (num molecules) x (num recipes) """output=[]forrecordindatabase.iterate_over_records():ifnotall(recipe.lookup(record)isnotNoneforrecipeinrecipes):continueoutput.append([recipe.lookup(record)forrecipeinrecipes])returnnp.array(output)
[docs]classSelector:"""Base class for selection algorithms **Using a Selector** Selectors function in two phases: gathering and dispensing. Selectors are in the gathering phase when first created. Add potential computations in batches with :meth:`add_possibilities`, which takes a list of keys describing the computations and a distribution of probable scores (e.g., predictions from different models in an ensemble) for each computation. Sample arrays are 3D and shaped ``num_recipes x num_samples x num_models`` The dispensing phase starts by calling :meth:`dispense`. ``dispense`` generates a selected computation from the list of keys acquired during gathering phase paired with a score. Selections are generated from highest to lowest priority. **Creating a Selector** You must implement three operations: - :meth:`start_gathering`, which is called at the beginning of a gathering phase and must clear state from the previous selection round. - :meth:`add_possibilities` updates the state of a selection to account for a new batch of computations. For example, you could update an ranked list of best-scored computations. - :meth:`dispense` generates a list of :attr:`to_select` in ranked order from best to worst """multiobjective:bool=False"""Whether the selector supports multi-objective optimization"""def__init__(self,to_select:int):""" Args: to_select: Target number of computations to select """self.to_select:int=to_select"""Number of computations to select"""self.gathering:bool=True"""Whether the selector is waiting to accept more possibilities."""self.start_gathering()
[docs]defstart_gathering(self):"""Prepare to gather new batches potential computations"""self.gathering=True
[docs]defadd_possibilities(self,keys:list,samples:np.ndarray,**kwargs):"""Add potential options to be selected Args: keys: Labels by which to identify the records being evaluated samples: A distribution of scores for each record. Expects a 3-dimensional array of shape (num recipes) x (num records) x (num models) """# Test for error conditionsifsamples.shape[0]>1andnotself.multiobjective:raiseValueError(f'Provided {samples.shape[0]} objectives but the class does not support multi-objective selection')ifsamples.ndim!=3:# pragma: no-coverageraiseValueError(f'Expected samples dimension of 3. Found {samples.ndim}. Array should be (recipe, records, model)')ifsamples.shape[1]!=len(keys):# pragma: no-coverageraiseValueError(f'Number of keys and number of samples differ. Keys={len(keys)}. Samples={samples.shape[1]}')# Do the workifnotself.gathering:logger.info('Switching selector back to gathering phase. Clearing any previous selection information')self.start_gathering()self._add_possibilities(keys,samples,**kwargs)
[docs]defupdate(self,database:MoleculeStore,recipes:Sequence[PropertyRecipe]):"""Update the selector given the current database Args: database: Known molecules recipes: Recipe being optimized """pass
[docs]defdispense(self)->Iterator[tuple[object,float]]:"""Dispense selected computations from highest- to least-rated. Yields: A pair of "selected computation" (as identified by the keys provided originally) and a score. """self.gathering=Falseyield fromself._dispense()
[docs]classRankingSelector(Selector):"""Base class where we assign an independent score to each possibility. Implementations should assume that the goal is maximization because this abstract class negates the samples for objective is to minimize. Args: to_select: How many computations to select per batch maximize: Whether to select entries with high or low values of the samples. Provide either a single value if maximizing or minimizing all objectives, or a list for whether to maximize each objectives. """def__init__(self,to_select:int,maximize:bool|Sequence[bool]=True):self._options:list[tuple[object,float]]=[]self.maximize=maximizesuper().__init__(to_select)def_add_possibilities(self,keys:list,samples:np.ndarray,**kwargs):# Determine user options for minimizationn_objectives=samples.shape[0]maximize=self.maximizeifisinstance(maximize,bool):maximize=[maximize]*n_objectiveseliflen(maximize)!=n_objectives:# pragma: no-coverraiseValueError(f'Different number of recipes ({n_objectives} and number of maximization selections ({len(maximize)})')# Negate, if neededifnotall(maximize):samples=samples.copy()fori,minenumerate(maximize):ifnotm:samples[i,:,:]*=-1score=self._assign_score(samples)self._options=heapq.nlargest(self.to_select,chain(self._options,zip(keys,score)),key=lambdax:x[1])def_dispense(self)->Iterator[tuple[object,float]]:yield fromself._options