"""Algorithms which select molecules based on k-means clustering"""importnumpyasnpfromsklearn.clusterimportKMeansfrom.baseimportStarterfromexamol.score.rdkit.descriptorsimportcompute_morgan_fingerprintsfromscipy.spatial.distanceimportcdist
[docs]classKMeansStarter(Starter):"""Select structurally distinct molecules by picking molecules at the centers of clusters"""def_select(self,to_select:list[str],count:int)->list[str]:# Compute Morgan fingerprints along with their indicesfingerprints=[compute_morgan_fingerprints(smiles)forsmilesinto_select]# Run KMeans clustering on the fingerprintskmeans=KMeans(n_clusters=count).fit(fingerprints)# Get cluster centerscluster_centers=kmeans.cluster_centers_# Get the molecules closest to the cluster centers# Using cdist to compute the distance between each pair of fingerprints and cluster centersdistances=cdist(fingerprints,cluster_centers,'euclidean')closest_molecules=[]# For each cluster center, find the molecule with the smallest distance to itforiinrange(count):closest_index=np.argmin(distances[:,i])closest_molecules.append(to_select[closest_index])returnclosest_molecules