Source code for PFASGroups.prioritise

"""
PFAS Molecule Prioritization Module
====================================

This module provides functions to prioritize PFAS molecules based on:
1. Distributional similarity to a reference list (using fingerprint KL divergence)
2. Fluorinated component characteristics (total fluorine content and component size distribution)

Author: HalogenGroups
Version: 2.2.4
"""

from typing import Union, List, Tuple, Optional, Dict
import numpy as np
from rdkit import Chem

from .parser import parse_smiles, parse_mols
from .PFASEmbeddings import PFASEmbeddingSet



[docs]
def prioritise_molecules(
    molecules: Union[List[str], List[Chem.Mol], PFASEmbeddingSet],
    reference: Optional[Union[List[str], List[Chem.Mol], PFASEmbeddingSet]] = None,
    group_selection: str = 'all',
    count_mode: str = 'max_component',
    halogens: Union[str, List[str]] = 'F',
    saturation: Optional[str] = None,
    a: float = 1.0,
    b: float = 1.0,
    percentile: float = 90.0,
    return_scores: bool = True,
    ascending: bool = False,
    progress: bool = False,
) -> Union[PFASEmbeddingSet, Tuple[PFASEmbeddingSet, np.ndarray]]:
    """
    Prioritize PFAS molecules based on similarity to a reference or intrinsic properties.

    Parameters
    ----------
    molecules : list of str, list of rdkit.Chem.Mol, or PFASEmbeddingSet
        Molecules to prioritize. Can be:
        - List of SMILES strings
        - List of RDKit molecule objects
        - PFASEmbeddingSet object (pre-computed results)

    reference : list of str, list of rdkit.Chem.Mol, PFASEmbeddingSet, or None
        Reference molecules for similarity comparison. If provided, molecules are
        prioritized by distributional similarity (lower KL divergence = higher priority).
        If None, prioritization is based on intrinsic fluorinated component properties.

    group_selection : str, default 'all'
        PFAS group selection for fingerprint generation when using reference:
        - 'all': All 115 groups (OECD + generic)
        - 'oecd': OECD-defined groups (1-28)
        - 'generic': Generic functional groups (29-115)
        - 'telomers': Telomer-related groups
        - 'generic+telomers': Combined selection

    count_mode : str, default 'binary'
        Fingerprint encoding mode when using reference:
        - 'binary': 1 if present, 0 if absent
        - 'count': Number of matches
        - 'max_component': Maximum component size

    halogens : str or list of str, default 'F'
        Which halogen(s) to include when generating fingerprints for reference
        comparison.  Passed directly to ``PFASEmbeddingSet.to_fingerprint``.

    saturation : str or None, default None
        Saturation filter applied to component SMARTS when generating
        fingerprints.  ``None`` (default) includes both per- and
        polyfluorinated / polyhalogenated components, which gives the broadest
        coverage and avoids zero scores for candidates that only contain
        polyfluorinated chains.  Pass ``'per'`` or ``'poly'`` to restrict.

    a : float, default 1.0
        Weight for total fluorinated component size (sum of all component sizes).
        Used when reference is None. Higher values prioritize molecules with more
        total fluorination.

    b : float, default 1.0
        Weight for component size percentile. Used when reference is None.
        Higher values prioritize molecules with larger individual fluorinated components.

    percentile : float, default 90.0
        Percentile value (0-100) for component size distribution. Used when reference
        is None. Common values:
        - 90.0: Focus on largest 10% of components
        - 75.0: Focus on largest 25% of components
        - 50.0: Median component size

    return_scores : bool, default True
        If True, returns tuple of (prioritized_results, scores).
        If False, returns only prioritized_results.

    ascending : bool, default False
        Sort order. If False (default), highest priority first.
        If True, lowest priority first.

    Returns
    -------
    PFASEmbeddingSet or tuple
        If return_scores=True: (prioritized_results, scores)
        If return_scores=False: prioritized_results only

        prioritized_results : PFASEmbeddingSet
            Molecules sorted by priority
        scores : np.ndarray
            Priority scores for each molecule

    Examples
    --------
    # Priority by similarity to reference list
    >>> from PFASGroups import prioritise_molecules
    >>> inventory = ["FC(F)(F)C(F)(F)C(=O)O", "FC(F)(F)C(F)(F)C(F)(F)C(=O)O"]
    >>> reference = ["FC(F)(F)C(F)(F)C(=O)O"]  # Known priority compounds
    >>> results, scores = prioritise_molecules(inventory, reference=reference)
    >>> print(f"Most similar: {results[0]['smiles']}")

    # Priority by fluorination characteristics
    >>> results, scores = prioritise_molecules(
    ...     inventory,
    ...     a=1.0,  # Weight for total fluorination
    ...     b=2.0,  # Weight for largest components
    ...     percentile=90
    ... )
    >>> print(f"Highest priority: {results[0]['smiles']}")

    # Emphasize total fluorine content
    >>> results = prioritise_molecules(inventory, a=2.0, b=0.5, return_scores=False)

    # Focus on molecules with longest chains
    >>> results = prioritise_molecules(inventory, a=0.5, b=2.0, percentile=95)

    Notes
    -----
    **Reference-based prioritization:**

    Uses cosine similarity between each candidate's fingerprint vector and the
    mean fingerprint of the reference set:

    score_i = (fp_i · mean_ref) / (||fp_i|| × ||mean_ref||)

    - Higher cosine similarity = more similar group profile to reference = higher priority
    - Molecules that activate the same PFAS groups as the reference rank highest
    - Molecules with no group matches receive score 0

    **Intrinsic prioritization (no reference):**

    Score = a × Σ(component_sizes) + b × percentile(component_sizes, p)

    Where:
    - Σ(component_sizes): Total number of fluorinated carbons across all components
    - percentile(component_sizes, p): Size of fluorinated components at pth percentile

    This approach prioritizes molecules based on:
    - Total fluorination burden (a parameter)
    - Presence of long perfluorinated chains (b and percentile parameters)

    **Tuning guidelines:**

    For environmental persistence concerns:
    - High b, high percentile (e.g., b=2.0, p=90): Long-chain compounds

    For bioaccumulation potential:
    - Balanced a and b (e.g., a=1.0, b=1.0): Both total and chain length

    For screening priority:
    - High a, moderate b (e.g., a=2.0, b=1.0, p=75): Total fluorine load

    See Also
    --------
    PFASEmbeddingSet.to_array : Convert results to fingerprints
    PFASEmbedding.compare_kld : Compare embedding distributions
    """
    # Step 1: Convert input to PFASEmbeddingSet if needed
    if isinstance(molecules, PFASEmbeddingSet):
        results = molecules
    elif isinstance(molecules, list):
        if len(molecules) == 0:
            raise ValueError("molecules list is empty")

        # Check if list contains SMILES strings or Mol objects
        if isinstance(molecules[0], str):
            results = parse_smiles(molecules, progress=progress)
        elif isinstance(molecules[0], Chem.Mol):
            results = parse_mols(molecules, progress=progress)
        else:
            raise TypeError(
                f"molecules must be list of SMILES strings or RDKit Mol objects, "
                f"got {type(molecules[0])}"
            )
    else:
        raise TypeError(
            f"molecules must be list of SMILES/Mol objects or PFASEmbeddingSet, "
            f"got {type(molecules)}"
        )

    # Step 2: Compute priority scores
    if reference is not None:
        # Reference-based: cosine similarity of embedding vectors
        scores = _prioritise_by_reference(
            results, reference, group_selection, count_mode, halogens, saturation,
            progress=progress,
        )
    else:
        # Intrinsic: fluorinated component characteristics
        scores = _prioritise_by_components(results, a, b, percentile)

    # Step 3: Sort by priority
    sorted_indices = np.argsort(scores)
    if not ascending:
        sorted_indices = sorted_indices[::-1]

    # Create prioritized results
    prioritized_results = PFASEmbeddingSet([results[i] for i in sorted_indices])

    if return_scores:
        return prioritized_results, scores[sorted_indices]
    else:
        return prioritized_results



def _prioritise_by_reference(
    results: PFASEmbeddingSet,
    reference: Union[List[str], List[Chem.Mol], PFASEmbeddingSet],
    group_selection: str,
    count_mode: str,
    halogens: Union[str, List[str]] = 'F',
    saturation: Optional[str] = None,
    progress: bool = False,
) -> np.ndarray:
    """Prioritize by cosine similarity to the mean embedding of a reference set.

    Parameters
    ----------
    results : PFASEmbeddingSet
        Molecules to prioritize.
    reference : list of str, list of rdkit.Chem.Mol, or PFASEmbeddingSet
        Reference molecules.
    group_selection : str
        PFAS group selection ('all', 'oecd', 'generic', …).
    count_mode : str
        Count mode for the embedding ('binary', 'count', 'max_component', …).
    halogens : str or list of str, default 'F'
        Passed to parse_smiles / parse_mols for the reference.
    saturation : str or None, default None
        Passed to parse_smiles / parse_mols for the reference.
    progress : bool, default False

    Returns
    -------
    np.ndarray
        Priority scores in [0, 1] (higher = more similar to reference).
    """
    # Convert reference to PFASEmbeddingSet if needed
    if isinstance(reference, PFASEmbeddingSet):
        ref_results = reference
    elif isinstance(reference, list):
        if len(reference) == 0:
            raise ValueError("reference list is empty")
        if isinstance(reference[0], str):
            ref_results = parse_smiles(reference, halogens=halogens,
                                       saturation=saturation, progress=progress)
        elif isinstance(reference[0], Chem.Mol):
            ref_results = parse_mols(reference, halogens=halogens,
                                     saturation=saturation, progress=progress)
        else:
            raise TypeError(
                f"reference must be list of SMILES strings or RDKit Mol objects, "
                f"got {type(reference[0])}"
            )
    else:
        raise TypeError(
            f"reference must be list of SMILES/Mol objects or PFASEmbeddingSet, "
            f"got {type(reference)}"
        )

    # Build mean reference vector: shape (n_cols,)
    ref_mat = ref_results.to_array(
        group_selection=group_selection,
        component_metrics=[count_mode],
    )
    ref_vec = np.mean(np.asarray(ref_mat), axis=0).astype(float)
    ref_norm = float(np.linalg.norm(ref_vec))

    if ref_norm == 0.0:
        import warnings
        warnings.warn(
            "Reference embedding is all-zero: no molecules in the reference "
            "matched any group for the given group_selection / count_mode settings. "
            "Returning zero scores.",
            UserWarning,
            stacklevel=3,
        )
        return np.zeros(len(results), dtype=float)

    # Build candidate matrix: shape (n_candidates, n_cols)
    cand_mat = np.asarray(results.to_array(
        group_selection=group_selection,
        component_metrics=[count_mode],
    )).astype(float)

    # Cosine similarity: score_i = (cand_i · ref_vec) / (||cand_i|| × ||ref_vec||)
    dot_products = cand_mat @ ref_vec                    # (n_candidates,)
    cand_norms   = np.linalg.norm(cand_mat, axis=1)     # (n_candidates,)

    scores = np.zeros(len(dot_products), dtype=float)
    nonzero = cand_norms > 0
    scores[nonzero] = dot_products[nonzero] / (cand_norms[nonzero] * ref_norm)

    return np.clip(scores, 0.0, 1.0)


def _prioritise_by_components(
    results: PFASEmbeddingSet,
    a: float,
    b: float,
    percentile: float
) -> np.ndarray:
    """
    Prioritize by fluorinated component characteristics.

    Score = a × sum(component_sizes) + b × percentile(component_sizes, p)

    Parameters
    ----------
    results : PFASEmbeddingSet
        Molecules to prioritize
    a : float
        Weight for total component size
    b : float
        Weight for component size percentile
    percentile : float
        Percentile value (0-100)

    Returns
    -------
    np.ndarray
        Priority scores (higher = higher priority)
    """
    scores = []

    for mol_result in results:
        # Extract all fluorinated component sizes from all matched groups
        all_component_sizes = []

        for match in mol_result['matches']:
            if match.get('type') == 'HalogenGroup' and 'components_sizes' in match:
                component_sizes = match['components_sizes']
                if isinstance(component_sizes, list) and len(component_sizes) > 0:
                    all_component_sizes.extend(component_sizes)

        if len(all_component_sizes) == 0:
            # No fluorinated components found
            score = 0.0
        else:
            all_component_sizes = np.array(all_component_sizes)

            # Compute score components
            total_size = float(np.sum(all_component_sizes))
            percentile_size = float(np.percentile(all_component_sizes, percentile))

            # Combined score
            score = a * total_size + b * percentile_size

        scores.append(score)

    return np.array(scores)


def get_priority_statistics(
    results: PFASEmbeddingSet,
    scores: np.ndarray,
    top_n: int = 10
) -> Dict:
    """
    Get statistics about prioritization results.

    Parameters
    ----------
    results : PFASEmbeddingSet
        Prioritized molecules
    scores : np.ndarray
        Priority scores
    top_n : int, default 10
        Number of top molecules to analyze

    Returns
    -------
    dict
        Statistics including:
        - 'n_molecules': Total number of molecules
        - 'score_mean': Mean priority score
        - 'score_std': Standard deviation of scores
        - 'score_min': Minimum score
        - 'score_max': Maximum score
        - 'top_n_smiles': SMILES of top N molecules
        - 'top_n_scores': Scores of top N molecules
        - 'top_n_groups': Most common groups in top N molecules

    Examples
    --------
    >>> results, scores = prioritise_molecules(smiles_list, reference=ref_list)
    >>> stats = get_priority_statistics(results, scores, top_n=10)
    >>> print(f"Mean score: {stats['score_mean']:.3f}")
    >>> print(f"Top molecule: {stats['top_n_smiles'][0]}")
    """
    stats = {
        'n_molecules': len(results),
        'score_mean': float(np.mean(scores)),
        'score_std': float(np.std(scores)),
        'score_min': float(np.min(scores)),
        'score_max': float(np.max(scores)),
        'score_median': float(np.median(scores)),
    }

    # Top N molecules
    n = min(top_n, len(results))
    stats['top_n_smiles'] = [results[i]['smiles'] for i in range(n)]
    stats['top_n_scores'] = scores[:n].tolist()

    # Most common groups in top N
    group_counts = {}
    for i in range(n):
        for match in results[i]['matches']:
            if match.get('type') == 'HalogenGroup':
                group_name = match.get('group_name', 'Unknown')
                group_counts[group_name] = group_counts.get(group_name, 0) + 1

    # Sort by frequency
    sorted_groups = sorted(group_counts.items(), key=lambda x: x[1], reverse=True)
    stats['top_n_groups'] = sorted_groups[:10]  # Top 10 most common groups

    return stats


# Alias for American spelling
prioritize_molecules = prioritise_molecules