Source code for PFASGroups.PFASDefinitionModel

from rdkit import Chem
from typing import List, Dict, Optional, Union


[docs]
class PFASDefinition:
    """Model class representing a PFAS definition based on structural criteria.
    
    A PFAS definition identifies molecules using SMARTS patterns and/or fluorine ratio thresholds.
    Unlike HalogenGroup which focuses on specific functional groups, PFASDefinition provides
    broader chemical definitions (e.g., "contains at least one perfluoroalkyl moiety").
    
    Attributes
    ----------
    id : int
        Unique identifier for this PFAS definition
    name : str
        Human-readable name (e.g., "Per- and polyfluoroalkyl substances")
    description : str
        Detailed description of what this definition represents
    fluorineRatio : Optional[float]
        Minimum ratio of fluorine atoms required (None if not applicable)
    smarts_strings : List[str]
        Original SMARTS pattern strings for structural matching
    smarts_patterns : List[Chem.Mol]
        Compiled SMARTS molecule objects for efficient matching
    includeHydrogen : bool
        Whether to include hydrogen atoms in fluorine ratio calculations
    requireBoth : bool
        If True, requires both SMARTS match AND fluorine ratio.
        If False, requires SMARTS match OR fluorine ratio.
    
    Examples
    --------
    >>> # Definition requiring perfluoroalkyl chain OR high fluorine ratio
    >>> pfas_def = PFASDefinition(
    ...     id=1,
    ...     name="PFAS (OECD definition)",
    ...     smarts=["[CX4][CX4]([F])([F])[F]"],
    ...     fluorineRatio=0.4,
    ...     description="Contains perfluoroalkyl moiety with ≥2 carbons",
    ...     requireBoth=False
    ... )
    """

[docs]
    def __init__(self, id: int, name: str, smarts: List[str], fluorineRatio: Optional[float], description: str, **kwargs):
        self.id = id
        self.name = name
        self.description = description
        self.fluorineRatio = fluorineRatio
        self.smarts_strings = smarts
        self.includeHydrogen = kwargs.get('includeHydrogen', True)
        self.requireBoth = kwargs.get('requireBoth', False)
        # Preload SMARTS patterns
        self.smarts_patterns = []
        for smarts_str in smarts:
            try:
                mol = Chem.MolFromSmarts(smarts_str)
                if mol is not None:
                    mol.UpdatePropertyCache()
                    Chem.GetSymmSSSR(mol)
                    mol.GetRingInfo().NumRings()
                    self.smarts_patterns.append(mol)
            except:
                pass

    
    def __str__(self):
        return self.name
    

[docs]
    def applies_to_molecule(self, 
                  mol_or_smiles: Union[Chem.Mol, str], 
                  formula: Optional[Dict[str, int]] = None,
                  **kwargs) -> bool:
        """Check if this PFAS definition applies to a given molecule.
        
        This method evaluates whether a molecule meets the structural and/or compositional
        criteria defined by this PFASDefinition. The evaluation logic depends on the
        requireBoth flag:
        
        - If requireBoth=False (default): Returns True if EITHER SMARTS matches OR
          fluorine ratio is met (logical OR)
        - If requireBoth=True: Returns True only if BOTH SMARTS matches AND fluorine
          ratio are met (logical AND)
        
        Parameters
        ----------
        mol_or_smiles : Union[Chem.Mol, str]
            Input molecule as RDKit Mol object or SMILES string
        formula : Optional[Dict[str, int]], default=None
            Pre-computed molecular formula as {element: count} dictionary.
            If None, will be computed from the molecule.
        **kwargs : dict
            Additional parameters:
            
            - include_hydrogen (bool): Whether to include H in fluorine ratio calculation.
              Defaults to self.includeHydrogen
            - require_both (bool): Override the instance's requireBoth setting
        
        Returns
        -------
        bool
            True if the molecule meets the definition criteria, False otherwise
        
        Examples
        --------
        >>> pfas_def = PFASDefinition(
        ...     id=1, name="Test", smarts=["[CX4]F"],
        ...     fluorineRatio=0.3, description="Test"
        ... )
        >>> pfas_def.applies_to_molecule("FC(F)(F)C(F)(F)F")  # PFOA-like
        True
        >>> pfas_def.applies_to_molecule("CCCCCC")  # No fluorine
        False
        
        Notes
        -----
        - SMARTS patterns are checked using substructure matching (HasSubstructMatch)
        - Fluorine ratio is calculated as: F_count / total_atom_count
        - Invalid SMILES strings return False
        """
        # Convert SMILES to Mol if needed
        if isinstance(mol_or_smiles, str):
            mol = Chem.MolFromSmiles(mol_or_smiles)
            if mol is None:
                return False
        else:
            mol = mol_or_smiles
        
        # Check SMARTS matches
        smarts_match = False
        for pattern in self.smarts_patterns:
            if mol.HasSubstructMatch(pattern):
                smarts_match = True
                break
        
        # Check fluorine ratio if defined
        ratio_match = True  # Default to True if no ratio requirement
        if self.fluorineRatio is not None:
            if formula is None:
                formula = self._compute_formula(mol, kwargs.get("include_hydrogen", self.includeHydrogen))
            
            ratio_match = self._check_fluorine_ratio(formula, kwargs.get("include_hydrogen", self.includeHydrogen))
        
        # Apply logic based on require_both
        if kwargs.get("require_both", self.requireBoth):
            return smarts_match and ratio_match
        else:
            # If no fluorine ratio is defined, only check SMARTS
            if self.fluorineRatio is None:
                return smarts_match
            # Otherwise, SMARTS OR ratio
            return smarts_match or ratio_match

    
    def _compute_formula(self, mol: Chem.Mol, include_hydrogen: bool) -> Dict[str, int]:
        """Compute molecular formula as element count dictionary.
        
        Parameters
        ----------
        mol : Chem.Mol
            RDKit molecule object
        include_hydrogen : bool
            If True, add explicit hydrogens before counting
        
        Returns
        -------
        Dict[str, int]
            Dictionary mapping element symbols to their counts, e.g. {'C': 8, 'F': 17, 'O': 2}
        """
        formula = {}
        
        # Add hydrogens if needed
        if include_hydrogen:
            mol = Chem.AddHs(mol)
        
        for atom in mol.GetAtoms():
            symbol = atom.GetSymbol()
            formula[symbol] = formula.get(symbol, 0) + 1
        
        return formula
    
    def _check_fluorine_ratio(self, formula: Dict[str, int], include_hydrogen: bool) -> bool:
        """Check if the fluorine ratio in a molecular formula meets the threshold.
        
        Parameters
        ----------
        formula : Dict[str, int]
            Molecular formula as {element: count} dictionary
        include_hydrogen : bool
            If True, includes hydrogen atoms in total atom count.
            If False, excludes hydrogen from total (heavy atoms only)
        
        Returns
        -------
        bool
            True if (F_count / total_atoms) >= self.fluorineRatio, False otherwise
        
        Notes
        -----
        - Returns False if total_atoms is 0
        - Formula with no fluorine (F_count=0) will fail unless fluorineRatio=0
        """
        f_count = formula.get('F', 0)
        
        if include_hydrogen:
            total_atoms = sum(formula.values())
        else:
            total_atoms = sum(v for k, v in formula.items() if k != 'H')
        
        if total_atoms == 0:
            return False
        
        ratio = f_count / total_atoms
        return ratio >= self.fluorineRatio
    

[docs]
    def test(self, test_data=None):
        """Test this PFAS definition against test molecules from metadata.
        
        Validates that the definition correctly classifies true positives, true negatives,
        false positives, and false negatives based on test metadata in 
        PFAS_definitions_smarts.json.
        
        Parameters
        ----------
        test_data : dict, optional
            Test metadata dictionary. If None, will be loaded from the definition's
            entry in PFAS_definitions_smarts.json. Expected keys: ``category``,
            ``examples`` (dict with keys ``true_positives``, ``true_negatives``,
            ``false_positives``, ``false_negatives`` each a list of dicts).
        
        Returns
        -------
        dict
            Test results with keys: ``passed`` (bool), ``total_tests`` (int),
            ``failures`` (list), ``category`` (str), ``stats`` (dict with
            counts for true/false positives/negatives).
        
        Notes
        -----
        - Tests against benchmark test compounds with known PFAS/non-PFAS labels
        - Validates both SMARTS patterns and fluorine ratio criteria
        - Returns detailed failure information for debugging
        """
        from rdkit import Chem
        
        # Load test data if not provided
        if test_data is None:
            import json
            from pathlib import Path
            definitions_file = Path(__file__).parent / 'data' / 'PFAS_definitions_smarts.json'
            with open(definitions_file, 'r') as f:
                all_definitions = json.load(f)
            
            # Find this definition's test data
            test_data = None
            for def_data in all_definitions:
                if def_data['id'] == self.id:
                    test_data = def_data.get('test', {})
                    break
            
            if test_data is None or not test_data:
                return {
                    'passed': None,
                    'total_tests': 0,
                    'failures': [],
                    'category': 'unknown',
                    'error': f'No test data found for definition {self.id}'
                }
        
        results = {
            'passed': True,
            'total_tests': 0,
            'failures': [],
            'category': test_data.get('category', 'definition'),
            'stats': {
                'true_positives': 0,
                'true_negatives': 0,
                'false_positives': 0,
                'false_negatives': 0
            }
        }
        
        examples = test_data.get('examples', {})
        
        # Test positives (should match)
        for smiles in examples.get('positives', []):
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': True,
                        'got': None,
                        'type': 'true_positive',
                        'error': 'Invalid SMILES'
                    })
                    continue
                
                # Check if definition applies
                applies = self.applies_to_molecule(mol)
                if applies:
                    results['stats']['true_positives'] += 1
                else:
                    results['passed'] = False
                    results['stats']['false_negatives'] += 1
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': True,
                        'got': False,
                        'type': 'true_positive',
                        'error': 'Definition should match but did not'
                    })
            except Exception as e:
                results['passed'] = False
                results['failures'].append({
                    'smiles': smiles,
                    'expected': True,
                    'got': None,
                    'type': 'positive',
                    'error': f'Exception: {str(e)}'
                })
        
        # Test negatives (should NOT match)
        for smiles in examples.get('negatives', []):
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': False,
                        'got': None,
                        'type': 'true_negative',
                        'error': 'Invalid SMILES'
                    })
                    continue
                
                # Check if definition applies
                applies = self.applies_to_molecule(mol)
                if not applies:
                    results['stats']['true_negatives'] += 1
                else:
                    results['passed'] = False
                    results['stats']['false_positives'] += 1
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': False,
                        'got': True,
                        'type': 'true_negative',
                        'error': 'Definition should not match but did'
                    })
            except Exception as e:
                results['passed'] = False
                results['failures'].append({
                    'smiles': smiles,
                    'expected': False,
                    'got': None,
                    'type': 'negative',
                    'error': f'Exception: {str(e)}'
                })
        
        # Test false positives (known to incorrectly match - document these)
        for item in examples.get('false_positives', []):
            smiles = item if isinstance(item, str) else item.get('smiles', '')
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    continue
                
                # These are expected to match (false positives)
                applies = self.applies_to_molecule(mol)
                if applies:
                    results['stats']['false_positives'] += 1
            except Exception:
                pass
        
        # Test false negatives (known to incorrectly NOT match - document these)
        for item in examples.get('false_negatives', []):
            smiles = item if isinstance(item, str) else item.get('smiles', '')
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    continue
                
                # These are expected to NOT match (false negatives)
                applies = self.applies_to_molecule(mol)
                if not applies:
                    results['stats']['false_negatives'] += 1
            except Exception:
                pass
        
        return results