Source code for PFASGroups.PFASDefinitionModel

from rdkit import Chem
from typing import List, Dict, Optional, Union

[docs] class PFASDefinition: """Model class representing a PFAS definition based on structural criteria. A PFAS definition identifies molecules using SMARTS patterns and/or fluorine ratio thresholds. Unlike HalogenGroup which focuses on specific functional groups, PFASDefinition provides broader chemical definitions (e.g., "contains at least one perfluoroalkyl moiety"). Attributes ---------- id : int Unique identifier for this PFAS definition name : str Human-readable name (e.g., "Per- and polyfluoroalkyl substances") description : str Detailed description of what this definition represents fluorineRatio : Optional[float] Minimum ratio of fluorine atoms required (None if not applicable) smarts_strings : List[str] Original SMARTS pattern strings for structural matching smarts_patterns : List[Chem.Mol] Compiled SMARTS molecule objects for efficient matching includeHydrogen : bool Whether to include hydrogen atoms in fluorine ratio calculations requireBoth : bool If True, requires both SMARTS match AND fluorine ratio. If False, requires SMARTS match OR fluorine ratio. Examples -------- >>> # Definition requiring perfluoroalkyl chain OR high fluorine ratio >>> pfas_def = PFASDefinition( ... id=1, ... name="PFAS (OECD definition)", ... smarts=["[CX4][CX4]([F])([F])[F]"], ... fluorineRatio=0.4, ... description="Contains perfluoroalkyl moiety with ≥2 carbons", ... requireBoth=False ... ) """
[docs] def __init__(self, id: int, name: str, smarts: List[str], fluorineRatio: Optional[float], description: str, **kwargs): self.id = id self.name = name self.description = description self.fluorineRatio = fluorineRatio self.smarts_strings = smarts self.includeHydrogen = kwargs.get('includeHydrogen', True) self.requireBoth = kwargs.get('requireBoth', False) # Preload SMARTS patterns self.smarts_patterns = [] for smarts_str in smarts: try: mol = Chem.MolFromSmarts(smarts_str) if mol is not None: mol.UpdatePropertyCache() Chem.GetSymmSSSR(mol) mol.GetRingInfo().NumRings() self.smarts_patterns.append(mol) except: pass
def __str__(self): return self.name
[docs] def applies_to_molecule(self, mol_or_smiles: Union[Chem.Mol, str], formula: Optional[Dict[str, int]] = None, **kwargs) -> bool: """Check if this PFAS definition applies to a given molecule. This method evaluates whether a molecule meets the structural and/or compositional criteria defined by this PFASDefinition. The evaluation logic depends on the requireBoth flag: - If requireBoth=False (default): Returns True if EITHER SMARTS matches OR fluorine ratio is met (logical OR) - If requireBoth=True: Returns True only if BOTH SMARTS matches AND fluorine ratio are met (logical AND) Parameters ---------- mol_or_smiles : Union[Chem.Mol, str] Input molecule as RDKit Mol object or SMILES string formula : Optional[Dict[str, int]], default=None Pre-computed molecular formula as {element: count} dictionary. If None, will be computed from the molecule. **kwargs : dict Additional parameters: - include_hydrogen (bool): Whether to include H in fluorine ratio calculation. Defaults to self.includeHydrogen - require_both (bool): Override the instance's requireBoth setting Returns ------- bool True if the molecule meets the definition criteria, False otherwise Examples -------- >>> pfas_def = PFASDefinition( ... id=1, name="Test", smarts=["[CX4]F"], ... fluorineRatio=0.3, description="Test" ... ) >>> pfas_def.applies_to_molecule("FC(F)(F)C(F)(F)F") # PFOA-like True >>> pfas_def.applies_to_molecule("CCCCCC") # No fluorine False Notes ----- - SMARTS patterns are checked using substructure matching (HasSubstructMatch) - Fluorine ratio is calculated as: F_count / total_atom_count - Invalid SMILES strings return False """ # Convert SMILES to Mol if needed if isinstance(mol_or_smiles, str): mol = Chem.MolFromSmiles(mol_or_smiles) if mol is None: return False else: mol = mol_or_smiles # Check SMARTS matches smarts_match = False for pattern in self.smarts_patterns: if mol.HasSubstructMatch(pattern): smarts_match = True break # Check fluorine ratio if defined ratio_match = True # Default to True if no ratio requirement if self.fluorineRatio is not None: if formula is None: formula = self._compute_formula(mol, kwargs.get("include_hydrogen", self.includeHydrogen)) ratio_match = self._check_fluorine_ratio(formula, kwargs.get("include_hydrogen", self.includeHydrogen)) # Apply logic based on require_both if kwargs.get("require_both", self.requireBoth): return smarts_match and ratio_match else: # If no fluorine ratio is defined, only check SMARTS if self.fluorineRatio is None: return smarts_match # Otherwise, SMARTS OR ratio return smarts_match or ratio_match
def _compute_formula(self, mol: Chem.Mol, include_hydrogen: bool) -> Dict[str, int]: """Compute molecular formula as element count dictionary. Parameters ---------- mol : Chem.Mol RDKit molecule object include_hydrogen : bool If True, add explicit hydrogens before counting Returns ------- Dict[str, int] Dictionary mapping element symbols to their counts, e.g. {'C': 8, 'F': 17, 'O': 2} """ formula = {} # Add hydrogens if needed if include_hydrogen: mol = Chem.AddHs(mol) for atom in mol.GetAtoms(): symbol = atom.GetSymbol() formula[symbol] = formula.get(symbol, 0) + 1 return formula def _check_fluorine_ratio(self, formula: Dict[str, int], include_hydrogen: bool) -> bool: """Check if the fluorine ratio in a molecular formula meets the threshold. Parameters ---------- formula : Dict[str, int] Molecular formula as {element: count} dictionary include_hydrogen : bool If True, includes hydrogen atoms in total atom count. If False, excludes hydrogen from total (heavy atoms only) Returns ------- bool True if (F_count / total_atoms) >= self.fluorineRatio, False otherwise Notes ----- - Returns False if total_atoms is 0 - Formula with no fluorine (F_count=0) will fail unless fluorineRatio=0 """ f_count = formula.get('F', 0) if include_hydrogen: total_atoms = sum(formula.values()) else: total_atoms = sum(v for k, v in formula.items() if k != 'H') if total_atoms == 0: return False ratio = f_count / total_atoms return ratio >= self.fluorineRatio
[docs] def test(self, test_data=None): """Test this PFAS definition against test molecules from metadata. Validates that the definition correctly classifies true positives, true negatives, false positives, and false negatives based on test metadata in PFAS_definitions_smarts.json. Parameters ---------- test_data : dict, optional Test metadata dictionary. If None, will be loaded from the definition's entry in PFAS_definitions_smarts.json. Expected keys: ``category``, ``examples`` (dict with keys ``true_positives``, ``true_negatives``, ``false_positives``, ``false_negatives`` each a list of dicts). Returns ------- dict Test results with keys: ``passed`` (bool), ``total_tests`` (int), ``failures`` (list), ``category`` (str), ``stats`` (dict with counts for true/false positives/negatives). Notes ----- - Tests against benchmark test compounds with known PFAS/non-PFAS labels - Validates both SMARTS patterns and fluorine ratio criteria - Returns detailed failure information for debugging """ from rdkit import Chem # Load test data if not provided if test_data is None: import json from pathlib import Path definitions_file = Path(__file__).parent / 'data' / 'PFAS_definitions_smarts.json' with open(definitions_file, 'r') as f: all_definitions = json.load(f) # Find this definition's test data test_data = None for def_data in all_definitions: if def_data['id'] == self.id: test_data = def_data.get('test', {}) break if test_data is None or not test_data: return { 'passed': None, 'total_tests': 0, 'failures': [], 'category': 'unknown', 'error': f'No test data found for definition {self.id}' } results = { 'passed': True, 'total_tests': 0, 'failures': [], 'category': test_data.get('category', 'definition'), 'stats': { 'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0 } } examples = test_data.get('examples', {}) # Test positives (should match) for smiles in examples.get('positives', []): results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': None, 'type': 'true_positive', 'error': 'Invalid SMILES' }) continue # Check if definition applies applies = self.applies_to_molecule(mol) if applies: results['stats']['true_positives'] += 1 else: results['passed'] = False results['stats']['false_negatives'] += 1 results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': False, 'type': 'true_positive', 'error': 'Definition should match but did not' }) except Exception as e: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': None, 'type': 'positive', 'error': f'Exception: {str(e)}' }) # Test negatives (should NOT match) for smiles in examples.get('negatives', []): results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': False, 'got': None, 'type': 'true_negative', 'error': 'Invalid SMILES' }) continue # Check if definition applies applies = self.applies_to_molecule(mol) if not applies: results['stats']['true_negatives'] += 1 else: results['passed'] = False results['stats']['false_positives'] += 1 results['failures'].append({ 'smiles': smiles, 'expected': False, 'got': True, 'type': 'true_negative', 'error': 'Definition should not match but did' }) except Exception as e: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': False, 'got': None, 'type': 'negative', 'error': f'Exception: {str(e)}' }) # Test false positives (known to incorrectly match - document these) for item in examples.get('false_positives', []): smiles = item if isinstance(item, str) else item.get('smiles', '') results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: continue # These are expected to match (false positives) applies = self.applies_to_molecule(mol) if applies: results['stats']['false_positives'] += 1 except Exception: pass # Test false negatives (known to incorrectly NOT match - document these) for item in examples.get('false_negatives', []): smiles = item if isinstance(item, str) else item.get('smiles', '') results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: continue # These are expected to NOT match (false negatives) applies = self.applies_to_molecule(mol) if not applies: results['stats']['false_negatives'] += 1 except Exception: pass return results