Source code for PFASGroups.HalogenGroupModel

from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
import re
from .core import mol_to_nx, add_componentSmarts




[docs] class HalogenGroup(): """Model class representing a specific halogenated functional group with structural patterns. A HalogenGroup defines a specific halogenated functional group using SMARTS patterns, component path types, and molecular formula constraints. Groups are used to classify molecules into specific categories (e.g., "Perfluoroalkyl carboxylic acid"). Attributes ---------- id : int Unique identifier for this Halogen group name : str Human-readable group name (e.g., "Perfluoroalkyl carboxylic acid") smarts : Chem.Mol or None SMARTS patterns (compiled RDKit molecule) for functional group detection. None if group is defined by componentSmarts alone. componentSmarts : list, str or None componentForm: str or None componentHalogens: list, str or None componentSaturation: str or None (-> both) max_dist_from_comp : int Maximum graph distance (number of bonds) from fluorinated component to functional group. When > 0, extends component search radius to find nearby functional groups. linker_smarts : Chem.Mol or None Compiled SMARTS pattern for validating linker atoms between fluorinated component and functional group. When None (default), no restriction is applied to linker atoms. Only used when max_dist_from_comp > 0. constraints : dict Molecular formula constraints with keys: - 'only': Elements that must be present exclusively (e.g., ['C', 'F', 'O']) - 'gte': Minimum element counts (e.g., {'C': 2}) - 'lte': Maximum element counts (e.g., {'O': 2}) - 'eq': Exact element counts (e.g., {'N': 1}) - 'rel': Relational constraints (e.g., {'O': {'atoms': ['C'], 'div': 2, 'add': 0}}) Examples -------- >>> # Perfluoroalkyl carboxylic acid: R_F-COOH >>> pfaa = HalogenGroup( ... id=1, ... name="Perfluoroalkyl carboxylic acid", ... smarts={"C(=O)O":1}, # Carboxylic acid group ... componentSmarts="Perfluoroalkyl", ... constraints={"only": ["C", "F", "O", "H"]}, ... max_dist_from_comp=0, ... linker_smarts=None ... ) Notes ----- - SMARTS patterns are compiled on initialization for efficient matching - Constraints are validated when checking if a molecule belongs to this group - max_dist_from_comp allows finding functional groups connected via non-fluorinated linkers - linker_smarts restricts which atoms can be in the path between component and functional group """ @add_componentSmarts() def __init__(self, id, name,**kwargs): self.id = id self.name = name smarts = kwargs.get('smarts',{}) # Save original SMARTS strings for atom counting if smarts and len(smarts) > 0: self.smarts_str, self.smarts_count = zip(*smarts.items()) else: self.smarts_str = None self.smarts_count = None self.smarts = [] if self.smarts_str else None self.componentSmarts = kwargs.get('componentSmarts',None) self.componentSaturation = kwargs.get('componentSaturation',None) self.componentHalogens = kwargs.get('componentHalogens', kwargs.get('componentHalogen', None)) self.componentForm = kwargs.get("componentForm", None) self._comp_type_to_halogen = {} # populated by set_component_smarts self._comp_type_to_constraints = {} # populated by set_component_smarts self.set_component_smarts(kwargs.get('componentSmartss', {})) self.excludeHalogens = kwargs.get('excludeHalogens', None) self.max_dist_from_comp = kwargs.get('max_dist_from_comp', 0) # Compile linker_smarts pattern if provided linker_smarts_str = kwargs.get('linker_smarts', None) self.linker_smarts = None if linker_smarts_str is not None: try: self.linker_smarts = Chem.MolFromSmarts(linker_smarts_str) self.linker_smarts.UpdatePropertyCache() Chem.GetSymmSSSR(self.linker_smarts) self.linker_smarts.GetRingInfo().NumRings() except: raise ValueError(f"Invalid linker_smarts pattern '{linker_smarts_str}' for HalogenGroup '{self.name}' (ID: {self.id})") if self.smarts_str is not None: for smarts_pattern in self.smarts_str: if smarts_pattern and smarts_pattern != "": try: smarts_mol = Chem.MolFromSmarts(smarts_pattern) smarts_mol.UpdatePropertyCache() Chem.GetSymmSSSR(smarts_mol) smarts_mol.GetRingInfo().NumRings() self.smarts.append(smarts_mol) except: raise ValueError(f"Invalid SMARTS pattern(s) for HalogenGroup '{self.name}' (ID: {self.id})") self.constraints = kwargs.get('constraints',{}) # Precompute number of extra atoms in SMARTS patterns (beyond matched atom and H/F/Cl/Br/I) self.smarts_extra_atoms = self._count_smarts_extra_atoms(self.smarts_str) self.component_specific_extra_atoms = [] self.all_matches = [] self.compute = kwargs.get('compute',True)# whether the HalogenGroups needs to be parsed, or is an aggregate group, e.g. telomers self.re_search = kwargs.get('re_search',None)# regex for aggregate groups, e.g. telomers if self.re_search is not None: try: self.re_search = re.compile(self.re_search) except Exception as e: raise Exception(f"Error for agg Group {self.id}: {self.name}\n {e}") self.test_dict = kwargs.get('test',None)# test dict for unit tests
[docs] def set_component_smarts(self, componentSmartss): """ Infers componentSmarts based on componentSmarts, componentSaturation, componentForm and componentHalogen """ if not componentSmartss: return # Build comp_type → halogen and comp_type → constraints mappings self._comp_type_to_halogen = { k: v['halogen'] for k, v in componentSmartss.items() if isinstance(v, dict) and 'halogen' in v } self._comp_type_to_constraints = { k: v['constraints'] for k, v in componentSmartss.items() if isinstance(v, dict) and v.get('constraints') } # if componentSmarts is None, or is not in (entirely) in the available componentSmartss if self.componentSmarts is None or (isinstance(self.componentSmarts,list) and not set(self.componentSmarts).issubset(componentSmartss.keys()) or (isinstance(self.componentSmarts, str) and not self.componentSmarts in componentSmartss.keys())): new_CS = [] try: componentSmarts_dict = {} for k,v in componentSmartss.items(): if not isinstance(v, dict): continue halogen = v.get('halogen') form = v.get('form') saturation = v.get('saturation') if halogen is None or form is None or saturation is None: continue componentSmarts_dict.setdefault(halogen,{}).setdefault(form,{})[saturation]=k except Exception as e: raise ValueError(f"Error processing componentSmartss for HalogenGroup '{self.name}' (ID: {self.id}): {e}") if not componentSmarts_dict: # No metadata available to infer component SMARTS return # prepare halogens (accepts list, str or None) if isinstance(self.componentHalogens,list) and len(set(self.componentHalogens).intersection(['F','Cl','Br','I','H'])) == 0: raise ValueError(f"Invalid componentHalogens for HalogenGroup '{self.name}' (ID: {self.id})") _halogens_inferred = False if self.componentHalogens is None: # Derive from whichever halogens have component SMARTS defined in the # available componentSmartss. Fall back to real halogens only (no H) # when the data provides no guidance. _hal_order = ['F', 'Cl', 'Br', 'I', 'H'] _inferred = [h for h in _hal_order if h in componentSmarts_dict] self.componentHalogens = _inferred if _inferred else ['F', 'Cl', 'Br', 'I'] _halogens_inferred = True elif isinstance(self.componentHalogens, str) and self.componentHalogens in ['F','Cl','Br','I','H']: self.componentHalogens = [self.componentHalogens] if self.componentSaturation is None: self.componentSaturation = ['per','poly'] elif isinstance(self.componentSaturation, str) and self.componentSaturation in ['per','poly','both']: self.componentSaturation = ['per','poly'] if self.componentSaturation == 'both' else [self.componentSaturation] elif isinstance(self.componentSaturation, list): self.componentSaturation = self.componentSaturation else: raise ValueError(f"Invalid componentSaturation for HalogenGroup '{self.name}' (ID: {self.id}), expected 'per', 'poly', 'both' or null, got {self.componentSaturation}") if self.componentForm is None: self.componentForm = 'alkyl' for halogen in self.componentHalogens: if halogen not in componentSmarts_dict: if _halogens_inferred: continue raise ValueError(f"No component SMARTS available for halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})") if self.componentForm not in componentSmarts_dict[halogen]: if _halogens_inferred: continue raise ValueError(f"No component SMARTS available for form '{self.componentForm}' and halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})") for saturation in self.componentSaturation: if saturation not in componentSmarts_dict[halogen][self.componentForm]: if _halogens_inferred: continue raise ValueError(f"No component SMARTS available for saturation '{saturation}', form '{self.componentForm}', halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})") new_CS.append(componentSmarts_dict[halogen][self.componentForm][saturation]) self.componentSmarts = new_CS
[docs] def set_componentSmarts(self, componentSmartss): """Backward-compatible alias for set_component_smarts.""" return self.set_component_smarts(componentSmartss)
def _count_smarts_extra_atoms(self, smarts_str): """Count number of extra carbon atoms in functional group beyond what's captured by component. Parameters ---------- smarts_str : str or None Original SMARTS string before compilation Returns ------- int Number of extra carbon atoms beyond the matched atom Notes ----- The component fraction calculation is now based on carbon atoms only: 1. Carbon atoms in component 2. Carbon atoms in SMARTS matches 3. Additional carbon atoms from SMARTS (this return value) For automatic counting (when manual_size is None), this returns 0 since we now focus only on carbons and they are already counted in the component and SMARTS matches. """ PAT_c = re.compile(r'((C(?![adeflmnorsu]))|((?<![TAS])c)|(\#6))') # Match 'C' not followed by a letter, or c not preceded by T,A,S or #6 return [max(0,len(PAT_c.findall(s))-1) for s in smarts_str] if smarts_str is not None else None def __str__(self): return self.name @staticmethod def _check_component_constraints(comp_formula, constraints): """Check a component's element formula against component-level constraints. Parameters ---------- comp_formula : dict ``{element_symbol: count}`` for all atoms in the full component (backbone + attached H/halogens). constraints : dict Component constraints with optional keys: * ``'gte'`` - ``{element: min_count}``; component must have at least *min_count* atoms of *element*. * ``'exclude'`` - list of element symbols that must be absent from the component. Returns ------- bool """ for elem, n in constraints.get('gte', {}).items(): if comp_formula.get(elem, 0) < n: return False for elem in constraints.get('exclude', []): if comp_formula.get(elem, 0) > 0: return False return True
[docs] def constraint_gte(self, formula_dict): """Check 'greater than or equal' constraints on element counts. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary Returns ------- bool True if all 'gte' constraints are satisfied, False otherwise Examples -------- >>> # Requires at least 2 carbons and 3 fluorines >>> group.constraints = {'gte': {'C': 2, 'F': 3}} >>> group.constraint_gte({'C': 3, 'F': 5, 'O': 1}) # True >>> group.constraint_gte({'C': 1, 'F': 5, 'O': 1}) # False (C < 2) """ success = True for e,n in self.constraints.get('gte',{}).items(): success = success and formula_dict.get(e,0)>=n return success
[docs] def constraint_lte(self, formula_dict): """Check 'less than or equal' constraints on element counts. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary Returns ------- bool True if all 'lte' constraints are satisfied, False otherwise Examples -------- >>> # Requires at most 2 oxygens >>> group.constraints = {'lte': {'O': 2}} >>> group.constraint_lte({'C': 8, 'F': 15, 'O': 2}) # True >>> group.constraint_lte({'C': 8, 'F': 15, 'O': 3}) # False (O > 2) """ success = True for e,n in self.constraints.get('lte',{}).items(): success = success and formula_dict.get(e,0)<=n return success
[docs] def constraint_eq(self, formula_dict): """Check 'equal to' constraints on element counts. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary Returns ------- bool True if all 'eq' constraints are satisfied, False otherwise Examples -------- >>> # Requires exactly 1 nitrogen >>> group.constraints = {'eq': {'N': 1}} >>> group.constraint_eq({'C': 8, 'F': 15, 'N': 1}) # True >>> group.constraint_eq({'C': 8, 'F': 15, 'N': 2}) # False (N != 1) """ success = True for e,n in self.constraints.get('eq',{}).items(): success = success and formula_dict.get(e,0)==n return success
[docs] def constraint_only(self, formula_dict): """Check 'only' constraint - molecule must contain only specified elements. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary Returns ------- bool True if molecule contains only the allowed elements, False otherwise Examples -------- >>> # Molecule must contain only C, F, O, H >>> group.constraints = {'only': ['C', 'F', 'O', 'H']} >>> group.constraint_only({'C': 8, 'F': 15, 'O': 2, 'H': 1}) # True >>> group.constraint_only({'C': 8, 'F': 15, 'O': 2, 'S': 1}) # False (S not allowed) Notes ----- Checks that sum of allowed elements equals total atoms in molecule. """ success = True if 'only' in self.constraints.keys(): tot = sum(formula_dict.values()) nn = 0 for e in self.constraints['only']: nn += formula_dict.get(e,0) success = success and tot == nn return success
[docs] def constraint_rel(self, formula_dict): """Check relational constraints between element counts. Validates relationships of the form: count(element) = f(other_elements) where f can include division, addition, and summing other element counts. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary Returns ------- bool True if all relational constraints are satisfied, False otherwise Notes ----- Constraint Format:: 'rel': { 'ElementA': { 'atoms': ['ElementB', 'ElementC'], # Elements to sum 'div': int, # Divisor (default 1) 'add': int, # Additive constant (default 0) 'add_atoms': ['ElementD'] # Additional elements to add } } Formula: count(ElementA) = (sum(atoms) / div) + add + sum(add_atoms) Examples -------- >>> # Carbon count must equal half the fluorine count >>> group.constraints = {'rel': {'C': {'atoms': ['F'], 'div': 2, 'add': 0}}} >>> group.constraint_rel({'C': 4, 'F': 8, 'O': 2}) # True (4 == 8/2) >>> group.constraint_rel({'C': 3, 'F': 8, 'O': 2}) # False (3 != 8/2) >>> # Oxygen count must equal carbon count plus 1 >>> group.constraints = {'rel': {'O': {'atoms': ['C'], 'div': 1, 'add': 1}}} >>> group.constraint_rel({'C': 3, 'F': 7, 'O': 4}) # True (4 == 3 + 1) """ success = True for e,v in self.constraints.get('rel',{}).items(): n = sum([formula_dict.get(x,0) for x in v.get('atoms',[])]) success = success and formula_dict.get(e,0)==n/v.get('div',1)+v.get('add',0)+sum([formula_dict.get(x,0) for x in v.get('add_atoms',[])]) return success
[docs] def formula_dict_satisfies_constraints(self,formula_dict): """Check if a molecular formula satisfies all constraints for this PFAS group. Evaluates all constraint types in order: relational → only → equal → lte → gte. Stops evaluation at first failure for efficiency. Parameters ---------- formula_dict : dict Molecular formula as {element: count} dictionary (e.g., {'C': 8, 'F': 17, 'O': 2}) Returns ------- bool True if all constraints are satisfied, False if any constraint fails Constraint Evaluation Order --------------------------- 1. Relational constraints ('rel') - element count relationships 2. 'Only' constraints - allowed elements 3. Equality constraints ('eq') - exact element counts 4. Upper bound constraints ('lte') - maximum element counts 5. Lower bound constraints ('gte') - minimum element counts Examples -------- >>> # Perfluoroalkyl carboxylic acid constraints >>> group.constraints = { ... 'only': ['C', 'F', 'O', 'H'], # No other elements ... 'gte': {'C': 2, 'F': 3}, # At least 2 carbons, 3 fluorines ... 'eq': {'O': 2} # Exactly 2 oxygens ... } >>> group.formula_dict_satisfies_constraints({'C': 8, 'F': 15, 'O': 2, 'H': 1}) True >>> group.formula_dict_satisfies_constraints({'C': 8, 'F': 15, 'O': 3, 'H': 1}) False # Fails 'eq': {'O': 2} Notes ----- - Returns True immediately if no constraints are defined - Short-circuits on first constraint failure for performance - Constraint evaluation order is fixed for consistency """ if len(self.constraints.keys())==0: return True success = True process = [None,self.constraint_rel,self.constraint_only,self.constraint_eq, self.constraint_lte, self.constraint_gte] k = process.pop() while success and k is not None: success = k(formula_dict) k = process.pop() return success
[docs] def find_matched_atoms(self, mol): """Find all substructure matches of this PFAS group's SMARTS patterns in a molecule. Parameters ---------- mol : Chem.Mol RDKit molecule object to search for matches Returns ------- List[List[int]] List of matches, where each match is a list of atom indices in the molecule Notes ----- - If no SMARTS patterns are defined, returns an empty list. - Each SMARTS pattern is searched independently; matches from all patterns are combined. """ self.all_matches = [] self.subset = set() if self.smarts is not None: for smarts_mol,min_count in zip(self.smarts, self.smarts_count): matches = mol.GetSubstructMatches(smarts_mol) if len(matches) < min_count: return False if len(matches)>0: # Keep matches as list to preserve duplicates when same atoms matched multiple times # (e.g., dicarboxylic/disulfonic acids where both groups attach to same carbon) self.all_matches.append(list(matches)) self.subset.update({y for x in matches for y in x if len(x)>0}) return True
[docs] def component_satisfies_all_smarts(self, component): """Check if a fluorinated component matches all SMARTS patterns of this PFAS group. Parameters ---------- component : PFASComponent PFASComponent object representing a fluorinated component in the molecule Returns ------- bool True if the component matches all SMARTS patterns, False otherwise Notes ----- - If no SMARTS patterns are defined for this group, returns True. - Each SMARTS pattern must have at least one match that includes the component's atom. """ atom_count = 0 for i, (matches, min_count) in enumerate(zip(self.all_matches,self.smarts_count)): # Count how many SMARTS matches have overlap with this component # matches is a set of tuples, each tuple represents one SMARTS match component_set = set(component) found = sum(1 for match_tuple in matches if any(atom_idx in component_set for atom_idx in match_tuple)) if found < min_count: self.component_specific_extra_atoms.append(0) return False atom_count += found * self.smarts_extra_atoms[i] self.component_specific_extra_atoms.append(atom_count) return True
[docs] def find_alkyl_components(self, mol, component_solver, **kwargs): """Find fluorinated components in a molecule that match this PFAS group's criteria. Parameters ---------- mol : Chem.Mol RDKit molecule object to search components : List[PFASComponent] List of PFASComponent objects representing fluorinated components in the molecule Returns ------- List[PFASComponent] List of PFASComponent objects that match this PFAS group's criteria Notes ----- - Matches are determined based on componentSmarts and max_dist_from_comp attributes. - If componentSmarts is None, all components are considered. - max_dist_from_comp allows extending the search radius for functional groups. """ if not self.find_matched_atoms(mol): return 0, [], 0, [] # Clear component-specific extra atoms list for this matching attempt self.component_specific_extra_atoms = [] if self.componentSmarts is None: # If no componentSmarts specified, only check alkyl components (not cyclic) # This ensures functional groups like carboxylic acid (group 33) are only # detected when attached to perfluoroalkyl or polyfluoroalkyl chains, # not when attached directly to cyclic structures componentSmartss = [] for path_type, meta in component_solver.componentSmartss.items(): if isinstance(meta, dict) and meta.get('form') == 'cyclic': continue componentSmartss.append(path_type) elif isinstance(self.componentSmarts, (list, tuple, set)): # Only keep paths the solver knows about (respects halogen filtering) available = set(component_solver.componentSmartss.keys()) componentSmartss = [cs for cs in self.componentSmarts if cs in available] else: componentSmartss = [self.componentSmarts] # Preload components (optional, not strictly required for filtering) components = [] for comp_type in componentSmartss: comps = component_solver.get(comp_type, max_dist=self.max_dist_from_comp, default=[]) components.extend(comps) # Filter components connected to the smarts and get augmented versions augmented_matched_components = [] for _componentSmarts in componentSmartss: extended_components = component_solver.get(_componentSmarts, self.max_dist_from_comp, []) for i, comp in enumerate(extended_components): # Check if this component is connected to SMARTS matches if self.component_satisfies_all_smarts(comp): augmented = component_solver.get_augmented_component( _componentSmarts, self.max_dist_from_comp, i, self.subset, self.linker_smarts ) # Accept augmented component if valid (linker validation already done in get_augmented_component) if augmented is not None and len(augmented) > 0: augmented_matched_components.append( component_solver.get_matched_component_dict(augmented, self.subset, _componentSmarts, self, comp_id = i) ) if len(augmented_matched_components) == 0: return 0, [], 0, [] # Get all component sizes from matched component dicts component_sizes = [comp.get('size', 0) for comp in augmented_matched_components] self.all_matches = [] # Clear matches after use self.component_specific_extra_atoms = [] return max([0] + component_sizes), component_sizes, len(augmented_matched_components), augmented_matched_components
[docs] def find_aryl_components(self,mol, component_solver=None, **kwargs): """Find aryl components in a molecule with comprehensive metrics.""" matches = mol.GetSubstructMatches(self.smarts[0]) subset = [y for x in matches for y in x] if len(subset)==0: return 0, [], 0, [] components = component_solver._connected_components(subset) component_sizes = [len(x) for x in components] # Get molecular graph for metrics calculation subset_set = set(subset) # Convert components to the same format as other functions return with comprehensive metrics matched_components = [] for comp in components: matched_components.append( component_solver.get_matched_component_dict(comp, subset_set, 'cyclic', self) ) return max([0]+[len(x) for x in components]), component_sizes, len(components), matched_components
[docs] def find_components(self, mol, fd, component_solver, **kwargs): """Find fluorinated components in a molecule that match this PFAS group's criteria.""" group_matches = [] if self.formula_dict_satisfies_constraints(fd) is True: component_sizes = [] # Create molecular graph once for this fragment G = mol_to_nx(mol) kwargs['G'] = G if self.componentSmarts =='cyclic': # treat cyclic groups separately # Use first SMARTS pattern for cyclic match_count, component_sizes, matched1_len, matched_components = self.find_aryl_components(mol, component_solver=component_solver, **kwargs) elif self.smarts is not None and len(self.smarts) > 0: # Handle groups with SMARTS patterns match_count, component_sizes, matched1_len, matched_components = self.find_alkyl_components(mol, component_solver, **kwargs) elif self.componentSmarts is not None: # treat cases with only componentSmarts defined (no SMARTS patterns), find all components of that path type available = set(component_solver.componentSmartss.keys()) if isinstance(self.componentSmarts, (list, tuple, set)): # Only keep paths available in the solver (respects halogen filter) component_types = [cs for cs in self.componentSmarts if cs in available] elif self.componentSmarts in available: component_types = [self.componentSmarts] else: component_types = [] # Collect unique components across all types; deduplicate by atom-set so # the same carbon substructure is only reported once even when multiple # per-halogen SMARTS types are listed (e.g. perhalogenated alkyl groups). seen_atom_sets = set() matched_components = [] for comp_type in component_types: comps = component_solver.get(comp_type, max_dist=self.max_dist_from_comp, default=[]) for comp in comps: key = frozenset(comp) if key in seen_atom_sets: continue # Per-component formula constraint check defined in # component_smarts_halogens.json (e.g. 'gte': {'F': 2}). # Done before adding to seen_atom_sets so the same atom-set # can still be accepted via a different comp_type. comp_constraints = self._comp_type_to_constraints.get(comp_type, {}) if comp_constraints: full_comp = component_solver.get_full_component_atoms(comp) comp_formula = {} for idx in full_comp: sym = mol.GetAtomWithIdx(idx).GetSymbol() comp_formula[sym] = comp_formula.get(sym, 0) + 1 if not self._check_component_constraints(comp_formula, comp_constraints): continue seen_atom_sets.add(key) matched_components.append( component_solver.get_matched_component_dict(comp, None, comp_type, self) ) match_count = len(matched_components) component_sizes = [comp.get('size', 0) for comp in matched_components] matched1_len = match_count # Set matched1_len to enable group matching else: # treat cases with no SMARTS patterns, just formula constraints match_count = 1 matched_components = [] matched1_len = 1 if match_count > 0 and matched1_len > 0: # add to matches if functional group was found group_matches.append((self, match_count, component_sizes, matched_components)) else: return None return group_matches return None
[docs] def test(self, test_data=None): """Test this PFAS group against test molecules from metadata. Validates that the group correctly identifies positive examples and rejects negative examples based on test metadata in PFAS_groups_smarts.json. Parameters ---------- test_data : dict, optional Test metadata dictionary. If None, will be loaded from the group's entry in PFAS_groups_smarts.json. Expected keys: ``category``, ``examples``, ``generate``. Returns ------- dict Test results with keys: ``passed`` (bool), ``total_tests`` (int), ``failures`` (list of dicts), ``category`` (str). Notes ----- - For OECD groups: Tests against curated positive examples - For telomer groups: Tests generated molecules based on smiles patterns - For generic groups: Tests both positive and negative examples - Returns detailed failure information for debugging """ from .core import n_from_formula from .ComponentsSolverModel import ComponentsSolver results = { 'passed': True, 'total_tests': 0, 'failures': [], 'category': self.test_dict.get('category', 'unknown') if self.test_dict else 'unknown' } # Load test data if not provided if self.test_dict is None: return results # No tests defined for this group # Test positive examples examples = self.test_dict.get('examples', []) for smiles in examples: results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': None, 'error': 'Invalid SMILES' }) continue # Add hydrogens as done in parser mol = Chem.AddHs(mol) # Create ComponentsSolver for this molecule with ComponentsSolver(mol) as component_solver: # Get formula dict formula = CalcMolFormula(mol) fd = n_from_formula(formula) # Use find_components to check if group matches matches = self.find_components(mol, fd, component_solver) is_match = matches is not None and len(matches) > 0 if not is_match: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': False, 'error': 'Group should match but did not' }) except Exception as e: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': None, 'error': f'Exception: {str(e)}' }) # Test positive examples examples = self.test_dict.get('counter-examples', []) for smiles in examples: results['total_tests'] += 1 try: mol = Chem.MolFromSmiles(smiles) if mol is None: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': False, 'got': None, 'error': 'Invalid SMILES' }) continue # Add hydrogens as done in parser mol = Chem.AddHs(mol) # Create ComponentsSolver for this molecule with ComponentsSolver(mol) as component_solver: # Get formula dict formula = CalcMolFormula(mol) fd = n_from_formula(formula) # Use find_components to check if group matches matches = self.find_components(mol, fd, component_solver) is_match = matches is not None and len(matches) > 0 if is_match: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': False, 'got': True, 'error': 'Group should match but did not' }) except Exception as e: results['passed'] = False results['failures'].append({ 'smiles': smiles, 'expected': True, 'got': None, 'error': f'Exception: {str(e)}' }) return results