Source code for PFASGroups.HalogenGroupModel

from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
import re
from .core import mol_to_nx, add_componentSmarts





[docs]
class HalogenGroup():
    """Model class representing a specific halogenated functional group with structural patterns.

    A HalogenGroup defines a specific halogenated functional group using SMARTS patterns,
    component path types, and molecular formula constraints. Groups are used to classify
    molecules into specific categories (e.g., "Perfluoroalkyl carboxylic acid").

    Attributes
    ----------
    id : int
        Unique identifier for this Halogen group
    name : str
        Human-readable group name (e.g., "Perfluoroalkyl carboxylic acid")
    smarts : Chem.Mol or None
        SMARTS patterns (compiled RDKit molecule) for functional group detection.
        None if group is defined by componentSmarts alone.
    componentSmarts : list, str or None
    componentForm: str or None
    componentHalogens: list, str or None
    componentSaturation: str or None (-> both)
    max_dist_from_comp : int
        Maximum graph distance (number of bonds) from fluorinated component to functional group.
        When > 0, extends component search radius to find nearby functional groups.
    linker_smarts : Chem.Mol or None
        Compiled SMARTS pattern for validating linker atoms between fluorinated component
        and functional group. When None (default), no restriction is applied to linker atoms.
        Only used when max_dist_from_comp > 0.
    constraints : dict
        Molecular formula constraints with keys:
        - 'only': Elements that must be present exclusively (e.g., ['C', 'F', 'O'])
        - 'gte': Minimum element counts (e.g., {'C': 2})
        - 'lte': Maximum element counts (e.g., {'O': 2})
        - 'eq': Exact element counts (e.g., {'N': 1})
        - 'rel': Relational constraints (e.g., {'O': {'atoms': ['C'], 'div': 2, 'add': 0}})

    Examples
    --------
    >>> # Perfluoroalkyl carboxylic acid: R_F-COOH
    >>> pfaa = HalogenGroup(
    ...     id=1,
    ...     name="Perfluoroalkyl carboxylic acid",
    ...     smarts={"C(=O)O":1},  # Carboxylic acid group
    ...     componentSmarts="Perfluoroalkyl",
    ...     constraints={"only": ["C", "F", "O", "H"]},
    ...     max_dist_from_comp=0,
    ...     linker_smarts=None
    ... )

    Notes
    -----
    - SMARTS patterns are compiled on initialization for efficient matching
    - Constraints are validated when checking if a molecule belongs to this group
    - max_dist_from_comp allows finding functional groups connected via non-fluorinated linkers
    - linker_smarts restricts which atoms can be in the path between component and functional group
    """
    @add_componentSmarts()
    def __init__(self, id, name,**kwargs):
        self.id = id
        self.name = name
        smarts = kwargs.get('smarts',{})
        # Save original SMARTS strings for atom counting
        if smarts and len(smarts) > 0:
            self.smarts_str, self.smarts_count = zip(*smarts.items())
        else:
            self.smarts_str = None
            self.smarts_count = None
        self.smarts = [] if self.smarts_str else None
        self.componentSmarts = kwargs.get('componentSmarts',None)
        self.componentSaturation = kwargs.get('componentSaturation',None)
        self.componentHalogens = kwargs.get('componentHalogens', kwargs.get('componentHalogen', None))
        self.componentForm = kwargs.get("componentForm", None)
        self._comp_type_to_halogen = {}      # populated by set_component_smarts
        self._comp_type_to_constraints = {}  # populated by set_component_smarts
        self.set_component_smarts(kwargs.get('componentSmartss', {}))
        self.excludeHalogens = kwargs.get('excludeHalogens', None)
        self.max_dist_from_comp = kwargs.get('max_dist_from_comp', 0)
        # Compile linker_smarts pattern if provided
        linker_smarts_str = kwargs.get('linker_smarts', None)
        self.linker_smarts = None
        if linker_smarts_str is not None:
            try:
                self.linker_smarts = Chem.MolFromSmarts(linker_smarts_str)
                self.linker_smarts.UpdatePropertyCache()
                Chem.GetSymmSSSR(self.linker_smarts)
                self.linker_smarts.GetRingInfo().NumRings()
            except:
                raise ValueError(f"Invalid linker_smarts pattern '{linker_smarts_str}' for HalogenGroup '{self.name}' (ID: {self.id})")
        if self.smarts_str is not None:
            for smarts_pattern in self.smarts_str:
                if smarts_pattern and smarts_pattern != "":
                    try:
                        smarts_mol = Chem.MolFromSmarts(smarts_pattern)
                        smarts_mol.UpdatePropertyCache()
                        Chem.GetSymmSSSR(smarts_mol)
                        smarts_mol.GetRingInfo().NumRings()
                        self.smarts.append(smarts_mol)
                    except:
                        raise ValueError(f"Invalid SMARTS pattern(s) for HalogenGroup '{self.name}' (ID: {self.id})")
        self.constraints = kwargs.get('constraints',{})
        # Precompute number of extra atoms in SMARTS patterns (beyond matched atom and H/F/Cl/Br/I)
        self.smarts_extra_atoms = self._count_smarts_extra_atoms(self.smarts_str)
        self.component_specific_extra_atoms = []
        self.all_matches = []
        self.compute = kwargs.get('compute',True)# whether the HalogenGroups needs to be parsed, or is an aggregate group, e.g. telomers
        self.re_search = kwargs.get('re_search',None)# regex for aggregate groups, e.g. telomers
        if self.re_search is not None:
            try:
                self.re_search = re.compile(self.re_search)
            except Exception as e:
                raise Exception(f"Error for agg Group {self.id}: {self.name}\n {e}")
        self.test_dict = kwargs.get('test',None)# test dict for unit tests

[docs]
    def set_component_smarts(self, componentSmartss):
        """
        Infers componentSmarts based on componentSmarts, componentSaturation, componentForm and componentHalogen
        """
        if not componentSmartss:
            return
        # Build comp_type → halogen and comp_type → constraints mappings
        self._comp_type_to_halogen = {
            k: v['halogen']
            for k, v in componentSmartss.items()
            if isinstance(v, dict) and 'halogen' in v
        }
        self._comp_type_to_constraints = {
            k: v['constraints']
            for k, v in componentSmartss.items()
            if isinstance(v, dict) and v.get('constraints')
        }
        # if componentSmarts is None, or is not in (entirely) in the available componentSmartss
        if self.componentSmarts is None or (isinstance(self.componentSmarts,list) and not set(self.componentSmarts).issubset(componentSmartss.keys()) or (isinstance(self.componentSmarts, str) and not self.componentSmarts in componentSmartss.keys())):
            new_CS = []
            try:
                componentSmarts_dict = {}
                for k,v in componentSmartss.items():
                    if not isinstance(v, dict):
                        continue
                    halogen = v.get('halogen')
                    form = v.get('form')
                    saturation = v.get('saturation')
                    if halogen is None or form is None or saturation is None:
                        continue
                    componentSmarts_dict.setdefault(halogen,{}).setdefault(form,{})[saturation]=k
            except Exception as e:
                raise ValueError(f"Error processing componentSmartss for HalogenGroup '{self.name}' (ID: {self.id}): {e}")
            if not componentSmarts_dict:
                # No metadata available to infer component SMARTS
                return
            # prepare halogens (accepts list, str or None)
            if isinstance(self.componentHalogens,list) and len(set(self.componentHalogens).intersection(['F','Cl','Br','I'])) == 0:
                raise ValueError(f"Invalid componentHalogens for HalogenGroup '{self.name}' (ID: {self.id})")
            if self.componentHalogens is None:
                self.componentHalogens = ['F','Cl','Br','I']
            elif isinstance(self.componentHalogens, str) and self.componentHalogens in ['F','Cl','Br','I']:
                self.componentHalogens = [self.componentHalogens]
            if self.componentSaturation is None:
                self.componentSaturation = ['per','poly']
            elif isinstance(self.componentSaturation, str) and self.componentSaturation in ['per','poly','both']:
                self.componentSaturation = ['per','poly'] if self.componentSaturation == 'both' else [self.componentSaturation]
            elif isinstance(self.componentSaturation, list):
                self.componentSaturation = self.componentSaturation
            else:
                raise ValueError(f"Invalid componentSaturation for HalogenGroup '{self.name}' (ID: {self.id}), expected 'per', 'poly', 'both' or null, got {self.componentSaturation}")
            if self.componentForm is None:
                self.componentForm = 'alkyl'
            for halogen in self.componentHalogens:
                if halogen not in componentSmarts_dict:
                    raise ValueError(f"No component SMARTS available for halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})")
                if self.componentForm not in componentSmarts_dict[halogen]:
                    raise ValueError(f"No component SMARTS available for form '{self.componentForm}' and halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})")
                for saturation in self.componentSaturation:
                    if saturation not in componentSmarts_dict[halogen][self.componentForm]:
                        raise ValueError(f"No component SMARTS available for saturation '{saturation}', form '{self.componentForm}', halogen '{halogen}' in HalogenGroup '{self.name}' (ID: {self.id})")
                    new_CS.append(componentSmarts_dict[halogen][self.componentForm][saturation])
            self.componentSmarts = new_CS



[docs]
    def set_componentSmarts(self, componentSmartss):
        """Backward-compatible alias for set_component_smarts."""
        return self.set_component_smarts(componentSmartss)

    def _count_smarts_extra_atoms(self, smarts_str):
        """Count number of extra carbon atoms in functional group beyond what's captured by component.

        Parameters
        ----------
        smarts_str : str or None
            Original SMARTS string before compilation
        Returns
        -------
        int
            Number of extra carbon atoms beyond the matched atom

        Notes
        -----
        The component fraction calculation is now based on carbon atoms only:
        1. Carbon atoms in component
        2. Carbon atoms in SMARTS matches
        3. Additional carbon atoms from SMARTS (this return value)


        For automatic counting (when manual_size is None), this returns 0 since we now focus only
        on carbons and they are already counted in the component and SMARTS matches.
        """
        PAT_c = re.compile(r'((C(?![adeflmnorsu]))|((?<![TAS])c)|(\#6))')  # Match 'C' not followed by a letter, or c not preceded by T,A,S or #6
        return [max(0,len(PAT_c.findall(s))-1) for s in smarts_str] if smarts_str is not None else None


    def __str__(self):
        return self.name

    @staticmethod
    def _check_component_constraints(comp_formula, constraints):
        """Check a component's element formula against component-level constraints.

        Parameters
        ----------
        comp_formula : dict
            ``{element_symbol: count}`` for all atoms in the full component
            (backbone + attached H/halogens).
        constraints : dict
            Component constraints with optional keys:

            * ``'gte'`` – ``{element: min_count}``; component must have at least
              *min_count* atoms of *element*.
            * ``'exclude'`` – list of element symbols that must be absent from
              the component.

        Returns
        -------
        bool
        """
        for elem, n in constraints.get('gte', {}).items():
            if comp_formula.get(elem, 0) < n:
                return False
        for elem in constraints.get('exclude', []):
            if comp_formula.get(elem, 0) > 0:
                return False
        return True


[docs]
    def constraint_gte(self, formula_dict):
        """Check 'greater than or equal' constraints on element counts.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary

        Returns
        -------
        bool
            True if all 'gte' constraints are satisfied, False otherwise

        Examples
        --------
        >>> # Requires at least 2 carbons and 3 fluorines
        >>> group.constraints = {'gte': {'C': 2, 'F': 3}}
        >>> group.constraint_gte({'C': 3, 'F': 5, 'O': 1})  # True
        >>> group.constraint_gte({'C': 1, 'F': 5, 'O': 1})  # False (C < 2)
        """
        success = True
        for e,n in self.constraints.get('gte',{}).items():
            success = success and formula_dict.get(e,0)>=n
        return success


[docs]
    def constraint_lte(self, formula_dict):
        """Check 'less than or equal' constraints on element counts.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary

        Returns
        -------
        bool
            True if all 'lte' constraints are satisfied, False otherwise

        Examples
        --------
        >>> # Requires at most 2 oxygens
        >>> group.constraints = {'lte': {'O': 2}}
        >>> group.constraint_lte({'C': 8, 'F': 15, 'O': 2})  # True
        >>> group.constraint_lte({'C': 8, 'F': 15, 'O': 3})  # False (O > 2)
        """
        success = True
        for e,n in self.constraints.get('lte',{}).items():
            success = success and formula_dict.get(e,0)<=n
        return success


[docs]
    def constraint_eq(self, formula_dict):
        """Check 'equal to' constraints on element counts.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary

        Returns
        -------
        bool
            True if all 'eq' constraints are satisfied, False otherwise

        Examples
        --------
        >>> # Requires exactly 1 nitrogen
        >>> group.constraints = {'eq': {'N': 1}}
        >>> group.constraint_eq({'C': 8, 'F': 15, 'N': 1})  # True
        >>> group.constraint_eq({'C': 8, 'F': 15, 'N': 2})  # False (N != 1)
        """
        success = True
        for e,n in self.constraints.get('eq',{}).items():
            success = success and formula_dict.get(e,0)==n
        return success


[docs]
    def constraint_only(self, formula_dict):
        """Check 'only' constraint - molecule must contain only specified elements.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary

        Returns
        -------
        bool
            True if molecule contains only the allowed elements, False otherwise

        Examples
        --------
        >>> # Molecule must contain only C, F, O, H
        >>> group.constraints = {'only': ['C', 'F', 'O', 'H']}
        >>> group.constraint_only({'C': 8, 'F': 15, 'O': 2, 'H': 1})  # True
        >>> group.constraint_only({'C': 8, 'F': 15, 'O': 2, 'S': 1})  # False (S not allowed)

        Notes
        -----
        Checks that sum of allowed elements equals total atoms in molecule.
        """
        success = True
        if 'only' in self.constraints.keys():
            tot = sum(formula_dict.values())
            nn = 0
            for e in self.constraints['only']:
                nn += formula_dict.get(e,0)
            success = success and tot == nn
        return success


[docs]
    def constraint_rel(self, formula_dict):
        """Check relational constraints between element counts.

        Validates relationships of the form: count(element) = f(other_elements)
        where f can include division, addition, and summing other element counts.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary

        Returns
        -------
        bool
            True if all relational constraints are satisfied, False otherwise

        Notes
        -----
        Constraint Format::

            'rel': {
                'ElementA': {
                    'atoms': ['ElementB', 'ElementC'],  # Elements to sum
                    'div': int,  # Divisor (default 1)
                    'add': int,  # Additive constant (default 0)
                    'add_atoms': ['ElementD']  # Additional elements to add
                }
            }

        Formula: count(ElementA) = (sum(atoms) / div) + add + sum(add_atoms)

        Examples
        --------
        >>> # Carbon count must equal half the fluorine count
        >>> group.constraints = {'rel': {'C': {'atoms': ['F'], 'div': 2, 'add': 0}}}
        >>> group.constraint_rel({'C': 4, 'F': 8, 'O': 2})  # True (4 == 8/2)
        >>> group.constraint_rel({'C': 3, 'F': 8, 'O': 2})  # False (3 != 8/2)

        >>> # Oxygen count must equal carbon count plus 1
        >>> group.constraints = {'rel': {'O': {'atoms': ['C'], 'div': 1, 'add': 1}}}
        >>> group.constraint_rel({'C': 3, 'F': 7, 'O': 4})  # True (4 == 3 + 1)
        """
        success = True
        for e,v in self.constraints.get('rel',{}).items():
            n = sum([formula_dict.get(x,0) for x in v.get('atoms',[])])
            success = success and formula_dict.get(e,0)==n/v.get('div',1)+v.get('add',0)+sum([formula_dict.get(x,0) for x in v.get('add_atoms',[])])
        return success


[docs]
    def formula_dict_satisfies_constraints(self,formula_dict):
        """Check if a molecular formula satisfies all constraints for this PFAS group.

        Evaluates all constraint types in order: relational → only → equal → lte → gte.
        Stops evaluation at first failure for efficiency.

        Parameters
        ----------
        formula_dict : dict
            Molecular formula as {element: count} dictionary (e.g., {'C': 8, 'F': 17, 'O': 2})

        Returns
        -------
        bool
            True if all constraints are satisfied, False if any constraint fails

        Constraint Evaluation Order
        ---------------------------
        1. Relational constraints ('rel') - element count relationships
        2. 'Only' constraints - allowed elements
        3. Equality constraints ('eq') - exact element counts
        4. Upper bound constraints ('lte') - maximum element counts
        5. Lower bound constraints ('gte') - minimum element counts

        Examples
        --------
        >>> # Perfluoroalkyl carboxylic acid constraints
        >>> group.constraints = {
        ...     'only': ['C', 'F', 'O', 'H'],  # No other elements
        ...     'gte': {'C': 2, 'F': 3},  # At least 2 carbons, 3 fluorines
        ...     'eq': {'O': 2}  # Exactly 2 oxygens
        ... }
        >>> group.formula_dict_satisfies_constraints({'C': 8, 'F': 15, 'O': 2, 'H': 1})
        True
        >>> group.formula_dict_satisfies_constraints({'C': 8, 'F': 15, 'O': 3, 'H': 1})
        False  # Fails 'eq': {'O': 2}

        Notes
        -----
        - Returns True immediately if no constraints are defined
        - Short-circuits on first constraint failure for performance
        - Constraint evaluation order is fixed for consistency
        """
        if len(self.constraints.keys())==0:
            return True
        success = True
        process = [None,self.constraint_rel,self.constraint_only,self.constraint_eq, self.constraint_lte, self.constraint_gte]
        k = process.pop()
        while success and k is not None:
            success = k(formula_dict)
            k = process.pop()
        return success


[docs]
    def find_matched_atoms(self, mol):
        """Find all substructure matches of this PFAS group's SMARTS patterns in a molecule.

        Parameters
        ----------
        mol : Chem.Mol
            RDKit molecule object to search for matches

        Returns
        -------
        List[List[int]]
            List of matches, where each match is a list of atom indices in the molecule

        Notes
        -----
        - If no SMARTS patterns are defined, returns an empty list.
        - Each SMARTS pattern is searched independently; matches from all patterns are combined.
        """
        self.all_matches = []
        self.subset = set()
        if self.smarts is not None:
            for smarts_mol,min_count in zip(self.smarts, self.smarts_count):
                matches = mol.GetSubstructMatches(smarts_mol)
                if len(matches) < min_count:
                    return False
                if len(matches)>0:
                    # Keep matches as list to preserve duplicates when same atoms matched multiple times
                    # (e.g., dicarboxylic/disulfonic acids where both groups attach to same carbon)
                    self.all_matches.append(list(matches))
                    self.subset.update({y for x in matches for y in x if len(x)>0})
        return True


[docs]
    def component_satisfies_all_smarts(self, component):
        """Check if a fluorinated component matches all SMARTS patterns of this PFAS group.

        Parameters
        ----------
        component : PFASComponent
            PFASComponent object representing a fluorinated component in the molecule

        Returns
        -------
        bool
            True if the component matches all SMARTS patterns, False otherwise

        Notes
        -----
        - If no SMARTS patterns are defined for this group, returns True.
        - Each SMARTS pattern must have at least one match that includes the component's atom.
        """
        atom_count = 0
        for i, (matches, min_count) in enumerate(zip(self.all_matches,self.smarts_count)):
            # Count how many SMARTS matches have overlap with this component
            # matches is a set of tuples, each tuple represents one SMARTS match
            component_set = set(component)
            found = sum(1 for match_tuple in matches if any(atom_idx in component_set for atom_idx in match_tuple))

            if found < min_count:
                self.component_specific_extra_atoms.append(0)
                return False
            atom_count += found * self.smarts_extra_atoms[i]
        self.component_specific_extra_atoms.append(atom_count)
        return True



[docs]
    def find_alkyl_components(self, mol, component_solver, **kwargs):
        """Find fluorinated components in a molecule that match this PFAS group's criteria.

        Parameters
        ----------
        mol : Chem.Mol
            RDKit molecule object to search
        components : List[PFASComponent]
            List of PFASComponent objects representing fluorinated components in the molecule

        Returns
        -------
        List[PFASComponent]
            List of PFASComponent objects that match this PFAS group's criteria

        Notes
        -----
        - Matches are determined based on componentSmarts and max_dist_from_comp attributes.
        - If componentSmarts is None, all components are considered.
        - max_dist_from_comp allows extending the search radius for functional groups.
        """
        if not self.find_matched_atoms(mol):
            return 0, [], 0, []

        # Clear component-specific extra atoms list for this matching attempt
        self.component_specific_extra_atoms = []

        if self.componentSmarts is None:
            # If no componentSmarts specified, only check alkyl components (not cyclic)
            # This ensures functional groups like carboxylic acid (group 33) are only
            # detected when attached to perfluoroalkyl or polyfluoroalkyl chains,
            # not when attached directly to cyclic structures
            componentSmartss = []
            for path_type, meta in component_solver.componentSmartss.items():
                if isinstance(meta, dict) and meta.get('form') == 'cyclic':
                    continue
                componentSmartss.append(path_type)
        elif isinstance(self.componentSmarts, (list, tuple, set)):
            # Only keep paths the solver knows about (respects halogen filtering)
            available = set(component_solver.componentSmartss.keys())
            componentSmartss = [cs for cs in self.componentSmarts if cs in available]
        else:
            componentSmartss = [self.componentSmarts]

        # Preload components (optional, not strictly required for filtering)
        components = []
        for comp_type in componentSmartss:
            comps = component_solver.get(comp_type, max_dist=self.max_dist_from_comp, default=[])
            components.extend(comps)

        # Filter components connected to the smarts and get augmented versions
        augmented_matched_components = []
        for _componentSmarts in componentSmartss:
            extended_components = component_solver.get(_componentSmarts, self.max_dist_from_comp, [])
            for i, comp in enumerate(extended_components):
                # Check if this component is connected to SMARTS matches
                if self.component_satisfies_all_smarts(comp):
                    augmented = component_solver.get_augmented_component(
                        _componentSmarts, self.max_dist_from_comp, i, self.subset, self.linker_smarts
                    )
                    # Accept augmented component if valid (linker validation already done in get_augmented_component)
                    if augmented is not None and len(augmented) > 0:
                        augmented_matched_components.append(
                            component_solver.get_matched_component_dict(augmented, self.subset, _componentSmarts, self, comp_id = i)
                        )

        if len(augmented_matched_components) == 0:
            return 0, [], 0, []

        # Get all component sizes from all path types
        all_components = list(set([comp for comps in augmented_matched_components for comp in comps]))
        component_sizes = [len(x) for x in all_components]

        self.all_matches = []  # Clear matches after use
        self.component_specific_extra_atoms = []
        return max([0] + component_sizes), component_sizes, len(all_components), augmented_matched_components



[docs]
    def find_aryl_components(self,mol, component_solver=None, **kwargs):
        """Find aryl components in a molecule with comprehensive metrics."""
        matches = mol.GetSubstructMatches(self.smarts[0])
        subset = [y for x in matches for y in x]
        if len(subset)==0:
            return 0, [], 0, []

        components = component_solver._connected_components(subset)
        component_sizes = [len(x) for x in components]

        # Get molecular graph for metrics calculation
        subset_set = set(subset)

        # Convert components to the same format as other functions return with comprehensive metrics
        matched_components = []
        for comp in components:
            matched_components.append(
                component_solver.get_matched_component_dict(comp, subset_set, 'cyclic', self)
            )

        return max([0]+[len(x) for x in components]), component_sizes, len(components), matched_components



[docs]
    def find_components(self, mol, fd, component_solver, **kwargs):
        """Find fluorinated components in a molecule that match this PFAS group's criteria."""
        group_matches = []
        if self.formula_dict_satisfies_constraints(fd) is True:
            component_sizes = []
            # Create molecular graph once for this fragment
            G = mol_to_nx(mol)
            kwargs['G'] = G
            if self.componentSmarts =='cyclic':
                # treat cyclic groups separately
                # Use first SMARTS pattern for cyclic
                match_count, component_sizes, matched1_len, matched_components = self.find_aryl_components(mol, component_solver=component_solver, **kwargs)
            elif self.smarts is not None and len(self.smarts) > 0:
                # Handle groups with SMARTS patterns
                match_count, component_sizes, matched1_len, matched_components = self.find_alkyl_components(mol, component_solver, **kwargs)
            elif self.componentSmarts is not None:
                # treat cases with only componentSmarts defined (no SMARTS patterns), find all components of that path type
                available = set(component_solver.componentSmartss.keys())
                if isinstance(self.componentSmarts, (list, tuple, set)):
                    # Only keep paths available in the solver (respects halogen filter)
                    component_types = [cs for cs in self.componentSmarts if cs in available]
                elif self.componentSmarts in available:
                    component_types = [self.componentSmarts]
                else:
                    component_types = []
                # Collect unique components across all types; deduplicate by atom-set so
                # the same carbon substructure is only reported once even when multiple
                # per-halogen SMARTS types are listed (e.g. perhalogenated alkyl groups).
                seen_atom_sets = set()
                matched_components = []
                for comp_type in component_types:
                    comps = component_solver.get(comp_type, max_dist=self.max_dist_from_comp, default=[])
                    for comp in comps:
                        key = frozenset(comp)
                        if key in seen_atom_sets:
                            continue
                        # Per-component formula constraint check defined in
                        # component_smarts_halogens.json (e.g. 'gte': {'F': 2}).
                        # Done before adding to seen_atom_sets so the same atom-set
                        # can still be accepted via a different comp_type.
                        comp_constraints = self._comp_type_to_constraints.get(comp_type, {})
                        if comp_constraints:
                            full_comp = component_solver.get_full_component_atoms(comp)
                            comp_formula = {}
                            for idx in full_comp:
                                sym = mol.GetAtomWithIdx(idx).GetSymbol()
                                comp_formula[sym] = comp_formula.get(sym, 0) + 1
                            if not self._check_component_constraints(comp_formula, comp_constraints):
                                continue
                        seen_atom_sets.add(key)
                        matched_components.append(
                            component_solver.get_matched_component_dict(comp, None, comp_type, self)
                        )
                match_count = len(matched_components)
                component_sizes = [comp.get('size', 0) for comp in matched_components]
                matched1_len = match_count  # Set matched1_len to enable group matching
            else:
                # treat cases with no SMARTS patterns, just formula constraints
                match_count = 1
                matched_components = []
                matched1_len = 1
            if match_count > 0 and matched1_len > 0:
                # add to matches if functional group was found
                group_matches.append((self, match_count, component_sizes, matched_components))
            else:
                return None
            return group_matches
        return None



[docs]
    def test(self, test_data=None):
        """Test this PFAS group against test molecules from metadata.

        Validates that the group correctly identifies positive examples and
        rejects negative examples based on test metadata in PFAS_groups_smarts.json.

        Parameters
        ----------
        test_data : dict, optional
            Test metadata dictionary. If None, will be loaded from the group's
            entry in PFAS_groups_smarts.json. Expected keys: ``category``,
            ``examples``, ``generate``.

        Returns
        -------
        dict
            Test results with keys: ``passed`` (bool), ``total_tests`` (int),
            ``failures`` (list of dicts), ``category`` (str).

        Notes
        -----
        - For OECD groups: Tests against curated positive examples
        - For telomer groups: Tests generated molecules based on smiles patterns
        - For generic groups: Tests both positive and negative examples
        - Returns detailed failure information for debugging
        """
        from .core import n_from_formula
        from .ComponentsSolverModel import ComponentsSolver



        results = {
            'passed': True,
            'total_tests': 0,
            'failures': [],
            'category': self.test_dict.get('category', 'unknown') if self.test_dict else 'unknown'
        }
        # Load test data if not provided
        if self.test_dict is None:
            return results # No tests defined for this group
        # Test positive examples
        examples = self.test_dict.get('examples', [])
        for smiles in examples:
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': True,
                        'got': None,
                        'error': 'Invalid SMILES'
                    })
                    continue

                # Add hydrogens as done in parser
                mol = Chem.AddHs(mol)

                # Create ComponentsSolver for this molecule
                with ComponentsSolver(mol) as component_solver:
                    # Get formula dict
                    formula = CalcMolFormula(mol)
                    fd = n_from_formula(formula)

                    # Use find_components to check if group matches
                    matches = self.find_components(mol, fd, component_solver)
                    is_match = matches is not None and len(matches) > 0

                if not is_match:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': True,
                        'got': False,
                        'error': 'Group should match but did not'
                    })
            except Exception as e:
                results['passed'] = False
                results['failures'].append({
                    'smiles': smiles,
                    'expected': True,
                    'got': None,
                    'error': f'Exception: {str(e)}'
                })
        # Test positive examples
        examples = self.test_dict.get('counter-examples', [])
        for smiles in examples:
            results['total_tests'] += 1
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': False,
                        'got': None,
                        'error': 'Invalid SMILES'
                    })
                    continue

                # Add hydrogens as done in parser
                mol = Chem.AddHs(mol)

                # Create ComponentsSolver for this molecule
                with ComponentsSolver(mol) as component_solver:
                    # Get formula dict
                    formula = CalcMolFormula(mol)
                    fd = n_from_formula(formula)

                    # Use find_components to check if group matches
                    matches = self.find_components(mol, fd, component_solver)
                    is_match = matches is not None and len(matches) > 0

                if is_match:
                    results['passed'] = False
                    results['failures'].append({
                        'smiles': smiles,
                        'expected': False,
                        'got': True,
                        'error': 'Group should match but did not'
                    })
            except Exception as e:
                results['passed'] = False
                results['failures'].append({
                    'smiles': smiles,
                    'expected': True,
                    'got': None,
                    'error': f'Exception: {str(e)}'
                })
        return results