Tutorial

from PFASGroups import parse_smiles, prioritise_molecules, plot_HalogenGroups

results  = parse_smiles(["CCCC(F)(F)F", "FC(F)(F)C(=O)O"])
arr      = results.to_array()                     # (2, 114) binary embedding
pca      = results.perform_pca(color_by='top_group')
ranked   = prioritise_molecules(["CCCC(F)(F)F", "FC(F)(F)C(=O)O"])

Sample data

All examples on this page share the following molecule list. Copy it once at the top of your script or notebook:

smiles_list = [
    "C[Si](C)(Cl)CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(=C(C(F)(F)F)C(F)(F)F)C(F)(F)C(F)(F)F",
    "FC(F)(F)C(F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(C(F)(F)F)C(F)(F)F",
    "C[Si](Cl)(Cl)CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "CC(=O)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)=C(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)(F)N1C(F)(F)C(F)(F)C(F)(F)C(F)(F)C1(F)F",
    "FC(F)(F)C(F)(F)C(F)(F)CCl",
    "OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "OS(=O)(=O)CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)COC(=O)C=C",
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)OC=C",
    "FC(F)(F)C(I)(C(F)(F)F)C(F)(F)F",
    "OC(C(F)(F)C(F)(F)F)(C(F)(F)C(F)(F)F)C(F)(F)C(F)(F)F",
    "C[Si](Cl)(Cl)CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)=C(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "CCN(CC)CC.NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "[OH-].C[N+](C)(C)CCCNS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "[K+].[O-]S(=O)(=O)C(F)(F)C(F)(C(F)(F)F)C(F)(F)F",
    "OCCN(CCO)S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)F",
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)N(C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "COC(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)OCC=C",
    "CC(C)CCOC(=O)CC(NC(=O)C(F)(F)C(F)(F)C(F)(F)F)C(=O)OCCC(C)C",
    "CN(C(=O)C(F)(F)C(F)(F)C(F)(F)F)C(=O)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)(F)C(F)(F)C(F)(F)C=CI",
    "OC(=O)C1=CC=CC=C1NC(=O)C(F)(F)C(F)(F)C(F)(F)F",
    "FC(F)(F)C1(F)C(F)(F)C(F)(F)C(F)(F)C(F)(C(F)(F)F)C1(F)F",
]

Parsing SMILES

parse_smiles() accepts a single SMILES string or a list and returns a PFASEmbeddingSet:

from PFASGroups import parse_smiles

results = parse_smiles(smiles_list)
print(results)        # PFASEmbeddingSet summary: molecule count, top groups, …
print(results[0])     # per-molecule summary for the first entry

results.summary()     # tabular overview of matched groups across all molecules

Iterating over results

for mol in results:
    flag = " (PFAS)" if any(m.is_PFAS for m in mol.matches) else ""
    print(f"{mol.smiles}: {len(mol.matches)} match(es){flag}")

Drilling into a single match

mol   = results[0]
match = mol.matches[0]

print("Group   :", match.group_name)
print("Group ID:", match.group_id)
print("Is PFAS :", match.is_PFAS)
print("Category:", match.group_category)

for i, comp in enumerate(match.components):
    print(f"  Component {i}: atoms {comp.atoms}")
    print(f"  EGR: {comp.effective_graph_resistance}")

Exporting to DataFrame

df = results.to_dataframe()
# columns: smiles, inchi, group_name, group_id, is_PFAS,
#          group_category, n_components, component_atoms, …
df.to_csv("results.csv", index=False)

Parsing RDKit Mol objects

parse_mols() accepts a list of RDKit Mol objects directly — useful when molecules are already in memory (e.g. read from an SD file):

from rdkit import Chem
from PFASGroups import parse_mols

mols = [Chem.MolFromSmiles(s) for s in smiles_list]
results = parse_mols(mols)

print(results)
results.summary()

The returned PFASEmbeddingSet has the same interface as the one returned by parse_smiles().

Plotting results

plot_HalogenGroups() draws 2-D molecular structures with each detected PFAS group highlighted:

from PFASGroups import plot_HalogenGroups

# First three molecules, group highlights, 2-column grid
fig, w, h = plot_HalogenGroups(smiles_list[:3], ncols=2, display=True)

# Save to a file without displaying
fig, w, h = plot_HalogenGroups(smiles_list[:3], path="groups.png", display=False)

# Vector SVG output
fig, w, h = plot_HalogenGroups(smiles_list[:3], svg=True, display=True)

# One panel per group match (useful when a molecule has multiple groups)
fig, w, h = plot_HalogenGroups(smiles_list[:3], split_matches=True, ncols=3)

# Custom panel labels
labels = [f"mol {i}" for i in range(3)]
fig, w, h = plot_HalogenGroups(smiles_list[:3], panel_labels=labels, ncols=3)

For plain structure drawings without group highlights, use plot_mol() (single) or plot_mols() (grid):

from rdkit import Chem
from PFASGroups import plot_mol, plot_mols

mol  = Chem.MolFromSmiles(smiles_list[0])
mols = [Chem.MolFromSmiles(s) for s in smiles_list[:4]]

fig, w, h = plot_mol(mol, addAtomIndices=False)
fig, w, h = plot_mols(mols, ncols=2, addAtomIndices=False)

Component filters

Filter parsed results by halogen, saturation level, or structural form:

# Fluorine only (default)
r_f    = parse_smiles(smiles_list, halogens='F')

# Perfluorinated alkyl chains only
r_pfa  = parse_smiles(smiles_list, halogens='F',
                       saturation='per', form='alkyl')

# Cyclic fluorinated structures only
r_cyc  = parse_smiles(smiles_list, form='cyclic')

# Polyfluorinated components only
r_poly = parse_smiles(smiles_list, saturation='poly')

# All four halogens
r_all  = parse_smiles(smiles_list, halogens=['F', 'Cl', 'Br', 'I'])

Embeddings

to_array() converts a PFASEmbeddingSet into a 2-D EmbeddingArray (a numpy array subclass that carries molecule identity metadata). All examples reuse the pre-parsed results object — no re-parsing needed.

# Default: binary encoding, all groups → shape (n_mols, 114)
arr = results.to_array()
print(arr.shape)      # (29, 114)
print(arr.smiles)     # list of SMILES strings, one per row

# OECD groups only
arr_oecd = results.to_array(group_selection='oecd')          # (n, 27)

# Count or max-component size encoding
arr_cnt  = results.to_array(component_metrics=['count'])
arr_max  = results.to_array(component_metrics=['max_component'])

# Preset 'best': binary + effective graph resistance
# (best mean Tanimoto in benchmarks — 2 × 114 columns)
arr_best = results.to_array(preset='best')                   # (n, 228)

# Stacked multi-metric embedding
arr_multi = results.to_array(
    component_metrics=['binary', 'effective_graph_resistance',
                       'n_spacer', 'ring_size'],
    molecule_metrics=['n_components', 'max_size',
                      'mean_branching', 'mean_component_fraction'],
)

# Column labels matching the array columns
cols = results.column_names(
    component_metrics=['binary', 'effective_graph_resistance']
)
print(cols[:4])

Note

n_spacer encodes the telomer CH2 spacer length (the m in m:n telomer notation). It is zero for non-telomers.

ring_size is the smallest ring overlapping each matched component (zero for acyclic groups; 5 for azoles/furans; 6 for benzene/cyclohexane).

Saving to a database

results.to_sql(filename='results.db')   # SQLite by default

Dimensionality reduction

PCA, t-SNE, and UMAP operate directly on a PFASEmbeddingSet — no manual call to to_array() needed. Each method returns a dict whose 'transformed' key holds the projected coordinates as a numpy array.

Note

PCA and t-SNE require scikit-learn; UMAP requires umap-learn:

pip install scikit-learn umap-learn matplotlib

PCA

# Basic scatter plot — displayed automatically
pca = results.perform_pca(n_components=2, plot=True)
print(pca['transformed'].shape)   # (29, 2)

# Colour each point by the PFAS group with the highest match count
pca = results.perform_pca(n_components=2, color_by='top_group')

# Supply your own per-molecule labels (e.g. from external metadata)
labels = ['industrial'] * 15 + ['environmental'] * (len(smiles_list) - 15)
pca = results.perform_pca(n_components=2, color_by=labels)

# Save figure to a file without displaying it
pca = results.perform_pca(n_components=2, color_by='top_group',
                           output_file='pca.png', plot=False)

t-SNE

# Basic t-SNE scatter plot
tsne = results.perform_tsne(n_components=2, perplexity=8, plot=True)
print(tsne['transformed'].shape)  # (29, 2)

# Colour by top PFAS group
tsne = results.perform_tsne(perplexity=8, color_by='top_group')

# Custom per-molecule labels
tsne = results.perform_tsne(perplexity=8, color_by=labels)

# Save figure
tsne = results.perform_tsne(perplexity=8, color_by='top_group',
                             output_file='tsne.png', plot=False)

UMAP

# Basic UMAP scatter plot
umap_res = results.perform_umap(n_components=2, n_neighbors=10, plot=True)
print(umap_res['transformed'].shape)  # (29, 2)

# Colour by top PFAS group
umap_res = results.perform_umap(n_neighbors=10, color_by='top_group')

# Custom per-molecule labels
umap_res = results.perform_umap(n_neighbors=10, color_by=labels)

# Save figure
umap_res = results.perform_umap(n_neighbors=10, color_by='top_group',
                                output_file='umap.png', plot=False)

Prioritisation

prioritise_molecules() ranks molecules by structural similarity to a reference set, or by intrinsic fluorination properties when no reference is provided. It accepts SMILES lists, RDKit Mol lists, or a pre-parsed PFASEmbeddingSet.

Ranking against a reference set

Molecules are ranked by KL-divergence similarity to the reference distribution (lower divergence = more similar = higher priority):

from PFASGroups import prioritise_molecules

reference = [
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)O",
]

ranked, scores = prioritise_molecules(
    molecules=smiles_list,
    reference=reference,
    return_scores=True,
)

for mol, score in zip(ranked, scores):
    print(f"{score:.4f}  {mol.smiles}")

Ranking by intrinsic fluorination

When no reference is given, molecules are scored by total fluorination (weight a) and longest chain length (weight b, at the given percentile):

ranked, scores = prioritise_molecules(
    molecules=smiles_list,
    a=1.0,          # weight for total fluorinated content
    b=5.0,          # weight for longest chain (percentile-based)
    percentile=90,  # focus on the 90th-percentile component size
    return_scores=True,
)

for mol, score in zip(ranked, scores):
    print(f"{score:.4f}  {mol.smiles}")

Passing a pre-parsed set avoids double-parsing large inventories:

ranked, scores = prioritise_molecules(
    molecules=results,       # PFASEmbeddingSet already computed above
    a=1.0, b=5.0, percentile=90,
    return_scores=True,
)

Comparing two datasets

compare_kld() returns a single normalised similarity score (0 = identical distributions, 1 = maximally different):

other_smiles = smiles_list[:15] + [
    "OC(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(O)=O",
    "OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F",
]
other_results = parse_smiles(other_smiles)

sim = results.compare_kld(other_results, method='minmax')
print(f"Similarity: {sim:.3f}")   # lower = more similar

Multi-halogen analysis

By default PFASGroups detects fluorine only. Pass halogens explicitly to include other halogens, or import from HalogenGroups where all four are the default:

# Explicit multi-halogen parse via PFASGroups
r_mh = parse_smiles(smiles_list, halogens=['F', 'Cl', 'Br', 'I'])

# Multi-halogen embedding: 114 groups × 4 halogens = 456 columns
arr_mh = r_mh.to_array(halogens=['F', 'Cl', 'Br', 'I'])
print(arr_mh.shape)   # (29, 456)

See Multi-Halogen Analysis (Advanced) for the full multi-halogen reference.

PFAS definition screening

results_defs = parse_smiles(smiles_list, include_PFAS_definitions=True)

for mol in results_defs:
    if mol.pfas_definition_matches:
        names = [d.definition_name for d in mol.pfas_definition_matches]
        print(mol.smiles, "→", names)

Available definitions: OECD 2021, EU REACH, OPPT 2023, UK Environment Agency, PFASSTRUCTv5. See PFAS Definitions for details.

Loading and inspecting groups

from PFASGroups import get_compiled_HalogenGroups

groups = get_compiled_HalogenGroups()
print(len(groups))        # 114 compiled groups (plus aggregate groups)

for g in groups[:5]:
    print(g.name, g.category, g.id)

See Customization for how to add custom group definitions.