Spaces:

Protolaw
/

synplanner_dev

Sleeping

App Files Files Community

Gilmullin Almaz commited on Jun 23, 2025

Commit

72a3513

1 Parent(s): 3e5f8cc

Refactor code structure for improved readability and maintainability

Browse files

Files changed (50) hide show

.gitattributes +0 -35
Dockerfile +0 -21
README.md +11 -14
app.py +1349 -0
pre-requirements.txt +7 -0
requirements.txt +6 -3
src/streamlit_app.py +0 -40
synplan/__init__.py +3 -0
synplan/chem/__init__.py +3 -0
synplan/chem/data/__init__.py +0 -0
synplan/chem/data/filtering.py +962 -0
synplan/chem/data/standardizing.py +1187 -0
synplan/chem/precursor.py +100 -0
synplan/chem/reaction.py +125 -0
synplan/chem/reaction_routes/__init__.py +0 -0
synplan/chem/reaction_routes/clustering.py +859 -0
synplan/chem/reaction_routes/io.py +286 -0
synplan/chem/reaction_routes/leaving_groups.py +131 -0
synplan/chem/reaction_routes/route_cgr.py +570 -0
synplan/chem/reaction_routes/visualisation.py +903 -0
synplan/chem/reaction_rules/__init__.py +0 -0
synplan/chem/reaction_rules/extraction.py +744 -0
synplan/chem/reaction_rules/manual/__init__.py +6 -0
synplan/chem/reaction_rules/manual/decompositions.py +413 -0
synplan/chem/reaction_rules/manual/transformations.py +532 -0
synplan/chem/utils.py +225 -0
synplan/interfaces/__init__.py +0 -0
synplan/interfaces/cli.py +506 -0
synplan/interfaces/gui.py +1323 -0
synplan/mcts/__init__.py +8 -0
synplan/mcts/evaluation.py +45 -0
synplan/mcts/expansion.py +96 -0
synplan/mcts/node.py +47 -0
synplan/mcts/search.py +199 -0
synplan/mcts/tree.py +635 -0
synplan/ml/__init__.py +0 -0
synplan/ml/networks/__init__.py +0 -0
synplan/ml/networks/modules.py +234 -0
synplan/ml/networks/policy.py +137 -0
synplan/ml/networks/value.py +67 -0
synplan/ml/training/__init__.py +11 -0
synplan/ml/training/preprocessing.py +516 -0
synplan/ml/training/reinforcement.py +379 -0
synplan/ml/training/supervised.py +153 -0
synplan/utils/__init__.py +4 -0
synplan/utils/config.py +543 -0
synplan/utils/files.py +226 -0
synplan/utils/loading.py +151 -0
synplan/utils/logging.py +179 -0
synplan/utils/visualisation.py +1365 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile DELETED Viewed

@@ -1,21 +0,0 @@
-FROM python:3.9-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,20 +1,17 @@
 ---
-title: Synplanner Dev
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Developers mode for synplanner
 license: mit
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: SynPlanner GUI
+emoji: 🧪
+colorFrom: pink
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.37.0
+app_file: app.py
 pinned: false
 license: mit
+python_version: 3.11.9
 ---
+# SynPlanner Graphical User Interface (GUI)
+Try the GUI to find reaction paths...
+**documentation to be done**

app.py ADDED Viewed

	@@ -0,0 +1,1349 @@

+import base64
+import pickle
+import re
+import uuid
+import io
+import zipfile
+import pandas as pd
+import streamlit as st
+from CGRtools.files import SMILESRead
+from streamlit_ketcher import st_ketcher
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import disable_progress_bars
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.search import extract_tree_stats
+from synplan.mcts.tree import Tree
+from synplan.chem.utils import mol_from_smiles
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.clustering import *
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+    generate_results_html,
+    html_top_routes_cluster,
+    get_route_svg,
+    get_route_svg_from_json,
+    get_route_svg_mod
+)
+from synplan.utils.config import TreeConfig, PolicyNetworkConfig
+from synplan.utils.loading import load_reaction_rules, load_building_blocks
+import psutil
+import gc
+disable_progress_bars("huggingface_hub")
+smiles_parser = SMILESRead.create_parser(ignore=True)
+DEFAULT_MOL = "c1cc(ccc1Cl)C(CCO)NC(C2(CCN(CC2)c3c4cc[nH]c4ncn3)N)=O"
+# --- Helper Functions ---
+def download_button(
+    object_to_download, download_filename, button_text, pickle_it=False
+):
+    """
+    Issued from
+    Generates a link to download the given object_to_download.
+    Params:
+    ------
+    object_to_download:  The object to be downloaded.
+    download_filename (str): filename and extension of file. e.g. mydata.csv,
+    some_txt_output.txt download_link_text (str): Text to display for download
+    link.
+    button_text (str): Text to display on download button (e.g. 'click here to download file')
+    pickle_it (bool): If True, pickle file.
+    Returns:
+    -------
+    (str): the anchor tag to download object_to_download
+    Examples:
+    --------
+    download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
+    download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
+    """
+    if pickle_it:
+        try:
+            object_to_download = pickle.dumps(object_to_download)
+        except pickle.PicklingError as e:
+            st.write(e)
+            return None
+    else:
+        if isinstance(object_to_download, bytes):
+            pass
+        elif isinstance(object_to_download, pd.DataFrame):
+            object_to_download = object_to_download.to_csv(index=False).encode("utf-8")
+    try:
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError:
+        b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
+        <style>
+            #{button_id} {{
+                background-color: rgb(255, 255, 255);
+                color: rgb(38, 39, 48);
+                text-decoration: none;
+                border-radius: 4px;
+                border-width: 1px;
+                border-style: solid;
+                border-color: rgb(230, 234, 241);
+                border-image: initial;
+            }}
+            #{button_id}:hover {{
+                border-color: rgb(246, 51, 102);
+                color: rgb(246, 51, 102);
+            }}
+            #{button_id}:active {{
+                box-shadow: none;
+                background-color: rgb(246, 51, 102);
+                color: white;
+                }}
+        </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br></br>'
+    )
+    return dl_link
+@st.cache_resource
+def load_planning_resources_cached():  # Renamed to avoid conflict if main calls it directly
+    building_blocks_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="building_blocks_em_sa_ln.smi",
+        subfolder="building_blocks",
+        local_dir=".",
+    )
+    ranking_policy_weights_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="ranking_policy_network.ckpt",
+        subfolder="uspto/weights",
+        local_dir=".",
+    )
+    reaction_rules_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="uspto_reaction_rules.pickle",
+        subfolder="uspto",
+        local_dir=".",
+    )
+    return building_blocks_path, ranking_policy_weights_path, reaction_rules_path
+# --- GUI Sections ---
+def initialize_app():
+    """1. Initialization: Setting up the main window, layout, and initial widgets."""
+    st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
+    # Initialize session state variables if they don't exist.
+    if "planning_done" not in st.session_state:
+        st.session_state.planning_done = False
+    if "tree" not in st.session_state:
+        st.session_state.tree = None
+    if "res" not in st.session_state:
+        st.session_state.res = None
+    if "target_smiles" not in st.session_state:
+        st.session_state.target_smiles = (
+            ""  # Initial value, might be overwritten by ketcher
+        )
+    # Clustering state
+    if "clustering_done" not in st.session_state:
+        st.session_state.clustering_done = False
+    if "clusters" not in st.session_state:
+        st.session_state.clusters = None
+    if "reactions_dict" not in st.session_state:
+        st.session_state.reactions_dict = None
+    if "num_clusters_setting" not in st.session_state:  # Store the setting used
+        st.session_state.num_clusters_setting = 10
+    if "route_cgrs_dict" not in st.session_state:
+        st.session_state.route_cgrs_dict = None
+    if "sb_cgrs_dict" not in st.session_state:
+        st.session_state.sb_cgrs_dict = None
+    if "route_json" not in st.session_state:
+        st.session_state.route_json = None
+    # Subclustering state
+    if "subclustering_done" not in st.session_state:
+        st.session_state.subclustering_done = False
+    if "subclusters" not in st.session_state:  # Renamed from 'sub' for clarity
+        st.session_state.subclusters = None
+    # Download state (less critical now with direct download links)
+    if "clusters_downloaded" not in st.session_state:  # Example, might not be needed
+        st.session_state.clusters_downloaded = False
+    if "ketcher" not in st.session_state:  # For ketcher persistence
+        st.session_state.ketcher = DEFAULT_MOL
+    intro_text = """
+    This is a demo of the graphical user interface of
+    [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
+    SynPlanner is a comprehensive tool for reaction data curation, rule extraction, model training and retrosynthetic planning.
+    More information on SynPlanner is available in the [official docs](https://synplanner.readthedocs.io/en/latest/index.html).
+    """
+    st.title("`SynPlanner GUI`")
+    st.write(intro_text)
+def setup_sidebar():
+    """2. Sidebar: Handling the widgets and logic within the sidebar area."""
+    # st.sidebar.image("img/logo.png") # Assuming img/logo.png is available
+    st.sidebar.title("Docs")
+    st.sidebar.markdown("https://synplanner.readthedocs.io/en/latest/")
+    st.sidebar.title("Tutorials")
+    st.sidebar.markdown(
+        "https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/tree/main/tutorials"
+    )
+    st.sidebar.title("Paper")
+    st.sidebar.markdown(
+        "https://chemrxiv.org/engage/chemrxiv/article-details/66add90bc9c6a5c07ae65796"
+    )
+    st.sidebar.title("Issues")
+    st.sidebar.markdown(
+        "[Report a bug 🐞](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/issues/new?assignees=&labels=bug&projects=&template=bug_report.md&title=%5BBUG%5D)"
+    )
+def handle_molecule_input():
+    """3. Molecule Input: Managing the input area for molecule data with two-way synchronization."""
+    st.header("Molecule input")
+    st.markdown(
+        """
+        You can provide a molecular structure by either providing:
+        * SMILES string + Enter
+        * Draw it + Apply
+        """
+    )
+    if "shared_smiles" not in st.session_state:
+        st.session_state.shared_smiles = st.session_state.get("ketcher", DEFAULT_MOL)
+    if "ketcher_render_count" not in st.session_state:
+        st.session_state.ketcher_render_count = 0
+    def text_input_changed_callback():
+        new_text_value = (
+            st.session_state.smiles_text_input_key_for_sync
+        )  # Key of the text_input
+        if new_text_value != st.session_state.shared_smiles:
+            st.session_state.shared_smiles = new_text_value
+            st.session_state.ketcher = new_text_value
+            st.session_state.ketcher_render_count += 1
+    # SMILES Text Input
+    st.text_input(
+        "SMILES:",
+        value=st.session_state.shared_smiles,
+        key="smiles_text_input_key_for_sync",  # Unique key for this widget
+        on_change=text_input_changed_callback,
+        help="Enter SMILES string and press Enter. The drawing will update, and vice-versa.",
+    )
+    ketcher_key = f"ketcher_widget_for_sync_{st.session_state.ketcher_render_count}"
+    smile_code_output_from_ketcher = st_ketcher(
+        st.session_state.shared_smiles, key=ketcher_key
+    )
+    if smile_code_output_from_ketcher != st.session_state.shared_smiles:
+        st.session_state.shared_smiles = smile_code_output_from_ketcher
+        st.session_state.ketcher = smile_code_output_from_ketcher
+        st.rerun()
+    current_smiles_for_planning = st.session_state.shared_smiles
+    last_planned_smiles = st.session_state.get("target_smiles")
+    if (
+        last_planned_smiles
+        and current_smiles_for_planning != last_planned_smiles
+        and st.session_state.get("planning_done", False)
+    ):
+        st.warning(
+            "Molecule structure has changed since the last successful planning run. "
+            "Results shown below (if any) are for the previous molecule. "
+            "Please re-run planning for the current structure."
+        )
+    # Ensure st.session_state.ketcher is consistent for other parts of the app
+    if st.session_state.get("ketcher") != current_smiles_for_planning:
+        st.session_state.ketcher = current_smiles_for_planning
+    return current_smiles_for_planning
+def setup_planning_options():
+    """4. Planning: Encapsulating the logic related to the "planning" functionality."""
+    st.header("Launch calculation")
+    st.markdown(
+        """If you modified the structure, please ensure you clicked on `Apply` (bottom right of the molecular editor)."""
+    )
+    st.markdown(
+        f"The molecule SMILES is actually: ``{st.session_state.get('ketcher', DEFAULT_MOL)}``"
+    )
+    st.subheader("Planning options")
+    st.markdown(
+        """
+        The description of each option can be found in the
+        [Retrosynthetic Planning Tutorial](https://synplanner.readthedocs.io/en/latest/tutorial_files/retrosynthetic_planning.html#Configuring-search-tree).
+        """
+    )
+    col_options_1, col_options_2 = st.columns(2, gap="medium")
+    with col_options_1:
+        search_strategy_input = st.selectbox(
+            label="Search strategy",
+            options=(
+                "Expansion first",
+                "Evaluation first",
+            ),
+            index=0,
+            key="search_strategy_input",
+        )
+        ucb_type = st.selectbox(
+            label="UCB type",
+            options=("uct", "puct", "value"),
+            index=0,
+            key="ucb_type_input",
+        )
+        c_ucb = st.number_input(
+            "C coefficient of UCB",
+            value=0.1,
+            placeholder="Type a number...",
+            key="c_ucb_input",
+        )
+    with col_options_2:
+        max_iterations = st.slider(
+            "Total number of MCTS iterations",
+            min_value=50,
+            max_value=3000,
+            value=1000,
+            key="max_iterations_slider",
+        )
+        max_depth = st.slider(
+            "Maximal number of reaction steps",
+            min_value=3,
+            max_value=9,
+            value=6,
+            key="max_depth_slider",
+        )
+        min_mol_size = st.slider(
+            "Minimum size of a molecule to be precursor",
+            min_value=0,
+            max_value=7,
+            value=0,
+            key="min_mol_size_slider",
+            help="Number of non-hydrogen atoms in molecule",
+        )
+    search_strategy_translator = {
+        "Expansion first": "expansion_first",
+        "Evaluation first": "evaluation_first",
+    }
+    search_strategy = search_strategy_translator[search_strategy_input]
+    planning_params = {
+        "search_strategy": search_strategy,
+        "ucb_type": ucb_type,
+        "c_ucb": c_ucb,
+        "max_iterations": max_iterations,
+        "max_depth": max_depth,
+        "min_mol_size": min_mol_size,
+    }
+    if st.button("Start retrosynthetic planning", key="submit_planning_button"):
+        # Reset downstream states if replanning
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.route_cgrs_dict = None
+        st.session_state.sb_cgrs_dict = None
+        st.session_state.route_json = None
+        active_smile_code = st.session_state.get(
+            "ketcher", DEFAULT_MOL
+        )  # Get current SMILES
+        st.session_state.target_smiles = (
+            active_smile_code  # Store the SMILES used for this run
+        )
+        try:
+            target_molecule = mol_from_smiles(active_smile_code, clean_stereo=True)
+            if target_molecule is None:
+                raise ValueError(f"Could not parse the input SMILES: {active_smile_code}")
+            (
+                building_blocks_path,
+                ranking_policy_weights_path,
+                reaction_rules_path,
+            ) = load_planning_resources_cached()
+            with st.spinner("Running retrosynthetic planning..."):
+                with st.status("Loading resources...", expanded=False) as status:
+                    st.write("Loading building blocks...")
+                    building_blocks = load_building_blocks(
+                        building_blocks_path, standardize=False
+                    )
+                    st.write("Loading reaction rules...")
+                    reaction_rules = load_reaction_rules(reaction_rules_path)
+                    st.write("Loading policy network...")
+                    policy_config = PolicyNetworkConfig(
+                        weights_path=ranking_policy_weights_path
+                    )
+                    policy_function = PolicyNetworkFunction(
+                        policy_config=policy_config
+                    )
+                    status.update(label="Resources loaded!", state="complete")
+                tree_config = TreeConfig(
+                    search_strategy=planning_params["search_strategy"],
+                    evaluation_type="rollout",
+                    max_iterations=planning_params["max_iterations"],
+                    max_depth=planning_params["max_depth"],
+                    min_mol_size=planning_params["min_mol_size"],
+                    init_node_value=0.5,
+                    ucb_type=planning_params["ucb_type"],
+                    c_ucb=planning_params["c_ucb"],
+                    silent=True,
+                )
+                tree = Tree(
+                    target=target_molecule,
+                    config=tree_config,
+                    reaction_rules=reaction_rules,
+                    building_blocks=building_blocks,
+                    expansion_function=policy_function,
+                    evaluation_function=None,
+                )
+                mcts_progress_text = "Running MCTS iterations..."
+                mcts_bar = st.progress(0, text=mcts_progress_text)
+                for step, (solved, route_id) in enumerate(tree):
+                    progress_value = min(
+                        1.0, (step + 1) / planning_params["max_iterations"]
+                    )
+                    mcts_bar.progress(
+                        progress_value,
+                        text=f"{mcts_progress_text} ({step+1}/{planning_params['max_iterations']})",
+                    )
+                res = extract_tree_stats(tree, target_molecule)
+                st.session_state["tree"] = tree
+                st.session_state["res"] = res
+                st.session_state.planning_done = True
+                st.rerun()
+        except (ValueError, KeyError, FileNotFoundError, TypeError) as e:
+            st.error(f"An error occurred during planning: {e}")
+            st.session_state.planning_done = False
+def display_planning_results():
+    """5. Planning Results Display: Handling the presentation of results."""
+    if st.session_state.get("planning_done", False):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        if res is None or tree is None:
+            st.error(
+                "Planning results are missing from session state. Please re-run planning."
+            )
+            st.session_state.planning_done = False  # Reset state
+            return  # Exit this function if no results
+        if res.get("solved", False):  # Use .get for safety
+            st.header("Planning Results")
+            winning_nodes = (
+                sorted(set(tree.winning_nodes))
+                if hasattr(tree, "winning_nodes") and tree.winning_nodes
+                else []
+            )
+            st.subheader(f"Number of unique routes found: {len(winning_nodes)}")
+            st.subheader("Examples of found retrosynthetic routes")
+            image_counter = 0
+            visualised_route_ids = set()
+            if not winning_nodes:
+                st.warning(
+                    "Planning solved, but no winning nodes found in the tree object."
+                )
+            else:
+                for n, route_id in enumerate(winning_nodes):
+                    if image_counter >= 3:
+                        break
+                    if route_id not in visualised_route_ids:
+                        try:
+                            visualised_route_ids.add(route_id)
+                            num_steps = len(tree.synthesis_route(route_id))
+                            route_score = round(tree.route_score(route_id), 3)
+                            svg = get_route_svg(tree, route_id)
+                            if svg:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                                image_counter += 1
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {route_id}."
+                                )
+                        except Exception as e:
+                            st.error(f"Error displaying route {route_id}: {e}")
+        else:  # Not solved
+            st.header("Planning Results")
+            st.warning(
+                "No reaction path found for the target molecule with the current settings."
+            )
+            st.write(
+                "Find below the unfinished pathways"
+            )
+            image_counter = 0
+            for route_id in list(tree.nodes.keys())[1:tree.config.max_iterations:50]:
+                svg = get_route_svg_mod(tree, route_id)
+                # display(SVG(get_route_svg_mod(tree, route_id)))
+                if svg:
+                    st.image(
+                        svg,
+                        caption=f"Route {route_id};",
+                    )
+                    image_counter += 1
+                    reactions = tree.synthesis_route(route_id)
+                    for reaction in reactions:
+                        st.write(reaction)
+                else:
+                    st.warning(
+                        f"Could not generate SVG for route {route_id}."
+                    )
+                if image_counter >= 20:
+                    break
+            # st.warning(
+            #     "No reaction path found for the target molecule with the current settings."
+            # )
+            # st.write(
+            #     "Consider adjusting planning options (e.g., increase iterations, adjust depth, check molecule validity)."
+            # )
+            # stat_col, _ = st.columns(2)
+            # with stat_col:
+            #     st.subheader("Run Statistics (No Solution)")
+            #     try:
+            #         if (
+            #             "target_smiles" not in res
+            #             and "target_smiles" in st.session_state
+            #         ):
+            #             res["target_smiles"] = st.session_state.target_smiles
+            #         cols_to_show = [
+            #             col
+            #             for col in [
+            #                 "target_smiles",
+            #                 "num_nodes",
+            #                 "num_iter",
+            #                 "search_time",
+            #             ]
+            #             if col in res
+            #         ]
+            #         if cols_to_show:
+            #             df = pd.DataFrame(res, index=[0])[cols_to_show]
+            #             st.dataframe(df)
+            #         else:
+            #             st.write("No statistics to display for the unsuccessful run.")
+            #     except Exception as e:
+            #         st.error(f"Error displaying statistics: {e}")
+            #         st.write(res)
+def download_planning_results():
+    """6. Planning Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        # This section is usually placed within a column in the original script
+        # We'll assume it's called after display_planning_results and can use a new column or area.
+        # For proper layout, this should be integrated with display_planning_results' columns.
+        # For now, creating a placeholder or separate section for downloads:
+        # st.subheader("Downloads") # This might be redundant if called within a layout context.
+        # The original code places downloads in the second column of planning results.
+        # To replicate, we'd need to pass the column object or call this within that context.
+        # Simulating this by just creating the download links:
+        try:
+            html_body = generate_results_html(tree, html_path=None, extended=True)
+            dl_html = download_button(
+                html_body,
+                f"results_synplanner_{st.session_state.target_smiles}.html",
+                "Download results (HTML)",
+            )
+            if dl_html:
+                st.markdown(dl_html, unsafe_allow_html=True)
+            try:
+                res_df = pd.DataFrame(res, index=[0])
+                dl_csv = download_button(
+                    res_df,
+                    f"stats_synplanner_{st.session_state.target_smiles}.csv",
+                    "Download statistics (CSV)",
+                )
+                if dl_csv:
+                    st.markdown(dl_csv, unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Could not prepare statistics CSV for download: {e}")
+        except Exception as e:
+            st.error(f"Error generating download links for planning results: {e}")
+def setup_clustering():
+    """7. Clustering: Encapsulating the logic related to the "clustering" functionality."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        st.divider()
+        st.header("Clustering the retrosynthetic routes")
+        if st.button("Run Clustering", key="submit_clustering_button"):
+            # st.session_state.num_clusters_setting = num_clusters_input
+            st.session_state.clustering_done = False
+            st.session_state.subclustering_done = False
+            st.session_state.clusters = None
+            st.session_state.reactions_dict = None
+            st.session_state.subclusters = None
+            st.session_state.route_cgrs_dict = None
+            st.session_state.sb_cgrs_dict = None
+            st.session_state.route_json = None
+            with st.spinner("Performing clustering..."):
+                try:
+                    current_tree = st.session_state.tree
+                    if not current_tree:
+                        st.error("Tree object not found. Please re-run planning.")
+                        return
+                    st.write("Calculating RoutesCGRs...")
+                    route_cgrs_dict = compose_all_route_cgrs(current_tree)
+                    st.write("Processing SB-CGRs...")
+                    sb_cgrs_dict = compose_all_sb_cgrs(route_cgrs_dict)
+                    results = cluster_routes(
+                        sb_cgrs_dict, use_strat=False
+                    )  # num_clusters was removed from args
+                    results = dict(sorted(results.items(), key=lambda x: float(x[0])))
+                    st.session_state.clusters = results
+                    st.session_state.route_cgrs_dict = route_cgrs_dict
+                    st.session_state.sb_cgrs_dict = sb_cgrs_dict
+                    st.write("Extracting reactions...")
+                    st.session_state.reactions_dict = extract_reactions(current_tree)
+                    st.session_state.route_json = make_json(st.session_state.reactions_dict)
+                    if (
+                        st.session_state.clusters is not None
+                        and st.session_state.reactions_dict is not None
+                    ):  # Check for None explicitly
+                        st.session_state.clustering_done = True
+                        st.success(
+                            f"Clustering complete. Found {len(st.session_state.clusters)} clusters."
+                        )
+                    else:
+                        st.error("Clustering failed or returned empty results.")
+                        st.session_state.clustering_done = False
+                    del results  # route_cgrs_dict, sb_cgrs_dict are stored
+                    gc.collect()
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"An error occurred during clustering: {e}")
+                    st.session_state.clustering_done = False
+def display_clustering_results():
+    """8. Clustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("clustering_done", False):
+        clusters = st.session_state.clusters
+        # reactions_dict = st.session_state.reactions_dict # Needed for download, not directly for display here
+        tree = st.session_state.tree
+        MAX_DISPLAY_CLUSTERS_DATA = 10
+        if (
+            clusters is None or tree is None
+        ):  # reactions_dict removed as not critical for display part
+            st.error(
+                "Clustering results (clusters or tree) are missing. Please re-run clustering."
+            )
+            st.session_state.clustering_done = False
+            return
+        st.subheader(f"Best routes from {len(clusters)} Found Clusters")
+        clusters_items = list(clusters.items())
+        first_items = clusters_items[:MAX_DISPLAY_CLUSTERS_DATA]
+        remaining_items = clusters_items[MAX_DISPLAY_CLUSTERS_DATA:]
+        for cluster_num, group_data in first_items:
+            if (
+                not group_data
+                or "route_ids" not in group_data
+                or not group_data["route_ids"]
+            ):
+                st.warning(f"Cluster {cluster_num} has no data or route_ids.")
+                continue
+            st.markdown(
+                f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+            )
+            route_id = group_data["route_ids"][0]
+            try:
+                num_steps = len(tree.synthesis_route(route_id))
+                route_score = round(tree.route_score(route_id), 3)
+                # svg = get_route_svg(tree, route_id)
+                svg = get_route_svg_from_json(st.session_state.route_json, route_id)
+                sb_cgr = group_data.get("sb_cgr")  # Safely get sb_cgr
+                sb_cgr_svg = None
+                if sb_cgr:
+                    sb_cgr.clean2d()
+                    sb_cgr_svg = cgr_display(sb_cgr)
+                if svg and sb_cgr_svg:
+                    col1, col2 = st.columns([0.2, 0.8])
+                    with col1:
+                        st.image(sb_cgr_svg, caption="SB-CGR")
+                    with col2:
+                        st.image(
+                            svg,
+                            caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                        )
+                elif svg:  # Only route SVG available
+                    st.image(
+                        svg,
+                        caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                    )
+                    st.warning(
+                        f"SB-CGR could not be displayed for cluster {cluster_num}."
+                    )
+                else:
+                    st.warning(
+                        f"Could not generate SVG for route {route_id} or its SB-CGR."
+                    )
+            except Exception as e:
+                st.error(
+                    f"Error displaying route {route_id} for cluster {cluster_num}: {e}"
+                )
+        if remaining_items:
+            with st.expander(f"... and {len(remaining_items)} more clusters"):
+                for cluster_num, group_data in remaining_items:
+                    if (
+                        not group_data
+                        or "route_ids" not in group_data
+                        or not group_data["route_ids"]
+                    ):
+                        st.warning(
+                            f"Cluster {cluster_num} in expansion has no data or route_ids."
+                        )
+                        continue
+                    st.markdown(
+                        f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+                    )
+                    route_id = group_data["route_ids"][0]
+                    try:
+                        num_steps = len(tree.synthesis_route(route_id))
+                        route_score = round(tree.route_score(route_id), 3)
+                        # svg = get_route_svg(tree, route_id)
+                        svg = get_route_svg_from_json(st.session_state.route_json, route_id)
+                        sb_cgr = group_data.get("sb_cgr")
+                        sb_cgr_svg = None
+                        if sb_cgr:
+                            sb_cgr.clean2d()
+                            sb_cgr_svg = cgr_display(sb_cgr)
+                        if svg and sb_cgr_svg:
+                            col1, col2 = st.columns([0.2, 0.8])
+                            with col1:
+                                st.image(sb_cgr_svg, caption="SB-CGR")
+                            with col2:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                        elif svg:
+                            st.image(
+                                svg,
+                                caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                            )
+                            st.warning(
+                                f"SB-CGR could not be displayed for cluster {cluster_num}."
+                            )
+                        else:
+                            st.warning(
+                                f"Could not generate SVG for route {route_id} or its SB-CGR."
+                            )
+                    except Exception as e:
+                        st.error(
+                            f"Error displaying route {route_id} for cluster {cluster_num}: {e}"
+                        )
+def download_clustering_results():
+    """10. Clustering Results Download: Providing functionality to download."""
+    if st.session_state.get("clustering_done", False):
+        tree_for_html = st.session_state.get("tree")
+        clusters_for_html = st.session_state.get("clusters")
+        sb_cgrs_for_html = st.session_state.get(
+            "sb_cgrs_dict"
+        )  # This was used instead of reactions_dict in the original for report
+        if not tree_for_html:
+            st.warning("MCTS Tree data not found. Cannot generate cluster reports.")
+            return
+        if not clusters_for_html:
+            st.warning("Cluster data not found. Cannot generate cluster reports.")
+            return
+        # sb_cgrs_for_html is optional for routes_clustering_report if not essential
+        st.subheader("Cluster Reports")  # Changed subheader in original
+        st.write("Generate downloadable HTML reports for each cluster:")
+        MAX_DOWNLOAD_LINKS_DISPLAYED = 10
+        num_clusters_total = len(clusters_for_html)
+        clusters_items = list(clusters_for_html.items())
+        for i, (cluster_idx, group_data) in enumerate(
+            clusters_items
+        ):  # group_data might not be needed here if report uses cluster_idx
+            if i >= MAX_DOWNLOAD_LINKS_DISPLAYED:
+                break
+            try:
+                html_content = routes_clustering_report(
+                    tree_for_html,
+                    clusters_for_html,  # Pass the whole dict
+                    str(cluster_idx),  # Pass the key of the cluster
+                    sb_cgrs_for_html,  # Pass the sb_cgrs dict
+                    aam=False,
+                )
+                st.download_button(
+                    label=f"Download report for cluster {cluster_idx}",
+                    data=html_content,
+                    file_name=f"cluster_{cluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_cluster_{cluster_idx}",
+                )
+            except Exception as e:
+                st.error(f"Error generating report for cluster {cluster_idx}: {e}")
+        if num_clusters_total > MAX_DOWNLOAD_LINKS_DISPLAYED:
+            remaining_items = clusters_items[MAX_DOWNLOAD_LINKS_DISPLAYED:]
+            remaining_count = len(remaining_items)
+            expander_label = f"Show remaining {remaining_count} cluster reports"
+            with st.expander(expander_label):
+                for (
+                    group_index,
+                    _,
+                ) in remaining_items:  # group_data not needed here either
+                    try:
+                        html_content = routes_clustering_report(
+                            tree_for_html,
+                            clusters_for_html,
+                            str(group_index),
+                            sb_cgrs_for_html,
+                            aam=False,
+                        )
+                        st.download_button(
+                            label=f"Download report for cluster {group_index}",
+                            data=html_content,
+                            file_name=f"cluster_{group_index}_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_expanded_{group_index}",
+                        )
+                    except Exception as e:
+                        st.error(
+                            f"Error generating report for cluster {group_index} (expanded): {e}"
+                        )
+        try:
+            buffer = io.BytesIO()
+            with zipfile.ZipFile(
+                buffer, mode="w", compression=zipfile.ZIP_DEFLATED
+            ) as zf:
+                for idx, _ in clusters_items:  # group_data not needed
+                    html_content_zip = routes_clustering_report(
+                        tree_for_html,
+                        clusters_for_html,
+                        str(idx),
+                        sb_cgrs_for_html,
+                        aam=False,
+                    )
+                    filename = f"cluster_{idx}_{st.session_state.target_smiles}.html"
+                    zf.writestr(filename, html_content_zip)
+            buffer.seek(0)
+            st.download_button(
+                label="📦 Download all cluster reports as ZIP",
+                data=buffer,
+                file_name=f"all_cluster_reports_{st.session_state.target_smiles}.zip",
+                mime="application/zip",
+                key="download_all_clusters_zip",
+            )
+        except Exception as e:
+            st.error(f"Error generating ZIP file for cluster reports: {e}")
+def setup_subclustering():
+    """11. Subclustering: Encapsulating the logic related to the "subclustering" functionality."""
+    if st.session_state.get(
+        "clustering_done", False
+    ):  # Subclustering depends on clustering being done
+        st.divider()
+        st.header("Sub-Clustering within a selected Cluster")
+        if st.button("Run Subclustering Analysis", key="submit_subclustering_button"):
+            st.session_state.subclustering_done = False
+            st.session_state.subclusters = None
+            with st.spinner("Performing subclustering analysis..."):
+                try:
+                    clusters_for_sub = st.session_state.get("clusters")
+                    sb_cgrs_dict_for_sub = st.session_state.get("sb_cgrs_dict")
+                    route_cgrs_dict_for_sub = st.session_state.get("route_cgrs_dict")
+                    if (
+                        clusters_for_sub
+                        and sb_cgrs_dict_for_sub
+                        and route_cgrs_dict_for_sub
+                    ):  # Ensure all are present
+                        all_subgroups = subcluster_all_clusters(
+                            clusters_for_sub,
+                            sb_cgrs_dict_for_sub,
+                            route_cgrs_dict_for_sub,
+                        )
+                        st.session_state.subclusters = all_subgroups
+                        st.session_state.subclustering_done = True
+                        st.success("Subclustering analysis complete.")
+                        gc.collect()
+                        st.rerun()
+                    else:
+                        missing = []
+                        if not clusters_for_sub:
+                            missing.append("clusters")
+                        if not sb_cgrs_dict_for_sub:
+                            missing.append("SB-CGRs dictionary")
+                        if not route_cgrs_dict_for_sub:
+                            missing.append("RouteCGRs dictionary")
+                        st.error(
+                            f"Cannot run subclustering. Missing data: {', '.join(missing)}. Please ensure clustering ran successfully."
+                        )
+                        st.session_state.subclustering_done = False
+                except Exception as e:
+                    st.error(f"An error occurred during subclustering: {e}")
+                    st.session_state.subclustering_done = False
+def display_subclustering_results():
+    """12. Subclustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("subclustering_done", False):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        # clusters_for_sub_display = st.session_state.get('clusters') # Not directly used in display logic from original code snippet
+        if not sub or not tree:
+            st.error(
+                "Subclustering results (subclusters or tree) are missing. Please re-run subclustering."
+            )
+            st.session_state.subclustering_done = False
+            return
+        sub_input_col, sub_display_col = st.columns([0.25, 0.75])
+        with sub_input_col:
+            st.subheader("Select Cluster and Subcluster")
+            available_cluster_nums = list(sub.keys())
+            if not available_cluster_nums:
+                st.warning("No clusters available in subclustering results.")
+                return  # Exit if no clusters to select
+            user_input_cluster_num_display = st.selectbox(
+                "Select Cluster #:",
+                options=sorted(available_cluster_nums),
+                key="subcluster_num_select_key",
+            )
+            selected_subcluster_idx = 0
+            if user_input_cluster_num_display in sub:
+                sub_step_cluster = sub[user_input_cluster_num_display]
+                allowed_subclusters_indices = sorted(list(sub_step_cluster.keys()))
+                if not allowed_subclusters_indices:
+                    st.warning(
+                        f"No reaction steps (subclusters) found for Cluster {user_input_cluster_num_display}."
+                    )
+                else:
+                    selected_subcluster_idx = st.selectbox(
+                        "Select Subcluster Index:",
+                        options=allowed_subclusters_indices,
+                        key="subcluster_index_select_key",
+                    )
+                    if selected_subcluster_idx in sub[user_input_cluster_num_display]:
+                        current_subcluster_data = sub[user_input_cluster_num_display][
+                            selected_subcluster_idx
+                        ]
+                        if "sb_cgr" in current_subcluster_data:
+                            cluster_sb_cgr_display = current_subcluster_data["sb_cgr"]
+                            cluster_sb_cgr_display.clean2d()
+                            st.image(
+                                cluster_sb_cgr_display.depict(),
+                                caption=f"SB-CGR of parent Cluster {user_input_cluster_num_display}",
+                            )
+                        else:
+                            st.warning("SB-CGR for this subcluster not found.")
+            else:
+                st.warning(
+                    f"Selected cluster {user_input_cluster_num_display} not found in subclustering results."
+                )
+                return
+        with sub_display_col:
+            st.subheader("Subcluster Details")
+            if (
+                user_input_cluster_num_display in sub
+                and selected_subcluster_idx in sub[user_input_cluster_num_display]
+            ):
+                subcluster_content = sub[user_input_cluster_num_display][
+                    selected_subcluster_idx
+                ]
+                # subcluster_to_display = post_process_subgroup(subcluster_content) #Under development
+                subcluster_to_display = subcluster_content
+                if (
+                    not subcluster_to_display
+                    or "routes_data" not in subcluster_to_display
+                    or not subcluster_to_display["routes_data"]
+                ):
+                    st.info("No routes or data found for this subcluster selection.")
+                else:
+                    MAX_ROUTES_PER_SUBCLUSTER = 5
+                    all_route_ids_in_subcluster = list(
+                        subcluster_to_display["routes_data"].keys()
+                    )
+                    routes_to_display_direct = all_route_ids_in_subcluster[
+                        :MAX_ROUTES_PER_SUBCLUSTER
+                    ]
+                    remaining_routes_sub = all_route_ids_in_subcluster[
+                        MAX_ROUTES_PER_SUBCLUSTER:
+                    ]
+                    st.markdown(
+                        f"--- \n**Subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}** (Size: {len(all_route_ids_in_subcluster)})"
+                    )
+                    if "synthon_reaction" in subcluster_to_display:
+                        synthon_reaction = subcluster_to_display["synthon_reaction"]
+                        try:
+                            synthon_reaction.clean2d()
+                            st.image(
+                                depict_custom_reaction(synthon_reaction),
+                                caption=f"Markush-like pseudo reaction of subcluster",
+                            )  # Assuming depict_custom_reaction
+                        except Exception as e_depict:
+                            st.warning(f"Could not depict synthon reaction: {e_depict}")
+                    else:
+                        st.info("No synthon reaction data for this subcluster.")
+                    with st.container(height=500):
+                        for route_id in routes_to_display_direct:
+                            try:
+                                route_score_sub = round(tree.route_score(route_id), 3)
+                                # svg_sub = get_route_svg(tree, route_id)
+                                svg_sub = get_route_svg_from_json(st.session_state.route_json, route_id)
+                                if svg_sub:
+                                    st.image(
+                                        svg_sub,
+                                        caption=f"Route {route_id}; Score: {route_score_sub}",
+                                    )
+                                else:
+                                    st.warning(
+                                        f"Could not generate SVG for route {route_id}."
+                                    )
+                            except Exception as e:
+                                st.error(
+                                    f"Error displaying route {route_id} in subcluster: {e}"
+                                )
+                        if remaining_routes_sub:
+                            with st.expander(
+                                f"... and {len(remaining_routes_sub)} more routes in this subcluster"
+                            ):
+                                for route_id in remaining_routes_sub:
+                                    try:
+                                        route_score_sub = round(
+                                            tree.route_score(route_id), 3
+                                        )
+                                        # svg_sub = get_route_svg(tree, route_id)
+                                        svg_sub = get_route_svg_from_json(st.session_state.route_json, route_id)
+                                        if svg_sub:
+                                            st.image(
+                                                svg_sub,
+                                                caption=f"Route {route_id}; Score: {route_score_sub}",
+                                            )
+                                        else:
+                                            st.warning(
+                                                f"Could not generate SVG for route {route_id}."
+                                            )
+                                    except Exception as e:
+                                        st.error(
+                                            f"Error displaying route {route_id} in subcluster (expanded): {e}"
+                                        )
+            else:
+                st.info("Select a valid cluster and subcluster index to see details.")
+def download_subclustering_results():
+    """13. Subclustering Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("subclustering_done", False)
+        and "subcluster_num_select_key" in st.session_state
+        and "subcluster_index_select_key" in st.session_state
+    ):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        sb_cgrs_for_report = st.session_state.get(
+            "sb_cgrs_dict"
+        )  # Used by routes_subclustering_report
+        user_input_cluster_num_display = st.session_state.subcluster_num_select_key
+        selected_subcluster_idx = st.session_state.subcluster_index_select_key
+        if not tree or not sub or not sb_cgrs_for_report:
+            st.warning(
+                "Missing data for subclustering report generation (tree, subclusters, or SB-CGRs)."
+            )
+            return
+        if (
+            user_input_cluster_num_display in sub
+            and selected_subcluster_idx in sub[user_input_cluster_num_display]
+        ):
+            subcluster_data_for_report = sub[user_input_cluster_num_display][
+                selected_subcluster_idx
+            ]
+            # Apply the same post-processing as in display
+            processed_subcluster_data = post_process_subgroup(
+                subcluster_data_for_report
+            )
+            if "routes_data" in subcluster_data_for_report and isinstance(
+                subcluster_data_for_report["routes_data"], dict
+            ):
+                processed_subcluster_data["group_lgs"] = group_by_identical_values(
+                    subcluster_data_for_report["routes_data"]
+                )
+            else:
+                processed_subcluster_data["group_lgs"] = {}
+            try:
+                subcluster_html_content = routes_subclustering_report(
+                    tree,
+                    processed_subcluster_data,  # Pass the specific post-processed subcluster data
+                    user_input_cluster_num_display,
+                    selected_subcluster_idx,
+                    sb_cgrs_for_report,  # Pass the whole sb_cgrs dict
+                    if_lg_group=True,  # This parameter was in the original call
+                )
+                st.download_button(
+                    label=f"Download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}",
+                    data=subcluster_html_content,
+                    file_name=f"subcluster_{user_input_cluster_num_display}.{selected_subcluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_subcluster_{user_input_cluster_num_display}_{selected_subcluster_idx}",
+                )
+            except Exception as e:
+                st.error(
+                    f"Error generating download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}: {e}"
+                )
+        # else:
+        # This case is handled by the display logic mostly, download button just won't appear or will be for previous valid selection.
+def implement_restart():
+    """14. Restart: Implementing the logic to reset or restart the application state."""
+    st.divider()
+    st.header("Restart Application State")
+    if st.button("Clear All Results & Restart", key="restart_button"):
+        keys_to_clear = [
+            "planning_done",
+            "tree",
+            "res",
+            "target_smiles",
+            "clustering_done",
+            "clusters",
+            "reactions_dict",
+            "num_clusters_setting",
+            "route_cgrs_dict",
+            "sb_cgrs_dict",
+            "route_json",
+            "subclustering_done",
+            "subclusters",  # "sub" was renamed
+            "clusters_downloaded",
+            # Potentially ketcher related keys if they need manual reset beyond new input
+            "ketcher_widget",
+            "smiles_text_input_key",  # Keys for widgets
+            "subcluster_num_select_key",
+            "subcluster_index_select_key",
+        ]
+        for key in keys_to_clear:
+            if key in st.session_state:
+                del st.session_state[key]
+        # Reset ketcher input to default by resetting its session state variable
+        st.session_state.ketcher = DEFAULT_MOL
+        # Also explicitly set target_smiles to empty or default to avoid stale data
+        st.session_state.target_smiles = ""
+        # It's generally better to let Streamlit manage widget state if possible,
+        # but for a full reset, clearing their explicit session state keys might be needed.
+        st.rerun()
+# --- Main Application Flow ---
+def main():
+    initialize_app()
+    setup_sidebar()
+    current_smile_code = handle_molecule_input()
+    # Update session_state.ketcher if current_smile_code has changed from ketcher output
+    if st.session_state.get("ketcher") != current_smile_code:
+        st.session_state.ketcher = current_smile_code
+        # No rerun here, let the flow continue. handle_molecule_input already warns.
+    setup_planning_options()  # This function now also handles the button press and logic for planning
+    # Display planning results and download options together
+    if st.session_state.get("planning_done", False):
+        display_planning_results()  # Displays stats and routes
+        if st.session_state.res and st.session_state.res.get("solved", False):
+            stat_col, download_col = st.columns(
+                2, gap="medium"
+            )  # Placeholder for download column
+            with stat_col:
+                st.subheader("Statistics")
+                try:
+                    res = st.session_state.res
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_routes",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:  # Ensure there are columns to show
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display from planning results.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)  # Show raw dict if DataFrame fails
+            with download_col:
+                st.subheader("Planning Downloads")  # Adding a subheader for clarity
+                download_planning_results()
+    # Clustering section (setup button, display, download)
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        setup_clustering()  # Contains the "Run Clustering" button and logic
+        if st.session_state.get("clustering_done", False):
+            display_clustering_results()  # Displays cluster routes and stats
+            cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
+            with cluster_stat_col:
+                clusters = st.session_state.clusters
+                cluster_sizes = [
+                    cluster.get("group_size", 0)
+                    for cluster in clusters.values()
+                    if cluster
+                ]  # Safe get
+                st.subheader("Cluster Statistics")
+                if cluster_sizes:
+                    cluster_df = pd.DataFrame(
+                        {
+                            "Cluster": [
+                                k for k, v in clusters.items() if v
+                            ],  # Filter out empty clusters
+                            "Number of Routes": [
+                                v["group_size"] for v in clusters.values() if v
+                            ],
+                        }
+                    )
+                    if not cluster_df.empty:
+                        cluster_df.index += 1
+                        st.dataframe(cluster_df)
+                        best_route_html = html_top_routes_cluster(
+                            clusters,
+                            st.session_state.tree,
+                            st.session_state.target_smiles,
+                        )
+                        st.download_button(
+                            label=f"Download best route from each cluster",
+                            data=best_route_html,
+                            file_name=f"cluster_best_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_best",
+                        )
+                    else:
+                        st.write("No valid cluster data to display statistics for.")
+                    # download_top_routes_cluster()
+                else:
+                    st.write("No cluster data to display statistics for.")
+            with cluster_download_col:
+                download_clustering_results()
+    # Subclustering section (setup button, display, download)
+    if st.session_state.get("clustering_done", False):  # Depends on clustering
+        setup_subclustering()  # Contains "Run Subclustering" button
+        if st.session_state.get("subclustering_done", False):
+            display_subclustering_results()  # Displays subcluster details and routes
+            download_subclustering_results()  # This needs to be called after selections are made in display.
+    implement_restart()
+if __name__ == "__main__":
+    main()

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==2.2.2+cpu
+scikit-learn==1.5.1
+scipy==1.14.0
+fastcluster==1.2.6
+matplotlib==3.10.1
+seaborn==0.13.2

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit

+streamlit
+streamlit_ketcher
+git+https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner.git
+git+https://github.com/cimm-kzn/StructureFingerprint.git
+scikit-learn

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

synplan/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .mcts import *
2	+
3	+ __all__ = ["Tree"]

synplan/chem/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from CGRtools.files import SMILESRead
2	+
3	+ smiles_parser = SMILESRead.create_parser(ignore=True)

synplan/chem/data/__init__.py ADDED Viewed

File without changes

synplan/chem/data/filtering.py ADDED Viewed

	@@ -0,0 +1,962 @@

+"""Module containing classes abd functions for reactions filtering."""
+import logging
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import ray
+import yaml
+from CGRtools.containers import CGRContainer, MoleculeContainer, ReactionContainer
+from chython.algorithms.fingerprints.morgan import MorganFingerprint
+from tqdm import tqdm
+from synplan.chem.data.standardizing import (
+    AromaticFormStandardizer,
+    KekuleFormStandardizer,
+    RemoveReagentsStandardizer,
+)
+from synplan.chem.utils import cgrtools_to_chython_molecule
+from synplan.utils.config import ConfigABC, convert_config_to_dict
+from synplan.utils.files import ReactionReader, ReactionWriter
+@dataclass
+class CompeteProductsConfig(ConfigABC):
+    fingerprint_tanimoto_threshold: float = 0.3
+    mcs_tanimoto_threshold: float = 0.6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "CompeteProductsConfig":
+        """Create an instance of CompeteProductsConfig from a dictionary."""
+        return CompeteProductsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "CompeteProductsConfig":
+        """Deserialize a YAML file into a CompeteProductsConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return CompeteProductsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if not isinstance(params.get("fingerprint_tanimoto_threshold"), float) or not (
+            0 <= params["fingerprint_tanimoto_threshold"] <= 1
+        ):
+            raise ValueError(
+                "Invalid 'fingerprint_tanimoto_threshold'; expected a float between 0 and 1"
+            )
+        if not isinstance(params.get("mcs_tanimoto_threshold"), float) or not (
+            0 <= params["mcs_tanimoto_threshold"] <= 1
+        ):
+            raise ValueError(
+                "Invalid 'mcs_tanimoto_threshold'; expected a float between 0 and 1"
+            )
+class CompeteProductsFilter:
+    """Checks if there are compete reactions."""
+    def __init__(
+        self,
+        fingerprint_tanimoto_threshold: float = 0.3,
+        mcs_tanimoto_threshold: float = 0.6,
+    ):
+        self.fingerprint_tanimoto_threshold = fingerprint_tanimoto_threshold
+        self.mcs_tanimoto_threshold = mcs_tanimoto_threshold
+    @staticmethod
+    def from_config(config: CompeteProductsConfig) -> "CompeteProductsFilter":
+        """Creates an instance of CompeteProductsFilter from a configuration object."""
+        return CompeteProductsFilter(
+            config.fingerprint_tanimoto_threshold, config.mcs_tanimoto_threshold
+        )
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """Checks if the reaction has competing products, else False.
+        :param reaction: Input reaction.
+        :return: Returns True if the reaction has competing products, else False.
+        """
+        mf = MorganFingerprint()
+        is_compete = False
+        # check for compete products using both fingerprint similarity and maximum common substructure (MCS) similarity
+        for mol in reaction.reagents:
+            for other_mol in reaction.products:
+                if len(mol) > 6 and len(other_mol) > 6:
+                    # compute fingerprint similarity
+                    molf = mf.transform([cgrtools_to_chython_molecule(mol)])
+                    other_molf = mf.transform([cgrtools_to_chython_molecule(other_mol)])
+                    fingerprint_tanimoto = tanimoto_kernel(molf, other_molf)[0][0]
+                    # if fingerprint similarity is high enough, check for MCS similarity
+                    if fingerprint_tanimoto > self.fingerprint_tanimoto_threshold:
+                        try:
+                            # find the maximum common substructure (MCS) and compute its size
+                            clique_size = len(
+                                next(mol.get_mcs_mapping(other_mol, limit=100))
+                            )
+                            # calculate MCS similarity based on MCS size
+                            mcs_tanimoto = clique_size / (
+                                len(mol) + len(other_mol) - clique_size
+                            )
+                            # if MCS similarity is also high enough, mark the reaction as having compete products
+                            if mcs_tanimoto > self.mcs_tanimoto_threshold:
+                                is_compete = True
+                                break
+                        except StopIteration:
+                            continue
+        return is_compete
+@dataclass
+class DynamicBondsConfig(ConfigABC):
+    min_bonds_number: int = 1
+    max_bonds_number: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "DynamicBondsConfig":
+        """Create an instance of DynamicBondsConfig from a dictionary."""
+        return DynamicBondsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "DynamicBondsConfig":
+        """Deserialize a YAML file into a DynamicBondsConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return DynamicBondsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if (
+            not isinstance(params.get("min_bonds_number"), int)
+            or params["min_bonds_number"] < 0
+        ):
+            raise ValueError(
+                "Invalid 'min_bonds_number'; expected a non-negative integer"
+            )
+        if (
+            not isinstance(params.get("max_bonds_number"), int)
+            or params["max_bonds_number"] < 0
+        ):
+            raise ValueError(
+                "Invalid 'max_bonds_number'; expected a non-negative integer"
+            )
+        if params["min_bonds_number"] > params["max_bonds_number"]:
+            raise ValueError(
+                "'min_bonds_number' cannot be greater than 'max_bonds_number'"
+            )
+class DynamicBondsFilter:
+    """Checks if there is an unacceptable number of dynamic bonds in CGR."""
+    def __init__(self, min_bonds_number: int = 1, max_bonds_number: int = 6):
+        self.min_bonds_number = min_bonds_number
+        self.max_bonds_number = max_bonds_number
+    @staticmethod
+    def from_config(config: DynamicBondsConfig):
+        """Creates an instance of DynamicBondsChecker from a configuration object."""
+        return DynamicBondsFilter(config.min_bonds_number, config.max_bonds_number)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not (
+            self.min_bonds_number <= len(cgr.center_bonds) <= self.max_bonds_number
+        )
+@dataclass
+class SmallMoleculesConfig(ConfigABC):
+    mol_max_size: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "SmallMoleculesConfig":
+        """Creates an instance of SmallMoleculesConfig from a dictionary."""
+        return SmallMoleculesConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "SmallMoleculesConfig":
+        """Deserialize a YAML file into a SmallMoleculesConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return SmallMoleculesConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if (
+            not isinstance(params.get("mol_max_size"), int)
+            or params["mol_max_size"] < 1
+        ):
+            raise ValueError("Invalid 'mol_max_size'; expected a positive integer")
+class SmallMoleculesFilter:
+    """Checks if there are only small molecules in the reaction or if there is only one
+    small reactant or product."""
+    def __init__(self, mol_max_size: int = 6):
+        self.limit = mol_max_size
+    @staticmethod
+    def from_config(config: SmallMoleculesConfig) -> "SmallMoleculesFilter":
+        """Creates an instance of SmallMoleculesChecker from a configuration object."""
+        return SmallMoleculesFilter(config.mol_max_size)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        if (
+            (
+                len(reaction.reactants) == 1
+                and self.are_only_small_molecules(reaction.reactants)
+            )
+            or (
+                len(reaction.products) == 1
+                and self.are_only_small_molecules(reaction.products)
+            )
+            or (
+                self.are_only_small_molecules(reaction.reactants)
+                and self.are_only_small_molecules(reaction.products)
+            )
+        ):
+            return True
+        return False
+    def are_only_small_molecules(self, molecules: Iterable[MoleculeContainer]) -> bool:
+        """Checks if all molecules in the given iterable are small molecules."""
+        return all(len(molecule) <= self.limit for molecule in molecules)
+@dataclass
+class CGRConnectedComponentsConfig:
+    pass
+class CGRConnectedComponentsFilter:
+    """Checks if CGR contains unrelated components (without reagents)."""
+    @staticmethod
+    def from_config(
+        config: CGRConnectedComponentsConfig,
+    ) -> "CGRConnectedComponentsFilter":
+        """Creates an instance of CGRConnectedComponentsChecker from a configuration
+        object."""
+        return CGRConnectedComponentsFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+        cgr = ~tmp_reaction
+        return cgr.connected_components_count > 1
+@dataclass
+class RingsChangeConfig:
+    pass
+class RingsChangeFilter:
+    """Checks if there is changing rings number in the reaction."""
+    @staticmethod
+    def from_config(config: RingsChangeConfig) -> "RingsChangeFilter":
+        """Creates an instance of RingsChecker from a configuration object."""
+        return RingsChangeFilter()
+    def __call__(self, reaction: ReactionContainer):
+        """
+        Returns True if there are valence mistakes in the reaction or there is a
+        reaction with mismatch numbers of all rings or aromatic rings in reactants and
+        products (reaction in rings)
+        :param reaction: Input reaction.
+        :return: Returns True if there are valence mistakes in the reaction.
+        """
+        r_rings, r_arom_rings = self._calc_rings(reaction.reactants)
+        p_rings, p_arom_rings = self._calc_rings(reaction.products)
+        return (r_arom_rings != p_arom_rings) or (r_rings != p_rings)
+    @staticmethod
+    def _calc_rings(molecules: Iterable) -> Tuple[int, int]:
+        """
+        Calculates number of all rings and number of aromatic rings in molecules.
+        :param molecules: Set of molecules.
+        :return: Number of all rings and number of aromatic rings in molecules
+        """
+        rings, arom_rings = 0, 0
+        for mol in molecules:
+            rings += mol.rings_count
+            arom_rings += len(mol.aromatic_rings)
+        return rings, arom_rings
+@dataclass
+class StrangeCarbonsConfig:
+    # currently empty, but can be extended in the future if needed
+    pass
+class StrangeCarbonsFilter:
+    """Checks if there are 'strange' carbons in the reaction."""
+    @staticmethod
+    def from_config(config: StrangeCarbonsConfig) -> "StrangeCarbonsFilter":
+        """Creates an instance of StrangeCarbonsChecker from a configuration object."""
+        return StrangeCarbonsFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        for molecule in reaction.reactants + reaction.products:
+            atoms_types = {
+                a.atomic_symbol for _, a in molecule.atoms()
+            }  # atoms types in molecule
+            if len(atoms_types) == 1 and atoms_types.pop() == "C":
+                if len(molecule) == 1:  # methane
+                    return True
+                bond_types = {int(b) for _, _, b in molecule.bonds()}
+                if len(bond_types) == 1 and bond_types.pop() != 4:
+                    return True  # C molecules with only one type of bond (not aromatic)
+        return False
+@dataclass
+class NoReactionConfig:
+    # Currently empty, but can be extended in the future if needed
+    pass
+class NoReactionFilter:
+    """Checks if there is no reaction in the provided reaction container."""
+    @staticmethod
+    def from_config(config: NoReactionConfig) -> "NoReactionFilter":
+        """Creates an instance of NoReactionChecker from a configuration object."""
+        return NoReactionFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not cgr.center_atoms and not cgr.center_bonds
+@dataclass
+class MultiCenterConfig:
+    pass
+class MultiCenterFilter:
+    """Checks if there is a multicenter reaction."""
+    @staticmethod
+    def from_config(config: MultiCenterConfig) -> "MultiCenterFilter":
+        return MultiCenterFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return len(cgr.centers_list) > 1
+@dataclass
+class WrongCHBreakingConfig:
+    pass
+class WrongCHBreakingFilter:
+    """Checks for incorrect C-C bond formation from breaking a C-H bond."""
+    @staticmethod
+    def from_config(config: WrongCHBreakingConfig) -> "WrongCHBreakingFilter":
+        return WrongCHBreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Determines if a reaction involves incorrect C-C bond formation from breaking
+        a C-H bond.
+        :param reaction: The reaction to be filtered.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        if reaction.check_valence():
+            return False
+        copy_reaction = reaction.copy()
+        copy_reaction.explicify_hydrogens()
+        cgr = ~copy_reaction
+        reduced_cgr = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        return self.is_wrong_c_h_breaking(reduced_cgr)
+    @staticmethod
+    def is_wrong_c_h_breaking(cgr: CGRContainer) -> bool:
+        """
+        Checks for incorrect C-C bond formation from breaking a C-H bond in a CGR.
+        :param cgr: The CGR with explicified hydrogens.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        for atom_id in cgr.center_atoms:
+            if cgr.atom(atom_id).atomic_symbol == "C":
+                is_c_h_breaking, is_c_c_formation = False, False
+                c_with_h_id, another_c_id = None, None
+                for neighbour_id, bond in cgr._bonds[atom_id].items():
+                    neighbour = cgr.atom(neighbour_id)
+                    if (
+                        bond.order
+                        and not bond.p_order
+                        and neighbour.atomic_symbol == "H"
+                    ):
+                        is_c_h_breaking = True
+                        c_with_h_id = atom_id
+                    elif (
+                        not bond.order
+                        and bond.p_order
+                        and neighbour.atomic_symbol == "C"
+                    ):
+                        is_c_c_formation = True
+                        another_c_id = neighbour_id
+                if is_c_h_breaking and is_c_c_formation:
+                    # check for presence of heteroatoms in the first environment of 2 bonding carbons
+                    if any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[c_with_h_id]
+                    ) or any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[another_c_id]
+                    ):
+                        return False
+                    return True
+        return False
+@dataclass
+class CCsp3BreakingConfig:
+    pass
+class CCsp3BreakingFilter:
+    """Checks if there is C(sp3)-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCsp3BreakingConfig) -> "CCsp3BreakingFilter":
+        return CCsp3BreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if there is C(sp3)-C bonds breaking, else False.
+        :param reaction: Input reaction
+        :return: Returns True if there is C(sp3)-C bonds breaking, else False.
+        """
+        cgr = ~reaction
+        reaction_center = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            atom = reaction_center.atom(atom_id)
+            neighbour = reaction_center.atom(neighbour_id)
+            is_bond_broken = bond.order is not None and bond.p_order is None
+            are_atoms_carbons = (
+                atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+            )
+            is_atom_sp3 = atom.hybridization == 1 or neighbour.hybridization == 1
+            if is_bond_broken and are_atoms_carbons and is_atom_sp3:
+                return True
+        return False
+@dataclass
+class CCRingBreakingConfig:
+    """
+    Object to pass to ReactionFilterConfig if you want to enable C-C ring breaking filter
+    """
+    pass
+class CCRingBreakingFilter:
+    """Checks if a reaction involves ring C-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCRingBreakingConfig):
+        return CCRingBreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if the reaction involves ring C-C bond breaking, else False.
+        :param reaction: Input reaction
+        :return: Returns True if the reaction involves ring C-C bond breaking, else
+            False.
+        """
+        cgr = ~reaction
+        # Extract reactants' center atoms and their rings
+        reactants_center_atoms = {}
+        reactants_rings = set()
+        for reactant in reaction.reactants:
+            reactants_rings.update(reactant.sssr)
+            for n, atom in reactant.atoms():
+                if n in cgr.center_atoms:
+                    reactants_center_atoms[n] = atom
+        # identify reaction center based on center atoms
+        reaction_center = cgr.augmented_substructure(atoms=cgr.center_atoms, deep=0)
+        # iterate over bonds in the reaction center and filter for ring C-C bond breaking
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            try:
+                # Retrieve corresponding atoms from reactants
+                atom = reactants_center_atoms[atom_id]
+                neighbour = reactants_center_atoms[neighbour_id]
+            except KeyError:
+                continue
+            else:
+                # Check if the bond is broken and both atoms are carbons in rings of size 5, 6, or 7
+                is_bond_broken = (bond.order is not None) and (bond.p_order is None)
+                are_atoms_carbons = (
+                    atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+                )
+                are_atoms_in_ring = (
+                    set(atom.ring_sizes).intersection({5, 6, 7})
+                    and set(neighbour.ring_sizes).intersection({5, 6, 7})
+                    and any(
+                        atom_id in ring and neighbour_id in ring
+                        for ring in reactants_rings
+                    )
+                )
+                # If all conditions are met, indicate ring C-C bond breaking
+                if is_bond_broken and are_atoms_carbons and are_atoms_in_ring:
+                    return True
+        return False
+@dataclass
+class ReactionFilterConfig(ConfigABC):
+    """
+    Configuration class for reaction filtering. This class manages configuration
+    settings for various reaction filters, including paths, file formats, and filter-
+    specific parameters.
+    :ivar dynamic_bonds_config: Configuration for dynamic bonds checking.
+    :ivar small_molecules_config: Configuration for small molecules checking.
+    :ivar strange_carbons_config: Configuration for strange carbons checking.
+    :ivar compete_products_config: Configuration for competing products checking.
+    :ivar cgr_connected_components_config: Configuration for CGR connected components checking.
+    :ivar rings_change_config: Configuration for rings change checking.
+    :ivar no_reaction_config: Configuration for no reaction checking.
+    :ivar multi_center_config: Configuration for multi-center checking.
+    :ivar wrong_ch_breaking_config: Configuration for wrong C-H breaking checking.
+    :ivar cc_sp3_breaking_config: Configuration for CC sp3 breaking checking.
+    :ivar cc_ring_breaking_config: Configuration for CC ring breaking checking.
+    """
+    # configuration for reaction filters
+    dynamic_bonds_config: Optional[DynamicBondsConfig] = None
+    small_molecules_config: Optional[SmallMoleculesConfig] = None
+    strange_carbons_config: Optional[StrangeCarbonsConfig] = None
+    compete_products_config: Optional[CompeteProductsConfig] = None
+    cgr_connected_components_config: Optional[CGRConnectedComponentsConfig] = None
+    rings_change_config: Optional[RingsChangeConfig] = None
+    no_reaction_config: Optional[NoReactionConfig] = None
+    multi_center_config: Optional[MultiCenterConfig] = None
+    wrong_ch_breaking_config: Optional[WrongCHBreakingConfig] = None
+    cc_sp3_breaking_config: Optional[CCsp3BreakingConfig] = None
+    cc_ring_breaking_config: Optional[CCRingBreakingConfig] = None
+    def to_dict(self):
+        """Converts the configuration into a dictionary."""
+        config_dict = {
+            "dynamic_bonds_config": convert_config_to_dict(
+                self.dynamic_bonds_config, DynamicBondsConfig
+            ),
+            "small_molecules_config": convert_config_to_dict(
+                self.small_molecules_config, SmallMoleculesConfig
+            ),
+            "compete_products_config": convert_config_to_dict(
+                self.compete_products_config, CompeteProductsConfig
+            ),
+            "cgr_connected_components_config": (
+                {} if self.cgr_connected_components_config is not None else None
+            ),
+            "rings_change_config": {} if self.rings_change_config is not None else None,
+            "strange_carbons_config": (
+                {} if self.strange_carbons_config is not None else None
+            ),
+            "no_reaction_config": {} if self.no_reaction_config is not None else None,
+            "multi_center_config": {} if self.multi_center_config is not None else None,
+            "wrong_ch_breaking_config": (
+                {} if self.wrong_ch_breaking_config is not None else None
+            ),
+            "cc_sp3_breaking_config": (
+                {} if self.cc_sp3_breaking_config is not None else None
+            ),
+            "cc_ring_breaking_config": (
+                {} if self.cc_ring_breaking_config is not None else None
+            ),
+        }
+        filtered_config_dict = {k: v for k, v in config_dict.items() if v is not None}
+        return filtered_config_dict
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ReactionFilterConfig":
+        """Create an instance of ReactionCheckConfig from a dictionary."""
+        # Instantiate configuration objects if their corresponding dictionary is present
+        dynamic_bonds_config = (
+            DynamicBondsConfig(**config_dict["dynamic_bonds_config"])
+            if "dynamic_bonds_config" in config_dict
+            else None
+        )
+        small_molecules_config = (
+            SmallMoleculesConfig(**config_dict["small_molecules_config"])
+            if "small_molecules_config" in config_dict
+            else None
+        )
+        compete_products_config = (
+            CompeteProductsConfig(**config_dict["compete_products_config"])
+            if "compete_products_config" in config_dict
+            else None
+        )
+        cgr_connected_components_config = (
+            CGRConnectedComponentsConfig()
+            if "cgr_connected_components_config" in config_dict
+            else None
+        )
+        rings_change_config = (
+            RingsChangeConfig() if "rings_change_config" in config_dict else None
+        )
+        strange_carbons_config = (
+            StrangeCarbonsConfig() if "strange_carbons_config" in config_dict else None
+        )
+        no_reaction_config = (
+            NoReactionConfig() if "no_reaction_config" in config_dict else None
+        )
+        multi_center_config = (
+            MultiCenterConfig() if "multi_center_config" in config_dict else None
+        )
+        wrong_ch_breaking_config = (
+            WrongCHBreakingConfig()
+            if "wrong_ch_breaking_config" in config_dict
+            else None
+        )
+        cc_sp3_breaking_config = (
+            CCsp3BreakingConfig() if "cc_sp3_breaking_config" in config_dict else None
+        )
+        cc_ring_breaking_config = (
+            CCRingBreakingConfig() if "cc_ring_breaking_config" in config_dict else None
+        )
+        return ReactionFilterConfig(
+            dynamic_bonds_config=dynamic_bonds_config,
+            small_molecules_config=small_molecules_config,
+            compete_products_config=compete_products_config,
+            cgr_connected_components_config=cgr_connected_components_config,
+            rings_change_config=rings_change_config,
+            strange_carbons_config=strange_carbons_config,
+            no_reaction_config=no_reaction_config,
+            multi_center_config=multi_center_config,
+            wrong_ch_breaking_config=wrong_ch_breaking_config,
+            cc_sp3_breaking_config=cc_sp3_breaking_config,
+            cc_ring_breaking_config=cc_ring_breaking_config,
+        )
+    @staticmethod
+    def from_yaml(file_path: str) -> "ReactionFilterConfig":
+        """Deserializes a YAML file into a ReactionCheckConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ReactionFilterConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        pass
+    def create_filters(self):
+        filter_instances = []
+        if self.dynamic_bonds_config is not None:
+            filter_instances.append(
+                DynamicBondsFilter.from_config(self.dynamic_bonds_config)
+            )
+        if self.small_molecules_config is not None:
+            filter_instances.append(
+                SmallMoleculesFilter.from_config(self.small_molecules_config)
+            )
+        if self.strange_carbons_config is not None:
+            filter_instances.append(
+                StrangeCarbonsFilter.from_config(self.strange_carbons_config)
+            )
+        if self.compete_products_config is not None:
+            filter_instances.append(
+                CompeteProductsFilter.from_config(self.compete_products_config)
+            )
+        if self.cgr_connected_components_config is not None:
+            filter_instances.append(
+                CGRConnectedComponentsFilter.from_config(
+                    self.cgr_connected_components_config
+                )
+            )
+        if self.rings_change_config is not None:
+            filter_instances.append(
+                RingsChangeFilter.from_config(self.rings_change_config)
+            )
+        if self.no_reaction_config is not None:
+            filter_instances.append(
+                NoReactionFilter.from_config(self.no_reaction_config)
+            )
+        if self.multi_center_config is not None:
+            filter_instances.append(
+                MultiCenterFilter.from_config(self.multi_center_config)
+            )
+        if self.wrong_ch_breaking_config is not None:
+            filter_instances.append(
+                WrongCHBreakingFilter.from_config(self.wrong_ch_breaking_config)
+            )
+        if self.cc_sp3_breaking_config is not None:
+            filter_instances.append(
+                CCsp3BreakingFilter.from_config(self.cc_sp3_breaking_config)
+            )
+        if self.cc_ring_breaking_config is not None:
+            filter_instances.append(
+                CCRingBreakingFilter.from_config(self.cc_ring_breaking_config)
+            )
+        return filter_instances
+def tanimoto_kernel(x: MorganFingerprint, y: MorganFingerprint) -> float:
+    """Calculate the Tanimoto coefficient between each element of arrays x and y."""
+    x = x.astype(np.float64)
+    y = y.astype(np.float64)
+    x_dot = np.dot(x, y.T)
+    x2 = np.sum(x**2, axis=1)
+    y2 = np.sum(y**2, axis=1)
+    denominator = np.array([x2] * len(y2)).T + np.array([y2] * len(x2)) - x_dot
+    result = np.divide(
+        x_dot, denominator, out=np.zeros_like(x_dot), where=denominator != 0
+    )
+    return result
+def filter_reaction(
+    reaction: ReactionContainer, config: ReactionFilterConfig, filters: list
+) -> Tuple[bool, ReactionContainer]:
+    """Checks the input reaction. Returns True if reaction is detected as erroneous and
+    returns reaction itself, which sometimes is modified and does not necessarily
+    correspond to the initial reaction.
+    :param reaction: Reaction to be filtered.
+    :param config: Reaction filtration configuration.
+    :param filters: The list of reaction filters.
+    :return: False and reaction if reaction is correct and True and reaction if reaction
+        is filtered (erroneous).
+    """
+    is_filtered = False
+    # run reaction standardization
+    standardizers = [
+        RemoveReagentsStandardizer(),
+        KekuleFormStandardizer(),
+        AromaticFormStandardizer(),
+    ]
+    for reaction_standardizer in standardizers:
+        reaction = reaction_standardizer(reaction)
+        if not reaction:
+            is_filtered = True
+            break
+    # run reaction filtration
+    if not is_filtered:
+        for reaction_filter in filters:
+            try:  # CGRTools ValueError: mapping of graphs is not disjoint
+                if reaction_filter(reaction):
+                    # if filter returns True it means the reaction doesn't pass the filter
+                    reaction.meta["filtration_log"] = reaction_filter.__class__.__name__
+                    is_filtered = True
+            except Exception as e:
+                logging.debug(e)
+                is_filtered = True
+    return is_filtered, reaction
+@ray.remote
+def process_batch(
+    batch: List[Tuple[int, ReactionContainer]],
+    config: ReactionFilterConfig,
+    filters: list,
+) -> List[Tuple[bool, ReactionContainer]]:
+    """
+    Processes a batch of reactions to extract reaction rules based on the given
+    configuration. This function operates as a remote task in a distributed system using
+    Ray.
+    :param batch: A list where each element is a tuple containing an index (int) and a
+        ReactionContainer object. The index is typically used to keep track of the
+        reaction's position in a larger dataset.
+    :param config: Reaction filtration configuration.
+    :param filters: The list of reaction filters.
+    :return: The list of tuples where each tuple include the reaction index, is ir
+        filtered or not (True/False) and reaction itself.
+    """
+    processed_reaction_list = []
+    for reaction in batch:
+        try:  # CGRtools.exceptions.MappingError: atoms with number {52} not equal
+            is_filtered, processed_reaction = filter_reaction(reaction, config, filters)
+            processed_reaction_list.append((is_filtered, processed_reaction))
+        except Exception as e:
+            logging.debug(e)
+            processed_reaction_list.append((True, reaction))
+    return processed_reaction_list
+def process_completed_batch(
+    futures: Dict,
+    result_file: TextIOWrapper,
+    n_filtered: int = 0,
+) -> int:
+    """
+    Processes completed batches of reactions.
+    :param futures: A dictionary of futures representing ongoing batch processing tasks.
+    :param result_file: The path to the file where filtered reactions will be stored.
+    :param n_filtered: The number of processed reactions.
+    :return: The numbers of filtered and correct reactions.
+    """
+    ready_id, running_id = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(ready_id[0])
+    # write results of the completed batch to file
+    for is_filtered, reaction in completed_batch:
+        if not is_filtered:
+            result_file.write(reaction)
+            n_filtered += 1
+    # remove completed future and update progress bar
+    del futures[ready_id[0]]
+    return n_filtered
+def filter_reactions_from_file(
+    config: ReactionFilterConfig,
+    input_reaction_data_path: str,
+    filtered_reaction_data_path: str = "reaction_data_filtered.smi",
+    num_cpus: int = 1,
+    batch_size: int = 100,
+) -> None:
+    """
+    Processes reaction data, applying reaction filters based on the provided
+    configuration, and writes the results to specified files.
+    :param config: ReactionCheckConfig object containing all filtration configuration
+        settings.
+    :param input_reaction_data_path: Path to the reaction data file.
+    :param filtered_reaction_data_path: Name for the file that will contain filtered
+        reactions.
+    :param num_cpus: Number of CPUs to use for processing.
+    :param batch_size: Size of the batch for processing reactions.
+    :return: None. The function writes the processed reactions to specified RDF/smi
+        files.
+    """
+    filters = config.create_filters()
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    max_concurrent_batches = num_cpus  # limit the number of concurrent batches
+    lines_counter = 0
+    with ReactionReader(input_reaction_data_path) as reactions, ReactionWriter(
+        filtered_reaction_data_path
+    ) as result_file:
+        batches_to_process, batch = {}, []
+        n_filtered = 0
+        for index, reaction in tqdm(
+            enumerate(reactions),
+            desc="Number of reactions processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            lines_counter += 1
+            batch.append(reaction)
+            if len(batch) == batch_size:
+                batch_results = process_batch.remote(batch, config, filters)
+                batches_to_process[batch_results] = None
+                batch = []
+                # check and process completed tasks if we've reached the concurrency limit
+                while len(batches_to_process) >= max_concurrent_batches:
+                    n_filtered = process_completed_batch(
+                        batches_to_process,
+                        result_file,
+                        n_filtered,
+                    )
+        # process the last batch if it's not empty
+        if batch:
+            batch_results = process_batch.remote(batch, config, filters)
+            batches_to_process[batch_results] = None
+        # process remaining batches
+        while batches_to_process:
+            n_filtered = process_completed_batch(
+                batches_to_process,
+                result_file,
+                n_filtered,
+            )
+    ray.shutdown()
+    print(f"Initial number of reactions: {lines_counter}")
+    print(f"Filtered number of reactions: {n_filtered}")

synplan/chem/data/standardizing.py ADDED Viewed

	@@ -0,0 +1,1187 @@

+"""Module containing classes and functions for reactions standardizing.
+This module contains the open-source code from
+https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning/blob/master/scripts/standardizer.py
+"""
+from __future__ import annotations
+import logging
+from contextlib import suppress
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, Sequence, TextIO
+from abc import ABC, abstractmethod
+from pathlib import Path
+import sys
+import ray
+import yaml
+from CGRtools import smiles as smiles_cgrtools
+from CGRtools.containers import MoleculeContainer
+from CGRtools.containers import ReactionContainer
+from CGRtools.containers import ReactionContainer as ReactionContainerCGRTools
+from chython import ReactionContainer as ReactionContainerChython
+from chython import smiles as smiles_chython
+from tqdm.auto import tqdm
+from synplan.chem.utils import unite_molecules
+from synplan.utils.config import ConfigABC
+from synplan.utils.files import ReactionReader, ReactionWriter
+from synplan.utils.logging import init_logger, init_ray_logging
+logger = logging.getLogger("synplan.chem.data.standardizing")
+class StandardizationError(RuntimeError):
+    """Wraps the original exception and the reaction string that failed."""
+    def __init__(self, stage: str, reaction: str, original: Exception):
+        super().__init__(f"{stage} failed on {reaction}: {original}")
+        self.stage = stage
+        self.reaction = reaction
+        self.original = original
+class BaseStandardizer(ABC):
+    """Template: subclasses override `_run` only."""
+    @classmethod
+    def from_config(cls, _cfg: object) -> "BaseStandardizer":
+        return cls()
+    @abstractmethod
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Run the standardization step on the reaction.
+        Args:
+            rxn: The reaction to standardize
+        Returns:
+            The standardized reaction
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        ...
+    def __call__(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Execute the standardization step with proper error handling.
+        Args:
+            rxn: The reaction to standardize
+        Returns:
+            The standardized reaction
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        try:
+            return self._run(rxn)
+        except Exception as exc:
+            logging.debug("%s: %s", self.__class__.__name__, exc, exc_info=True)
+            raise StandardizationError(self.__class__.__name__, str(rxn), exc)
+# Configuration classes
+@dataclass
+class ReactionMappingConfig:
+    pass
+class ReactionMappingStandardizer(BaseStandardizer):
+    """Maps atoms of the reaction using chython (chytorch)."""
+    def _map_and_remove_reagents(
+        self, reaction: ReactionContainerChython
+    ) -> ReactionContainerChython:
+        """Map and remove reagents from the reaction.
+        Args:
+            reaction: Input reaction
+        Returns:
+            The mapped reaction with reagents removed
+        """
+        reaction.reset_mapping()
+        reaction.remove_reagents()
+        return reaction
+    def _run(self, rxn: ReactionContainerCGRTools) -> ReactionContainerCGRTools:
+        """Map atoms of the reaction using chython.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The mapped reaction
+        Raises:
+            StandardizationError: If mapping fails
+        """
+        try:
+            # Convert to chython format
+            if isinstance(rxn, str):
+                chython_reaction = smiles_chython(rxn)
+            else:
+                # Convert CGRtools reaction to SMILES string, preserving reagents
+                reactants = ".".join(str(m) for m in rxn.reactants)
+                reagents = ".".join(str(m) for m in rxn.reagents)
+                products = ".".join(str(m) for m in rxn.products)
+                smiles = f"{reactants}>{reagents}>{products}"
+                # Parse SMILES string with chython
+                chython_reaction = smiles_chython(smiles)
+            # Map and remove reagents
+            reaction_mapped = self._map_and_remove_reagents(chython_reaction)
+            if not reaction_mapped:
+                raise StandardizationError(
+                    "ReactionMapping", str(rxn), ValueError("Mapping failed")
+                )
+            # Convert back to CGRtools format
+            mapped_smiles = format(chython_reaction, "m")
+            result = smiles_cgrtools(mapped_smiles)
+            result.meta.update(rxn.meta)  # Preserve metadata
+            return result
+        except Exception as e:
+            raise StandardizationError("ReactionMapping", str(rxn), e)
+@dataclass
+class FunctionalGroupsConfig:
+    pass
+class FunctionalGroupsStandardizer(BaseStandardizer):
+    """Functional groups standardization."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Standardize functional groups in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with standardized functional groups
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        rxn.standardize()
+        return rxn
+@dataclass
+class KekuleFormConfig:
+    pass
+class KekuleFormStandardizer(BaseStandardizer):
+    """Reactants/reagents/products kekulization."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Kekulize the reaction.
+        Args:
+            rxn: The reaction to kekulize
+        Returns:
+            The kekulized reaction
+        Raises:
+            StandardizationError: If kekulization fails
+        """
+        rxn.kekule()
+        return rxn
+@dataclass
+class CheckValenceConfig:
+    pass
+class CheckValenceStandardizer(BaseStandardizer):
+    """Check valence."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Check valence of atoms in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction if valences are correct
+        Raises:
+            StandardizationError: If valence check fails
+        """
+        for molecule in rxn.reactants + rxn.products + rxn.reagents:
+            valence_mistakes = molecule.check_valence()
+            if valence_mistakes:
+                raise StandardizationError(
+                    "CheckValence",
+                    str(rxn),
+                    ValueError(f"Valence errors: {valence_mistakes}"),
+                )
+        return rxn
+@dataclass
+class ImplicifyHydrogensConfig:
+    pass
+class ImplicifyHydrogensStandardizer(BaseStandardizer):
+    """Implicify hydrogens."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Implicify hydrogens in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with implicified hydrogens
+        Raises:
+            StandardizationError: If hydrogen implicification fails
+        """
+        rxn.implicify_hydrogens()
+        return rxn
+@dataclass
+class CheckIsotopesConfig:
+    pass
+class CheckIsotopesStandardizer(BaseStandardizer):
+    """Check isotopes."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Check and clean isotopes in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with cleaned isotopes
+        Raises:
+            StandardizationError: If isotope check/cleaning fails
+        """
+        is_isotope = False
+        for molecule in rxn.reactants + rxn.products:
+            for _, atom in molecule.atoms():
+                if atom.isotope:
+                    is_isotope = True
+                    break
+            if is_isotope:
+                break
+        if is_isotope:
+            rxn.clean_isotopes()
+        return rxn
+@dataclass
+class SplitIonsConfig:
+    pass
+class SplitIonsStandardizer(BaseStandardizer):
+    """Computing charge of molecule."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Split ions in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with split ions
+        Raises:
+            StandardizationError: If ion splitting fails
+        """
+        reaction, return_code = self._split_ions(rxn)
+        if return_code == 2:  # ions were split but the reaction is imbalanced
+            raise StandardizationError(
+                "SplitIons",
+                str(rxn),
+                ValueError("Reaction is imbalanced after ion splitting"),
+            )
+        return reaction
+    def _calc_charge(self, molecule: MoleculeContainer) -> int:
+        """Compute total charge of a molecule.
+        Args:
+            molecule: Input molecule
+        Returns:
+            The total charge of the molecule
+        """
+        return sum(molecule._charges.values())
+    def _split_ions(self, reaction: ReactionContainer) -> Tuple[ReactionContainer, int]:
+        """Split ions in a reaction.
+        Args:
+            reaction: Input reaction
+        Returns:
+            A tuple containing:
+            - The reaction with split ions
+            - Return code (0: nothing changed, 1: ions split, 2: ions split but imbalanced)
+        """
+        meta = reaction.meta
+        reaction_parts = []
+        return_codes = []
+        for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+            # Split molecules into individual components
+            divided_molecules = []
+            for molecule in molecules:
+                if isinstance(molecule, str):
+                    # If it's a string, try to parse it as a molecule
+                    try:
+                        molecule: MoleculeContainer = smiles_cgrtools(molecule)
+                    except Exception as e:
+                        logging.warning("Failed to parse molecule %s: %s", molecule, e)
+                        continue
+                # Use the split method from CGRtools
+                try:
+                    components = molecule.split()
+                    divided_molecules.extend(components)
+                except Exception as e:
+                    logging.warning("Failed to split molecule %s: %s", molecule, e)
+                    divided_molecules.append(molecule)
+            total_charge = 0
+            ions_present = False
+            for molecule in divided_molecules:
+                try:
+                    mol_charge = self._calc_charge(molecule)
+                    total_charge += mol_charge
+                    if mol_charge != 0:
+                        ions_present = True
+                except Exception as e:
+                    logging.warning(
+                        "Failed to calculate charge for molecule %s: %s", molecule, e
+                    )
+                    continue
+            if ions_present and total_charge:
+                return_codes.append(2)
+            elif ions_present:
+                return_codes.append(1)
+            else:
+                return_codes.append(0)
+            reaction_parts.append(tuple(divided_molecules))
+        return (
+            ReactionContainer(
+                reactants=reaction_parts[0],
+                reagents=reaction_parts[1],
+                products=reaction_parts[2],
+                meta=meta,
+            ),
+            max(return_codes),
+        )
+@dataclass
+class AromaticFormConfig:
+    pass
+class AromaticFormStandardizer(BaseStandardizer):
+    """Aromatize molecules in reaction."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Aromatize molecules in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with aromatized molecules
+        Raises:
+            StandardizationError: If aromatization fails
+        """
+        rxn.thiele()
+        return rxn
+@dataclass
+class MappingFixConfig:
+    pass
+class MappingFixStandardizer(BaseStandardizer):
+    """Fix atom-to-atom mapping in reaction."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Fix atom-to-atom mapping in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with fixed atom-to-atom mapping
+        Raises:
+            StandardizationError: If mapping fix fails
+        """
+        rxn.fix_mapping()
+        return rxn
+@dataclass
+class UnchangedPartsConfig:
+    pass
+class UnchangedPartsStandardizer(BaseStandardizer):
+    """Ungroup molecules, remove unchanged parts from reactants and products."""
+    def __init__(
+        self,
+        add_reagents_to_reactants: bool = False,
+        keep_reagents: bool = False,
+    ):
+        self.add_reagents_to_reactants = add_reagents_to_reactants
+        self.keep_reagents = keep_reagents
+    @classmethod
+    def from_config(cls, config: UnchangedPartsConfig) -> "UnchangedPartsStandardizer":
+        return cls()
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove unchanged parts from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with unchanged parts removed
+        Raises:
+            StandardizationError: If unchanged parts removal fails
+        """
+        meta = rxn.meta
+        new_reactants = list(rxn.reactants)
+        new_reagents = list(rxn.reagents)
+        if self.add_reagents_to_reactants:
+            new_reactants.extend(new_reagents)
+            new_reagents = []
+        reactants = new_reactants.copy()
+        new_products = list(rxn.products)
+        for reactant in reactants:
+            if reactant in new_products:
+                new_reagents.append(reactant)
+                new_reactants.remove(reactant)
+                new_products.remove(reactant)
+        if not self.keep_reagents:
+            new_reagents = []
+        if not new_reactants and new_products:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No reactants left")
+            )
+        if not new_products and new_reactants:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No products left")
+            )
+        if not new_reactants and not new_products:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No molecules left")
+            )
+        new_reaction = ReactionContainer(
+            reactants=tuple(new_reactants),
+            reagents=tuple(new_reagents),
+            products=tuple(new_products),
+            meta=meta,
+        )
+        new_reaction.name = rxn.name
+        return new_reaction
+@dataclass
+class SmallMoleculesConfig:
+    mol_max_size: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "SmallMoleculesConfig":
+        """Create an instance of SmallMoleculesConfig from a dictionary."""
+        return SmallMoleculesConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "SmallMoleculesConfig":
+        """Deserialize a YAML file into a SmallMoleculesConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return SmallMoleculesConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        mol_max_size = params.get("mol_max_size", self.mol_max_size)
+        if not isinstance(mol_max_size, int) or not (0 < mol_max_size):
+            raise ValueError("Invalid 'mol_max_size'; expected an integer more than 1")
+class SmallMoleculesStandardizer(BaseStandardizer):
+    """Remove small molecule from reaction."""
+    def __init__(self, mol_max_size: int = 6):
+        self.mol_max_size = mol_max_size
+    @classmethod
+    def from_config(cls, config: SmallMoleculesConfig) -> "SmallMoleculesStandardizer":
+        return cls(config.mol_max_size)
+    def _split_molecules(
+        self, molecules: Iterable, number_of_atoms: int
+    ) -> Tuple[List[MoleculeContainer], List[MoleculeContainer]]:
+        """Split molecules according to the number of heavy atoms.
+        Args:
+            molecules: Iterable of molecules
+            number_of_atoms: Threshold for splitting molecules
+        Returns:
+            Tuple of lists containing "big" molecules and "small" molecules
+        """
+        big_molecules, small_molecules = [], []
+        for molecule in molecules:
+            if len(molecule) > number_of_atoms:
+                big_molecules.append(molecule)
+            else:
+                small_molecules.append(molecule)
+        return big_molecules, small_molecules
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove small molecules from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction without small molecules
+        Raises:
+            StandardizationError: If small molecule removal fails
+        """
+        new_reactants, small_reactants = self._split_molecules(
+            rxn.reactants, self.mol_max_size
+        )
+        new_products, small_products = self._split_molecules(
+            rxn.products, self.mol_max_size
+        )
+        if not new_reactants or not new_products:
+            raise StandardizationError(
+                "SmallMolecules",
+                str(rxn),
+                ValueError("No molecules left after removing small ones"),
+            )
+        new_reaction = ReactionContainer(
+            new_reactants, new_products, rxn.reagents, rxn.meta
+        )
+        new_reaction.name = rxn.name
+        # Save small molecules to meta
+        united_small_reactants = unite_molecules(small_reactants)
+        new_reaction.meta["small_reactants"] = str(united_small_reactants)
+        united_small_products = unite_molecules(small_products)
+        new_reaction.meta["small_products"] = str(united_small_products)
+        return new_reaction
+@dataclass
+class RemoveReagentsConfig:
+    reagent_max_size: int = 7
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "RemoveReagentsConfig":
+        """Create an instance of RemoveReagentsConfig from a dictionary."""
+        return RemoveReagentsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "RemoveReagentsConfig":
+        """Deserialize a YAML file into a RemoveReagentsConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return RemoveReagentsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        reagent_max_size = params.get("reagent_max_size", self.reagent_max_size)
+        if not isinstance(reagent_max_size, int) or not (0 < reagent_max_size):
+            raise ValueError(
+                "Invalid 'reagent_max_size'; expected an integer more than 1"
+            )
+class RemoveReagentsStandardizer(BaseStandardizer):
+    """Remove reagents from reaction."""
+    def __init__(self, reagent_max_size: int = 7):
+        self.reagent_max_size = reagent_max_size
+    @classmethod
+    def from_config(cls, config: RemoveReagentsConfig) -> "RemoveReagentsStandardizer":
+        return cls(config.reagent_max_size)
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove reagents from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction without reagents
+        Raises:
+            StandardizationError: If reagent removal fails
+        """
+        not_changed_molecules = set(rxn.reactants).intersection(rxn.products)
+        cgr = ~rxn
+        center_atoms = set(cgr.center_atoms)
+        new_reactants = []
+        new_products = []
+        new_reagents = []
+        for molecule in rxn.reactants:
+            if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+                new_reagents.append(molecule)
+            else:
+                new_reactants.append(molecule)
+        for molecule in rxn.products:
+            if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+                new_reagents.append(molecule)
+            else:
+                new_products.append(molecule)
+        if not new_reactants or not new_products:
+            raise StandardizationError(
+                "RemoveReagents",
+                str(rxn),
+                ValueError("No molecules left after removing reagents"),
+            )
+        # Filter reagents by size
+        new_reagents = {
+            molecule
+            for molecule in new_reagents
+            if len(molecule) <= self.reagent_max_size
+        }
+        new_reaction = ReactionContainer(
+            new_reactants, new_products, new_reagents, rxn.meta
+        )
+        new_reaction.name = rxn.name
+        return new_reaction
+@dataclass
+class RebalanceReactionConfig:
+    pass
+class RebalanceReactionStandardizer(BaseStandardizer):
+    """Rebalance reaction."""
+    @classmethod
+    def from_config(
+        cls, config: RebalanceReactionConfig
+    ) -> "RebalanceReactionStandardizer":
+        return cls()
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Rebalances the reaction by assembling CGR and then decomposing it. Works for
+        all reactions for which the correct CGR can be assembled.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The rebalanced reaction
+        Raises:
+            StandardizationError: If rebalancing fails
+        """
+        try:
+            tmp_rxn = ReactionContainer(rxn.reactants, rxn.products)
+            cgr = ~tmp_rxn
+            reactants, products = ~cgr
+            new_rxn = ReactionContainer(
+                reactants.split(), products.split(), rxn.reagents, rxn.meta
+            )
+            new_rxn.name = rxn.name
+            return new_rxn
+        except Exception as e:
+            logging.debug(f"Rebalancing attempt failed: {e}")
+            raise StandardizationError(
+                "RebalanceReaction",
+                str(rxn),
+                ValueError("Failed to rebalance reaction"),
+            )
+@dataclass
+class DuplicateReactionConfig:
+    pass
+class DuplicateReactionStandardizer(BaseStandardizer):
+    """Cluster‑wide duplicate removal via a Ray actor."""
+    def __init__(self, dedup_actor: "ray.actor.ActorHandle"):
+        self._actor = dedup_actor  # global singleton handle
+        # local fast‑path cache to avoid actor call on obvious repeats *in
+        # the same worker*; purely an optimisation, not required.
+        self._local_seen: set[int] = set()
+    @classmethod
+    def from_config(cls, config: DuplicateReactionConfig):
+        # fallback for single‑process mode: create a dummy in‑proc actor
+        if ray.is_initialized():
+            dedup_actor = ray.get_actor("duplicate_rxn_actor")
+        else:
+            dedup_actor = None
+        return cls(dedup_actor)
+    # ------------------------------------------------------------------
+    def safe_reaction_smiles(self, reaction: ReactionContainer) -> str:
+        reactants_smi = ".".join(str(i) for i in reaction.reactants)
+        products_smi = ".".join(str(i) for i in reaction.products)
+        return f"{reactants_smi}>>{products_smi}"
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        h = hash(self.safe_reaction_smiles(rxn))
+        # local cache fast‑path (helps in large batches processed by same
+        # worker; no correctness impact).
+        if h in self._local_seen:
+            raise StandardizationError(
+                "DuplicateReaction", str(rxn), ValueError("Duplicate reaction found")
+            )
+        # ------------------- cluster‑wide check ------------------------
+        if self._actor is None:  # single‑CPU fall‑back
+            is_new = h not in self._local_seen
+        else:
+            # synchronous, returns True/False
+            is_new = ray.get(self._actor.check_and_add.remote(h))
+        if is_new:
+            self._local_seen.add(h)
+            return rxn
+        raise StandardizationError(
+            "DuplicateReaction", str(rxn), ValueError("Duplicate reaction found")
+        )
+@ray.remote
+class DedupActor:
+    """Cluster‑wide set of reaction hashes."""
+    def __init__(self):
+        self._seen: set[int] = set()
+    def check_and_add(self, h: int) -> bool:
+        """
+        Returns True **iff** the hash was not present yet and is now stored.
+        Cluster‑wide uniqueness is guaranteed because this method executes
+        serially inside the actor process.
+        """
+        if h in self._seen:
+            return False
+        self._seen.add(h)
+        return True
+# Registry mapping config field names to standardizer classes
+STANDARDIZER_REGISTRY = {
+    "reaction_mapping_config": ReactionMappingStandardizer,
+    "functional_groups_config": FunctionalGroupsStandardizer,
+    "kekule_form_config": KekuleFormStandardizer,
+    "check_valence_config": CheckValenceStandardizer,
+    "implicify_hydrogens_config": ImplicifyHydrogensStandardizer,
+    "check_isotopes_config": CheckIsotopesStandardizer,
+    "split_ions_config": SplitIonsStandardizer,
+    "aromatic_form_config": AromaticFormStandardizer,
+    "mapping_fix_config": MappingFixStandardizer,
+    "unchanged_parts_config": UnchangedPartsStandardizer,
+    "small_molecules_config": SmallMoleculesStandardizer,
+    "remove_reagents_config": RemoveReagentsStandardizer,
+    "rebalance_reaction_config": RebalanceReactionStandardizer,
+    "duplicate_reaction_config": DuplicateReactionStandardizer,
+}
+@dataclass
+class ReactionStandardizationConfig(ConfigABC):
+    """Configuration class for reaction filtering. This class manages configuration
+    settings for various reaction filters, including paths, file formats, and filter-
+    specific parameters.
+    :param reaction_mapping_config: Configuration for reaction mapping.
+    :param functional_groups_config: Configuration for functional groups
+        standardization.
+    :param kekule_form_config: Configuration for reactants/reagents/products
+        kekulization.
+    :param check_valence_config: Configuration for atom valence checking.
+    :param implicify_hydrogens_config: Configuration for hydrogens removal.
+    :param check_isotopes_config: Configuration for isotopes checking and cleaning.
+    :param split_ions_config: Configuration for computing charge of molecule.
+    :param aromatic_form_config: Configuration for molecules aromatization.
+    :param unchanged_parts_config: Configuration for removal of unchanged parts in
+        reaction.
+    :param small_molecules_config: Configuration for removal of small molecule from
+        reaction.
+    :param remove_reagents_config: Configuration for removal of reagents from reaction.
+    :param rebalance_reaction_config: Configuration for reaction rebalancing.
+    :param duplicate_reaction_config: Configuration for removal of duplicate reactions.
+    """
+    # configuration for reaction standardizers
+    reaction_mapping_config: Optional[ReactionMappingConfig] = None
+    functional_groups_config: Optional[FunctionalGroupsConfig] = None
+    kekule_form_config: Optional[KekuleFormConfig] = None
+    check_valence_config: Optional[CheckValenceConfig] = None
+    implicify_hydrogens_config: Optional[ImplicifyHydrogensConfig] = None
+    check_isotopes_config: Optional[CheckIsotopesConfig] = None
+    split_ions_config: Optional[SplitIonsConfig] = None
+    aromatic_form_config: Optional[AromaticFormConfig] = None
+    mapping_fix_config: Optional[MappingFixConfig] = None
+    unchanged_parts_config: Optional[UnchangedPartsConfig] = None
+    small_molecules_config: Optional[SmallMoleculesConfig] = None
+    remove_reagents_config: Optional[RemoveReagentsConfig] = None
+    rebalance_reaction_config: Optional[RebalanceReactionConfig] = None
+    duplicate_reaction_config: Optional[DuplicateReactionConfig] = None
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        for field_name, config in self.__dict__.items():
+            if config is not None and hasattr(config, "_validate_params"):
+                config._validate_params(params.get(field_name, {}))
+    def to_dict(self):
+        """Converts the configuration into a dictionary."""
+        config_dict = {}
+        for field_name in STANDARDIZER_REGISTRY:
+            config = getattr(self, field_name)
+            if config is not None:
+                config_dict[field_name] = {}
+        return config_dict
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ReactionStandardizationConfig":
+        """Create an instance of ReactionCheckConfig from a dictionary."""
+        config_kwargs = {}
+        for field_name, std_cls in STANDARDIZER_REGISTRY.items():
+            if field_name in config_dict:
+                config_kwargs[field_name] = std_cls.__name__.replace(
+                    "Standardizer", "Config"
+                )()
+        return ReactionStandardizationConfig(**config_kwargs)
+    @staticmethod
+    def from_yaml(file_path: str) -> "ReactionStandardizationConfig":
+        """Deserializes a YAML file into a ReactionCheckConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ReactionStandardizationConfig.from_dict(config_dict)
+    def create_standardizers(self):
+        """Create standardizer instances based on configuration."""
+        standardizers = []
+        for field_name, std_cls in STANDARDIZER_REGISTRY.items():
+            config = getattr(self, field_name)
+            if config is not None:
+                standardizers.append(std_cls.from_config(config))
+        return standardizers
+def standardize_reaction(
+    reaction: ReactionContainer,
+    standardizers: Sequence,
+) -> ReactionContainer | None:
+    """
+    Apply each standardizer in order.
+    Returns
+    -------
+    ReactionContainer | None
+        - the fully‑standardised reaction, or
+        - None if *any* standardizer decides to filter it out.
+    Raises
+    ------
+    StandardizationError
+        Propagated untouched so the caller can decide what to do.
+    """
+    std_rxn = reaction
+    for std in standardizers:
+        logger.debug("  › %s(%s)", std.__class__.__name__, std_rxn)
+        try:
+            std_rxn = std(std_rxn)  # may return None
+            if std_rxn is None:  # soft filter
+                logger.info("%s filtered out reaction", std.__class__.__name__)
+                return None
+        except StandardizationError as exc:
+            # Log *once*, then re‑raise with full traceback intact
+            logger.warning(
+                "%s failed on reaction %s : %s",
+                std.__class__.__name__,
+                std_rxn,
+                exc,
+            )
+            raise  # re‑raise same object
+    return std_rxn
+def safe_standardize(
+    item: str | ReactionContainer,
+    standardizers: Sequence,
+) -> Tuple[ReactionContainer, bool]:
+    """
+    Always returns a ReactionContainer. The boolean flags real success.
+    """
+    try:
+        # Parse only if needed
+        reaction = (
+            item if isinstance(item, ReactionContainer) else smiles_cgrtools(item)
+        )
+        std = standardize_reaction(reaction, standardizers)
+        if std is None:
+            return reaction, False  # filtered → keep original
+        return std, True
+    except Exception as exc:  # noqa: BLE001
+        # keep the original container (parse if it was a string)
+        if isinstance(item, ReactionContainer):
+            return item, False
+        return smiles_cgrtools(item), False
+def _process_batch(
+    batch: Sequence[str | ReactionContainer],
+    standardizers: Sequence,
+) -> Tuple[List[ReactionContainer], int]:
+    results: List[ReactionContainer] = []
+    n_std = 0
+    for item in batch:
+        rxn, ok = safe_standardize(item, standardizers)
+        results.append(rxn)
+        n_std += ok
+    return results, n_std
+@ray.remote
+def process_batch_remote(
+    batch: Sequence[str | ReactionContainer],
+    std_param: ray.ObjectRef,  # <-- receives a ref
+    log_file_path: str | Path | None = None,
+) -> Tuple[List[ReactionContainer], int]:
+    # Ray keeps a local cache of fetched objects, so the list is
+    # deserialised only once per worker process, not once per task.
+    if isinstance(std_param, ray.ObjectRef):  # handle?   get it
+        standardizers = ray.get(std_param)  # • O(once)
+    else:  # plain list? use as is
+        standardizers = std_param
+    # --- Worker-specific logging setup ---
+    worker_logger = logging.getLogger("synplan.chem.data.standardizing")
+    if log_file_path:
+        log_file_path = Path(log_file_path)  # Ensure it's a Path object
+        # Check if a handler for this file already exists for this logger
+        handler_exists = any(
+            isinstance(h, logging.FileHandler) and Path(h.baseFilename) == log_file_path
+            for h in worker_logger.handlers
+        )
+        if not handler_exists:
+            try:
+                fh = logging.FileHandler(log_file_path, encoding="utf-8")
+                # Use a simple format for worker logs, or match driver's format
+                formatter = logging.Formatter(
+                    "%(asctime)s | %(name)s (worker) | %(levelname)-8s | %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                )
+                fh.setFormatter(formatter)
+                fh.setLevel(logging.INFO)  # Or DEBUG, or use worker_log_level if passed
+                worker_logger.addHandler(fh)
+                worker_logger.setLevel(
+                    logging.INFO
+                )  # Ensure logger passes messages to handler
+                worker_logger.propagate = (
+                    False  # Avoid double logging if driver also logs
+                )
+                # Optional: Log that the handler was added
+                # worker_logger.info(f"Worker process attached file handler: {log_file_path}")
+            except Exception as e:
+                # Log error if handler creation fails (e.g., permissions)
+                logging.error(
+                    f"Worker failed to create file handler {log_file_path}: {e}"
+                )
+    return _process_batch(batch, standardizers)
+def chunked(iterable: Iterable, size: int):
+    chunk = []
+    for it in iterable:
+        chunk.append(it)
+        if len(chunk) == size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+def standardize_reactions_from_file(
+    config: "ReactionStandardizationConfig",
+    input_reaction_data_path: str | Path,
+    standardized_reaction_data_path: str | Path = "reaction_data_standardized.smi",
+    *,
+    num_cpus: int = 1,
+    batch_size: int = 1_000,  # larger batches amortise overhead
+    silent: bool = True,
+    max_pending_factor: int = 4,  # tasks in flight = factor × CPUs
+    worker_log_level: int | str = logging.WARNING,
+    log_file_path: str | Path | None = None,
+) -> None:
+    """
+    Reads reactions, standardises them in parallel with Ray, writes results.
+    The function keeps at most `max_pending_factor * num_cpus` Ray tasks in
+    flight to avoid flooding the scheduler and blowing up the object store.
+    Standardisers are broadcast once with `ray.put`, removing per‑task
+    pickling cost.  All other logic is unchanged.
+    Args:
+        config: Configuration object for standardizers.
+        input_reaction_data_path: Path to the input reaction data file.
+        standardized_reaction_data_path: Path to save the standardized reactions.
+        num_cpus: Number of CPU cores to use for parallel processing.
+        batch_size: Number of reactions to process in each batch.
+        silent: If True, suppress the progress bar.
+        max_pending_factor: Controls the number of pending Ray tasks.
+        worker_log_level: Logging level for Ray workers (e.g., logging.INFO, logging.WARNING).
+        log_file_path: Path to the log file for workers to write to.
+    """
+    output_path = Path(standardized_reaction_data_path)
+    standardizers = config.create_standardizers()
+    logger.info(
+        "Standardizers: %s",
+        ", ".join(s.__class__.__name__ for s in standardizers),
+    )
+    # -----------------------  Ray initialisation  -----------------------
+    if num_cpus > 1:
+        if not ray.is_initialized():
+            ray.init(
+                num_cpus=num_cpus,
+                ignore_reinit_error=True,
+                logging_level=worker_log_level,
+                log_to_driver=False,
+            )
+        DEDUP_NAME = "duplicate_rxn_actor"
+        try:
+            dedup_actor = ray.get_actor(DEDUP_NAME)  # already running?
+        except ValueError:
+            dedup_actor = DedupActor.options(
+                name=DEDUP_NAME, lifetime="detached"  # survives driver exit
+            ).remote()
+        std_ref: ray.ObjectRef | None = None
+        if num_cpus > 1 and std_ref is None:  # broadcast once
+            std_ref = ray.put(standardizers)
+    max_pending = max_pending_factor * num_cpus
+    pending: Dict[ray.ObjectRef, None] = {}
+    n_processed = n_std = 0
+    bar = tqdm(
+        total=0,
+        unit="rxn",
+        desc="Standardising",
+        disable=silent,
+        dynamic_ncols=True,
+    )
+    # ------------------------  Helper function  ------------------------
+    def _flush(ref: ray.ObjectRef, write_fn) -> None:
+        """Fetch finished task, write its results, update counters & bar."""
+        nonlocal n_processed, n_std
+        res, ok = ray.get(ref)
+        write_fn(res)
+        bar.update(len(res))
+        n_processed += len(res)
+        n_std += ok
+    # -----------------------------  I/O  -------------------------------
+    with ReactionReader(input_reaction_data_path) as reader, ReactionWriter(
+        output_path
+    ) as writer:
+        write_fn = lambda reactions: [writer.write(r) for r in reactions]
+        # ---------------------  Main read/compute loop  -----------------
+        for chunk in chunked(reader, batch_size):
+            bar.total += len(chunk)
+            bar.refresh()
+            if num_cpus > 1:
+                # ---------- back‑pressure: keep ≤ max_pending ----------
+                while len(pending) >= max_pending:
+                    done, _ = ray.wait(list(pending), num_returns=1)
+                    _flush(done[0], write_fn)
+                    pending.pop(done[0], None)
+                # ----------- schedule new task -------------------------
+                ref = process_batch_remote.remote(chunk, std_ref, log_file_path)
+                pending[ref] = None
+            else:
+                # --------------- serial fall‑back ----------------------
+                res, ok = _process_batch(chunk, standardizers)
+                write_fn(res)
+                bar.update(len(res))
+                n_processed += len(res)
+                n_std += ok
+        # ------------------  Drain remaining Ray tasks  -----------------
+        while pending:
+            done, _ = ray.wait(list(pending), num_returns=1)
+            _flush(done[0], write_fn)
+            pending.pop(done[0], None)
+    bar.close()
+    ray.shutdown()
+    logger.info(
+        "Finished: processed %d, standardised %d, filtered %d",
+        n_processed,
+        n_std,
+        n_processed - n_std,
+    )

synplan/chem/precursor.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Module containing a class Precursor that represents a precursor (extend molecule object) in
+the search tree."""
+from typing import Set
+from CGRtools.containers import MoleculeContainer
+from synplan.chem.utils import safe_canonicalization
+class Precursor:
+    """Precursor class is used to extend the molecule behavior needed for interaction with
+    a tree in MCTS."""
+    def __init__(self, molecule: MoleculeContainer, canonicalize: bool = True):
+        """It initializes a Precursor object with a molecule container as a parameter.
+        :param molecule: A molecule.
+        """
+        self.molecule = safe_canonicalization(molecule) if canonicalize else molecule
+        self.prev_precursors = []
+    def __len__(self) -> int:
+        """Return the number of atoms in Precursor."""
+        return len(self.molecule)
+    def __hash__(self) -> hash:
+        """Returns the hash value of Precursor."""
+        return hash(self.molecule)
+    def __str__(self) -> str:
+        """Returns a SMILES of the Precursor."""
+        return str(self.molecule)
+    def __eq__(self, other: "Precursor") -> bool:
+        """Checks if the current Precursor is equal to another Precursor."""
+        return self.molecule == other.molecule
+    def __repr__(self) -> str:
+        """Returns a SMILES of the Precursor."""
+        return str(self.molecule)
+    def is_building_block(self, bb_stock: Set[str], min_mol_size: int = 6) -> bool:
+        """Checks if a Precursor is a building block.
+        :param bb_stock: The list of building blocks. Each building block is represented
+            by a canonical SMILES.
+        :param min_mol_size: If the size of the Precursor is equal or smaller than
+            min_mol_size it is automatically classified as building block.
+        :return: True is Precursor is a building block.
+        """
+        if len(self.molecule) <= min_mol_size:
+            return True
+        return str(self.molecule) in bb_stock
+def compose_precursors(
+    precursors: list = None, exclude_small: bool = True, min_mol_size: int = 6
+) -> MoleculeContainer:
+    """
+    Takes a list of precursors, excludes small precursors if specified, and composes them
+    into a single molecule. The composed molecule then is used for the prediction of
+    synthesisability of the characterizing the possible success of the route including
+    the nodes with the given precursor.
+    :param precursors: The list of precursor to be composed.
+    :param exclude_small: The parameter that determines whether small precursor should be excluded from the composition
+                          process. If `exclude_small` is set to `True`,
+                          only precursor with a length greater than min_mol_size will be composed.
+    :param min_mol_size: The parameter used with exclude_small.
+    :return: A composed precursor as a MoleculeContainer object.
+    """
+    if len(precursors) == 1:
+        return precursors[0].molecule
+    if len(precursors) > 1:
+        if exclude_small:
+            big_precursor = [
+                precursor
+                for precursor in precursors
+                if len(precursor.molecule) > min_mol_size
+            ]
+            if big_precursor:
+                precursors = big_precursor
+        tmp_mol = precursors[0].molecule.copy()
+        transition_mapping = {}
+        for mol in precursors[1:]:
+            for n, atom in mol.molecule.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in mol.molecule.bonds():
+                tmp_mol.add_bond(
+                    transition_mapping[atom], transition_mapping[neighbor], bond
+                )
+            transition_mapping = {}
+        return tmp_mol

synplan/chem/reaction.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Module containing classes and functions for manipulating reactions and reaction
+rules."""
+from typing import Any, Iterator, List, Optional
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+class Reaction(ReactionContainer):
+    """Reaction class used for a general representation of reaction."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+def add_small_mols(
+    big_mol: MoleculeContainer, small_molecules: Optional[Any] = None
+) -> List[MoleculeContainer]:
+    """Takes a molecule and returns a list of modified molecules where each small
+    molecule has been added to the big molecule.
+    :param big_mol: A molecule.
+    :param small_molecules: A list of small molecules that need to be added to the
+        molecule.
+    :return: Returns a list of molecules.
+    """
+    if small_molecules:
+        tmp_mol = big_mol.copy()
+        transition_mapping = {}
+        for small_mol in small_molecules:
+            for n, atom in small_mol.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in small_mol.bonds():
+                tmp_mol.add_bond(
+                    transition_mapping[atom], transition_mapping[neighbor], bond
+                )
+            transition_mapping = {}
+        return tmp_mol.split()
+    return [big_mol]
+def apply_reaction_rule(
+    molecule: MoleculeContainer,
+    reaction_rule: Reactor,
+    sort_reactions: bool = False,
+    top_reactions_num: int = 3,
+    validate_products: bool = True,
+    rebuild_with_cgr: bool = False,
+) -> Iterator[List[MoleculeContainer,]]:
+    """Applies a reaction rule to a given molecule.
+    :param molecule: A molecule to which reaction rule will be applied.
+    :param reaction_rule: A reaction rule to be applied.
+    :param sort_reactions:
+    :param top_reactions_num: The maximum amount of reactions after the application of
+        reaction rule.
+    :param validate_products: If True, validates the final products.
+    :param rebuild_with_cgr: If True, the products are extracted from CGR decomposition.
+    :return: An iterator yielding the products of reaction rule application.
+    """
+    reactants = add_small_mols(molecule, small_molecules=False)
+    try:
+        if sort_reactions:
+            unsorted_reactions = list(reaction_rule(reactants))
+            sorted_reactions = sorted(
+                unsorted_reactions,
+                key=lambda react: len(
+                    list(filter(lambda mol: len(mol) > 6, react.products))
+                ),
+                reverse=True,
+            )
+            # take top-N reactions from reactor
+            reactions = sorted_reactions[:top_reactions_num]
+        else:
+            reactions = []
+            for reaction in reaction_rule(reactants):
+                reactions.append(reaction)
+                if len(reactions) == top_reactions_num:
+                    break
+    except IndexError:
+        reactions = []
+    for reaction in reactions:
+        # temporary solution - incorrect leaving groups
+        reactant_atom_nums = []
+        for i in reaction.reactants:
+            reactant_atom_nums.extend(i.atoms_numbers)
+        product_atom_nums = []
+        for i in reaction.products:
+            product_atom_nums.extend(i.atoms_numbers)
+        leaving_atom_nums = set(reactant_atom_nums) - set(product_atom_nums)
+        if len(leaving_atom_nums) > len(product_atom_nums):
+            continue
+        # check reaction
+        if rebuild_with_cgr:
+            cgr = reaction.compose()
+            reactants = cgr.decompose()[1].split()
+        else:
+            reactants = reaction.products  # reactants are products in retro reaction
+        reactants = [mol for mol in reactants if len(mol) > 0]
+        # validate products
+        if validate_products:
+            for mol in reactants:
+                try:
+                    mol.kekule()
+                    if mol.check_valence():
+                        yield None
+                    mol.thiele()
+                except InvalidAromaticRing:
+                    yield None
+        yield reactants

synplan/chem/reaction_routes/__init__.py ADDED Viewed

File without changes

synplan/chem/reaction_routes/clustering.py ADDED Viewed

	@@ -0,0 +1,859 @@

+from collections import defaultdict
+from pathlib import Path
+import pickle
+import re
+from CGRtools.containers import ReactionContainer, CGRContainer
+from CGRtools.containers.bonds import DynamicBond
+from synplan.chem.reaction_routes.leaving_groups import *
+from synplan.chem.reaction_routes.visualisation import *
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.io import (
+    read_routes_csv,
+    read_routes_json,
+    make_dict,
+    make_json,
+)
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+)
+def run_cluster_cli(
+    routes_file: str,
+    cluster_results_dir: str,
+    perform_subcluster: bool = False,
+    subcluster_results_dir: Path = None,
+):
+    """
+    Read routes from a CSV or JSON file, perform clustering, and optionally subclustering.
+    Args:
+        routes_file: Path to the input routes file (.csv or .json).
+        cluster_results_dir: Directory where clustering results are stored.
+        perform_subcluster: Whether to run subclustering on each cluster.
+        subcluster_results_dir: Subdirectory for subclustering results (if enabled).
+    """
+    import click
+    routes_file = Path(routes_file)
+    match = re.search(r"_(\d+)\.", routes_file.name)
+    if not match:
+        raise ValueError(f"Could not extract index from filename: {routes_file.name}")
+    file_index = int(match.group(1))
+    ext = routes_file.suffix.lower()
+    if ext == ".csv":
+        routes_dict = read_routes_csv(str(routes_file))
+        routes_json = make_json(routes_dict)
+    elif ext == ".json":
+        routes_json = read_routes_json(str(routes_file))
+        routes_dict = make_dict(routes_json)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+    # Compose condensed graph representations
+    route_cgrs = compose_all_route_cgrs(routes_dict)
+    click.echo(f"Generating RouteCGR")
+    reduced_cgrs = compose_all_sb_cgrs(route_cgrs)
+    click.echo(f"Generating ReducedRouteCGR")
+    # Perform clustering
+    click.echo(f"\nClustering")
+    clusters = cluster_routes(reduced_cgrs, use_strat=False)
+    click.echo(f"Total number of routes: {len(routes_dict)}")
+    click.echo(f"Found number of clusters: {len(clusters)} ({list(clusters.keys())})")
+    # Ensure output directory exists
+    cluster_results_dir = Path(cluster_results_dir)
+    cluster_results_dir.mkdir(parents=True, exist_ok=True)
+    # Save clusters to pickle
+    with open(cluster_results_dir / f"clusters_{file_index}.pickle", "wb") as f:
+        pickle.dump(clusters, f)
+    # Generate HTML reports for each cluster
+    for idx in clusters:
+        report_path = cluster_results_dir / f"{file_index}_cluster_{idx}.html"
+        routes_clustering_report(
+            routes_json, clusters, idx, reduced_cgrs, html_path=str(report_path)
+        )
+    # Optional subclustering (Under development)
+    if perform_subcluster and subcluster_results_dir:
+        click.echo("\nSubClustering")
+        sub_dir = cluster_results_dir / subcluster_results_dir
+        sub_dir.mkdir(parents=True, exist_ok=True)
+        subclusters = subcluster_all_clusters(clusters, reduced_cgrs, route_cgrs)
+        for cluster_idx, sub in subclusters.items():
+            click.echo(f"Cluster {cluster_idx} has {len(sub)} subclusters")
+            for sub_idx, subcluster in sub.items():
+                subreport_path = (
+                    sub_dir / f"{file_index}_subcluster_{cluster_idx}.{sub_idx}.html"
+                )
+                routes_subclustering_report(
+                    routes_json,
+                    subcluster,
+                    cluster_idx,
+                    sub_idx,
+                    reduced_cgrs,
+                    aam=False,
+                    html_path=str(subreport_path),
+                )
+def cluster_route_from_csv(routes_file: str):
+    """
+    Reads retrosynthetic routes from a CSV file, processes them, and performs clustering.
+    This function orchestrates the process of loading retrosynthetic route data
+    from a specified CSV file, converting the routes into Condensed Graph of
+    Reactions (CGRs), reducing these CGRs to a simplified form (ReducedRouteCGRs),
+    and finally clustering the routes based on these reduced representations.
+    It uses strategic bonds for clustering by default (as indicated by `use_strat=False`
+    in `cluster_routes`, which implies clustering based on the graph structure
+    derived from the reduced CGRs, which often highlight strategic bonds).
+    Args:
+        routes_file (str): The path to the CSV file containing the retrosynthetic
+                           route data.
+    Returns:
+        object: The result of the clustering process, typically a data structure
+                representing the identified clusters. The exact type depends on
+                the implementation of the `cluster_routes` function.
+    """
+    routes_dict = read_routes_csv(routes_file)
+    route_cgrs_dict = compose_all_route_cgrs(routes_dict)
+    reduced_route_cgrs_dict = compose_all_sb_cgrs(route_cgrs_dict)
+    clusters = cluster_routes(reduced_route_cgrs_dict, use_strat=False)
+    return clusters
+def cluster_route_from_json(routes_file: str):
+    """
+    Reads retrosynthetic routes from a JSON file, processes them, and performs clustering.
+    This function is similar to `cluster_route_from_csv` but loads the
+    retrosynthetic route data from a specified JSON file. It reads the JSON,
+    converts it into a suitable dictionary format, composes and reduces the
+    Condensed Graph of Reactions (CGRs) for each route, and then clusters
+    the routes based on these reduced representations, typically using
+    strategic bonds as the basis for clustering.
+    Args:
+        routes_file (str): The path to the JSON file containing the retrosynthetic
+                           route data.
+    Returns:
+        object: The result of the clustering process, typically a data structure
+                representing the identified clusters. The exact type depends on
+                the implementation of the `cluster_routes` function.
+    """
+    routes_json = read_routes_json(routes_file)
+    routes_dict = make_dict(routes_json)
+    route_cgrs_dict = compose_all_route_cgrs(routes_dict)
+    reduced_route_cgrs_dict = compose_all_sb_cgrs(route_cgrs_dict)
+    clusters = cluster_routes(reduced_route_cgrs_dict, use_strat=False)
+    return clusters
+def extract_strat_bonds(target_cgr: CGRContainer):
+    """
+    Extracts strategic bonds from a CGRContainer object.
+    Strategic bonds are identified as bonds where the original bond order
+    (`bond.order`) is None (indicating a bond that was not present in the
+    reactants) but the primary bond order (`bond.p_order`) is not None
+    (indicating a bond that was formed in the product). This function iterates
+    through all bonds in the input CGR, identifies those matching the criteria
+    for strategic bonds, and returns a sorted list of unique strategic bonds
+    represented as tuples of sorted atom indices.
+    Args:
+        target_cgr (CGRContainer): The CGRContainer object from which to extract
+                                   strategic bonds.
+    Returns:
+        list: A sorted list of tuples, where each tuple represents a strategic
+              bond by the sorted integer indices of the two atoms involved in the bond.
+    """
+    result = []
+    seen = set()
+    for atom1, bond_set in target_cgr._bonds.items():
+        for atom2, bond in bond_set.items():
+            if atom1 >= atom2:
+                continue
+            if bond.order is None and bond.p_order is not None:
+                bond_key = tuple(sorted((atom1, atom2)))
+                if bond_key not in seen:
+                    seen.add(bond_key)
+                    result.append(bond_key)
+    return sorted(result)
+def cluster_routes(sb_cgrs: dict, use_strat=False):
+    """
+    Cluster routes objects based on their strategic bonds
+      or CGRContainer object signature (not avoid mapping)
+    Args:
+        sb_cgrs: Dictionary mapping node_id to sb_cgr objects.
+    Returns:
+        Dictionary with groups keyed by '{length}.{index}' containing
+        'sb_cgr', 'node_ids', and 'strat_bonds'.
+    """
+    temp_groups = defaultdict(
+        lambda: {"node_ids": [], "sb_cgr": None, "strat_bonds": None}
+    )
+    # 1. Initial grouping based on the content of strategic bonds
+    for node_id, sb_cgr in sb_cgrs.items():
+        strat_bonds_list = extract_strat_bonds(sb_cgr)
+        if use_strat == True:
+            group_key = tuple(strat_bonds_list)
+        else:
+            group_key = str(sb_cgr)
+        if not temp_groups[group_key]["node_ids"]:  # First time seeing this group
+            temp_groups[group_key][
+                "sb_cgr"
+            ] = sb_cgr  # Store the first CGR as representative
+            temp_groups[group_key][
+                "strat_bonds"
+            ] = strat_bonds_list  # Store the actual list
+        temp_groups[group_key]["node_ids"].append(node_id)
+        temp_groups[group_key][
+            "node_ids"
+        ].sort()  # Keep node_ids sorted for consistency
+    for group_key in temp_groups.keys():
+        temp_groups[group_key]["group_size"] = len(temp_groups[group_key]["node_ids"])
+    # 2. Format the output dictionary with desired keys '{length}.{index}'
+    final_grouped_results = {}
+    group_indices = defaultdict(int)  # To track index for each length
+    # Sort items by length of bonds first, then potentially by bonds themselves for consistent indexing
+    # Sorting by the group_key (tuple of tuples) provides a deterministic order
+    sorted_groups = sorted(
+        temp_groups.items(), key=lambda item: (len(item[0]), item[0])
+    )
+    for group_key, group_data in sorted_groups:
+        num_bonds = len(group_data["strat_bonds"])
+        group_indices[num_bonds] += 1  # Increment index for this length (1-based)
+        final_key = f"{num_bonds}.{group_indices[num_bonds]}"
+        final_grouped_results[final_key] = group_data
+    return final_grouped_results
+def lg_process_reset(lg_cgr: CGRContainer, atom_num: int):
+    """
+    Normalize bonds in an extracted leaving group (X) fragment and flag the attachment atom as a radical.
+    Scans all bonds in `lg_cgr`, converting any bond with undefined `p_order`
+    but defined `order` into a `DynamicBond` of matching integer order. Then sets
+    the atom at `atom_num` to a radical.
+    Parameters
+    ----------
+    target_cgr : CGRContainer
+        The CGR representing the isolated leaving-group fragment.
+    atom_num : int
+        Index of the attachment atom to mark as a radical.
+    Returns
+    -------
+    CGRContainer
+        The modified `lg_cgr` with normalized bonds and the specified atom
+        flagged as a radical.
+    """
+    bond_items = list(lg_cgr._bonds.items())
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            if bond.p_order is None and bond.order is not None:
+                order = int(bond.order)
+                lg_cgr.delete_bond(atom1, atom2)
+                lg_cgr.add_bond(atom1, atom2, DynamicBond(order, order))
+    lg_cgr._atoms[atom_num].is_radical = True
+    return lg_cgr
+def lg_replacer(route_cgr: CGRContainer):
+    """
+    Extract dynamic leaving-groups from a CGR and mark attachment points.
+    Scans the input CGRContainer for bonds lacking explicit p_order (i.e., leaving-group attachments),
+    severs those bonds, captures each leaving-group as its own CGRContainer, and inserts DynamicX
+    markers at the attachment sites. Finally, reindexes the markers to ensure unique labels.
+    Parameters
+    ----------
+    route_cgr : CGRContainer
+        A CGR representing the full synthethic route.
+    Returns
+    -------
+    synthon_cgr : CGRContainer
+        The core synthon CGR with DynamicX atoms marking each former leaving-group site.
+    lg_groups : dict[int, tuple[CGRContainer, int]]
+        Mapping from each marker label to a tuple of:
+        - the extracted leaving-group CGRContainer
+        - the atom index where it was attached.
+    """
+    lg_groups = {}
+    cgr_prods = [route_cgr.substructure(c) for c in route_cgr.connected_components]
+    target_cgr = cgr_prods[0]
+    bond_items = list(target_cgr._bonds.items())
+    reaction = ReactionContainer.from_cgr(target_cgr)
+    target_mol = reaction.products[0]
+    max_in_target_mol = max(target_mol._atoms)
+    k = 1
+    atom_nums = []
+    checked_atoms = set()
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            if bond.p_order is None and bond.order is not None and tuple(sorted([atom1, atom2])) not in checked_atoms:
+                if atom1 <= max_in_target_mol:
+                    lg = DynamicX()
+                    lg.mark = k
+                    lg.isotope = k
+                    order = bond.order
+                    p_order = bond.p_order
+                    target_cgr.delete_bond(atom1, atom2)
+                    lg_cgrs = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ]
+                    checked_atoms.add(tuple(sorted([atom1, atom2])))
+                    if len(lg_cgrs) == 2:
+                        lg_cgr = lg_cgrs[1]
+                        lg_cgr = lg_process_reset(lg_cgr, atom2)
+                        lg_cgr.clean2d()
+                    else:
+                        continue
+                    lg_groups[k] = (lg_cgr, atom2)
+                    target_cgr = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ][0]
+                    target_cgr.add_atom(lg, atom2)
+                    if order == 4 and p_order == None:
+                        order = 1
+                    target_cgr.add_bond(atom1, atom2, DynamicBond(order, p_order))
+                    target_cgr = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ][0]
+                    k += 1
+                    atom_nums.append(atom2)
+    synthon_cgr = [target_cgr.substructure(c) for c in target_cgr.connected_components][
+        0
+    ]
+    reaction = ReactionContainer.from_cgr(synthon_cgr)
+    reactants = reaction.reactants
+    atom_mark_map = {}  # To map atom numbers to their new marks
+    g = 1
+    for n, r in enumerate(reactants):
+        for atom_num in atom_nums:
+            if atom_num in r._atoms:
+                synthon_cgr._atoms[atom_num].mark = g
+                atom_mark_map[atom_num] = g
+                g += 1
+    new_lg_groups = {}
+    for original_mark in lg_groups:
+        cgr_obj, a_num = lg_groups[original_mark]
+        new_mark = atom_mark_map.get(a_num)
+        if new_mark is not None:
+            new_lg_groups[new_mark] = (cgr_obj, a_num)
+    lg_groups = new_lg_groups
+    return synthon_cgr, lg_groups
+def lg_reaction_replacer(
+    synthon_reaction: ReactionContainer, lg_groups: dict, max_in_target_mol: int
+):
+    """
+    Replace marked leaving-groups (X) into synthon reactants.
+    For each reactant in `synthon_reaction`, finds placeholder atoms
+    (indices > `max_in_target_mol`) that match entries in `lg_groups`,
+    replaces them with `MarkedAt` atoms labeled by their leaving-group key (X),
+    and preserves original bond connectivity.
+    Parameters
+    ----------
+    synthon_reaction : ReactionContainer
+        Reaction containing reactants with X placeholders.
+    lg_groups : dict[int, tuple[CGRContainer, int]]
+        Mapping from X label to (X CGR, attachment atom index).
+    max_in_target_mol : int
+        Highest atom index of the core product; any atom_num above this is a placeholder.
+    Returns
+    -------
+    List[Molecule]
+        Reactant molecules with `MarkedAt` atoms reinserted at X attachment sites.
+    """
+    new_reactants = []
+    for reactant in synthon_reaction.reactants:
+        atom_keys = list(reactant._atoms.keys())
+        for atom_num in atom_keys:
+            if atom_num > max_in_target_mol:
+                for k, val in lg_groups.items():
+                    lg = MarkedAt()
+                    if atom_num == val[1]:
+                        lg.mark = k
+                        lg.isotope = k
+                        atom1 = list(reactant._bonds[atom_num].keys())[0]
+                        bond = reactant._bonds[atom_num][atom1]
+                        reactant.delete_bond(atom1, atom_num)
+                        reactant.delete_atom(atom_num)
+                        reactant.add_atom(lg, atom_num)
+                        reactant.add_bond(atom1, atom_num, bond)
+        new_reactants.append(reactant)
+    return new_reactants
+class SubclusterError(Exception):
+    """Raised when subcluster_one_cluster cannot complete successfully."""
+def subcluster_one_cluster(group, sb_cgrs_dict, route_cgrs_dict):
+    """
+    Generate synthon data for each route in a single cluster.
+    For each route (node ID) in `group['node_ids']`, replaces RouteCGRs with
+    SynthonCGR, builds ReactionContainers before and after X replacement,
+    and collects relevant data.
+    Parameters
+    ----------
+    group : dict
+        Must include `'node_ids'`, a list of node identifiers.
+    sb_cgrs_dict : dict
+        Maps node IDs to their ReducedRouteCGR.
+    route_cgrs_dict : dict
+        Maps node IDs to their RouteCGR.
+    Returns
+    -------
+    dict or None
+        If successful, returns a dict mapping each `node_id` to a tuple:
+        `(sb_cgr, original_reaction, synthon_cgr, new_reaction, lg_groups)`.
+        Or raises SubclusterError on any failure: if any step (X replacement or reaction
+        parsing) fails for a node.
+    """
+    node_ids = group.get("node_ids")
+    if not isinstance(node_ids, (list, tuple)):
+        raise SubclusterError(
+            f"'node_ids' must be a list or tuple, got {type(node_ids).__name__}"
+        )
+    result = {}
+    for node_id in node_ids:
+        sb_cgr = sb_cgrs_dict[node_id]
+        route_cgr = route_cgrs_dict[node_id]
+        # 1) Replace leaving groups in RouteCGR
+        try:
+            synthon_cgr, lg_groups = lg_replacer(route_cgr)
+        except (KeyError, ValueError) as e:
+            raise SubclusterError(f"LG replacement failed for node {node_id}") from e
+        # 2) Build ReactionContainer for Abstracted RouteCGR
+        try:
+            synthon_rxn = ReactionContainer.from_cgr(synthon_cgr)
+        except:  # replace with the actual exception class
+            raise SubclusterError(
+                f"Failed to parse synthon CGR for node {node_id}"
+            ) from e
+        # 3) Prepare for X-based reaction replacement
+        try:
+            old_reactants = synthon_rxn.reactants
+            target_mol = synthon_rxn.products[0]
+            max_atom_idx = max(target_mol._atoms)
+            new_reactants = lg_reaction_replacer(synthon_rxn, lg_groups, max_atom_idx)
+            new_rxn = ReactionContainer(reactants=new_reactants, products=[target_mol])
+        except (IndexError, TypeError) as e:
+            raise SubclusterError(
+                f"Leaving group (X) reaction replacement failed for node {node_id}"
+            ) from e
+        result[node_id] = (
+            sb_cgr,
+            ReactionContainer(reactants=old_reactants, products=[target_mol]),
+            synthon_cgr,
+            new_rxn,
+            lg_groups,
+        )
+    return result
+def group_nodes_by_synthon_detail(data_dict: dict):
+    """
+    Groups nodes based on synthon CGR (result[0]) and reaction (result[1]).
+    The output includes a dictionary mapping node IDs to their result[2] value.
+    Args:
+        data_dict: Dictionary {node_id: [synthon_cgr, synthon_reaction, node_data, ...]}.
+    Returns:
+        Dictionary {group_index: {'sb_cgr': ... ,'synthon_cgr': ..., 'synthon_reaction': ...,
+                                  'nodes_data': {node_id1: node_data1, ...}}}.
+    """
+    temp_groups = defaultdict(list)
+    for node_id, result_list in data_dict.items():
+        if len(result_list) < 4:
+            group_key = (result_list[0], None)  # Handle missing reaction
+        else:
+            try:
+                group_key = (
+                    result_list[0],
+                    result_list[1],
+                    result_list[2],
+                    result_list[3],
+                )
+            except TypeError:
+                print(
+                    f"Warning: Skipping node {node_id} because reaction data is not hashable: {type(result_list[1])}"
+                )
+                continue
+        temp_groups[group_key].append(node_id)
+    # 2. Format the output dictionary with sequential integer keys
+    #    and include the node-specific data (result[2]) in a sub-dictionary.
+    final_grouped_results = {}
+    group_index = 1
+    sorted_temp_groups = sorted(temp_groups.items(), key=lambda item: item[1])
+    for group_key, node_ids in sorted_temp_groups:
+        sb_cgr, unlabeled_reaction, synthon_cgr, synthon_reaction = group_key
+        nodes_data_dict = {}
+        # Iterate through the node IDs belonging to this group
+        for node_id in sorted(node_ids):  # Sort node IDs for consistent dict order
+            original_result = data_dict.get(
+                node_id, []
+            )  # Get original list for this node
+            node_specific_data = None  # Default value if index 2 is missing
+            if len(original_result) > 4:
+                node_specific_data = original_result[4]  # Get the third element
+            nodes_data_dict[node_id] = node_specific_data  # Add to the sub-dictionary
+        final_grouped_results[group_index] = {
+            "sb_cgr": sb_cgr,
+            "unlabeled_reaction": unlabeled_reaction,
+            "synthon_cgr": synthon_cgr,
+            "synthon_reaction": synthon_reaction,
+            "nodes_data": nodes_data_dict,
+            "post_processed": False,
+        }
+        group_index += 1
+    return final_grouped_results
+def subcluster_all_clusters(groups, sb_cgrs_dict, route_cgrs_dict):
+    """
+    Subdivide each reaction cluster into detailed synthon-based subgroups.
+    Iterates over all clusters in `groups`, applies `subcluster_one_cluster`
+    to generate per-cluster synthons, then organizes nodes by synthon detail.
+    Parameters
+    ----------
+    groups : dict
+        Mapping of cluster indices to cluster data.
+    sb_cgrs_dict : dict
+        Dictionary of ReducedRoteCGRs
+    route_cgrs_dict : dict
+        Dictionary of RoteCGRs
+    Returns
+    -------
+    dict or None
+        A dict mapping each cluster index to its subgroups dict,
+        or None if any cluster fails to subcluster.
+    """
+    all_subgroups = {}
+    for group_index, group in groups.items():
+        group_synthons = subcluster_one_cluster(
+            group, sb_cgrs_dict, route_cgrs_dict
+        )
+        if group_synthons is None:
+            return None
+        all_subgroups[group_index] = group_nodes_by_synthon_detail(group_synthons)
+    return all_subgroups
+def all_lg_collect(subgroup):
+    """
+    Gather all leaving-group CGRContainers by node index.
+    Scans `subgroup['nodes_data']`, collects every CGRContainer per index,
+    and returns a mapping from each index to the list of distinct containers.
+    Parameters
+    ----------
+    subgroup : dict
+        Must contain 'nodes_data', a dict mapping pathway keys to
+        dicts of {node_index: (CGRContainer, …)}.
+    Returns
+    -------
+    dict[int, list[CGRContainer]]
+        For each node index, a list of unique CGRContainer objects
+        (duplicates by string are filtered out).
+    """
+    all_indices = set()
+    for sub_dict in subgroup["nodes_data"].values():
+        all_indices.update(sub_dict.keys())
+    # Dynamically initialize result and seen dictionaries
+    result = {idx: [] for idx in all_indices}
+    seen = {idx: set() for idx in all_indices}
+    # Populate the result with unique CGRContainer objects
+    for sub_dict in subgroup["nodes_data"].values():
+        for idx in sub_dict:
+            cgr_container = sub_dict[idx][0]
+            cgr_str = str(cgr_container)
+            if cgr_str not in seen[idx]:
+                seen[idx].add(cgr_str)
+                result[idx].append(cgr_container)
+    return result
+def replace_leaving_groups_in_synthon(subgroup, to_remove):  # Under development
+    """
+    Replace specified leaving groups (LG) in a synthon CGR with new fragments and return the updated CGR
+    along with a mapping from adjusted LG marks to their atom indices.
+    Parameters:
+        subgroup (dict): Must contain:
+            - 'synthon_cgr': the CGR object representing the synthon graph
+            - 'nodes_data': mapping of node indices to LG replacement data
+        to_remove (List[int]): List of LG marks to remove and replace.
+    Returns:
+        Tuple[CGR, Dict[int, int]]:
+            - The updated CGR with replacements
+            - A dict mapping new LG marks to their atom indices in the updated CGR
+    """
+    # Extract the original CGR and leaving group replacement table
+    original_cgr = subgroup["synthon_cgr"]
+    lg_table = next(iter(subgroup["nodes_data"].values()))
+    updated_cgr = original_cgr
+    removed_count = 0
+    new_lgs = {}
+    # Iterate through all atoms (index, atom_obj) in the CGR
+    for atom_idx, atom_obj in list(updated_cgr.atoms()):
+        # Skip non-X atoms
+        if atom_obj.__class__.__name__ != "DynamicX":
+            continue
+        current_mark = atom_obj.mark
+        if current_mark in to_remove:
+            # Remove old LG (X): delete bond and atom
+            neighbors = list(updated_cgr._bonds[atom_idx].keys())
+            if neighbors:
+                neighbor_idx = neighbors[0]
+                bond = updated_cgr._bonds[atom_idx][neighbor_idx]
+                updated_cgr.delete_bond(atom_idx, neighbor_idx)
+                updated_cgr.delete_atom(atom_idx)
+                # Attach new LG(X) fragment from the table
+                lg_fragment = lg_table[current_mark][0]
+                updated_cgr = updated_cgr.union(lg_fragment)
+                # Reset radical flag on the new atom and restore the bond
+                updated_cgr._atoms[atom_idx].is_radical = False
+                updated_cgr.add_bond(atom_idx, neighbor_idx, bond)
+            removed_count += 1
+        else:
+            # Adjust the marks of remaining LGs to account for removed ones
+            atom_obj.mark -= removed_count
+            new_lgs[atom_obj.mark] = atom_idx
+    # Reorder atoms dict and update 2D coordinates for depiction
+    updated_cgr._atoms = dict(sorted(updated_cgr._atoms.items()))
+    return updated_cgr, new_lgs
+def new_lg_reaction_replacer(synthon_reaction, new_lgs, max_in_target_mol):
+    """
+    Replace placeholder atom indices with marked leaving-group atoms in reactants.
+    Iterates through each reactant in a `ReactionContainer`, finds atom indices
+    corresponding to newly detached leaving-groups (those greater than the
+    core’s maximum index), and replaces them with `MarkedAt` atoms bearing
+    the correct X labels and isotopes. Bonds to the original attachment points
+    are preserved.
+    Parameters
+    ----------
+    synthon_reaction : ReactionContainer
+        A reaction container whose `reactants` list contains molecules with
+        dummy atoms (by index) marking where leaving-groups were removed.
+    new_lgs : dict[int, int]
+        Mapping from leaving-group label (int) to the atom index (int) in each
+        reactant that should be replaced.
+    max_in_target_mol : int
+        The highest atom index used by the core product. Any atom index in a
+        reactant greater than this is treated as a leaving-group placeholder.
+    Returns
+    -------
+    List[Molecule]
+        A list of reactant molecules where each placeholder atom has been
+        replaced by a `MarkedAt` atom with its `.mark` and `.isotope` set
+        to the leaving-group label, and original bonds reattached.
+    """
+    new_reactants = []
+    for reactant in synthon_reaction.reactants:
+        atom_keys = list(reactant._atoms.keys())
+        for atom_num in atom_keys:
+            if atom_num > max_in_target_mol:
+                for k, val in new_lgs.items():
+                    lg = MarkedAt()
+                    if atom_num == val:
+                        lg.mark = k
+                        lg.isotope = k
+                        atom1 = list(reactant._bonds[atom_num].keys())[0]
+                        bond = reactant._bonds[atom_num][atom1]
+                        reactant.delete_bond(atom1, atom_num)
+                        reactant.delete_atom(atom_num)
+                        reactant.add_atom(lg, atom_num)
+                        reactant.add_bond(atom1, atom_num, bond)
+        new_reactants.append(reactant)
+    return new_reactants
+def post_process_subgroup(
+    subgroup,
+):  # Under development: Error in replace_leaving_groups_in_synthon , 'cuz synthon_reaction.clean2d crashes
+    """
+    Drop leaving-groups common to all pathways and rebuild a minimal synthon.
+    Scans the subgroup for leaving-groups present in every route, removes those
+    from the CGR, re-assembles a clean ReactionContainer with the original core,
+    updates `nodes_data`, and flags the dict as processed.
+    Parameters
+    ----------
+    subgroup : dict
+        Must include keys for `nodes_data` and the helpers
+        (`all_lg_collect`, `find_const_lg`, etc.). If already
+        post_processed, returns immediately.
+    Returns
+    -------
+    dict
+        The same dict, now with:
+        - `'synthon_reaction'`: cleaned ReactionContainer
+        - `'nodes_data'`: filtered node table
+        - `'post_processed'`: True
+    """
+    if "post_processed" in subgroup.keys() and subgroup["post_processed"] == True:
+        return subgroup
+    result = all_lg_collect(subgroup)
+    # to find constant lg that need to be removed
+    to_remove = [ind for ind, cgr_set in result.items() if len(cgr_set) == 1]
+    new_synthon_cgr, new_lgs = replace_leaving_groups_in_synthon(subgroup, to_remove)
+    synthon_reaction = ReactionContainer.from_cgr(new_synthon_cgr)
+    synthon_reaction.clean2d()
+    old_reactants = ReactionContainer.from_cgr(new_synthon_cgr).reactants
+    target_mol = synthon_reaction.products[0]  # TO DO: target_mol might be non 0
+    max_in_target_mol = max(target_mol._atoms)
+    new_reactants = new_lg_reaction_replacer(
+        synthon_reaction, new_lgs, max_in_target_mol
+    )
+    new_synthon_reaction = ReactionContainer(
+        reactants=new_reactants, products=[target_mol]
+    )
+    new_synthon_reaction.clean2d()
+    subgroup["synthon_reaction"] = new_synthon_reaction
+    subgroup["nodes_data"] = remove_and_shift(subgroup["nodes_data"], to_remove)
+    subgroup["post_processed"] = True
+    subgroup["group_lgs"] = group_by_identical_values(subgroup["nodes_data"])
+    return subgroup
+def group_by_identical_values(nodes_data):  # Under development
+    """
+    Groups entries in a nested dictionary based on identical sets of core values.
+    Identifies route IDs whose inner dictionaries contain the
+    same sequence of leaving groups, when ordered by subkey. These are collapsed into a single entry.
+    Args:
+        nodes_data (dict): A dictionary mapping outer keys to inner dictionaries.
+            Each inner dictionary maps subkeys to a tuple `(value_obj, other_info)`.
+            `value_obj` is used for grouping, `other_info` is ignored.
+            Example: {'route_1': {'pos_a': (1, 'infoA'), 'pos_b': (2, 'infoB')}, ...}
+    Returns:
+        dict: A dictionary where:
+            - Keys are tuples of the original outer keys that were grouped.
+            - Values are dictionaries mapping the original subkeys to the
+              `value_obj` from the first outer key in the group's tuple.
+            The dictionary is sorted descending by the number of grouped outer keys.
+            Example: {('route_1', 'route_2'): {'pos_a': 1, 'pos_b': 2}, ...}
+    """
+    # Step 1: Build a signature for each outer key: the tuple of all first-elements in its inner dict
+    signature_map = defaultdict(list)
+    for outer_key, inner_dict in nodes_data.items():
+        # Sort inner_dict items by subkey to ensure consistent ordering
+        sorted_items = sorted(inner_dict.items(), key=lambda kv: kv[0])
+        # Extract only the first element of each (value_obj, other_info) tuple
+        signature = tuple(val_tuple[0] for _, val_tuple in sorted_items)
+        signature_map[signature].append(outer_key)
+    # Step 2: Build the grouped result
+    grouped = {}
+    for signature, outer_keys in signature_map.items():
+        # Use the representative inner dict from the first outer key in this group
+        rep_inner = nodes_data[outer_keys[0]]
+        # Build mapping subkey -> value_obj
+        rep_values = {subkey: val_tuple[0] for subkey, val_tuple in rep_inner.items()}
+        # Store under tuple of grouped outer keys
+        grouped_key = tuple(outer_keys)
+        grouped[grouped_key] = rep_values
+    sorted_grouped = dict(
+        sorted(grouped.items(), key=lambda item: len(item[0]), reverse=True)
+    )
+    return sorted_grouped

synplan/chem/reaction_routes/io.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import csv
+import json
+import pickle
+import os
+from CGRtools import smiles as read_smiles
+from synplan.mcts.tree import Tree
+def make_dict(routes_json):
+    """
+    routes_json : list of tree-dicts as produced by make_json()
+    Returns a dict mapping each route index (0, 1, …) to a sub-dict
+    of {new_step_id: ReactionContainer}, where the step IDs run
+    from the earliest reaction (0) up to the final (max).
+    """
+    routes_dict = {}
+    if isinstance(routes_json, dict):
+        for route_idx, tree in routes_json.items():
+            rxn_list = []
+            def _postorder(node):
+                # first dive into any children, then record this reaction
+                for child in node.get("children", []):
+                    _postorder(child)
+                if node["type"] == "reaction":
+                    rxn_list.append(read_smiles(node["smiles"]))
+                # mol-nodes simply recurse (no record)
+            # collect all reactions in leaf→root order
+            _postorder(tree)
+            # now assign 0,1,2,… in that order
+            reactions = {i: rxn for i, rxn in enumerate(rxn_list)}
+            routes_dict[int(route_idx)] = reactions
+        return routes_dict
+    else:
+        for route_idx, tree in enumerate(routes_json):
+            rxn_list = []
+            def _postorder(node):
+                # first dive into any children, then record this reaction
+                for child in node.get("children", []):
+                    _postorder(child)
+                if node["type"] == "reaction":
+                    rxn_list.append(read_smiles(node["smiles"]))
+                # mol-nodes simply recurse (no record)
+            # collect all reactions in leaf→root order
+            _postorder(tree)
+            # now assign 0,1,2,… in that order
+            reactions = {i: rxn for i, rxn in enumerate(rxn_list)}
+            routes_dict[int(route_idx)] = reactions
+        return routes_dict
+def read_routes_json(file_path="routes.csv", to_dict=False):
+    with open(file_path, "r") as file:
+        routes_json = json.load(file)
+    if to_dict:
+        return make_dict(routes_json)
+    return routes_json
+def read_routes_csv(file_path="routes.csv"):
+    """
+    Read a CSV with columns: route_id, step_id, smiles, meta
+    and return a nested dict mapping
+        route_id (int) -> step_id (int) -> ReactionContainer
+    (ignoring meta for now, but you could extract it if needed).
+    """
+    routes_dict = {}
+    with open(file_path, newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            route_id = int(row["route_id"])
+            step_id = int(row["step_id"])
+            smiles = row["smiles"]
+            # adjust this constructor to your actual API
+            reaction = read_smiles(smiles)
+            routes_dict.setdefault(route_id, {})[step_id] = reaction
+    return routes_dict
+def make_json(routes_dict, keep_ids=True):
+    """
+    Convert routes into a nested JSON tree of reaction and molecule nodes.
+    Args:
+        routes_dict (dict[int, dict[int, Reaction]]): Mapping route IDs to steps (step_id -> Reaction).
+        keep_ids (bool): If True, returns a list of route trees; otherwise returns a dict mapping route IDs to trees.
+    Returns:
+        list or dict: JSON-like tree(s) of routes.
+    """
+    # Prepare output
+    all_routes = {} if keep_ids else []
+    for route_id, steps in routes_dict.items():
+        if not steps:
+            continue
+        # Determine target molecule atoms from the final step of this route
+        final_step = max(steps)
+        target = steps[final_step].products[0]
+        atom_nums = set(target._atoms.keys())
+        # Precompute canonical SMILES and producer mapping for all products
+        prod_map = {}  # smiles -> list of step_ids
+        for sid, rxn in steps.items():
+            for prod in rxn.products:
+                prod.kekule()
+                prod.implicify_hydrogens()
+                prod.thiele()
+                s = str(prod)
+                prod_map.setdefault(s, []).append(sid)
+        def transform(mol):
+            mol.kekule()
+            mol.implicify_hydrogens()
+            mol.thiele()
+            return str(mol)
+        def build_mol_node(sid):
+            """Find the product with any overlap to target atoms and recurse into its reaction."""
+            rxn = steps[sid]
+            for p in rxn.products:
+                if atom_nums & set(p._atoms.keys()):
+                    smiles = str(p)
+                    return {
+                        "type": "mol",
+                        "smiles": smiles,
+                        "children": [build_reaction_node(sid)],
+                        "in_stock": False,
+                    }
+            # Shouldn't reach here if tree is consistent
+            return None
+        def build_reaction_node(sid):
+            """Build reaction node and recurse into reactant molecule nodes."""
+            rxn = steps[sid]
+            node = {"type": "reaction", "smiles": format(rxn, "m"), "children": []}
+            for react in rxn.reactants:
+                r_smi = transform(react)
+                # Look up any prior step producing this reactant
+                prior = [ps for ps in prod_map.get(r_smi, []) if ps < sid]
+                if prior:
+                    node["children"].append(build_mol_node(max(prior)))
+                else:
+                    node["children"].append(
+                        {"type": "mol", "smiles": r_smi, "in_stock": True}
+                    )
+            return node
+        # Build route tree and store
+        tree = build_mol_node(final_step)
+        if keep_ids:
+            all_routes[int(route_id)] = tree
+        else:
+            all_routes.append(tree)
+    return all_routes
+def write_routes_json(routes_dict, file_path):
+    """Serialize reaction routes to a JSON file."""
+    routes_json = make_json(routes_dict)
+    with open(file_path, "w") as f:
+        json.dump(routes_json, f, indent=2)
+def write_routes_csv(routes_dict, file_path="routes.csv"):
+    """
+    Write out a nested routes_dict of the form
+        { route_id: { step_id: reaction_obj, ... }, ... }
+    to a CSV with columns: route_id, step_id, smiles, meta
+    where smiles is format(reaction, 'm') and meta is left blank.
+    """
+    with open(file_path, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        # header row
+        writer.writerow(["route_id", "step_id", "smiles", "meta"])
+        # sort routes and steps for deterministic output
+        for route_id in sorted(routes_dict):
+            steps = routes_dict[route_id]
+            for step_id in sorted(steps):
+                reaction = steps[step_id]
+                smiles = format(reaction, "m")
+                meta = ""  # or reaction.meta if you add that later
+                writer.writerow([route_id, step_id, smiles, meta])
+class TreeWrapper:
+    def __init__(self, tree, mol_id=1, config=1, path="planning_results/forest"):
+        """Initializes the TreeWrapper."""
+        self.tree = tree
+        self.mol_id = mol_id
+        self.config = config
+        self.path = path
+        # Ensure the directory exists before creating the filename
+        os.makedirs(self.path, exist_ok=True)
+        self.filename = os.path.join(self.path, f"tree_{mol_id}_{config}.pkl")
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        tree_state = self.tree.__dict__.copy()
+        # Reset or remove non-pickleable attributes (e.g., _tqdm, policy_network, value_network)
+        if "_tqdm" in tree_state:
+            tree_state["_tqdm"] = True  # Reset to a simple flag
+        for attr in ["policy_network", "value_network"]:
+            if attr in tree_state:
+                tree_state[attr] = None
+        state["tree_state"] = tree_state
+        del state["tree"]
+        return state
+    def __setstate__(self, state):
+        tree_state = state.pop("tree_state")
+        self.__dict__.update(state)
+        new_tree = Tree.__new__(Tree)
+        new_tree.__dict__.update(tree_state)
+        self.tree = new_tree
+    def save_tree(self):
+        """Saves the TreeWrapper instance (including the tree state) to a file."""
+        try:
+            with open(self.filename, "wb") as f:
+                pickle.dump(self, f)
+            print(
+                f"Tree wrapper for mol_id '{self.mol_id}', config '{self.config}' saved to '{self.filename}'."
+            )
+        except Exception as e:
+            print(f"Error saving tree to {self.filename}: {e}")
+    @classmethod
+    def load_tree_from_id(cls, mol_id, config=1, path="planning_results/forest"):
+        """
+        Loads a Tree object from a saved file using mol_id and config.
+        Args:
+            mol_id: The molecule ID used for saving.
+            config: The configuration used for saving.
+            path: The directory where the file is located
+        Returns:
+            The loaded Tree object, or None if loading fails.
+        """
+        filename = os.path.join(path, f"tree_{mol_id}_{config}.pkl")
+        print(f"Attempting to load tree from: {filename}")
+        try:
+            # Ensure the 'Tree' class is defined in the current scope
+            if "Tree" not in globals() and "Tree" not in locals():
+                raise NameError(
+                    "The 'Tree' class definition is required to load the object."
+                )
+            with open(filename, "rb") as f:
+                loaded_wrapper = pickle.load(f)  # This implicitly calls __setstate__
+            print(
+                f"Tree object for mol_id '{mol_id}', config '{config}' successfully loaded from '{filename}'."
+            )
+            # The __setstate__ method already reconstructed the tree inside the wrapper
+            return loaded_wrapper.tree
+        except FileNotFoundError:
+            print(f"Error: File not found at {filename}")
+            return None
+        except (pickle.UnpicklingError, EOFError) as e:
+            print(
+                f"Error: Could not unpickle file {filename}. It might be corrupted or empty. Details: {e}"
+            )
+            return None
+        except NameError as e:
+            print(f"Error during loading: {e}. Ensure 'Tree' class is defined.")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred loading tree from {filename}: {e}")
+            return None

synplan/chem/reaction_routes/leaving_groups.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from CGRtools.periodictable import Core, At, DynamicElement
+from typing import Optional
+class Marked(Core):
+    __slots__ = "__mark", "_isotope"
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__mark = None
+        self._isotope = 0  # Make sure this exists
+    @property
+    def mark(self):
+        return self.__mark
+    @mark.setter
+    def mark(self, mark):
+        self.__mark = mark
+    @property
+    def isotope(self):
+        return getattr(self, "_isotope", 0)  # Always returns int
+    @isotope.setter
+    def isotope(self, value):
+        self._isotope = int(value)
+    def __repr__(self):
+        return f"{self.symbol}({self.isotope})"
+    @property
+    def atomic_symbol(self) -> str:
+        return self.__class__.__name__[6:]
+    @property
+    def symbol(self) -> str:
+        return "X"  # For human-readable representation
+    def __len__(self):
+        return super().__len__()
+class MarkedAt(Marked, At):
+    atomic_number = At.atomic_number
+    @property
+    def atomic_symbol(self):
+        return "At"
+    @property
+    def symbol(self):
+        return "X"
+    def __repr__(self):
+        return f"X({self.isotope})"
+    def __str__(self):
+        return f"X({self.isotope})"
+    def __hash__(self):
+        return hash(
+            (
+                self.isotope,
+                getattr(self, "atomic_number", 0),
+                getattr(self, "charge", 0),
+                getattr(self, "is_radical", False),
+            )
+        )
+class DynamicX(DynamicElement):
+    __slots__ = ("_mark", "_isotope")
+    atomic_number = 85
+    mass = 0.0
+    group = 0
+    period = 0
+    isotopes_distribution = list(range(20))
+    atomic_radius = 0.5
+    isotopes_masses = 0
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._isotope = None
+        self._mark = None
+    @property
+    def mark(self):
+        return getattr(self, "_mark", None)
+    @mark.setter
+    def mark(self, value):
+        self._mark = value
+    @property
+    def isotope(self):
+        return getattr(self, "_isotope", None)
+    @isotope.setter
+    def isotope(self, value):
+        self._isotope = value
+    @property
+    def symbol(self) -> str:
+        return "X"
+    def valence_rules(
+        self, charge: int = 0, is_radical: bool = False, valence: int = 0
+    ) -> tuple:
+        if charge == 0 and not is_radical and (valence == 1):
+            return tuple()
+        elif charge == 0 and not is_radical and valence == 0:
+            return tuple()
+        else:
+            return tuple()
+    def __repr__(self):
+        return f"Dynamic{self.symbol}()"
+    @property
+    def p_charge(self) -> int:
+        return self.charge
+    @property
+    def p_is_radical(self) -> bool:
+        return self.is_radical
+    @property
+    def p_hybridization(self) -> Optional[int]:
+        return self.hybridization

synplan/chem/reaction_routes/route_cgr.py ADDED Viewed

	@@ -0,0 +1,570 @@

+from CGRtools.containers.bonds import DynamicBond
+from CGRtools.containers import ReactionContainer, CGRContainer, MoleculeContainer
+from synplan.mcts.tree import Tree
+def find_next_atom_num(reactions: list):
+    """
+    Find the next available atom number across a list of reactions.
+    This function iterates through a list of reaction containers, composes
+    each reaction to get its Condensed Graph of Reaction (CGR), and finds
+    the maximum atom index used within each CGR. It then returns the maximum
+    atom index found across all reactions plus one, providing a unique
+    next available atom number.
+    Args:
+        reactions (list): A list of ReactionContainer objects.
+    Returns:
+        int: The next available integer atom number, which is one greater
+             than the maximum atom index found in any of the provided reaction CGRs.
+    """
+    max_num = 0
+    for reaction in reactions:
+        cgr = reaction.compose()
+        max_num = max(max_num, max(cgr._atoms.keys()))
+    return max_num + 1
+def get_clean_mapping(
+    curr_prod: MoleculeContainer, prod: MoleculeContainer, reverse: bool = False
+):
+    """
+    Get a 'clean' atom mapping between two molecules, avoiding conflicts.
+    This function attempts to establish a mapping between the atoms of two
+    MoleculeContainer objects (`curr_prod` and `prod`). It uses an internal
+    mapping mechanism and then filters the result to create a "clean" mapping.
+    The cleaning process specifically avoids adding entries to the mapping
+    where the source and target indices are the same, or where the target
+    index already exists as a source in the mapping with a different target.
+    It also checks for potential conflicts based on the atom keys present
+    in the original molecules.
+    Args:
+        curr_prod (MoleculeContainer): The first MoleculeContainer object.
+        prod (MoleculeContainer): The second MoleculeContainer object.
+        reverse (bool, optional): If True, the mapping is generated in the
+                                  reverse direction (from `prod` to `curr_prod`).
+                                  Defaults to False (mapping from `curr_prod` to `prod`).
+    Returns:
+        dict: A dictionary representing the clean atom mapping. Keys are atom
+              indices from the source molecule, and values are the corresponding
+              atom indices in the target molecule. Returns an empty dictionary
+              if no mapping is found or if the initial mapping is empty.
+    """
+    dict_map = {}
+    m = list(curr_prod.get_mapping(prod))
+    if len(m) == 0:
+        return dict_map
+    curr_atoms = set(curr_prod._atoms.keys())
+    prod_atoms = set(prod._atoms.keys())
+    rr = m[0]
+    # Build mapping while checking for conflicts
+    for key, value in rr.items():
+        if key != value:
+            if value in rr.keys() and rr[value] != key:
+                continue
+            source = value if reverse else key
+            target = key if reverse else value
+            if reverse and target in curr_atoms:
+                continue
+            if not reverse and target in prod_atoms:
+                continue
+            dict_map[source] = target
+    return dict_map
+def validate_molecule_components(curr_mol: MoleculeContainer, node_id: int):
+    """
+    Validate that a molecule has only one connected component.
+    This function checks if a given MoleculeContainer object represents a
+    single connected molecule or multiple disconnected fragments. It extracts
+    the connected components and prints an error message if more than one
+    component is found, indicating a potential issue with the molecule
+    representation within a specific tree node.
+    Args:
+        curr_mol (MoleculeContainer): The MoleculeContainer object to validate.
+        node_id (int): The ID of the tree node associated with this molecule,
+                       used for reporting purposes in the error message.
+    """
+    new_rmol = [curr_mol.substructure(c) for c in curr_mol.connected_components]
+    if len(new_rmol) > 1:
+        print(f"Error tree {node_id}: We have more than one molecule in one node")
+def get_leaving_groups(products: list):
+    """
+    Extract leaving group atom numbers from a list of reaction products.
+    This function takes a list of product MoleculeContainer objects resulting
+     from a reaction. It assumes the first molecule in the list is the main
+    product and the subsequent molecules are leaving groups. It collects
+    the atom indices (keys from the `_atoms` dictionary) for all molecules
+    except the first one, considering these indices as belonging to leaving
+    group atoms.
+    Args:
+        products (list): A list of MoleculeContainer objects representing the
+                         products of a reaction. The first element is assumed
+                         to be the main product.
+    Returns:
+        list: A list of integer atom indices corresponding to the atoms
+              in the leaving group molecules.
+    """
+    lg_atom_nums = []
+    for i, prod in enumerate(products):
+        if i != 0:  # Skip first product (main product)
+            lg_atom_nums.extend(prod._atoms.keys())
+    return lg_atom_nums
+def process_first_reaction(first_react: ReactionContainer, tree: Tree, node_id: int):
+    """
+    Process the first reaction in a retrosynthetic route and initialize the building block set.
+    This function takes the first reaction in a route, iterates through its
+    reactants, validates that each reactant is a single connected component,
+    and identifies potential building blocks. A reactant is considered a
+    potential building block if its size is less than or equal to the
+    minimum molecule size defined in the tree's configuration or if its
+    SMILES string is present in the tree's building blocks set. The atom
+    indices of such building blocks are collected into a set.
+    Args:
+        first_react (ReactionContainer): The first ReactionContainer object in the route.
+        tree (Tree): The Tree object containing the retrosynthetic search tree
+                     and configuration (including `min_mol_size` and `building_blocks`).
+        node_id (int): The ID of the tree node associated with this reaction,
+                       used for validation reporting.
+    Returns:
+        set: A set of integer atom indices corresponding to the atoms
+             identified as part of building blocks in the first reaction's reactants.
+    """
+    bb_set = set()
+    for curr_mol in first_react.reactants:
+        react_key = tuple(curr_mol._atoms)
+        react_key_set = set(react_key)
+        if (
+            len(curr_mol) <= tree.config.min_mol_size
+            or str(curr_mol) in tree.building_blocks
+        ):
+            bb_set = react_key_set
+        validate_molecule_components(curr_mol, node_id)
+    return bb_set
+def update_reaction_dict(
+    reaction: ReactionContainer,
+    node_id: int,
+    mapping: dict,
+    react_dict: dict,
+    tree: Tree,
+    bb_set: set,
+    prev_remap: dict = None,
+):
+    """
+    Update a reaction dictionary with atom mappings and identify building blocks.
+    This function processes the reactants of a given reaction, validates their
+    structure (single connected component), updates a dictionary (`react_dict`)
+    with atom mappings for each reactant, and expands a set of building block
+    atom indices (`bb_set`). The mapping is filtered based on the atoms present
+    in the current reactant, and can optionally include a previous remapping.
+    Reactants are identified as building blocks based on size or presence in
+    the tree's building blocks set.
+    Args:
+        reaction (ReactionContainer): The ReactionContainer object representing the reaction.
+        node_id (int): The ID of the tree node associated with this synthethic route,
+                       used for validation reporting.
+        mapping (dict): The primary atom mapping dictionary to filter and apply.
+        react_dict (dict): The dictionary to update with filtered mappings for each reactant.
+                           Keys are tuples of atom indices for each reactant molecule.
+        tree (Tree): The Tree object containing the retrosynthetic search tree
+                     and configuration (including `min_mol_size` and `building_blocks`).
+        bb_set (set): The set of building block atom indices to update.
+        prev_remap (dict, optional): An optional dictionary representing a previous
+                                     remapping to include in the filtered mapping.
+                                     Defaults to None.
+    Returns:
+        tuple: A tuple containing:
+               - dict: The updated `react_dict` with filtered mappings for each reactant.
+               - set: The updated `bb_set` including atom indices from newly identified
+                      building blocks.
+    """
+    for curr_mol in reaction.reactants:
+        react_key = tuple(curr_mol._atoms)
+        react_key_set = set(react_key)
+        validate_molecule_components(curr_mol, node_id)
+        if (
+            len(curr_mol) <= tree.config.min_mol_size
+            or str(curr_mol) in tree.building_blocks
+        ):
+            bb_set = bb_set.union(react_key_set)
+        # Filter the mapping to include only keys present in the current react_key
+        filtered_mapping = {k: v for k, v in mapping.items() if k in react_key_set}
+        if prev_remap:
+            prev_remappping = {
+                k: v for k, v in prev_remap.items() if k in react_key_set
+            }
+            filtered_mapping.update(prev_remappping)
+        react_dict[react_key] = filtered_mapping
+    return react_dict, bb_set
+def process_target_blocks(
+    curr_products: list,
+    curr_prod: MoleculeContainer,
+    lg_atom_nums: list,
+    curr_lg_atom_nums: list,
+    bb_set: set,
+):
+    """
+    Identifies and collects atom indices for target blocks based on leaving groups and building blocks.
+    This function iterates through a list of current product molecules, compares their atoms
+    to a reference molecule (`curr_prod`), and collects the indices of atoms that correspond
+    to atoms in the provided leaving group lists (`lg_atom_nums`, `curr_lg_atom_nums`) or
+    the building block set (`bb_set`). This is typically used to identify parts of molecules
+    that should be treated as 'target blocks' during a remapping or analysis process.
+    Args:
+        curr_products (list): A list of MoleculeContainer objects representing the current products.
+        curr_prod (MoleculeContainer): A reference MoleculeContainer object, likely the main product,
+                                       used for mapping atom indices.
+        lg_atom_nums (list): A list of integer atom indices identified as leaving group atoms
+                             in a relevant context.
+        curr_lg_atom_nums (list): Another list of integer atom indices identified as leaving
+                                   group atoms, potentially from a different context than `lg_atom_nums`.
+        bb_set (set): A set of integer atom indices identified as building block atoms.
+    Returns:
+        list: A list of integer atom indices that are identified as 'target blocks' based on
+              their presence in the leaving group lists or building block set after mapping
+              to the reference molecule.
+    """
+    target_block = []
+    if len(curr_products) > 1:
+        for prod in curr_products:
+            dict_map = get_clean_mapping(curr_prod, prod)
+            if prod._atoms.keys() != curr_prod._atoms.keys():
+                for key in list(prod._atoms.keys()):
+                    if key in lg_atom_nums or key in curr_lg_atom_nums:
+                        target_block.append(key)
+                    if key in bb_set:
+                        target_block.append(key)
+    return target_block
+def compose_route_cgr(tree_or_routes, node_id):
+    """
+    Process a single synthesis route maintaining consistent state.
+    Parameters
+    ----------
+    tree_or_routes : synplan.mcts.tree.Tree
+        or dict mapping route_id -> {step_id: ReactionContainer}
+    node_id : int
+        the route index (in the Tree’s winning_nodes, or the dict’s keys)
+    Returns
+    -------
+    dict or None
+      - if successful: { 'cgr': <composed CGR>, 'reactions_dict': {step: ReactionContainer,…} }
+      - on error: None
+    """
+    # ----------- dict-based branch ------------
+    if isinstance(tree_or_routes, dict):
+        routes_dict = tree_or_routes
+        if node_id not in routes_dict:
+            raise KeyError(f"Route {node_id} not in provided dict.")
+        # grab and sort the ReactionContainers in chronological order
+        step_map = routes_dict[node_id]
+        sorted_ids = sorted(step_map)
+        reactions = [step_map[i] for i in sorted_ids]
+        # start from the last (final) reaction
+        accum_cgr = reactions[-1].compose()
+        reactions_dict = {len(reactions) - 1: reactions[-1]}
+        # now fold backwards through the earlier steps
+        for idx in range(len(reactions) - 2, -1, -1):
+            rxn = reactions[idx]
+            curr_cgr = rxn.compose()
+            accum_cgr = curr_cgr.compose(accum_cgr)
+            reactions_dict[idx] = rxn
+        return {"cgr": accum_cgr, "reactions_dict": reactions_dict}
+    # ----------- tree-based branch ------------
+    tree = tree_or_routes
+    try:
+        # original tree-based logic:
+        reactions = tree.synthesis_route(node_id)
+        first_react = reactions[-1]
+        reactions_dict = {len(reactions) - 1: first_react}
+        accum_cgr = first_react.compose()
+        bb_set = process_first_reaction(first_react, tree, node_id)
+        react_dict = {}
+        max_num = find_next_atom_num(reactions)
+        for step in range(len(reactions) - 2, -1, -1):
+            reaction = reactions[step]
+            curr_cgr = reaction.compose()
+            curr_prod = reaction.products[0]
+            accum_products = accum_cgr.decompose()[1].split()
+            lg_atom_nums = get_leaving_groups(accum_products)
+            curr_products = curr_cgr.decompose()[1].split()
+            tuple_atoms = tuple(curr_prod._atoms)
+            prev_remap = react_dict.get(tuple_atoms, {})
+            if prev_remap:
+                curr_cgr = curr_cgr.remap(prev_remap, copy=True)
+            # identify new atom‐numbers for any overlap
+            target_block = process_target_blocks(
+                curr_products,
+                curr_prod,
+                lg_atom_nums,
+                [list(p._atoms.keys()) for p in curr_products[1:]],
+                bb_set,
+            )
+            mapping = {}
+            for atom_num in sorted(target_block):
+                if atom_num in accum_cgr._atoms and atom_num not in mapping:
+                    mapping[atom_num] = max_num
+                    max_num += 1
+            # carry forward any clean remap on the product itself
+            dict_map = {}
+            for ap in accum_products:
+                clean_map = get_clean_mapping(curr_prod, ap, reverse=True)
+                if clean_map:
+                    dict_map = clean_map
+                    break
+            if dict_map:
+                curr_cgr = curr_cgr.remap(dict_map, copy=False)
+            # update our react_dict & bb_set
+            react_dict, bb_set = update_reaction_dict(
+                reaction, node_id, mapping, react_dict, tree, bb_set, prev_remap
+            )
+            # apply the new overlap‐mapping
+            if mapping:
+                curr_cgr = curr_cgr.remap(mapping, copy=False)
+            reactions_dict[step] = ReactionContainer.from_cgr(curr_cgr)
+            accum_cgr = curr_cgr.compose(accum_cgr)
+        return {"cgr": accum_cgr, "reactions_dict": reactions_dict}
+    except Exception as e:
+        print(f"Error processing node {node_id}: {e}")
+        return None
+def compose_all_route_cgrs(tree_or_routes, node_id=None):
+    """
+    Process routes (reassign atom mappings) to compose RouteCGR.
+    Parameters
+    ----------
+    tree_or_routes : synplan.mcts.tree.Tree
+        or dict mapping route_id -> {step_id: ReactionContainer}
+    node_id : int or None
+        if None, do *all* winning routes (or all keys of the dict);
+        otherwise only that specific route.
+    Returns
+    -------
+    dict or None
+      - if node_id is None: {route_id: CGR, …}
+      - if node_id is given: {node_id: CGR}
+      - returns None on error
+    """
+    # dict-based branch
+    if isinstance(tree_or_routes, dict):
+        routes_dict = tree_or_routes
+        def _single(rid):
+            res = compose_route_cgr(routes_dict, rid)
+            return res["cgr"] if res else None
+        if node_id is not None:
+            if node_id not in routes_dict:
+                raise KeyError(f"Route {node_id} not in provided dict.")
+            return {node_id: _single(node_id)}
+        # all routes
+        result = {rid: _single(rid) for rid in sorted(routes_dict)}
+        return result
+    # tree-based branch
+    tree = tree_or_routes
+    route_cgrs = {}
+    if node_id is not None:
+        res = compose_route_cgr(tree, node_id)
+        if res:
+            route_cgrs[node_id] = res["cgr"]
+        else:
+            return None
+        return route_cgrs
+    for rid in sorted(set(tree.winning_nodes)):
+        res = compose_route_cgr(tree, rid)
+        if res:
+            route_cgrs[rid] = res["cgr"]
+    return route_cgrs
+def extract_reactions(tree: Tree, node_id=None):
+    """
+    Collect mapped reaction sequences from a synthesis tree.
+    Traverses either a single branch (if `node_id` is given) or all winning routes,
+    composing CGR-based reactions for each, and returns a dict of reaction mappings.
+    Ensures that in every extracted reaction, atom indices are uniquely mapped (no overlaps)
+    Parameters
+    ----------
+    tree : ReactionTree
+        A retrosynthetic tree object with a `.winning_nodes` attribute and
+        supporting `compose_route_cgr(...)`.
+    node_id : hashable, optional
+        If provided, only extract reactions for this specific node/route.
+    Returns
+    -------
+    dict[node_id, dict]
+        Maps each route terminal node ID to its `reactions_dict` (as returned
+        by `compose_route_cgr`). Returns `None` if the specified `node_id` fails
+        to produce valid reactions.
+    """
+    react_dict = {}
+    if node_id is not None:
+        result = compose_route_cgr(tree, node_id)
+        if result:
+            react_dict[node_id] = result["reactions_dict"]
+        else:
+            return None
+        return react_dict
+    for node_id in set(tree.winning_nodes):
+        result = compose_route_cgr(tree, node_id)
+        if result:
+            react_dict[node_id] = result["reactions_dict"]
+    return dict(sorted(react_dict.items()))
+def compose_sb_cgr(route_cgr: CGRContainer):
+    """
+    Reduces a Routes Condensed Graph of reaction (RouteCGR) by performing the following steps:
+    1. Extracts substructures corresponding to connected components from the input RouteCGR.
+    2. Selects the first substructure as the target to work on.
+    3. Iterates over all bonds in the target RouteCGR:
+       - If a bond is identified as a "leaving group" (its primary order is None while its original order is defined),
+         the bond is removed.
+       - If a bond has a modified order (both primary and original orders are integers) and the primary order is less than the original,
+         the bond is deleted and then re-added with a new dynamic bond using the primary order (this updates the bond to the reduced form).
+    4. After bond modifications, re-extracts the substructure from the target RouteCGR (now called the reduced RouteCGR or ReducedRouteCGR).
+    5. If the charge distributions (_p_charges vs. _charges) differ, neutralizes the charges by setting them to zero.
+    Args:
+        route_cgr: The input RouteCGR object to be reduced.
+    Returns:
+        The reduced RouteCGR object.
+    """
+    # Get all connected components of the RouteCGR as separate substructures.
+    cgr_prods = [route_cgr.substructure(c) for c in route_cgr.connected_components]
+    target_cgr = cgr_prods[
+        0
+    ]  # Choose the first substructure (main product) for further reduction.
+    # Iterate over each bond in the target RouteCGR.
+    bond_items = list(target_cgr._bonds.items())
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            # Removing bonds corresponding to leaving groups:
+            # If product bond order is None (indicating a leaving group) but an original bond order exists,
+            # delete the bond.
+            if bond.p_order is None and bond.order is not None:
+                target_cgr.delete_bond(atom1, atom2)
+            # For bonds that have been modified (not leaving groups) where the new (primary) order is less than the original:
+            # Remove the bond and re-add it using the DynamicBond with the primary order for both bond orders.
+            elif (
+                type(bond.p_order) is int
+                and type(bond.order) is int
+                and bond.p_order != bond.order
+            ):
+                p_order = int(bond.p_order)
+                target_cgr.delete_bond(atom1, atom2)
+                target_cgr.add_bond(atom1, atom2, DynamicBond(p_order, p_order))
+    # After modifying bonds, extract the reduced RouteCGR from the target's connected components.
+    reduced_route_cgr = [
+        target_cgr.substructure(c) for c in target_cgr.connected_components
+    ][0]
+    # Neutralize charges if the primary charges and current charges differ.
+    if reduced_route_cgr._p_charges != reduced_route_cgr._charges:
+        for num, charge in reduced_route_cgr._charges.items():
+            if charge != 0:
+                reduced_route_cgr._atoms[num].charge = 0
+    return reduced_route_cgr
+def compose_all_sb_cgrs(route_cgrs_dict: dict):
+    """
+    Processes a collection (dictionary) of RouteCGRs to generate their reduced forms (ReducedRouteCGRs).
+    Iterates over each RouteCGR in the provided dictionary and applies the compose_reduced_route_cgr function.
+    Args:
+        route_cgrs_dict (dict): A dictionary where keys are identifiers (e.g., route numbers)
+                                and values are RouteCGR objects.
+    Returns:
+        dict: A dictionary where each key corresponds to the original identifier from
+              `route_cgrs_dict` and the value is the corresponding ReducedRouteCGR object.
+    """
+    all_reduced_route_cgrs = dict()
+    for num, cgr in route_cgrs_dict.items():
+        all_reduced_route_cgrs[num] = compose_sb_cgr(cgr)
+    return all_reduced_route_cgrs

synplan/chem/reaction_routes/visualisation.py ADDED Viewed

	@@ -0,0 +1,903 @@

+from CGRtools.algorithms.depict import (
+    Depict,
+    DepictMolecule,
+    DepictCGR,
+    rotate_vector,
+    _render_charge,
+)
+from CGRtools.containers import ReactionContainer, MoleculeContainer, CGRContainer
+from collections import defaultdict
+from uuid import uuid4
+from math import hypot
+from functools import partial
+class WideBondDepictCGR(DepictCGR):
+    """
+    Like DepictCGR, but all DynamicBonds
+    are drawn 2.5× wider than the standard bond width.
+    """
+    __slots__ = ()
+    def _render_bonds(self):
+        """
+        Renders the bonds of the CGR as SVG lines, with DynamicBonds drawn wider.
+        This method overrides the base `_render_bonds` to apply a wider stroke
+        to DynamicBonds, highlighting changes in bond order during a reaction.
+        It iterates through all bonds, calculates their positions based on
+        2D coordinates, and generates SVG `<line>` elements with appropriate
+        styles (color, width, dash array) based on the bond's original (`order`)
+        and primary (`p_order`) states. Aromatic bonds are handled separately
+        using a helper method.
+        Returns:
+            list: A list of strings, where each string is an SVG element
+                  representing a bond.
+        """
+        plane = self._plane
+        config = self._render_config
+        # get the normal width (default 1.0) and compute a 4× wide stroke
+        normal_width = config.get("bond_width", 0.02)
+        wide_width = normal_width * 2.5
+        broken = config["broken_color"]
+        formed = config["formed_color"]
+        dash1, dash2 = config["dashes"]
+        double_space = config["double_space"]
+        triple_space = config["triple_space"]
+        svg = []
+        ar_bond_colors = defaultdict(dict)
+        for n, m, bond in self.bonds():
+            order, p_order = bond.order, bond.p_order
+            nx, ny = plane[n]
+            mx, my = plane[m]
+            # invert Y for SVG
+            ny, my = -ny, -my
+            rv = partial(rotate_vector, 0, x2=mx - nx, y2=ny - my)
+            if order == 1:
+                if p_order == 1:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 4:
+                if p_order == 4:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"  stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = None
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 2:
+                if p_order == 2:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed} stroke-width="{wide_width:.2f}""/>'
+                    )
+                elif p_order is None:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 3:
+                if p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" '
+                        f'stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}" '
+                        f'y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" x2="{mx - dx:.2f}" '
+                        f'y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" '
+                        f'x2="{mx:.2f}" y2="{my:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(double_space)
+                    dx3 = 3 * dx
+                    dy3 = 3 * dy
+                    svg.append(
+                        f'      <line x1="{nx + dx3:.2f}" y1="{ny - dy3:.2f}" x2="{mx + dx3:.2f}" '
+                        f'y2="{my - dy3:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx3:.2f}" y1="{ny + dy3:.2f}" x2="{mx - dx3:.2f}" '
+                        f'y2="{my + dy3:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order is None:
+                if p_order == 1:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(double_space)
+                    # dx = dx // 1.4
+                    # dy = dy // 1.4
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}" '
+                        f'y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" x2="{mx - dx:.2f}" '
+                        f'y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            else:
+                if p_order == 8:
+                    svg.append(
+                        f'        <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = None
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(double_space)
+                    dx3 = 3 * dx
+                    dy3 = 3 * dy
+                    svg.append(
+                        f'      <line x1="{nx + dx3:.2f}" y1="{ny - dy3:.2f}" x2="{mx + dx3:.2f}" '
+                        f'y2="{my - dy3:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx3:.2f}" y1="{ny + dy3:.2f}" '
+                        f'x2="{mx - dx3:.2f}" y2="{my + dy3:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+        # aromatic rings - unchanged
+        for ring in self.aromatic_rings:
+            cx = sum(plane[x][0] for x in ring) / len(ring)
+            cy = sum(plane[x][1] for x in ring) / len(ring)
+            for n, m in zip(ring, ring[1:]):
+                nx, ny = plane[n]
+                mx, my = plane[m]
+                aromatic = self.__render_aromatic_bond(
+                    nx, ny, mx, my, cx, cy, ar_bond_colors[n].get(m)
+                )
+                if aromatic:
+                    svg.append(aromatic)
+            n, m = ring[-1], ring[0]
+            nx, ny = plane[n]
+            mx, my = plane[m]
+            aromatic = self.__render_aromatic_bond(
+                nx, ny, mx, my, cx, cy, ar_bond_colors[n].get(m)
+            )
+            if aromatic:
+                svg.append(aromatic)
+        return svg
+    def __render_aromatic_bond(self, n_x, n_y, m_x, m_y, c_x, c_y, color):
+        config = self._render_config
+        dash1, dash2 = config["dashes"]
+        dash3, dash4 = config["aromatic_dashes"]
+        aromatic_space = config["cgr_aromatic_space"]
+        normal_width = config.get("bond_width", 0.02)
+        wide_width = normal_width * 2
+        # n aligned xy
+        mn_x, mn_y, cn_x, cn_y = m_x - n_x, m_y - n_y, c_x - n_x, c_y - n_y
+        # nm reoriented xy
+        mr_x, mr_y = hypot(mn_x, mn_y), 0
+        cr_x, cr_y = rotate_vector(cn_x, cn_y, mn_x, -mn_y)
+        if cr_y and aromatic_space / cr_y < 0.65:
+            if cr_y > 0:
+                r_y = aromatic_space
+            else:
+                r_y = -aromatic_space
+                cr_y = -cr_y
+            ar_x = aromatic_space * cr_x / cr_y
+            br_x = mr_x - aromatic_space * (mr_x - cr_x) / cr_y
+            # backward reorienting
+            an_x, an_y = rotate_vector(ar_x, r_y, mn_x, mn_y)
+            bn_x, bn_y = rotate_vector(br_x, r_y, mn_x, mn_y)
+            if color:
+                # print('color')
+                return (
+                    f'      <line x1="{an_x + n_x:.2f}" y1="{-an_y - n_y:.2f}" x2="{bn_x + n_x:.2f}" '
+                    f'y2="{-bn_y - n_y:.2f}" stroke-dasharray="{dash3:.2f} {dash4:.2f}" stroke="{color}" stroke-width="{wide_width:.2f}"/>'
+                )
+            elif color is None:
+                dash3, dash4 = dash1, dash2
+            return (
+                f'      <line x1="{an_x + n_x:.2f}" y1="{-an_y - n_y:.2f}"'
+                f' x2="{bn_x + n_x:.2f}" y2="{-bn_y - n_y:.2f}" stroke-dasharray="{dash3:.2f} {dash4:.2f}"/>'
+            )
+def cgr_display(cgr: CGRContainer) -> str:
+    """
+    Generates an SVG string for displaying a CGR with wider DynamicBonds.
+    This function temporarily modifies the rendering methods of the
+    `CGRContainer` class to use the bond rendering logic from
+    `WideBondDepictCGR`, which draws DynamicBonds with a wider stroke.
+    It cleans the 2D coordinates of the input CGR and then calls its
+    `depict()` method to generate the SVG string using the modified
+    rendering behavior.
+    Args:
+        cgr (CGRContainer): The CGRContainer object to be depicted.
+    Returns:
+        str: An SVG string representing the depiction of the CGR
+             with wider DynamicBonds.
+    """
+    CGRContainer._CGRContainer__render_aromatic_bond = (
+        WideBondDepictCGR._WideBondDepictCGR__render_aromatic_bond
+    )
+    CGRContainer._render_bonds = WideBondDepictCGR._render_bonds
+    CGRContainer._WideBondDepictCGR__render_aromatic_bond = (
+        WideBondDepictCGR._WideBondDepictCGR__render_aromatic_bond
+    )
+    cgr.clean2d()
+    return cgr.depict()
+class CustomDepictMolecule(DepictMolecule):
+    """
+    Custom molecule depiction class that uses atom.symbol for rendering.
+    """
+    def _render_atoms(self):
+        bonds = self._bonds
+        plane = self._plane
+        charges = self._charges
+        radicals = self._radicals
+        hydrogens = self._hydrogens
+        config = self._render_config
+        carbon = config["carbon"]
+        mapping = config["mapping"]
+        span_size = config["span_size"]
+        font_size = config["font_size"]
+        monochrome = config["monochrome"]
+        other_size = config["other_size"]
+        atoms_colors = config["atoms_colors"]
+        mapping_font = config["mapping_size"]
+        dx_m, dy_m = config["dx_m"], config["dy_m"]
+        dx_ci, dy_ci = config["dx_ci"], config["dy_ci"]
+        symbols_font_style = config["symbols_font_style"]
+        # for cumulenes
+        try:
+            # Check if _cumulenes method exists and handle potential errors
+            cumulenes = {
+                y
+                for x in self._cumulenes(heteroatoms=True)
+                if len(x) > 2
+                for y in x[1:-1]
+            }
+        except AttributeError:
+            cumulenes = set()  # Fallback if _cumulenes is not available or fails
+        if monochrome:
+            map_fill = other_fill = "black"
+        else:
+            map_fill = config["mapping_color"]
+            other_fill = config["other_color"]
+        svg = []
+        maps = []
+        others = []
+        font2 = 0.2 * font_size
+        font3 = 0.3 * font_size
+        font4 = 0.4 * font_size
+        font5 = 0.5 * font_size
+        font6 = 0.6 * font_size
+        font7 = 0.7 * font_size
+        font15 = 0.15 * font_size
+        font25 = 0.25 * font_size
+        mask = defaultdict(list)
+        for n, atom in self._atoms.items():
+            x, y = plane[n]
+            y = -y
+            # --- KEY CHANGE HERE ---
+            # Use atom.symbol if it exists, otherwise fallback to atomic_symbol
+            try:
+                symbol = atom.symbol
+            except AttributeError:
+                symbol = atom.atomic_symbol  # Fallback if .symbol doesn't exist
+            # --- END KEY CHANGE ---
+            if (
+                not bonds.get(n)
+                or symbol != "C"
+                or carbon
+                or atom.charge
+                or atom.is_radical
+                or atom.isotope
+                or n in cumulenes
+            ):  # Added bonds.get(n) check for single atoms
+                # Calculate hydrogens if the attribute exists, otherwise default to 0
+                try:
+                    h = hydrogens[n]
+                except (KeyError, AttributeError):
+                    h = 0  # Default if _hydrogens is missing or key n is not present
+                if h == 1:
+                    h_str = "H"
+                    span = ""
+                elif h and h > 1:  # Check if h is not None and greater than 1
+                    span = f'<tspan  dy="{config["span_dy"]:.2f}" font-size="{span_size:.2f}">{h}</tspan>'
+                    h_str = "H"
+                else:
+                    h_str = ""
+                    span = ""
+                # Handle charges and radicals safely
+                charge_val = charges.get(n, 0)
+                is_radical = radicals.get(n, False)
+                if charge_val:
+                    t = f'{_render_charge.get(charge_val, "")}{"↑" if is_radical else ""}'  # Use .get for safety
+                    if t:  # Only add if charge text is generated
+                        others.append(
+                            f'        <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">'
+                            f"{t}</text>"
+                        )
+                        mask["other"].append(
+                            f'           <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">'
+                            f"{t}</text>"
+                        )
+                elif is_radical:
+                    others.append(
+                        f'        <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">↑</text>'
+                    )
+                    mask["other"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}"'
+                        f' dy="-{dy_ci:.2f}">↑</text>'
+                    )
+                # Handle isotope safely
+                try:
+                    iso = atom.isotope
+                    if iso:
+                        t = iso
+                        others.append(
+                            f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_ci:.2f}" dy="-{dy_ci:.2f}" '
+                            f'text-anchor="end">{t}</text>'
+                        )
+                        mask["other"].append(
+                            f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_ci:.2f}"'
+                            f' dy="-{dy_ci:.2f}" text-anchor="end">{t}</text>'
+                        )
+                except AttributeError:
+                    pass  # Atom might not have isotope attribute
+                # Determine atom color based on atomic_number, default to black if monochrome or not found
+                atom_color = "black"
+                if not monochrome:
+                    try:
+                        an = atom.atomic_number
+                        if 0 < an <= len(atoms_colors):
+                            atom_color = atoms_colors[an - 1]
+                        else:
+                            atom_color = atoms_colors[
+                                5
+                            ]  # Default to Carbon color if out of range
+                    except AttributeError:
+                        atom_color = atoms_colors[
+                            5
+                        ]  # Default to Carbon color if no atomic_number
+                svg.append(
+                    f'      <g fill="{atom_color}" '
+                    f'font-family="{symbols_font_style }">'
+                )
+                # Adjust dx based on symbol length for better centering
+                if len(symbol) > 1:
+                    dx = font7
+                    dx_mm = dx_m + font5
+                    if symbol[-1].lower() in (
+                        "l",
+                        "i",
+                        "r",
+                        "t",
+                    ):  # Heuristic for narrow last letters
+                        rx = font6
+                        ax = font25
+                    else:
+                        rx = font7
+                        ax = font15
+                    mask["center"].append(
+                        f'          <ellipse cx="{x - ax:.2f}" cy="{y:.2f}" rx="{rx}" ry="{font4}"/>'
+                    )
+                else:
+                    if symbol == "I":  # Special case for 'I'
+                        dx = font15
+                        dx_mm = dx_m
+                    else:  # Single character
+                        dx = font4
+                        dx_mm = dx_m + font2
+                    mask["center"].append(
+                        f'          <circle cx="{x:.2f}" cy="{y:.2f}" r="{font4:.2f}"/>'
+                    )
+                svg.append(
+                    f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" dy="{font4:.2f}" '
+                    f'font-size="{font_size:.2f}">{symbol}{h_str}{span}</text>'
+                )
+                mask["symbols"].append(
+                    f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" '
+                    f'dy="{font4:.2f}">{symbol}{h_str}</text>'
+                )
+                if span:
+                    mask["span"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" dy="{font4:.2f}">'
+                        f"{symbol}{h_str}{span}</text>"
+                    )
+                svg.append("      </g>")
+                if mapping:
+                    maps.append(
+                        f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m + font3:.2f}" '
+                        f'text-anchor="end">{n}</text>'
+                    )
+                    mask["aam"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" '
+                        f'dy="{dy_m + font3:.2f}" text-anchor="end">{n}</text>'
+                    )
+            elif mapping:
+                # Determine dx_mm for mapping based on symbol length even if atom itself isn't drawn
+                if len(symbol) > 1:
+                    dx_mm = dx_m + font5
+                else:
+                    dx_mm = dx_m + font2 if symbol != "I" else dx_m
+                maps.append(
+                    f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m:.2f}" '
+                    f'text-anchor="end">{n}</text>'
+                )
+                mask["aam"].append(
+                    f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m:.2f}" '
+                    f'text-anchor="end">{n}</text>'
+                )
+        if others:
+            svg.append(
+                f'      <g font-family="{config["other_font_style"]}" fill="{other_fill}" '
+                f'font-size="{other_size:.2f}">'
+            )
+            svg.extend(others)
+            svg.append("      </g>")
+        if mapping:
+            svg.append(f'      <g fill="{map_fill}" font-size="{mapping_font:.2f}">')
+            svg.extend(maps)
+            svg.append("      </g>")
+        return svg, mask
+def depict_custom_reaction(reaction: ReactionContainer):
+    """
+    Depicts a ReactionContainer using custom atom rendering logic (replace At to X).
+    This function generates an SVG string representing a reaction. It
+    temporarily modifies the classes of the molecules within the reaction
+    to use a custom depiction logic (`CustomDepictMolecule`) that alters
+    how atoms are rendered (specifically, it seems to use `atom.symbol`
+    instead of `atom.atomic_symbol`, potentially for replacing 'At' with 'X'
+    as mentioned in the original comment). After depicting each molecule
+    with the temporary class, it restores the original classes. The function
+    then combines the individual molecule depictions, reaction arrow, and
+    reaction signs into a single SVG.
+    Args:
+        reaction (ReactionContainer): The ReactionContainer object to be depicted.
+    Returns:
+        str: An SVG string representing the depiction of the reaction
+             with custom atom rendering.
+    """
+    if not reaction._arrow:
+        reaction.fix_positions()  # Ensure positions are calculated
+    r_atoms = []
+    r_bonds = []
+    r_masks = []
+    r_max_x = r_max_y = r_min_y = 0
+    original_classes = {}  # Store original classes to restore later
+    try:
+        # Temporarily change the class of molecules to use the custom depiction
+        for mol in reaction.molecules():
+            if isinstance(mol, (MoleculeContainer, CGRContainer)):
+                original_classes[mol] = mol.__class__
+                custom_class_name = (
+                    f"TempCustom_{mol.__class__.__name__}_{uuid4().hex}"  # Unique name
+                )
+                # Combine custom depiction with original class methods
+                # Ensure the custom _render_atoms takes precedence
+                new_bases = (CustomDepictMolecule,) + original_classes[mol].__bases__
+                # Filter out DepictMolecule if it's already a base to avoid MRO issues
+                new_bases = tuple(b for b in new_bases if b is not DepictMolecule)
+                # If DepictMolecule wasn't a direct base, ensure its methods are accessible
+                if CustomDepictMolecule not in original_classes[mol].__mro__:
+                    # Prioritize CustomDepictMolecule's methods
+                    new_bases = (CustomDepictMolecule, original_classes[mol])
+                else:
+                    # If DepictMolecule was a base, CustomDepictMolecule is already first
+                    new_bases = (CustomDepictMolecule,) + tuple(
+                        b
+                        for b in original_classes[mol].__bases__
+                        if b is not DepictMolecule
+                    )
+                # Create the temporary class
+                mol.__class__ = type(custom_class_name, new_bases, {})
+            # Depict using the (potentially) modified class
+            atoms, bonds, masks, min_x, min_y, max_x, max_y = mol.depict(embedding=True)
+            r_atoms.append(atoms)
+            r_bonds.append(bonds)
+            r_masks.append(masks)
+            if max_x > r_max_x:
+                r_max_x = max_x
+            if max_y > r_max_y:
+                r_max_y = max_y
+            if min_y < r_min_y:
+                r_min_y = min_y
+    finally:
+        # Restore original classes
+        for mol, original_class in original_classes.items():
+            mol.__class__ = original_class
+    config = DepictMolecule._render_config  # Access via the imported class
+    font_size = config["font_size"]
+    font125 = 1.25 * font_size
+    width = r_max_x + 3.0 * font_size
+    height = r_max_y - r_min_y + 2.5 * font_size
+    viewbox_x = -font125
+    viewbox_y = -r_max_y - font125
+    svg = [
+        f'<svg width="{width:.2f}cm" height="{height:.2f}cm" '
+        f'viewBox="{viewbox_x:.2f} {viewbox_y:.2f} {width:.2f} '
+        f'{height:.2f}" xmlns="http://www.w3.org/2000/svg" version="1.1">\n'
+        '  <defs>\n    <marker id="arrow" markerWidth="10" markerHeight="10" '
+        'refX="0" refY="3" orient="auto">\n      <path d="M0,0 L0,6 L9,3"/>\n    </marker>\n  </defs>\n'
+        f'  <line x1="{reaction._arrow[0]:.2f}" y1="0" x2="{reaction._arrow[1]:.2f}" y2="0" '
+        'fill="none" stroke="black" stroke-width=".04" marker-end="url(#arrow)"/>'
+    ]
+    sings_plus = reaction._signs
+    if sings_plus:
+        svg.append(f'  <g fill="none" stroke="black" stroke-width=".04">')
+        for x in sings_plus:
+            svg.append(
+                f'    <line x1="{x + .35:.2f}" y1="0" x2="{x + .65:.2f}" y2="0"/>'
+            )
+            svg.append(
+                f'    <line x1="{x + .5:.2f}" y1="0.15" x2="{x + .5:.2f}" y2="-0.15"/>'
+            )
+        svg.append("  </g>")
+    for atoms, bonds, masks in zip(r_atoms, r_bonds, r_masks):
+        # Use the static method from Depict directly
+        svg.extend(
+            Depict._graph_svg(atoms, bonds, masks, viewbox_x, viewbox_y, width, height)
+        )
+    svg.append("</svg>")
+    return "\n".join(svg)
+def remove_and_shift(nested_dict, to_remove):  # Under development
+    """
+    Removes specified inner keys from a nested dictionary and renumbers the remaining keys.
+    Given a dictionary where values are themselves dictionaries, this function
+    iterates through each inner dictionary. For each inner dictionary, it
+    creates a new dictionary containing only the key-value pairs where the
+    inner key is NOT present in the `to_remove` list. The keys of the remaining
+    elements in the new inner dictionary are then renumbered sequentially
+    starting from 0, effectively removing gaps left by the removed keys.
+    Args:
+        nested_dict (dict): The input nested dictionary (dict of dicts).
+        to_remove (list): A list of keys to remove from the inner dictionaries.
+    Returns:
+        dict: A new nested dictionary with the specified keys removed from
+              inner dictionaries and the remaining inner keys renumbered.
+    """
+    rem_set = set(to_remove)
+    result = {}
+    for outer_k, inner in nested_dict.items():
+        new_inner = {}
+        for old_k, v in inner.items():
+            if old_k in rem_set:
+                continue
+            shift = sum(1 for r in rem_set if r < old_k)
+            new_k = old_k - shift
+            new_inner[new_k] = v
+        result[outer_k] = new_inner
+    return result

synplan/chem/reaction_rules/__init__.py ADDED Viewed

File without changes

synplan/chem/reaction_rules/extraction.py ADDED Viewed

	@@ -0,0 +1,744 @@

+"""Module containing functions for protocol of reaction rules extraction."""
+import logging
+import pickle
+from collections import defaultdict
+from itertools import islice
+from os.path import splitext
+from typing import Dict, List, Set, Tuple
+import ray
+from chython import smarts
+from chython import QueryContainer as QueryContainerChython
+from CGRtools.containers.cgr import CGRContainer
+from CGRtools.containers.molecule import MoleculeContainer
+from CGRtools.containers.query import QueryContainer
+from CGRtools.containers.reaction import ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from tqdm import tqdm
+from synplan.chem.data.standardizing import RemoveReagentsStandardizer
+from synplan.chem.utils import (
+    reverse_reaction,
+    cgrtools_to_chython_molecule,
+    chython_query_to_cgrtools,
+)
+from synplan.utils.config import RuleExtractionConfig
+from synplan.utils.files import ReactionReader
+def add_environment_atoms(
+    cgr: CGRContainer, center_atoms: Set[int], environment_atom_count: int
+) -> Set[int]:
+    """
+    Adds environment atoms to the set of center atoms based on the specified depth.
+    :param cgr: A complete graph representation of a reaction (ReactionContainer
+        object).
+    :param center_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param environment_atom_count: An integer specifying the depth of the environment
+        around the reaction center to be included. If it's 0, only the reaction center
+        is included. If it's 1, the first layer of surrounding atoms is included, and so
+        on.
+    :return: A set of atom id including the center atoms and their environment atoms up
+        to the specified depth. If environment_atom_count is 0, the original set of
+        center atoms is returned unchanged.
+    """
+    if environment_atom_count:
+        env_cgr = cgr.augmented_substructure(center_atoms, deep=environment_atom_count)
+        # combine the original center atoms with the new environment atoms
+        return center_atoms | set(env_cgr)
+    # if no environment is to be included, return the original center atoms
+    return center_atoms
+def add_functional_groups(
+    reaction: ReactionContainer,
+    center_atoms: Set[int],
+    func_groups_list: List[QueryContainerChython],
+) -> Set[int]:
+    """
+    Augments the set of reaction rule atoms with functional groups if specified.
+    :param reaction: The reaction object (ReactionContainer) from which molecules are
+        extracted.
+    :param center_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param func_groups_list: A list of functional group objects (MoleculeContainer or
+        QueryContainer) to be considered when including functional groups. These objects
+        define the structure of the functional groups to be included.
+    :return: A set of atom id corresponding to the rule atoms, including atoms from the
+        specified functional groups if include_func_groups is True. If
+        include_func_groups is False, the original set of center atoms is returned.
+    """
+    rule_atoms = center_atoms.copy()
+    # iterate over each molecule in the reaction
+    for molecule in reaction.molecules():
+        molecule_chython = cgrtools_to_chython_molecule(molecule)
+        # for each functional group specified in the list
+        for func_group in func_groups_list:
+            # find mappings of the functional group in the molecule
+            for mapping in func_group.get_mapping(molecule_chython):
+                # remap the functional group based on the found mapping
+                func_group.remap(mapping)
+                # if the functional group intersects with center atoms, include it
+                if set(func_group.atoms_numbers) & center_atoms:
+                    rule_atoms |= set(func_group.atoms_numbers)
+                # reset the mapping to its original state for the next iteration
+                func_group.remap({v: k for k, v in mapping.items()})
+    return rule_atoms
+def add_ring_structures(cgr: CGRContainer, rule_atoms: Set[int]) -> Set[int]:
+    """
+    Adds ring structures to the set of rule atoms if they intersect with the reaction
+    center atoms.
+    :param cgr: A condensed graph representation of a reaction (CGRContainer object).
+    :param rule_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :return: A set of atom id corresponding to the original rule atoms and the included
+        ring structures.
+    """
+    for ring in cgr.sssr:
+        # check if the current ring intersects with the set of rule atoms
+        if set(ring) & rule_atoms:
+            # if the intersection exists, include all atoms in the ring to the rule atoms
+            rule_atoms |= set(ring)
+    return rule_atoms
+def add_leaving_incoming_groups(
+    reaction: ReactionContainer,
+    rule_atoms: Set[int],
+    keep_leaving_groups: bool,
+    keep_incoming_groups: bool,
+) -> Tuple[Set[int], Dict[str, Set]]:
+    """
+    Identifies and includes leaving and incoming groups to the rule atoms based on
+    specified flags.
+    :param reaction: The reaction object (ReactionContainer) from which leaving and
+        incoming groups are extracted.
+    :param rule_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param keep_leaving_groups: A boolean flag indicating whether to include leaving
+        groups in the rule.
+    :param keep_incoming_groups: A boolean flag indicating whether to include incoming
+        groups in the rule.
+    :return: Updated set of rule atoms including leaving and incoming groups if
+        specified, and metadata about added groups.
+    """
+    meta_debug = {"leaving": set(), "incoming": set()}
+    # extract atoms from reactants and products
+    reactant_atoms = {atom for reactant in reaction.reactants for atom in reactant}
+    product_atoms = {atom for product in reaction.products for atom in product}
+    # identify leaving groups (reactant atoms not in products)
+    if keep_leaving_groups:
+        leaving_atoms = reactant_atoms - product_atoms
+        new_leaving_atoms = leaving_atoms - rule_atoms
+        # include leaving atoms in the rule atoms
+        rule_atoms |= leaving_atoms
+        # add leaving atoms to metadata
+        meta_debug["leaving"] |= new_leaving_atoms
+    # identify incoming groups (product atoms not in reactants)
+    if keep_incoming_groups:
+        incoming_atoms = product_atoms - reactant_atoms
+        new_incoming_atoms = incoming_atoms - rule_atoms
+        # Include incoming atoms in the rule atoms
+        rule_atoms |= incoming_atoms
+        # Add incoming atoms to metadata
+        meta_debug["incoming"] |= new_incoming_atoms
+    return rule_atoms, meta_debug
+def clean_molecules(
+    rule_molecules: List[MoleculeContainer],
+    reaction_molecules: Tuple[MoleculeContainer],
+    reaction_center_atoms: Set[int],
+    atom_retention_details: Dict[str, Dict[str, bool]],
+) -> List[QueryContainer]:
+    """
+    Cleans rule molecules by removing specified information about atoms based on
+    retention details provided.
+    :param rule_molecules: A list of query container objects representing the rule molecules.
+    :param reaction_molecules: A list of molecule container objects involved in the reaction.
+    :param reaction_center_atoms: A set of id corresponding to the atom numbers in the reaction center.
+    :param atom_retention_details: A dictionary specifying what atom information to retain or remove.
+                                   This dictionary should have two keys: "reaction_center" and "environment",
+                                   each mapping to another dictionary. The nested dictionaries should have
+                                   keys representing atom attributes (like "neighbors", "hybridization",
+                                   "implicit_hydrogens", "ring_sizes") and boolean values.
+                                   A value of True indicates that the corresponding attribute
+                                   should be retained, while False indicates it should be removed from the atom.
+    :return: A list of QueryContainer objects representing the cleaned rule molecules.
+    """
+    cleaned_rule_molecules = []
+    for rule_molecule in rule_molecules:
+        for reaction_molecule in reaction_molecules:
+            if set(rule_molecule.atoms_numbers) <= set(reaction_molecule.atoms_numbers):
+                query_reaction_molecule = reaction_molecule.substructure(
+                    reaction_molecule, as_query=True
+                )
+                query_rule_molecule = query_reaction_molecule.substructure(
+                    rule_molecule
+                )
+                # clean reaction center atoms
+                if not all(
+                    atom_retention_details["reaction_center"].values()
+                ):  # if everything True, we keep all marks
+                    local_reaction_center_atoms = (
+                        set(rule_molecule.atoms_numbers) & reaction_center_atoms
+                    )
+                    for atom_number in local_reaction_center_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["reaction_center"],
+                            atom_number,
+                        )
+                # clean environment atoms
+                if not all(
+                    atom_retention_details["environment"].values()
+                ):  # if everything True, we keep all marks
+                    local_environment_atoms = (
+                        set(rule_molecule.atoms_numbers) - reaction_center_atoms
+                    )
+                    for atom_number in local_environment_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["environment"],
+                            atom_number,
+                        )
+                cleaned_rule_molecules.append(query_rule_molecule)
+                break
+    return cleaned_rule_molecules
+def clean_atom(
+    query_molecule: QueryContainer,
+    attributes_to_keep: Dict[str, bool],
+    atom_number: int,
+) -> QueryContainer:
+    """
+    Removes specified information from a given atom in a query molecule.
+    :param query_molecule: The QueryContainer of molecule.
+    :param attributes_to_keep: Dictionary indicating which attributes to keep in the atom. The keys should be strings
+                               representing the attribute names, and the values should be booleans indicating whether
+                               to retain (True) or remove(False) that attribute. Expected keys are:
+                               - "neighbors": Indicates if neighbors of the atom should be removed.
+                               - "hybridization": Indicates if hybridization information of the atom should be removed.
+                               - "implicit_hydrogens": Indicates if implicit hydrogen information of the atom should be removed.
+                               - "ring_sizes": Indicates if ring size information of the atom should be removed.
+    :param atom_number: The number of the atom to be modified in the query molecule.
+    """
+    target_atom = query_molecule.atom(atom_number)
+    if not attributes_to_keep["neighbors"]:
+        target_atom.neighbors = None
+    if not attributes_to_keep["hybridization"]:
+        target_atom.hybridization = None
+    if not attributes_to_keep["implicit_hydrogens"]:
+        target_atom.implicit_hydrogens = None
+    if not attributes_to_keep["ring_sizes"]:
+        target_atom.ring_sizes = None
+    return query_molecule
+def create_substructures_and_reagents(
+    reaction: ReactionContainer,
+    rule_atoms: Set[int],
+    as_query_container: bool,
+    keep_reagents: bool,
+) -> Tuple[List[MoleculeContainer], List[MoleculeContainer], List]:
+    """
+    Creates substructures for reactants and products, and optionally includes
+    reagents, based on specified parameters. The function processes the reaction to
+    create substructures for reactants and products based on the rule atoms. It also
+    handles the inclusion of reagents based on the keep_reagents flag and converts these
+    structures to query containers if required.
+    :param reaction: The reaction object (ReactionContainer) from which to extract substructures.
+                     This object  represents a chemical reaction with specified reactants, products, and possibly reagents.
+    :param rule_atoms: A set of atom id corresponding to the rule atoms. These are used to identify relevant
+                       substructures in reactants and products.
+    :param as_query_container: A boolean flag indicating whether the substructures should be converted to query containers.
+                               Query containers are used for pattern matching in chemical structures.
+    :param keep_reagents: A boolean flag indicating whether reagents should be included in the resulting structures.
+                          Reagents are additional substances that are present in the reaction but are not reactants or products.
+    :return: A tuple containing three elements:
+             - A list of reactant substructures, each corresponding to a part of the reactants that matches the rule atoms.
+             - A list of product substructures, each corresponding to a part of the products that matches the rule atoms.
+             - A list of reagents, included as is or as substructures, depending on the as_query_container flag.
+    """
+    reactant_substructures = [
+        reactant.substructure(rule_atoms.intersection(reactant.atoms_numbers))
+        for reactant in reaction.reactants
+        if rule_atoms.intersection(reactant.atoms_numbers)
+    ]
+    product_substructures = [
+        product.substructure(rule_atoms.intersection(product.atoms_numbers))
+        for product in reaction.products
+        if rule_atoms.intersection(product.atoms_numbers)
+    ]
+    reagents = []
+    if keep_reagents:
+        if as_query_container:
+            reagents = [
+                reagent.substructure(reagent, as_query=True)
+                for reagent in reaction.reagents
+            ]
+        else:
+            reagents = reaction.reagents
+    return reactant_substructures, product_substructures, reagents
+def assemble_final_rule(
+    reactant_substructures: List[QueryContainer],
+    product_substructures: List[QueryContainer],
+    reagents: List,
+    meta_debug: Dict[str, Set],
+    keep_metadata: bool,
+    reaction: ReactionContainer,
+) -> ReactionContainer:
+    """
+    Assembles the final reaction rule from the provided substructures and metadata.
+    This function brings together the various components of a reaction rule, including
+    reactant and product substructures, reagents, and metadata. It creates a
+    comprehensive representation of the reaction rule, which can be used for further
+    processing or analysis.
+    :param reactant_substructures: A list of substructures derived from the reactants of
+        the reaction. These substructures represent parts of reactants that are relevant
+        to the rule.
+    :param product_substructures: A list of substructures derived from the products of
+        the reaction. These substructures represent parts of products that are relevant
+        to the rule.
+    :param reagents: A list of reagents involved in the reaction. These may be included
+        as-is or as substructures, depending on earlier processing steps.
+    :param meta_debug: A dictionary containing additional metadata about the reaction,
+        such as leaving and incoming groups.
+    :param keep_metadata: A boolean flag indicating whether to retain the metadata
+        associated with the reaction in the rule.
+    :param reaction: The original reaction object (ReactionContainer) from which the
+        rule is being created.
+    :return: A ReactionContainer object representing the assembled reaction rule. This
+        container includes the reactant and product substructures, reagents, and any
+        additional metadata if keep_metadata is True.
+    """
+    rule_metadata = meta_debug if keep_metadata else {}
+    rule_metadata.update(reaction.meta if keep_metadata else {})
+    rule = ReactionContainer(
+        reactant_substructures, product_substructures, reagents, rule_metadata
+    )
+    if keep_metadata:
+        rule.name = reaction.name
+    rule.flush_cache()
+    return rule
+def validate_rule(rule: ReactionContainer, reaction: ReactionContainer) -> bool:
+    """
+    Validates a reaction rule by ensuring it can correctly generate the products from
+    the reactants. The function uses a chemical reactor to simulate the reaction based
+    on the provided rule. It then compares the products generated by the simulation with
+    the actual products of the reaction. If they match, the rule is considered valid. If
+    not, a ValueError is raised, indicating an issue with the rule.
+    :param rule: The reaction rule to be validated. This is a ReactionContainer object
+        representing a chemical reaction rule, which includes the necessary information
+        to perform a reaction.
+    :param reaction: The original reaction object (ReactionContainer) against which the
+        rule is to be validated. This object contains the actual reactants and products
+        of the reaction.
+    :return: The validated rule if the rule correctly generates the products from the
+        reactants.
+    :raises ValueError: If the rule does not correctly generate the products from the
+        reactants, indicating an incorrect or incomplete rule.
+    """
+    # create a reactor with the given rule
+    reactor = Reactor(rule)
+    try:
+        for result_reaction in reactor(reaction.reactants):
+            result_products = []
+            for result_product in result_reaction.products:
+                tmp = result_product.copy()
+                try:
+                    tmp.kekule()
+                    if tmp.check_valence():
+                        continue
+                except InvalidAromaticRing:
+                    continue
+                result_products.append(result_product)
+            if set(reaction.products) == set(result_products) and len(
+                reaction.products
+            ) == len(result_products):
+                return True
+    except (KeyError, IndexError):
+        # KeyError - iteration over reactor is finished and products are different from the original reaction
+        # IndexError - mistake in __contract_ions, possibly problems with charges in reaction rule
+        return False
+    return False
+def create_rule(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> ReactionContainer:
+    """
+    Creates a reaction rule from a given reaction based on the specified
+    configuration. The function processes the reaction to create a rule that matches the
+    configuration settings. It handles the inclusion of environmental atoms, functional
+    groups, ring structures, and leaving and incoming groups. It also constructs
+    substructures for reactants, products, and reagents, and cleans molecule
+    representations if required. Optionally, it validates the rule using a reactor.
+    :param config: An instance of ExtractRuleConfig, containing various settings that
+                   determine how the rule is created, such as environmental atom count, inclusion
+                   of functional groups, rings, leaving and incoming groups, and other parameters.
+    :param reaction: The reaction object (ReactionContainer) from which to create the
+                     rule. This object represents a chemical reaction with specified reactants,
+                     products, and possibly reagents.
+    :return: A ReactionContainer object representing the extracted reaction rule. This
+             rule includes various elements of the reaction as specified by the
+             configuration, such as reaction centers, environmental atoms, functional groups,
+             and others.
+    """
+    # 1. create reaction CGR
+    cgr = ~reaction
+    center_atoms = set(cgr.center_atoms)
+    # 2. add atoms of reaction environment based on config settings
+    center_atoms = add_environment_atoms(
+        cgr, center_atoms, config.environment_atom_count
+    )
+    # 3. include functional groups in the rule if specified in config
+    if config.include_func_groups and config.func_groups_list:
+        rule_atoms = add_functional_groups(
+            reaction, center_atoms, config.func_groups_list
+        )
+    else:
+        rule_atoms = center_atoms.copy()
+    # 4. include ring structures in the rule if specified in config
+    if config.include_rings:
+        rule_atoms = add_ring_structures(cgr, rule_atoms)
+    # 5. add leaving and incoming groups to the rule based on config settings
+    rule_atoms, meta_debug = add_leaving_incoming_groups(
+        reaction, rule_atoms, config.keep_leaving_groups, config.keep_incoming_groups
+    )
+    # 6. create substructures for reactants, products, and reagents
+    reactant_substructures, product_substructures, reagents = (
+        create_substructures_and_reagents(
+            reaction, rule_atoms, config.as_query_container, config.keep_reagents
+        )
+    )
+    # 7. clean atom marks in the molecules if they are being converted to query containers
+    if config.as_query_container:
+        reactant_substructures = clean_molecules(
+            reactant_substructures,
+            reaction.reactants,
+            center_atoms,
+            config.atom_info_retention,
+        )
+        product_substructures = clean_molecules(
+            product_substructures,
+            reaction.products,
+            center_atoms,
+            config.atom_info_retention,
+        )
+    # 8. assemble the final rule including metadata if specified
+    rule = assemble_final_rule(
+        reactant_substructures,
+        product_substructures,
+        reagents,
+        meta_debug,
+        config.keep_metadata,
+        reaction,
+    )
+    # 9. reverse extracted reaction rule and reaction
+    if config.reverse_rule:
+        rule = reverse_reaction(rule)
+        reaction = reverse_reaction(reaction)
+    # 10. validate the rule using a reactor if validation is enabled in config
+    if config.reactor_validation:
+        if validate_rule(rule, reaction):
+            rule.meta["reactor_validation"] = "passed"
+        else:
+            rule.meta["reactor_validation"] = "failed"
+    return rule
+def extract_rules(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> List[ReactionContainer]:
+    """
+    Extracts reaction rules from a given reaction based on the specified
+    configuration.
+    :param config: An instance of ExtractRuleConfig, which contains various
+        configuration settings for rule extraction, such as whether to include
+        multicenter rules, functional groups, ring structures, leaving and incoming
+        groups, etc.
+    :param reaction: The reaction object (ReactionContainer) from which to extract
+        rules. The reaction object represents a chemical reaction with specified
+        reactants, products, and possibly reagents.
+    :return: A list of ReactionContainer objects, each representing a distinct reaction
+        rule. If config.multicenter_rules is True, a single rule encompassing all
+        reaction centers is returned. Otherwise, separate rules for each reaction center
+        are extracted, up to a maximum of 15 distinct centers.
+    """
+    standardizer = (
+        RemoveReagentsStandardizer()
+    )  # reagents are needed if they are the part of reaction rule specification
+    reaction = standardizer(reaction)
+    if config.multicenter_rules:
+        # extract a single rule encompassing all reaction centers
+        return [create_rule(config, reaction)]
+    # extract separate rules for each distinct reaction center
+    distinct_rules = set()
+    for center_reaction in islice(reaction.enumerate_centers(), 15):
+        single_rule = create_rule(config, center_reaction)
+        distinct_rules.add(single_rule)
+    return list(distinct_rules)
+@ray.remote
+def process_reaction_batch(
+    batch: List[Tuple[int, ReactionContainer]], config: RuleExtractionConfig
+) -> List[Tuple[int, List[ReactionContainer]]]:
+    """
+    Processes a batch of reactions to extract reaction rules based on the given
+    configuration. This function operates as a remote task in a distributed system using
+    Ray. It takes a batch of reactions, where each reaction is paired with an index. For
+    each reaction in the batch, it extracts reaction rules as specified by the
+    configuration object. The extracted rules for each reaction are then returned along
+    with the corresponding index. This function is intended to be used in a distributed
+    manner with Ray to parallelize the rule extraction process across multiple
+    reactions.
+    :param batch: A list where each element is a tuple containing an index (int) and a
+        ReactionContainer object. The index is typically used to keep track of the
+        reaction's position in a larger dataset.
+    :param config: An instance of ExtractRuleConfig that provides settings and
+        parameters for the rule extraction process.
+    :return: A list where each element is a tuple. The first element of the tuple is an
+        index (int), and the second is a list of ReactionContainer objects representing
+        the extracted rules for the corresponding reaction.
+    """
+    extracted_rules_list = []
+    for index, reaction in batch:
+        try:
+            extracted_rules = extract_rules(config, reaction)
+            extracted_rules_list.append((index, extracted_rules))
+        except Exception as e:
+            logging.debug(e)
+            continue
+    return extracted_rules_list
+def process_completed_batch(
+    futures: Dict,
+    rules_statistics: Dict,
+) -> None:
+    """
+    Processes completed batches of reactions, updating the rules statistics and
+    writing rules to a file. This function waits for the completion of a batch of
+    reactions processed in parallel (using Ray), updates the statistics for each
+    extracted rule, and writes the rules to a result file if they are new. It also
+    updates the progress bar with the size of the processed batch.
+    :param futures: A dictionary of futures representing ongoing batch processing tasks.
+    :param rules_statistics: A dictionary to keep track of statistics for each rule.
+    :return: None
+    """
+    ready_id, running_id = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(ready_id[0])
+    for index, extracted_rules in completed_batch:
+        for rule in extracted_rules:
+            prev_stats_len = len(rules_statistics)
+            rules_statistics[rule].append(index)
+            if len(rules_statistics) != prev_stats_len:
+                rule.meta["first_reaction_index"] = index
+    del futures[ready_id[0]]
+def sort_rules(
+    rules_stats: Dict, min_popularity: int, single_reactant_only: bool
+) -> List[Tuple[ReactionContainer, List[int]]]:
+    """
+    Sorts reaction rules based on their popularity and validation status. This
+    function sorts the given rules according to their popularity (i.e., the number of
+    times they have been applied) and filters out rules that haven't passed reactor
+    validation or are less popular than the specified minimum popularity threshold.
+    :param rules_stats: A dictionary where each key is a reaction rule and the value is
+        a list of integers. Each integer represents an index where the rule was applied.
+    :type rules_stats: The number of occurrence of the reaction rules.
+    :param min_popularity: The minimum number of times a rule must be applied to be
+        considered. Default is 3.
+    :type min_popularity: The minimum number of occurrence of the reaction rule to be
+        selected.
+    :param single_reactant_only: Whether to keep only reaction rules with a single
+        molecule on the right side of reaction arrow. Default is True.
+    :return: A list of tuples, where each tuple contains a reaction rule and a list of
+        indices representing the rule's applications. The list is sorted in descending
+        order of the rule's popularity.
+    """
+    return sorted(
+        (
+            (rule, indices)
+            for rule, indices in rules_stats.items()
+            if len(indices) >= min_popularity
+            and rule.meta["reactor_validation"] == "passed"
+            and (not single_reactant_only or len(rule.reactants) == 1)
+        ),
+        key=lambda x: -len(x[1]),
+    )
+def extract_rules_from_reactions(
+    config: RuleExtractionConfig,
+    reaction_data_path: str,
+    reaction_rules_path: str,
+    num_cpus: int,
+    batch_size: int,
+) -> None:
+    """
+    Extracts reaction rules from a set of reactions based on the given configuration.
+    This function initializes a Ray environment for distributed computing and processes
+    each reaction in the provided reaction database to extract reaction rules. It
+    handles the reactions in batches, parallelize the rule extraction process. Extracted
+    rules are written to RDF files and their statistics are recorded. The function also
+    sorts the rules based on their popularity and saves the sorted rules.
+    :param config: Configuration settings for rule extraction, including file paths,
+        batch size, and other parameters.
+    :param reaction_data_path: Path to the file containing reaction database.
+    :param reaction_rules_path: Name of the file to store the extracted rules.
+    :param num_cpus: Number of CPU cores to use for processing. Defaults to 1.
+    :param batch_size: Number of reactions to process in each batch. Defaults to 10.
+    :return: None
+    """
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    reaction_rules_path, _ = splitext(reaction_rules_path)
+    with ReactionReader(reaction_data_path) as reactions:
+        futures = {}
+        batch = []
+        max_concurrent_batches = num_cpus
+        extracted_rules_and_statistics = defaultdict(list)
+        for index, reaction in tqdm(
+            enumerate(reactions),
+            desc="Number of reactions processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            # reaction ready to use
+            batch.append((index, reaction))
+            if len(batch) == batch_size:
+                future = process_reaction_batch.remote(batch, config)
+                futures[future] = None
+                batch = []
+                while len(futures) >= max_concurrent_batches:
+                    process_completed_batch(
+                        futures,
+                        extracted_rules_and_statistics,
+                    )
+        if batch:
+            future = process_reaction_batch.remote(batch, config)
+            futures[future] = None
+        while futures:
+            process_completed_batch(
+                futures,
+                extracted_rules_and_statistics,
+            )
+        sorted_rules = sort_rules(
+            extracted_rules_and_statistics,
+            min_popularity=config.min_popularity,
+            single_reactant_only=config.single_reactant_only,
+        )
+        ray.shutdown()
+        with open(f"{reaction_rules_path}.pickle", "wb") as statistics_file:
+            pickle.dump(sorted_rules, statistics_file)
+        print(f"Number of extracted reaction rules: {len(sorted_rules)}")

synplan/chem/reaction_rules/manual/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .decompositions import rules as d_rules
+from .transformations import rules as t_rules
+hardcoded_rules = t_rules + d_rules
+__all__ = ["hardcoded_rules"]

synplan/chem/reaction_rules/manual/decompositions.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""Module containing hardcoded decomposition reaction rules."""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """Creates and returns three query containers and appends a reaction container to
+    the "rules" list."""
+    q_ = QueryContainer()
+    p1_ = QueryContainer()
+    p2_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p1_, p2_)))
+    return q_, p1_, p2_
+# R-amide/ester formation
+# [C](-[N,O;D23;Zs])(-[C])=[O]>>[A].[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom(ListElement(["N", "O"]), hybridization=1, neighbors=(2, 3))
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O", _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom("A", _map=4)
+# acyl group addition with aromatic carbon's case (Friedel-Crafts)
+# [C;Za]-[C](-[C])=[O]>>[C].[C]-[C](-[Cl])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C", hybridization=4)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("Cl", _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom("C", _map=4)
+# Williamson reaction
+# [C;Za]-[O]-[C;Zs;W0]>>[C]-[Br].[C]-[O]
+q, p1, p2 = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("O")
+q.add_atom("C", hybridization=1, heteroatoms=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_bond(1, 2, 1)
+p2.add_atom("C", _map=3)
+p2.add_atom("Br")
+p2.add_bond(3, 4, 1)
+# Buchwald-Hartwig amination
+# [N;D23;Zs;W0]-[C;Za]>>[C]-[Br].[N]
+q, p1, p2 = prepare()
+q.add_atom("N", heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom("C", hybridization=4)
+q.add_bond(1, 2, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("Br")
+p1.add_bond(2, 3, 1)
+p2.add_atom("N")
+# imidazole imine atom's alkylation
+# [C;r5](:[N;r5]-[C;Zs;W1]):[N;D2;r5]>>[C]-[Br].[N]:[C]:[N]
+q, p1, p2 = prepare()
+q.add_atom("N", rings_sizes=5)
+q.add_atom("C", rings_sizes=5)
+q.add_atom("N", rings_sizes=5, neighbors=2)
+q.add_atom("C", hybridization=1, heteroatoms=(1, 2))
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 4)
+q.add_bond(1, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 4)
+p1.add_bond(2, 3, 4)
+p2.add_atom("C", _map=4)
+p2.add_atom("Br")
+p2.add_bond(4, 5, 1)
+# Knoevenagel condensation (nitryl and carboxyl case)
+# [C]=[C](-[C]#[N])-[C](-[O])=[O]>>[C]=[O].[C](-[C]#[N])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 2)
+q.add_bond(5, 7, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 2)
+p1.add_bond(5, 7, 1)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double nitryl case)
+# [C]=[C](-[C]#[N])-[C]#[N]>>[C]=[O].[C](-[C]#[N])-[C]#[N]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 3)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 3)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double carboxyl case)
+# [C]=[C](-[C](-[O])=[O])-[C](-[O])=[O]>>[C]=[O].[C](-[C](-[O])=[O])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 2)
+q.add_bond(3, 5, 1)
+q.add_bond(2, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 2)
+p1.add_bond(3, 5, 1)
+p1.add_bond(2, 6, 1)
+p1.add_bond(6, 7, 2)
+p1.add_bond(6, 8, 1)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=9)
+p2.add_bond(1, 9, 2)
+# heterocyclization with guanidine
+# [c]((-[N;W0;Zs])@[n]@[c](-[N;D1])@[c;W0])@[n]@[c]-[O; D1]>>[C](-[N])(=[N])-[N].[C](#[N])-[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("N", heteroatoms=0, hybridization=1)
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N", neighbors=1)
+q.add_atom("C", heteroatoms=0)
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 4)
+q.add_bond(1, 7, 4)
+q.add_bond(7, 8, 4)
+q.add_bond(8, 9, 1)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("N")
+p1.add_atom("N", _map=7)
+p1.add_bond(1, 2, 1)
+p1.add_bond(1, 3, 2)
+p1.add_bond(1, 7, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("N")
+p2.add_atom("C")
+p2.add_atom("C", _map=8)
+p2.add_atom("O", _map=9)
+p2.add_atom("O")
+p2.add_bond(4, 5, 3)
+p2.add_bond(4, 6, 1)
+p2.add_bond(6, 8, 1)
+p2.add_bond(8, 9, 2)
+p2.add_bond(8, 10, 1)
+# alkylation of amine
+# [C]-[N]-[C]>>[C]-[N].[C]-[Br]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("Cl")
+p2.add_bond(4, 5, 1)
+# Synthesis of guanidines
+#
+q, p1, p2 = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N", hybridization=1)
+q.add_atom("N", hybridization=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom("N", _map=4)
+# Grignard reaction with nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 3)
+p2.add_atom("C", _map=4)
+p2.add_atom("Br")
+p2.add_bond(4, 5, 1)
+# Alkylation of alpha-carbon atom of nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C", neighbors=(3, 4))
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("Cl")
+p2.add_bond(4, 5, 1)
+# Gomberg-Bachmann reaction
+#
+q, p1, p2 = prepare()
+q.add_atom("C", hybridization=4, heteroatoms=0)
+q.add_atom("C", hybridization=4, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p1.add_atom("C")
+p1.add_atom("N", _map=3)
+p1.add_bond(1, 3, 1)
+p2.add_atom("C", _map=2)
+# Cyclocondensation
+#
+q, p1, p2 = prepare()
+q.add_atom("N", neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 1)
+q.add_bond(7, 8, 2)
+q.add_bond(1, 7, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O", _map=9)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 1)
+p1.add_bond(4, 9, 2)
+p2.add_atom("N", _map=5)
+p2.add_atom("C")
+p2.add_atom("C")
+p2.add_atom("O")
+p2.add_atom("O", _map=10)
+p2.add_bond(5, 6, 1)
+p2.add_bond(6, 7, 1)
+p2.add_bond(7, 8, 2)
+p2.add_bond(7, 10, 1)
+# heterocyclization dicarboxylic acids
+#
+q, p1, p2 = prepare()
+q.add_atom("C", rings_sizes=(5, 6))
+q.add_atom("O")
+q.add_atom(ListElement(["O", "N"]))
+q.add_atom("C", rings_sizes=(5, 6))
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(1, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O", _map=6)
+p1.add_bond(1, 2, 2)
+p1.add_bond(1, 6, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("O")
+p2.add_atom("O", _map=7)
+p2.add_bond(4, 5, 2)
+p2.add_bond(4, 7, 1)
+__all__ = ["rules"]

synplan/chem/reaction_rules/manual/transformations.py ADDED Viewed

	@@ -0,0 +1,532 @@

+"""Module containing hardcoded transformation reaction rules."""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """Creates and returns three query containers and appends a reaction container to
+    the "rules" list."""
+    q_ = QueryContainer()
+    p_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p_,)))
+    return q_, p_
+# aryl nitro reduction
+# [C;Za;W1]-[N;D1]>>[O-]-[N+](-[C])=[O]
+q, p = prepare()
+q.add_atom("N", neighbors=1)
+q.add_atom("C", hybridization=4, heteroatoms=1)
+q.add_bond(1, 2, 1)
+p.add_atom("N", charge=1)
+p.add_atom("C")
+p.add_atom("O", charge=-1)
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(1, 3, 1)
+p.add_bond(1, 4, 2)
+# aryl nitration
+# [O-]-[N+](=[O])-[C;Za;W12]>>[C]
+q, p = prepare()
+q.add_atom("N", charge=1)
+q.add_atom("C", hybridization=4, heteroatoms=(1, 2))
+q.add_atom("O", charge=-1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 1)
+q.add_bond(1, 4, 2)
+p.add_atom("C", _map=2)
+# Beckmann rearrangement (oxime -> amide)
+# [C]-[N;D2]-[C]=[O]>>[O]-[N]=[C]-[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("N", neighbors=2)
+q.add_atom("O")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+q.add_bond(2, 4, 1)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+p.add_bond(1, 4, 1)
+# aldehydes or ketones into oxime/imine reaction
+# [C;Zd;W1]=[N]>>[C]=[O]
+q, p = prepare()
+q.add_atom("C", hybridization=2, heteroatoms=1)
+q.add_atom("N")
+q.add_bond(1, 2, 2)
+p.add_atom("C")
+p.add_atom("O", _map=3)
+p.add_bond(1, 3, 2)
+# addition of halogen atom into phenol ring (orto)
+# [C](-[Cl,F,Br,I;D1]):[C]-[O,N;Zs]>>[C](-[A]):[C]
+q, p = prepare()
+q.add_atom(ListElement(["O", "N"]), hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["Cl", "F", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# addition of halogen atom into phenol ring (para)
+# [C](:[C]:[C]:[C]-[O,N;Zs])-[Cl,F,Br,I;D1]>>[A]-[C]:[C]:[C]:[C]
+q, p = prepare()
+q.add_atom(ListElement(["O", "N"]), hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["Cl", "F", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 4)
+q.add_bond(5, 6, 1)
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+p.add_bond(3, 4, 4)
+p.add_bond(4, 5, 4)
+# hard reduction of Ar-ketones
+# [C;Za]-[C;D2;Zs;W0]>>[C]-[C]=[O]
+q, p = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("C", hybridization=1, neighbors=2, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# reduction of alpha-hydroxy pyridine
+# [C;W1]:[N;H0;r6]>>[C](:[N])-[O]
+q, p = prepare()
+q.add_atom("C", heteroatoms=1)
+q.add_atom("N", rings_sizes=6, hydrogens=0)
+q.add_bond(1, 2, 4)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_bond(1, 2, 4)
+p.add_bond(1, 3, 1)
+# Reduction of alkene
+# [C]-[C;D23;Zs;W0]-[C;D123;Zs;W0]>>[C](-[C])=[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C", heteroatoms=0, neighbors=(2, 3), hybridization=1)
+q.add_atom("C", heteroatoms=0, neighbors=(1, 2, 3), hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# Kolbe-Schmitt reaction
+# [C](:[C]-[O;D1])-[C](=[O])-[O;D1]>>[C](-[O]):[C]
+q, p = prepare()
+q.add_atom("O", neighbors=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 2)
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# reduction of carboxylic acid
+# [O;D1]-[C;D2]-[C]>>[C]-[C](-[O])=[O]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C", neighbors=2)
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 1)
+p.add_bond(2, 4, 2)
+# halogenation of alcohols
+# [C;Zs]-[Cl,Br;D1]>>[C]-[O]
+q, p = prepare()
+q.add_atom("C", hybridization=1, heteroatoms=1)
+q.add_atom(ListElement(["Cl", "Br"]), neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("O", _map=3)
+p.add_bond(1, 3, 1)
+# Kolbe nitrilation
+# [N]#[C]-[C;Zs;W0]>>[Br]-[C]
+q, p = prepare()
+q.add_atom("C", heteroatoms=0, hybridization=1)
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 3)
+p.add_atom("C")
+p.add_atom("Br", _map=4)
+p.add_bond(1, 4, 1)
+# Nitrile hydrolysis
+# [O;D1]-[C]=[O]>>[N]#[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+p.add_atom("C")
+p.add_atom("N", _map=4)
+p.add_bond(1, 4, 3)
+# sulfamidation
+# [c]-[S](=[O])(=[O])-[N]>>[c]
+q, p = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("S")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("N", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 2)
+q.add_bond(2, 5, 1)
+p.add_atom("C")
+# Ring expansion rearrangement
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C", rings_sizes=6)
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(3, 6, 1)
+q.add_bond(4, 7, 1)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(4, 6, 1)
+p.add_bond(4, 7, 1)
+# hydrolysis of bromide alkyl
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("Br")
+p.add_bond(1, 2, 1)
+# Condensation of ketones/aldehydes and amines into imines
+#
+q, p = prepare()
+q.add_atom("N", neighbors=(1, 2))
+q.add_atom("C", neighbors=(2, 3), heteroatoms=1)
+q.add_bond(1, 2, 2)
+p.add_atom("C", _map=2)
+p.add_atom("O")
+p.add_bond(2, 3, 2)
+# Halogenation of alkanes
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom(ListElement(["F", "Cl", "Br"]))
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+# heterocyclization
+#
+q, p = prepare()
+q.add_atom("N", heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom("C", heteroatoms=2)
+q.add_atom("N", heteroatoms=0, neighbors=2)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 4, 2)
+# Reduction of nitrile
+#
+q, p = prepare()
+q.add_atom("N", neighbors=1)
+q.add_atom("C")
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+# SPECIAL CASE
+# Reduction of nitrile into methylamine
+#
+q, p = prepare()
+q.add_atom("C", neighbors=1)
+q.add_atom("N", neighbors=2)
+q.add_atom("C")
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom("N", _map=2)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(2, 3, 3)
+p.add_bond(3, 4, 1)
+# methylation of amides
+#
+q, p = prepare()
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C", neighbors=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("N")
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+# hydrocyanation of alkenes
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+# decarbocylation (alpha atom of nitrile)
+#
+q, p = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C", neighbors=2)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(4, 6, 1)
+# Bichler-Napieralski reaction
+#
+q, p = prepare()
+q.add_atom("C", rings_sizes=(6,))
+q.add_atom("C", rings_sizes=(6,))
+q.add_atom("N", rings_sizes=(6,), neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+q.add_bond(5, 9, 4)
+q.add_bond(9, 10, 1)
+q.add_bond(1, 9, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 4)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(5, 6, 1)
+p.add_bond(6, 7, 2)
+p.add_bond(6, 8, 1)
+p.add_bond(5, 9, 1)
+p.add_bond(9, 10, 2)
+p.add_bond(9, 11, 1)
+# heterocyclization in Prins reaction
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom(ListElement(["N", "O"]), neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 1)
+p.add_atom("C")
+p.add_atom("C", _map=5)
+p.add_bond(1, 5, 2)
+# recyclization of tetrahydropyran through an opening the ring and dehydration
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["N", "O"]))
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 2)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(1, 7, 1)
+p.add_bond(3, 7, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(5, 6, 1)
+p.add_bond(1, 6, 1)
+# alkenes + h2o/hHal
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("C", hybridization=1)
+q.add_atom(ListElement(["O", "F", "Cl", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+# methylation of dimethylamines
+#
+q, p = prepare()
+q.add_atom("C", neighbors=1)
+q.add_atom("N", neighbors=3)
+q.add_bond(1, 2, 1)
+p.add_atom("N", _map=2)
+__all__ = ["rules"]

synplan/chem/utils.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""Module containing additional functions needed in different reaction data processing
+protocols."""
+import logging
+from typing import Iterable
+from CGRtools.containers import (
+    CGRContainer,
+    MoleculeContainer,
+    QueryContainer,
+    ReactionContainer,
+)
+from CGRtools.exceptions import InvalidAromaticRing
+from tqdm import tqdm
+from synplan.chem import smiles_parser
+from synplan.utils.files import MoleculeReader, MoleculeWriter
+from chython import MoleculeContainer as MoleculeContainerChython
+def mol_from_smiles(
+    smiles: str,
+    standardize: bool = True,
+    clean_stereo: bool = True,
+    clean2d: bool = True,
+) -> MoleculeContainer:
+    """Converts a SMILES string to a `MoleculeContainer` object and optionally
+    standardizes, cleans stereochemistry, and cleans 2D coordinates.
+    :param smiles: The SMILES string representing the molecule.
+    :param standardize: Whether to standardize the molecule (default is True).
+    :param clean_stereo: Whether to remove the stereo marks on atoms of the molecule (default is True).
+    :param clean2d: Whether to clean the 2D coordinates of the molecule (default is True).
+    :return: The processed molecule object.
+    :raises ValueError: If the SMILES string could not be processed by CGRtools.
+    """
+    molecule = smiles_parser(smiles)
+    if not isinstance(molecule, MoleculeContainer):
+        raise ValueError("SMILES string was not processed by CGRtools")
+    tmp = molecule.copy()
+    try:
+        if standardize:
+            tmp.canonicalize()
+        if clean_stereo:
+            tmp.clean_stereo()
+        if clean2d:
+            tmp.clean2d()
+        molecule = tmp
+    except InvalidAromaticRing:
+        logging.warning(
+            "CGRtools was not able to standardize molecule due to invalid aromatic ring"
+        )
+    return molecule
+def query_to_mol(query: QueryContainer) -> MoleculeContainer:
+    """Converts a QueryContainer object into a MoleculeContainer object.
+    :param query: A QueryContainer object representing the query structure.
+    :return: A MoleculeContainer object that replicates the structure of the query.
+    """
+    new_mol = MoleculeContainer()
+    for n, atom in query.atoms():
+        new_mol.add_atom(
+            atom.atomic_symbol, n, charge=atom.charge, is_radical=atom.is_radical
+        )
+    for i, j, bond in query.bonds():
+        new_mol.add_bond(i, j, int(bond))
+    return new_mol
+def reaction_query_to_reaction(reaction_rule: ReactionContainer) -> ReactionContainer:
+    """Converts a ReactionContainer object with query structures into a
+    ReactionContainer with molecular structures.
+    :param reaction_rule: A ReactionContainer object where reactants and products are
+        QueryContainer objects.
+    :return: A new ReactionContainer object where reactants and products are
+        MoleculeContainer objects.
+    """
+    reactants = [query_to_mol(q) for q in reaction_rule.reactants]
+    products = [query_to_mol(q) for q in reaction_rule.products]
+    reagents = [
+        query_to_mol(q) for q in reaction_rule.reagents
+    ]  # Assuming reagents are also part of the rule
+    reaction = ReactionContainer(reactants, products, reagents, reaction_rule.meta)
+    reaction.name = reaction_rule.name
+    return reaction
+def unite_molecules(molecules: Iterable[MoleculeContainer]) -> MoleculeContainer:
+    """Unites a list of MoleculeContainer objects into a single MoleculeContainer. This
+    function takes multiple molecules and combines them into one larger molecule. The
+    first molecule in the list is taken as the base, and subsequent molecules are united
+    with it sequentially.
+    :param molecules: A list of MoleculeContainer objects to be united.
+    :return: A single MoleculeContainer object representing the union of all input
+        molecules.
+    """
+    new_mol = MoleculeContainer()
+    for mol in molecules:
+        new_mol = new_mol.union(mol)
+    return new_mol
+def safe_canonicalization(molecule: MoleculeContainer) -> MoleculeContainer:
+    """Attempts to canonicalize a molecule, handling any exceptions. If the
+    canonicalization process fails due to an InvalidAromaticRing exception, it safely
+    returns the original molecule.
+    :param molecule: The given molecule to be canonicalized.
+    :return: The canonicalized molecule if successful, otherwise the original molecule.
+    """
+    molecule._atoms = dict(sorted(molecule._atoms.items()))
+    molecule_copy = molecule.copy()
+    try:
+        molecule_copy.canonicalize()
+        molecule_copy.clean_stereo()
+        return molecule_copy
+    except InvalidAromaticRing:
+        return molecule
+def standardize_building_blocks(input_file: str, output_file: str) -> str:
+    """Standardizes custom building blocks.
+    :param input_file: The path to the file that stores the original building blocks.
+    :param output_file: The path to the file that will store the standardized building
+        blocks.
+    :return: The path to the file with standardized building blocks.
+    """
+    if input_file == output_file:
+        raise ValueError("input_file name and output_file name cannot be the same.")
+    with MoleculeReader(input_file) as inp_file, MoleculeWriter(
+        output_file
+    ) as out_file:
+        for mol in tqdm(
+            inp_file,
+            desc="Number of building blocks processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            try:
+                mol = safe_canonicalization(mol)
+            except Exception as e:
+                logging.debug(e)
+                continue
+            out_file.write(mol)
+    return output_file
+def cgr_from_reaction_rule(reaction_rule: ReactionContainer) -> CGRContainer:
+    """Creates a CGR from the given reaction rule.
+    :param reaction_rule: The reaction rule to be converted.
+    :return: The resulting CGR.
+    """
+    reaction_rule = reaction_query_to_reaction(reaction_rule)
+    cgr_rule = ~reaction_rule
+    return cgr_rule
+def hash_from_reaction_rule(reaction_rule: ReactionContainer) -> hash:
+    """Generates hash for the given reaction rule.
+    :param reaction_rule: The reaction rule to be converted.
+    :return: The resulting hash.
+    """
+    reactants_hash = tuple(sorted(hash(r) for r in reaction_rule.reactants))
+    reagents_hash = tuple(sorted(hash(r) for r in reaction_rule.reagents))
+    products_hash = tuple(sorted(hash(r) for r in reaction_rule.products))
+    return hash((reactants_hash, reagents_hash, products_hash))
+def reverse_reaction(
+    reaction: ReactionContainer,
+) -> ReactionContainer:
+    """Reverses the given reaction.
+    :param reaction: The reaction to be reversed.
+    :return: The reversed reaction.
+    """
+    reversed_reaction = ReactionContainer(
+        reaction.products, reaction.reactants, reaction.reagents, reaction.meta
+    )
+    reversed_reaction.name = reaction.name
+    return reversed_reaction
+def cgrtools_to_chython_molecule(molecule):
+    molecule_chython = MoleculeContainerChython()
+    for n, atom in molecule.atoms():
+        molecule_chython.add_atom(atom.atomic_symbol, n)
+    for n, m, bond in molecule.bonds():
+        molecule_chython.add_bond(n, m, int(bond))
+    return molecule_chython
+def chython_query_to_cgrtools(query):
+    cgrtools_query = QueryContainer()
+    for n, atom in query.atoms():
+        cgrtools_query.add_atom(
+            atom=atom.atomic_symbol,
+            charge=atom.charge,
+            neighbors=atom.neighbors,
+            hybridization=atom.hybridization,
+            _map=n,
+        )
+    for n, m, bond in query.bonds():
+        cgrtools_query.add_bond(n, m, int(bond))
+    return cgrtools_query

synplan/interfaces/__init__.py ADDED Viewed

File without changes

synplan/interfaces/cli.py ADDED Viewed

	@@ -0,0 +1,506 @@

+"""Module containing commands line scripts for training and planning steps."""
+import os
+import warnings
+from pathlib import Path
+import click
+import yaml
+from synplan.chem.data.filtering import ReactionFilterConfig, filter_reactions_from_file
+from synplan.chem.data.standardizing import (
+    ReactionStandardizationConfig,
+    standardize_reactions_from_file,
+)
+from synplan.chem.reaction_rules.extraction import extract_rules_from_reactions
+from synplan.chem.reaction_routes.clustering import run_cluster_cli
+from synplan.chem.utils import standardize_building_blocks
+from synplan.mcts.search import run_search
+from synplan.ml.training.supervised import create_policy_dataset, run_policy_training
+from synplan.ml.training.reinforcement import run_updating
+from synplan.utils.config import (
+    PolicyNetworkConfig,
+    RuleExtractionConfig,
+    TreeConfig,
+    TuningConfig,
+    ValueNetworkConfig,
+)
+from synplan.utils.loading import download_all_data
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+)
+warnings.filterwarnings("ignore")
+@click.group(name="synplan")
+def synplan():
+    """SynPlanner command line interface."""
+@synplan.command(name="download_all_data")
+@click.option(
+    "--save_to",
+    "save_to",
+    help="Path to the folder where downloaded data will be stored.",
+)
+def download_all_data_cli(save_to: str = ".") -> None:
+    """Downloads all data for training, planning and benchmarking SynPlanner."""
+    download_all_data(save_to=save_to)
+@synplan.command(name="building_blocks_standardizing")
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks to be standardized.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="Path to the file where standardized building blocks will be stored.",
+)
+def building_blocks_standardizing_cli(input_file: str, output_file: str) -> None:
+    """Standardizes building blocks."""
+    standardize_building_blocks(input_file=input_file, output_file=output_file)
+@synplan.command(name="reaction_standardizing")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reactions standardizing.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions to be standardized.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    type=click.Path(),
+    help="Path to the file where standardized reactions will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def reaction_standardizing_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+) -> None:
+    """Standardizes reactions and remove duplicates."""
+    stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+    standardize_reactions_from_file(
+        config=stand_config,
+        input_reaction_data_path=input_file,
+        standardized_reaction_data_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="reaction_filtering")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reactions filtering.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions to be filtered.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    default=Path("./"),
+    type=click.Path(),
+    help="Path to the file where successfully filtered reactions will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def reaction_filtering_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+):
+    """Filters erroneous reactions."""
+    reaction_check_config = ReactionFilterConfig().from_yaml(config_path)
+    filter_reactions_from_file(
+        config=reaction_check_config,
+        input_reaction_data_path=input_file,
+        filtered_reaction_data_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="rule_extracting")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reaction rules extracting.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions for reaction rules extraction.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="Path to the file where extracted reaction rules will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def rule_extracting_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+):
+    """Reaction rules extraction."""
+    reaction_rule_config = RuleExtractionConfig.from_yaml(config_path)
+    extract_rules_from_reactions(
+        config=reaction_rule_config,
+        reaction_data_path=input_file,
+        reaction_rules_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="ranking_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for ranking policy training.",
+)
+@click.option(
+    "--reaction_data",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions for ranking policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Path to the directory where the trained policy network will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=4,
+    type=int,
+    help="The number of CPUs to use for training set preparation.",
+)
+def ranking_policy_training_cli(
+    config_path: str,
+    reaction_data: str,
+    reaction_rules: str,
+    results_dir: str,
+    num_cpus: int,
+) -> None:
+    """Ranking policy network training."""
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_config.policy_type = "ranking"
+    policy_dataset_file = os.path.join(results_dir, "policy_dataset.dt")
+    datamodule = create_policy_dataset(
+        reaction_rules_path=reaction_rules,
+        molecules_or_reactions_path=reaction_data,
+        output_path=policy_dataset_file,
+        dataset_type="ranking",
+        batch_size=policy_config.batch_size,
+        num_cpus=num_cpus,
+    )
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@synplan.command(name="filtering_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for filtering policy training.",
+)
+@click.option(
+    "--molecule_data",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with molecules for filtering policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Path to the directory where the trained policy network will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=8,
+    type=int,
+    help="The number of CPUs to use for training set preparation.",
+)
+def filtering_policy_training_cli(
+    config_path: str,
+    molecule_data: str,
+    reaction_rules: str,
+    results_dir: str,
+    num_cpus: int,
+):
+    """Filtering policy network training."""
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_config.policy_type = "filtering"
+    policy_dataset_file = os.path.join(results_dir, "policy_dataset.ckpt")
+    datamodule = create_policy_dataset(
+        reaction_rules_path=reaction_rules,
+        molecules_or_reactions_path=molecule_data,
+        output_path=policy_dataset_file,
+        dataset_type="filtering",
+        batch_size=policy_config.batch_size,
+        num_cpus=num_cpus,
+    )
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@synplan.command(name="value_network_tuning")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for value network training.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for planning simulations.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules. Needed for planning simulations.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks. Needed for planning simulations.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with trained policy network. Needed for planning simulations.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the file with trained value network. Needed in case of additional value network fine-tuning",
+)
+@click.option(
+    "--results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the directory where the trained value network will be stored.",
+)
+def value_network_tuning_cli(
+    config_path: str,
+    targets: str,
+    reaction_rules: str,
+    building_blocks: str,
+    policy_network: str,
+    value_network: str,
+    results_dir: str,
+):
+    """Value network tuning."""
+    with open(config_path, "r", encoding="utf-8") as file:
+        config = yaml.safe_load(file)
+    policy_config = PolicyNetworkConfig.from_dict(config["node_expansion"])
+    policy_config.weights_path = policy_network
+    value_config = ValueNetworkConfig.from_dict(config["value_network"])
+    if value_network is None:
+        value_config.weights_path = os.path.join(
+            results_dir, "weights", "value_network.ckpt"
+        )
+    tree_config = TreeConfig.from_dict(config["tree"])
+    tuning_config = TuningConfig.from_dict(config["tuning"])
+    run_updating(
+        targets_path=targets,
+        tree_config=tree_config,
+        policy_config=policy_config,
+        value_config=value_config,
+        reinforce_config=tuning_config,
+        reaction_rules_path=reaction_rules,
+        building_blocks_path=building_blocks,
+        results_root=results_dir,
+    )
+@synplan.command(name="planning")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for retrosynthetic planning.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for retrosynthetic planning.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with trained policy network.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the file with trained value network.",
+)
+@click.option(
+    "--results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where retrosynthetic planning results will be stored.",
+)
+def planning_cli(
+    config_path: str,
+    targets: str,
+    reaction_rules: str,
+    building_blocks: str,
+    policy_network: str,
+    value_network: str,
+    results_dir: str,
+):
+    """Retrosynthetic planning."""
+    with open(config_path, "r", encoding="utf-8") as file:
+        config = yaml.safe_load(file)
+    search_config = {**config["tree"], **config["node_evaluation"]}
+    policy_config = PolicyNetworkConfig.from_dict(
+        {**config["node_expansion"], **{"weights_path": policy_network}}
+    )
+    run_search(
+        targets_path=targets,
+        search_config=search_config,
+        policy_config=policy_config,
+        reaction_rules_path=reaction_rules,
+        building_blocks_path=building_blocks,
+        value_network_path=value_network,
+        results_root=results_dir,
+    )
+@synplan.command(name="clustering")
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for retrosynthetic planning.",
+)
+@click.option(
+    "--routes_file",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where the planning results are stored.",
+)
+@click.option(
+    "--cluster_results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where clustering results will be stored.",
+)
+@click.option(
+    "--perform_subcluster",
+    default=None,
+    type=click.Path(exists=False),
+    help="Perform subclustering.",
+)
+@click.option(
+    "--subcluster_results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where subclustering results will be stored.",
+)
+def cluster_route_from_file_cli(
+    targets: str,
+    routes_file: str,
+    cluster_results_dir: str,
+    perform_subcluster: bool,
+    subcluster_results_dir: str,
+):
+    """Clustering the routes from planning"""
+    run_cluster_cli(
+        routes_file=routes_file,
+        cluster_results_dir=cluster_results_dir,
+        perform_subcluster=perform_subcluster,
+        subcluster_results_dir=subcluster_results_dir if perform_subcluster else None,
+    )
+if __name__ == "__main__":
+    synplan()

synplan/interfaces/gui.py ADDED Viewed

	@@ -0,0 +1,1323 @@

+import base64
+import pickle
+import re
+import uuid
+import io
+import zipfile
+import pandas as pd
+import streamlit as st
+from CGRtools.files import SMILESRead
+from streamlit_ketcher import st_ketcher
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import disable_progress_bars
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.search import extract_tree_stats
+from synplan.mcts.tree import Tree
+from synplan.chem.utils import mol_from_smiles
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.clustering import *
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+    generate_results_html,
+    html_top_routes_cluster,
+    get_route_svg,
+    get_route_svg_from_json
+)
+from synplan.utils.config import TreeConfig, PolicyNetworkConfig
+from synplan.utils.loading import load_reaction_rules, load_building_blocks
+import psutil
+import gc
+disable_progress_bars("huggingface_hub")
+smiles_parser = SMILESRead.create_parser(ignore=True)
+DEFAULT_MOL = "c1cc(ccc1Cl)C(CCO)NC(C2(CCN(CC2)c3c4cc[nH]c4ncn3)N)=O"
+# --- Helper Functions ---
+def download_button(
+    object_to_download, download_filename, button_text, pickle_it=False
+):
+    """
+    Issued from
+    Generates a link to download the given object_to_download.
+    Params:
+    ------
+    object_to_download:  The object to be downloaded.
+    download_filename (str): filename and extension of file. e.g. mydata.csv,
+    some_txt_output.txt download_link_text (str): Text to display for download
+    link.
+    button_text (str): Text to display on download button (e.g. 'click here to download file')
+    pickle_it (bool): If True, pickle file.
+    Returns:
+    -------
+    (str): the anchor tag to download object_to_download
+    Examples:
+    --------
+    download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
+    download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
+    """
+    if pickle_it:
+        try:
+            object_to_download = pickle.dumps(object_to_download)
+        except pickle.PicklingError as e:
+            st.write(e)
+            return None
+    else:
+        if isinstance(object_to_download, bytes):
+            pass
+        elif isinstance(object_to_download, pd.DataFrame):
+            object_to_download = object_to_download.to_csv(index=False).encode("utf-8")
+    try:
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError:
+        b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
+        <style>
+            #{button_id} {{
+                background-color: rgb(255, 255, 255);
+                color: rgb(38, 39, 48);
+                text-decoration: none;
+                border-radius: 4px;
+                border-width: 1px;
+                border-style: solid;
+                border-color: rgb(230, 234, 241);
+                border-image: initial;
+            }}
+            #{button_id}:hover {{
+                border-color: rgb(246, 51, 102);
+                color: rgb(246, 51, 102);
+            }}
+            #{button_id}:active {{
+                box-shadow: none;
+                background-color: rgb(246, 51, 102);
+                color: white;
+                }}
+        </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br></br>'
+    )
+    return dl_link
+@st.cache_resource
+def load_planning_resources_cached():  # Renamed to avoid conflict if main calls it directly
+    building_blocks_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="building_blocks_em_sa_ln.smi",
+        subfolder="building_blocks",
+        local_dir=".",
+    )
+    ranking_policy_weights_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="ranking_policy_network.ckpt",
+        subfolder="uspto/weights",
+        local_dir=".",
+    )
+    reaction_rules_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="uspto_reaction_rules.pickle",
+        subfolder="uspto",
+        local_dir=".",
+    )
+    return building_blocks_path, ranking_policy_weights_path, reaction_rules_path
+# --- GUI Sections ---
+def initialize_app():
+    """1. Initialization: Setting up the main window, layout, and initial widgets."""
+    st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
+    # Initialize session state variables if they don't exist.
+    if "planning_done" not in st.session_state:
+        st.session_state.planning_done = False
+    if "tree" not in st.session_state:
+        st.session_state.tree = None
+    if "res" not in st.session_state:
+        st.session_state.res = None
+    if "target_smiles" not in st.session_state:
+        st.session_state.target_smiles = (
+            ""  # Initial value, might be overwritten by ketcher
+        )
+    # Clustering state
+    if "clustering_done" not in st.session_state:
+        st.session_state.clustering_done = False
+    if "clusters" not in st.session_state:
+        st.session_state.clusters = None
+    if "reactions_dict" not in st.session_state:
+        st.session_state.reactions_dict = None
+    if "num_clusters_setting" not in st.session_state:  # Store the setting used
+        st.session_state.num_clusters_setting = 10
+    if "route_cgrs_dict" not in st.session_state:
+        st.session_state.route_cgrs_dict = None
+    if "sb_cgrs_dict" not in st.session_state:
+        st.session_state.sb_cgrs_dict = None
+    if "route_json" not in st.session_state:
+        st.session_state.route_json = None
+    # Subclustering state
+    if "subclustering_done" not in st.session_state:
+        st.session_state.subclustering_done = False
+    if "subclusters" not in st.session_state:  # Renamed from 'sub' for clarity
+        st.session_state.subclusters = None
+    # Download state (less critical now with direct download links)
+    if "clusters_downloaded" not in st.session_state:  # Example, might not be needed
+        st.session_state.clusters_downloaded = False
+    if "ketcher" not in st.session_state:  # For ketcher persistence
+        st.session_state.ketcher = DEFAULT_MOL
+    intro_text = """
+    This is a demo of the graphical user interface of
+    [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
+    SynPlanner is a comprehensive tool for reaction data curation, rule extraction, model training and retrosynthetic planning.
+    More information on SynPlanner is available in the [official docs](https://synplanner.readthedocs.io/en/latest/index.html).
+    """
+    st.title("`SynPlanner GUI`")
+    st.write(intro_text)
+def setup_sidebar():
+    """2. Sidebar: Handling the widgets and logic within the sidebar area."""
+    # st.sidebar.image("img/logo.png") # Assuming img/logo.png is available
+    st.sidebar.title("Docs")
+    st.sidebar.markdown("https://synplanner.readthedocs.io/en/latest/")
+    st.sidebar.title("Tutorials")
+    st.sidebar.markdown(
+        "https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/tree/main/tutorials"
+    )
+    st.sidebar.title("Paper")
+    st.sidebar.markdown(
+        "https://chemrxiv.org/engage/chemrxiv/article-details/66add90bc9c6a5c07ae65796"
+    )
+    st.sidebar.title("Issues")
+    st.sidebar.markdown(
+        "[Report a bug 🐞](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/issues/new?assignees=&labels=bug&projects=&template=bug_report.md&title=%5BBUG%5D)"
+    )
+def handle_molecule_input():
+    """3. Molecule Input: Managing the input area for molecule data with two-way synchronization."""
+    st.header("Molecule input")
+    st.markdown(
+        """
+        You can provide a molecular structure by either providing:
+        * SMILES string + Enter
+        * Draw it + Apply
+        """
+    )
+    if "shared_smiles" not in st.session_state:
+        st.session_state.shared_smiles = st.session_state.get("ketcher", DEFAULT_MOL)
+    if "ketcher_render_count" not in st.session_state:
+        st.session_state.ketcher_render_count = 0
+    def text_input_changed_callback():
+        new_text_value = (
+            st.session_state.smiles_text_input_key_for_sync
+        )  # Key of the text_input
+        if new_text_value != st.session_state.shared_smiles:
+            st.session_state.shared_smiles = new_text_value
+            st.session_state.ketcher = new_text_value
+            st.session_state.ketcher_render_count += 1
+    # SMILES Text Input
+    st.text_input(
+        "SMILES:",
+        value=st.session_state.shared_smiles,
+        key="smiles_text_input_key_for_sync",  # Unique key for this widget
+        on_change=text_input_changed_callback,
+        help="Enter SMILES string and press Enter. The drawing will update, and vice-versa.",
+    )
+    ketcher_key = f"ketcher_widget_for_sync_{st.session_state.ketcher_render_count}"
+    smile_code_output_from_ketcher = st_ketcher(
+        st.session_state.shared_smiles, key=ketcher_key
+    )
+    if smile_code_output_from_ketcher != st.session_state.shared_smiles:
+        st.session_state.shared_smiles = smile_code_output_from_ketcher
+        st.session_state.ketcher = smile_code_output_from_ketcher
+        st.rerun()
+    current_smiles_for_planning = st.session_state.shared_smiles
+    last_planned_smiles = st.session_state.get("target_smiles")
+    if (
+        last_planned_smiles
+        and current_smiles_for_planning != last_planned_smiles
+        and st.session_state.get("planning_done", False)
+    ):
+        st.warning(
+            "Molecule structure has changed since the last successful planning run. "
+            "Results shown below (if any) are for the previous molecule. "
+            "Please re-run planning for the current structure."
+        )
+    # Ensure st.session_state.ketcher is consistent for other parts of the app
+    if st.session_state.get("ketcher") != current_smiles_for_planning:
+        st.session_state.ketcher = current_smiles_for_planning
+    return current_smiles_for_planning
+def setup_planning_options():
+    """4. Planning: Encapsulating the logic related to the "planning" functionality."""
+    st.header("Launch calculation")
+    st.markdown(
+        """If you modified the structure, please ensure you clicked on `Apply` (bottom right of the molecular editor)."""
+    )
+    st.markdown(
+        f"The molecule SMILES is actually: ``{st.session_state.get('ketcher', DEFAULT_MOL)}``"
+    )
+    st.subheader("Planning options")
+    st.markdown(
+        """
+        The description of each option can be found in the
+        [Retrosynthetic Planning Tutorial](https://synplanner.readthedocs.io/en/latest/tutorial_files/retrosynthetic_planning.html#Configuring-search-tree).
+        """
+    )
+    col_options_1, col_options_2 = st.columns(2, gap="medium")
+    with col_options_1:
+        search_strategy_input = st.selectbox(
+            label="Search strategy",
+            options=(
+                "Expansion first",
+                "Evaluation first",
+            ),
+            index=0,
+            key="search_strategy_input",
+        )
+        ucb_type = st.selectbox(
+            label="UCB type",
+            options=("uct", "puct", "value"),
+            index=0,
+            key="ucb_type_input",
+        )  # Fixed label
+        c_ucb = st.number_input(
+            "C coefficient of UCB",
+            value=0.1,
+            placeholder="Type a number...",
+            key="c_ucb_input",
+        )
+    with col_options_2:
+        max_iterations = st.slider(
+            "Total number of MCTS iterations",
+            min_value=50,
+            max_value=1000,
+            value=300,
+            key="max_iterations_slider",
+        )
+        max_depth = st.slider(
+            "Maximal number of reaction steps",
+            min_value=3,
+            max_value=9,
+            value=6,
+            key="max_depth_slider",
+        )
+        min_mol_size = st.slider(
+            "Minimum size of a molecule to be precursor",
+            min_value=0,
+            max_value=7,
+            value=0,
+            key="min_mol_size_slider",
+            help="Number of non-hydrogen atoms in molecule",
+        )
+    search_strategy_translator = {
+        "Expansion first": "expansion_first",
+        "Evaluation first": "evaluation_first",
+    }
+    search_strategy = search_strategy_translator[search_strategy_input]
+    planning_params = {
+        "search_strategy": search_strategy,
+        "ucb_type": ucb_type,
+        "c_ucb": c_ucb,
+        "max_iterations": max_iterations,
+        "max_depth": max_depth,
+        "min_mol_size": min_mol_size,
+    }
+    if st.button("Start retrosynthetic planning", key="submit_planning_button"):
+        # Reset downstream states if replanning
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.route_cgrs_dict = None
+        st.session_state.sb_cgrs_dict = None
+        st.session_state.route_json = None
+        active_smile_code = st.session_state.get(
+            "ketcher", DEFAULT_MOL
+        )  # Get current SMILES
+        st.session_state.target_smiles = (
+            active_smile_code  # Store the SMILES used for this run
+        )
+        try:
+            target_molecule = mol_from_smiles(active_smile_code, clean_stereo=True)
+            if target_molecule is None:
+                st.error(f"Could not parse the input SMILES: {active_smile_code}")
+            else:
+                (
+                    building_blocks_path,
+                    ranking_policy_weights_path,
+                    reaction_rules_path,
+                ) = load_planning_resources_cached()
+                with st.spinner("Running retrosynthetic planning..."):
+                    with st.status("Loading resources...", expanded=False) as status:
+                        st.write("Loading building blocks...")
+                        building_blocks = load_building_blocks(
+                            building_blocks_path, standardize=False
+                        )
+                        st.write("Loading reaction rules...")
+                        reaction_rules = load_reaction_rules(reaction_rules_path)
+                        st.write("Loading policy network...")
+                        policy_config = PolicyNetworkConfig(
+                            weights_path=ranking_policy_weights_path
+                        )
+                        policy_function = PolicyNetworkFunction(
+                            policy_config=policy_config
+                        )
+                        status.update(label="Resources loaded!", state="complete")
+                    tree_config = TreeConfig(
+                        search_strategy=planning_params["search_strategy"],
+                        evaluation_type="rollout",  # This was hardcoded, keeping it.
+                        max_iterations=planning_params["max_iterations"],
+                        max_depth=planning_params["max_depth"],
+                        min_mol_size=planning_params["min_mol_size"],
+                        init_node_value=0.5,  # This was hardcoded
+                        ucb_type=planning_params["ucb_type"],
+                        c_ucb=planning_params["c_ucb"],
+                        silent=True,  # This was hardcoded
+                    )
+                    tree = Tree(
+                        target=target_molecule,
+                        config=tree_config,
+                        reaction_rules=reaction_rules,
+                        building_blocks=building_blocks,
+                        expansion_function=policy_function,
+                        evaluation_function=None,  # This was hardcoded
+                    )
+                    mcts_progress_text = "Running MCTS iterations..."
+                    mcts_bar = st.progress(0, text=mcts_progress_text)
+                    for step, (solved, route_id) in enumerate(tree):
+                        progress_value = min(
+                            1.0, (step + 1) / planning_params["max_iterations"]
+                        )
+                        mcts_bar.progress(
+                            progress_value,
+                            text=f"{mcts_progress_text} ({step+1}/{planning_params['max_iterations']})",
+                        )
+                    res = extract_tree_stats(tree, target_molecule)
+                    st.session_state["tree"] = tree
+                    st.session_state["res"] = res
+                    st.session_state.planning_done = True
+                    st.rerun()
+        except Exception as e:
+            st.error(f"An error occurred during planning: {e}")
+            st.session_state.planning_done = False
+def display_planning_results():
+    """5. Planning Results Display: Handling the presentation of results."""
+    if st.session_state.get("planning_done", False):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        if res is None or tree is None:
+            st.error(
+                "Planning results are missing from session state. Please re-run planning."
+            )
+            st.session_state.planning_done = False  # Reset state
+            return  # Exit this function if no results
+        if res.get("solved", False):  # Use .get for safety
+            st.header("Planning Results")
+            winning_nodes = (
+                sorted(set(tree.winning_nodes))
+                if hasattr(tree, "winning_nodes") and tree.winning_nodes
+                else []
+            )
+            st.subheader(f"Number of unique routes found: {len(winning_nodes)}")
+            st.subheader("Examples of found retrosynthetic routes")
+            image_counter = 0
+            visualised_route_ids = set()
+            if not winning_nodes:
+                st.warning(
+                    "Planning solved, but no winning nodes found in the tree object."
+                )
+            else:
+                for n, route_id in enumerate(winning_nodes):
+                    if image_counter >= 3:
+                        break
+                    if route_id not in visualised_route_ids:
+                        try:
+                            visualised_route_ids.add(route_id)
+                            num_steps = len(tree.synthesis_route(route_id))
+                            route_score = round(tree.route_score(route_id), 3)
+                            svg = get_route_svg(tree, route_id)
+                            # svg = get_route_svg_from_json(st.session_state.route_json, route_id)
+                            if svg:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                                image_counter += 1
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {route_id}."
+                                )
+                        except Exception as e:
+                            st.error(f"Error displaying route {route_id}: {e}")
+        else:  # Not solved
+            st.header("Planning Results")
+            st.warning(
+                "No reaction path found for the target molecule with the current settings."
+            )
+            st.write(
+                "Consider adjusting planning options (e.g., increase iterations, adjust depth, check molecule validity)."
+            )
+            stat_col, _ = st.columns(2)
+            with stat_col:
+                st.subheader("Run Statistics (No Solution)")
+                try:
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display for the unsuccessful run.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)
+def download_planning_results():
+    """6. Planning Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        # This section is usually placed within a column in the original script
+        # We'll assume it's called after display_planning_results and can use a new column or area.
+        # For proper layout, this should be integrated with display_planning_results' columns.
+        # For now, creating a placeholder or separate section for downloads:
+        # st.subheader("Downloads") # This might be redundant if called within a layout context.
+        # The original code places downloads in the second column of planning results.
+        # To replicate, we'd need to pass the column object or call this within that context.
+        # Simulating this by just creating the download links:
+        try:
+            html_body = generate_results_html(tree, html_path=None, extended=True)
+            dl_html = download_button(
+                html_body,
+                f"results_synplanner_{st.session_state.target_smiles}.html",
+                "Download results (HTML)",
+            )
+            if dl_html:
+                st.markdown(dl_html, unsafe_allow_html=True)
+            try:
+                res_df = pd.DataFrame(res, index=[0])
+                dl_csv = download_button(
+                    res_df,
+                    f"stats_synplanner_{st.session_state.target_smiles}.csv",
+                    "Download statistics (CSV)",
+                )
+                if dl_csv:
+                    st.markdown(dl_csv, unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Could not prepare statistics CSV for download: {e}")
+        except Exception as e:
+            st.error(f"Error generating download links for planning results: {e}")
+def setup_clustering():
+    """7. Clustering: Encapsulating the logic related to the "clustering" functionality."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        st.divider()
+        st.header("Clustering the retrosynthetic routes")
+        if st.button("Run Clustering", key="submit_clustering_button"):
+            # st.session_state.num_clusters_setting = num_clusters_input
+            st.session_state.clustering_done = False
+            st.session_state.subclustering_done = False
+            st.session_state.clusters = None
+            st.session_state.reactions_dict = None
+            st.session_state.subclusters = None
+            st.session_state.route_cgrs_dict = None
+            st.session_state.sb_cgrs_dict = None
+            st.session_state.route_json = None
+            with st.spinner("Performing clustering..."):
+                try:
+                    current_tree = st.session_state.tree
+                    if not current_tree:
+                        st.error("Tree object not found. Please re-run planning.")
+                        return
+                    st.write("Calculating RoutesCGRs...")
+                    route_cgrs_dict = compose_all_route_cgrs(current_tree)
+                    st.write("Processing SB-CGRs...")
+                    sb_cgrs_dict = compose_all_sb_cgrs(route_cgrs_dict)
+                    results = cluster_routes(
+                        sb_cgrs_dict, use_strat=False
+                    )  # num_clusters was removed from args
+                    results = dict(sorted(results.items(), key=lambda x: float(x[0])))
+                    st.session_state.clusters = results
+                    st.session_state.route_cgrs_dict = route_cgrs_dict
+                    st.session_state.sb_cgrs_dict = sb_cgrs_dict
+                    st.write("Extracting reactions...")
+                    st.session_state.reactions_dict = extract_reactions(current_tree)
+                    st.session_state.route_json = make_json(st.session_state.reactions_dict)
+                    if (
+                        st.session_state.clusters is not None
+                        and st.session_state.reactions_dict is not None
+                    ):  # Check for None explicitly
+                        st.session_state.clustering_done = True
+                        st.success(
+                            f"Clustering complete. Found {len(st.session_state.clusters)} clusters."
+                        )
+                    else:
+                        st.error("Clustering failed or returned empty results.")
+                        st.session_state.clustering_done = False
+                    del results  # route_cgrs_dict, sb_cgrs_dict are stored
+                    gc.collect()
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"An error occurred during clustering: {e}")
+                    st.session_state.clustering_done = False
+def display_clustering_results():
+    """8. Clustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("clustering_done", False):
+        clusters = st.session_state.clusters
+        # reactions_dict = st.session_state.reactions_dict # Needed for download, not directly for display here
+        tree = st.session_state.tree
+        MAX_DISPLAY_CLUSTERS_DATA = 10
+        if (
+            clusters is None or tree is None
+        ):  # reactions_dict removed as not critical for display part
+            st.error(
+                "Clustering results (clusters or tree) are missing. Please re-run clustering."
+            )
+            st.session_state.clustering_done = False
+            return
+        st.subheader(f"Best routes from {len(clusters)} Found Clusters")
+        clusters_items = list(clusters.items())
+        first_items = clusters_items[:MAX_DISPLAY_CLUSTERS_DATA]
+        remaining_items = clusters_items[MAX_DISPLAY_CLUSTERS_DATA:]
+        for cluster_num, group_data in first_items:
+            if (
+                not group_data
+                or "route_ids" not in group_data
+                or not group_data["route_ids"]
+            ):
+                st.warning(f"Cluster {cluster_num} has no data or route_ids.")
+                continue
+            st.markdown(
+                f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+            )
+            route_id = group_data["route_ids"][0]
+            try:
+                num_steps = len(tree.synthesis_route(route_id))
+                route_score = round(tree.route_score(route_id), 3)
+                # svg = get_route_svg(tree, route_id)
+                svg = get_route_svg_from_json(st.session_state.route_json, route_id)
+                sb_cgr = group_data.get("sb_cgr")  # Safely get sb_cgr
+                sb_cgr_svg = None
+                if sb_cgr:
+                    sb_cgr.clean2d()
+                    sb_cgr_svg = cgr_display(sb_cgr)
+                if svg and sb_cgr_svg:
+                    col1, col2 = st.columns([0.2, 0.8])
+                    with col1:
+                        st.image(sb_cgr_svg, caption="SB-CGR")
+                    with col2:
+                        st.image(
+                            svg,
+                            caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                        )
+                elif svg:  # Only route SVG available
+                    st.image(
+                        svg,
+                        caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                    )
+                    st.warning(
+                        f"SB-CGR could not be displayed for cluster {cluster_num}."
+                    )
+                else:
+                    st.warning(
+                        f"Could not generate SVG for route {route_id} or its SB-CGR."
+                    )
+            except Exception as e:
+                st.error(
+                    f"Error displaying route {route_id} for cluster {cluster_num}: {e}"
+                )
+        if remaining_items:
+            with st.expander(f"... and {len(remaining_items)} more clusters"):
+                for cluster_num, group_data in remaining_items:
+                    if (
+                        not group_data
+                        or "route_ids" not in group_data
+                        or not group_data["route_ids"]
+                    ):
+                        st.warning(
+                            f"Cluster {cluster_num} in expansion has no data or route_ids."
+                        )
+                        continue
+                    st.markdown(
+                        f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+                    )
+                    route_id = group_data["route_ids"][0]
+                    try:
+                        num_steps = len(tree.synthesis_route(route_id))
+                        route_score = round(tree.route_score(route_id), 3)
+                        # svg = get_route_svg(tree, route_id)
+                        svg = get_route_svg_from_json(st.session_state.route_json, route_id)
+                        sb_cgr = group_data.get("sb_cgr")
+                        sb_cgr_svg = None
+                        if sb_cgr:
+                            sb_cgr.clean2d()
+                            sb_cgr_svg = cgr_display(sb_cgr)
+                        if svg and sb_cgr_svg:
+                            col1, col2 = st.columns([0.2, 0.8])
+                            with col1:
+                                st.image(sb_cgr_svg, caption="SB-CGR")
+                            with col2:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                        elif svg:
+                            st.image(
+                                svg,
+                                caption=f"Route {route_id}; {num_steps} steps; Route score: {route_score}",
+                            )
+                            st.warning(
+                                f"SB-CGR could not be displayed for cluster {cluster_num}."
+                            )
+                        else:
+                            st.warning(
+                                f"Could not generate SVG for route {route_id} or its SB-CGR."
+                            )
+                    except Exception as e:
+                        st.error(
+                            f"Error displaying route {route_id} for cluster {cluster_num}: {e}"
+                        )
+def download_clustering_results():
+    """10. Clustering Results Download: Providing functionality to download."""
+    if st.session_state.get("clustering_done", False):
+        tree_for_html = st.session_state.get("tree")
+        clusters_for_html = st.session_state.get("clusters")
+        sb_cgrs_for_html = st.session_state.get(
+            "sb_cgrs_dict"
+        )  # This was used instead of reactions_dict in the original for report
+        if not tree_for_html:
+            st.warning("MCTS Tree data not found. Cannot generate cluster reports.")
+            return
+        if not clusters_for_html:
+            st.warning("Cluster data not found. Cannot generate cluster reports.")
+            return
+        # sb_cgrs_for_html is optional for routes_clustering_report if not essential
+        st.subheader("Cluster Reports")  # Changed subheader in original
+        st.write("Generate downloadable HTML reports for each cluster:")
+        MAX_DOWNLOAD_LINKS_DISPLAYED = 10
+        num_clusters_total = len(clusters_for_html)
+        clusters_items = list(clusters_for_html.items())
+        for i, (cluster_idx, group_data) in enumerate(
+            clusters_items
+        ):  # group_data might not be needed here if report uses cluster_idx
+            if i >= MAX_DOWNLOAD_LINKS_DISPLAYED:
+                break
+            try:
+                html_content = routes_clustering_report(
+                    tree_for_html,
+                    clusters_for_html,  # Pass the whole dict
+                    str(cluster_idx),  # Pass the key of the cluster
+                    sb_cgrs_for_html,  # Pass the sb_cgrs dict
+                    aam=False,
+                )
+                st.download_button(
+                    label=f"Download report for cluster {cluster_idx}",
+                    data=html_content,
+                    file_name=f"cluster_{cluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_cluster_{cluster_idx}",
+                )
+            except Exception as e:
+                st.error(f"Error generating report for cluster {cluster_idx}: {e}")
+        if num_clusters_total > MAX_DOWNLOAD_LINKS_DISPLAYED:
+            remaining_items = clusters_items[MAX_DOWNLOAD_LINKS_DISPLAYED:]
+            remaining_count = len(remaining_items)
+            expander_label = f"Show remaining {remaining_count} cluster reports"
+            with st.expander(expander_label):
+                for (
+                    group_index,
+                    _,
+                ) in remaining_items:  # group_data not needed here either
+                    try:
+                        html_content = routes_clustering_report(
+                            tree_for_html,
+                            clusters_for_html,
+                            str(group_index),
+                            sb_cgrs_for_html,
+                            aam=False,
+                        )
+                        st.download_button(
+                            label=f"Download report for cluster {group_index}",
+                            data=html_content,
+                            file_name=f"cluster_{group_index}_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_expanded_{group_index}",
+                        )
+                    except Exception as e:
+                        st.error(
+                            f"Error generating report for cluster {group_index} (expanded): {e}"
+                        )
+        try:
+            buffer = io.BytesIO()
+            with zipfile.ZipFile(
+                buffer, mode="w", compression=zipfile.ZIP_DEFLATED
+            ) as zf:
+                for idx, _ in clusters_items:  # group_data not needed
+                    html_content_zip = routes_clustering_report(
+                        tree_for_html,
+                        clusters_for_html,
+                        str(idx),
+                        sb_cgrs_for_html,
+                        aam=False,
+                    )
+                    filename = f"cluster_{idx}_{st.session_state.target_smiles}.html"
+                    zf.writestr(filename, html_content_zip)
+            buffer.seek(0)
+            st.download_button(
+                label="📦 Download all cluster reports as ZIP",
+                data=buffer,
+                file_name=f"all_cluster_reports_{st.session_state.target_smiles}.zip",
+                mime="application/zip",
+                key="download_all_clusters_zip",
+            )
+        except Exception as e:
+            st.error(f"Error generating ZIP file for cluster reports: {e}")
+def setup_subclustering():
+    """11. Subclustering: Encapsulating the logic related to the "subclustering" functionality."""
+    if st.session_state.get(
+        "clustering_done", False
+    ):  # Subclustering depends on clustering being done
+        st.divider()
+        st.header("Sub-Clustering within a selected Cluster")
+        if st.button("Run Subclustering Analysis", key="submit_subclustering_button"):
+            st.session_state.subclustering_done = False
+            st.session_state.subclusters = None
+            with st.spinner("Performing subclustering analysis..."):
+                try:
+                    clusters_for_sub = st.session_state.get("clusters")
+                    sb_cgrs_dict_for_sub = st.session_state.get("sb_cgrs_dict")
+                    route_cgrs_dict_for_sub = st.session_state.get("route_cgrs_dict")
+                    if (
+                        clusters_for_sub
+                        and sb_cgrs_dict_for_sub
+                        and route_cgrs_dict_for_sub
+                    ):  # Ensure all are present
+                        all_subgroups = subcluster_all_clusters(
+                            clusters_for_sub,
+                            sb_cgrs_dict_for_sub,
+                            route_cgrs_dict_for_sub,
+                        )
+                        st.session_state.subclusters = all_subgroups
+                        st.session_state.subclustering_done = True
+                        st.success("Subclustering analysis complete.")
+                        gc.collect()
+                        st.rerun()
+                    else:
+                        missing = []
+                        if not clusters_for_sub:
+                            missing.append("clusters")
+                        if not sb_cgrs_dict_for_sub:
+                            missing.append("SB-CGRs dictionary")
+                        if not route_cgrs_dict_for_sub:
+                            missing.append("RouteCGRs dictionary")
+                        st.error(
+                            f"Cannot run subclustering. Missing data: {', '.join(missing)}. Please ensure clustering ran successfully."
+                        )
+                        st.session_state.subclustering_done = False
+                except Exception as e:
+                    st.error(f"An error occurred during subclustering: {e}")
+                    st.session_state.subclustering_done = False
+def display_subclustering_results():
+    """12. Subclustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("subclustering_done", False):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        # clusters_for_sub_display = st.session_state.get('clusters') # Not directly used in display logic from original code snippet
+        if not sub or not tree:
+            st.error(
+                "Subclustering results (subclusters or tree) are missing. Please re-run subclustering."
+            )
+            st.session_state.subclustering_done = False
+            return
+        sub_input_col, sub_display_col = st.columns([0.25, 0.75])
+        with sub_input_col:
+            st.subheader("Select Cluster and Subcluster")
+            available_cluster_nums = list(sub.keys())
+            if not available_cluster_nums:
+                st.warning("No clusters available in subclustering results.")
+                return  # Exit if no clusters to select
+            user_input_cluster_num_display = st.selectbox(
+                "Select Cluster #:",
+                options=sorted(available_cluster_nums),
+                key="subcluster_num_select_key",
+            )
+            selected_subcluster_idx = 0
+            if user_input_cluster_num_display in sub:
+                sub_step_cluster = sub[user_input_cluster_num_display]
+                allowed_subclusters_indices = sorted(list(sub_step_cluster.keys()))
+                if not allowed_subclusters_indices:
+                    st.warning(
+                        f"No reaction steps (subclusters) found for Cluster {user_input_cluster_num_display}."
+                    )
+                else:
+                    selected_subcluster_idx = st.selectbox(
+                        "Select Subcluster Index:",
+                        options=allowed_subclusters_indices,
+                        key="subcluster_index_select_key",
+                    )
+                    if selected_subcluster_idx in sub[user_input_cluster_num_display]:
+                        current_subcluster_data = sub[user_input_cluster_num_display][
+                            selected_subcluster_idx
+                        ]
+                        if "sb_cgr" in current_subcluster_data:
+                            cluster_sb_cgr_display = current_subcluster_data["sb_cgr"]
+                            cluster_sb_cgr_display.clean2d()
+                            st.image(
+                                cluster_sb_cgr_display.depict(),
+                                caption=f"SB-CGR of parent Cluster {user_input_cluster_num_display}",
+                            )
+                        else:
+                            st.warning("SB-CGR for this subcluster not found.")
+            else:
+                st.warning(
+                    f"Selected cluster {user_input_cluster_num_display} not found in subclustering results."
+                )
+                return
+        with sub_display_col:
+            st.subheader("Subcluster Details")
+            if (
+                user_input_cluster_num_display in sub
+                and selected_subcluster_idx in sub[user_input_cluster_num_display]
+            ):
+                subcluster_content = sub[user_input_cluster_num_display][
+                    selected_subcluster_idx
+                ]
+                # subcluster_to_display = post_process_subgroup(subcluster_content) #Under development
+                subcluster_to_display = subcluster_content
+                if (
+                    not subcluster_to_display
+                    or "routes_data" not in subcluster_to_display
+                    or not subcluster_to_display["routes_data"]
+                ):
+                    st.info("No routes or data found for this subcluster selection.")
+                else:
+                    MAX_ROUTES_PER_SUBCLUSTER = 5
+                    all_route_ids_in_subcluster = list(
+                        subcluster_to_display["routes_data"].keys()
+                    )
+                    routes_to_display_direct = all_route_ids_in_subcluster[
+                        :MAX_ROUTES_PER_SUBCLUSTER
+                    ]
+                    remaining_routes_sub = all_route_ids_in_subcluster[
+                        MAX_ROUTES_PER_SUBCLUSTER:
+                    ]
+                    st.markdown(
+                        f"--- \n**Subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}** (Size: {len(all_route_ids_in_subcluster)})"
+                    )
+                    if "synthon_reaction" in subcluster_to_display:
+                        synthon_reaction = subcluster_to_display["synthon_reaction"]
+                        try:
+                            synthon_reaction.clean2d()
+                            st.image(
+                                depict_custom_reaction(synthon_reaction),
+                                caption=f"Markush-like pseudo reaction of subcluster",
+                            )  # Assuming depict_custom_reaction
+                        except Exception as e_depict:
+                            st.warning(f"Could not depict synthon reaction: {e_depict}")
+                    else:
+                        st.info("No synthon reaction data for this subcluster.")
+                    with st.container(height=500):
+                        for route_id in routes_to_display_direct:
+                            try:
+                                route_score_sub = round(tree.route_score(route_id), 3)
+                                # svg_sub = get_route_svg(tree, route_id)
+                                svg_sub = get_route_svg_from_json(st.session_state.route_json, route_id)
+                                if svg_sub:
+                                    st.image(
+                                        svg_sub,
+                                        caption=f"Route {route_id}; Score: {route_score_sub}",
+                                    )
+                                else:
+                                    st.warning(
+                                        f"Could not generate SVG for route {route_id}."
+                                    )
+                            except Exception as e:
+                                st.error(
+                                    f"Error displaying route {route_id} in subcluster: {e}"
+                                )
+                        if remaining_routes_sub:
+                            with st.expander(
+                                f"... and {len(remaining_routes_sub)} more routes in this subcluster"
+                            ):
+                                for route_id in remaining_routes_sub:
+                                    try:
+                                        route_score_sub = round(
+                                            tree.route_score(route_id), 3
+                                        )
+                                        # svg_sub = get_route_svg(tree, route_id)
+                                        svg_sub = get_route_svg_from_json(st.session_state.route_json, route_id)
+                                        if svg_sub:
+                                            st.image(
+                                                svg_sub,
+                                                caption=f"Route {route_id}; Score: {route_score_sub}",
+                                            )
+                                        else:
+                                            st.warning(
+                                                f"Could not generate SVG for route {route_id}."
+                                            )
+                                    except Exception as e:
+                                        st.error(
+                                            f"Error displaying route {route_id} in subcluster (expanded): {e}"
+                                        )
+            else:
+                st.info("Select a valid cluster and subcluster index to see details.")
+def download_subclustering_results():
+    """13. Subclustering Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("subclustering_done", False)
+        and "subcluster_num_select_key" in st.session_state
+        and "subcluster_index_select_key" in st.session_state
+    ):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        sb_cgrs_for_report = st.session_state.get(
+            "sb_cgrs_dict"
+        )  # Used by routes_subclustering_report
+        user_input_cluster_num_display = st.session_state.subcluster_num_select_key
+        selected_subcluster_idx = st.session_state.subcluster_index_select_key
+        if not tree or not sub or not sb_cgrs_for_report:
+            st.warning(
+                "Missing data for subclustering report generation (tree, subclusters, or SB-CGRs)."
+            )
+            return
+        if (
+            user_input_cluster_num_display in sub
+            and selected_subcluster_idx in sub[user_input_cluster_num_display]
+        ):
+            subcluster_data_for_report = sub[user_input_cluster_num_display][
+                selected_subcluster_idx
+            ]
+            # Apply the same post-processing as in display
+            processed_subcluster_data = post_process_subgroup(
+                subcluster_data_for_report
+            )
+            if "routes_data" in subcluster_data_for_report and isinstance(
+                subcluster_data_for_report["routes_data"], dict
+            ):
+                processed_subcluster_data["group_lgs"] = group_by_identical_values(
+                    subcluster_data_for_report["routes_data"]
+                )
+            else:
+                processed_subcluster_data["group_lgs"] = {}
+            try:
+                subcluster_html_content = routes_subclustering_report(
+                    tree,
+                    processed_subcluster_data,  # Pass the specific post-processed subcluster data
+                    user_input_cluster_num_display,
+                    selected_subcluster_idx,
+                    sb_cgrs_for_report,  # Pass the whole sb_cgrs dict
+                    if_lg_group=True,  # This parameter was in the original call
+                )
+                st.download_button(
+                    label=f"Download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}",
+                    data=subcluster_html_content,
+                    file_name=f"subcluster_{user_input_cluster_num_display}.{selected_subcluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_subcluster_{user_input_cluster_num_display}_{selected_subcluster_idx}",
+                )
+            except Exception as e:
+                st.error(
+                    f"Error generating download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}: {e}"
+                )
+        # else:
+        # This case is handled by the display logic mostly, download button just won't appear or will be for previous valid selection.
+def implement_restart():
+    """14. Restart: Implementing the logic to reset or restart the application state."""
+    st.divider()
+    st.header("Restart Application State")
+    if st.button("Clear All Results & Restart", key="restart_button"):
+        keys_to_clear = [
+            "planning_done",
+            "tree",
+            "res",
+            "target_smiles",
+            "clustering_done",
+            "clusters",
+            "reactions_dict",
+            "num_clusters_setting",
+            "route_cgrs_dict",
+            "sb_cgrs_dict",
+            "route_json",
+            "subclustering_done",
+            "subclusters",  # "sub" was renamed
+            "clusters_downloaded",
+            # Potentially ketcher related keys if they need manual reset beyond new input
+            "ketcher_widget",
+            "smiles_text_input_key",  # Keys for widgets
+            "subcluster_num_select_key",
+            "subcluster_index_select_key",
+        ]
+        for key in keys_to_clear:
+            if key in st.session_state:
+                del st.session_state[key]
+        # Reset ketcher input to default by resetting its session state variable
+        st.session_state.ketcher = DEFAULT_MOL
+        # Also explicitly set target_smiles to empty or default to avoid stale data
+        st.session_state.target_smiles = ""
+        # It's generally better to let Streamlit manage widget state if possible,
+        # but for a full reset, clearing their explicit session state keys might be needed.
+        st.rerun()
+# --- Main Application Flow ---
+def main():
+    initialize_app()
+    setup_sidebar()
+    current_smile_code = handle_molecule_input()
+    # Update session_state.ketcher if current_smile_code has changed from ketcher output
+    if st.session_state.get("ketcher") != current_smile_code:
+        st.session_state.ketcher = current_smile_code
+        # No rerun here, let the flow continue. handle_molecule_input already warns.
+    setup_planning_options()  # This function now also handles the button press and logic for planning
+    # Display planning results and download options together
+    if st.session_state.get("planning_done", False):
+        display_planning_results()  # Displays stats and routes
+        if st.session_state.res and st.session_state.res.get("solved", False):
+            stat_col, download_col = st.columns(
+                2, gap="medium"
+            )  # Placeholder for download column
+            with stat_col:
+                st.subheader("Statistics")
+                try:
+                    res = st.session_state.res
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_routes",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:  # Ensure there are columns to show
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display from planning results.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)  # Show raw dict if DataFrame fails
+            with download_col:
+                st.subheader("Planning Downloads")  # Adding a subheader for clarity
+                download_planning_results()
+    # Clustering section (setup button, display, download)
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        setup_clustering()  # Contains the "Run Clustering" button and logic
+        if st.session_state.get("clustering_done", False):
+            display_clustering_results()  # Displays cluster routes and stats
+            cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
+            with cluster_stat_col:
+                clusters = st.session_state.clusters
+                cluster_sizes = [
+                    cluster.get("group_size", 0)
+                    for cluster in clusters.values()
+                    if cluster
+                ]  # Safe get
+                st.subheader("Cluster Statistics")
+                if cluster_sizes:
+                    cluster_df = pd.DataFrame(
+                        {
+                            "Cluster": [
+                                k for k, v in clusters.items() if v
+                            ],  # Filter out empty clusters
+                            "Number of Routes": [
+                                v["group_size"] for v in clusters.values() if v
+                            ],
+                        }
+                    )
+                    if not cluster_df.empty:
+                        cluster_df.index += 1
+                        st.dataframe(cluster_df)
+                        best_route_html = html_top_routes_cluster(
+                            clusters,
+                            st.session_state.tree,
+                            st.session_state.target_smiles,
+                        )
+                        st.download_button(
+                            label=f"Download best route from each cluster",
+                            data=best_route_html,
+                            file_name=f"cluster_best_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_best",
+                        )
+                    else:
+                        st.write("No valid cluster data to display statistics for.")
+                    # download_top_routes_cluster()
+                else:
+                    st.write("No cluster data to display statistics for.")
+            with cluster_download_col:
+                download_clustering_results()
+    # Subclustering section (setup button, display, download)
+    if st.session_state.get("clustering_done", False):  # Depends on clustering
+        setup_subclustering()  # Contains "Run Subclustering" button
+        if st.session_state.get("subclustering_done", False):
+            display_subclustering_results()  # Displays subcluster details and routes
+            download_subclustering_results()  # This needs to be called after selections are made in display.
+    implement_restart()
+if __name__ == "__main__":
+    main()

synplan/mcts/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from CGRtools.containers import MoleculeContainer
+from .node import *
+from .tree import *
+MoleculeContainer.depict_settings(aam=False)
+__all__ = ["Tree", "Node"]

synplan/mcts/evaluation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Module containing a class that represents a value function for prediction of
+synthesisablity of new nodes in the tree search."""
+from typing import List
+import torch
+from synplan.chem.precursor import Precursor, compose_precursors
+from synplan.ml.networks.value import ValueNetwork
+from synplan.ml.training import mol_to_pyg
+class ValueNetworkFunction:
+    """Value function implemented as a value neural network for node evaluation
+    (synthesisability prediction) in tree search."""
+    def __init__(self, weights_path: str) -> None:
+        """The value function predicts the probability to synthesize the target molecule
+        with available building blocks starting from a given precursor.
+        :param weights_path: The value network weights file path.
+        """
+        value_net = ValueNetwork.load_from_checkpoint(
+            weights_path, map_location=torch.device("cpu")
+        )
+        self.value_network = value_net.eval()
+    def predict_value(self, precursors: List[Precursor,]) -> float:
+        """Predicts a value based on the given precursors from the node. For prediction,
+        precursors must be composed into a single molecule (product).
+        :param precursors: The list of precursors.
+        :return: The predicted float value ("synthesisability") of the node.
+        """
+        molecule = compose_precursors(precursors=precursors, exclude_small=True)
+        pyg_graph = mol_to_pyg(molecule)
+        if pyg_graph:
+            with torch.no_grad():
+                value_pred = self.value_network.forward(pyg_graph)[0].item()
+        else:
+            value_pred = -1e6
+        return value_pred

synplan/mcts/expansion.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Module containing a class that represents a policy function for node expansion in the
+tree search."""
+from typing import Iterator, List, Tuple, Union
+import torch
+import torch_geometric
+from CGRtools.reactor.reactor import Reactor
+from synplan.chem.precursor import Precursor
+from synplan.ml.networks.policy import PolicyNetwork
+from synplan.ml.training import mol_to_pyg
+from synplan.utils.config import PolicyNetworkConfig
+class PolicyNetworkFunction:
+    """Policy function implemented as a policy neural network for node expansion in tree
+    search."""
+    def __init__(
+        self, policy_config: PolicyNetworkConfig, compile: bool = False
+    ) -> None:
+        """Initializes the expansion function (ranking or filter policy network).
+        :param policy_config: An expansion policy configuration.
+        :param compile: Is supposed to speed up the training with model compilation.
+        """
+        self.config = policy_config
+        policy_net = PolicyNetwork.load_from_checkpoint(
+            self.config.weights_path,
+            map_location=torch.device("cpu"),
+            batch_size=1,
+            dropout=0,
+        )
+        policy_net = policy_net.eval()
+        if compile:
+            self.policy_net = torch_geometric.compile(policy_net, dynamic=True)
+        else:
+            self.policy_net = policy_net
+    def predict_reaction_rules(
+        self, precursor: Precursor, reaction_rules: List[Reactor]
+    ) -> Iterator[Union[Iterator, Iterator[Tuple[float, Reactor, int]]]]:
+        """The policy function predicts the list of reaction rules for a given precursor.
+        :param precursor: The current precursor for which the reaction rules are predicted.
+        :param reaction_rules: The list of reaction rules from which applicable reaction
+            rules are predicted and selected.
+        :return: Yielding the predicted probability for the reaction rule, reaction rule
+            and reaction rule id.
+        """
+        out_dim = list(self.policy_net.modules())[-1].out_features
+        if out_dim != len(reaction_rules):
+            raise Exception(
+                f"The policy network output dimensionality is {out_dim}, but the number of reaction rules is {len(reaction_rules)}. "
+                "Probably you use a different version of the policy network. Be sure to retain the policy network "
+                "with the current set of reaction rules"
+            )
+        pyg_graph = mol_to_pyg(precursor.molecule, canonicalize=False)
+        if pyg_graph:
+            with torch.no_grad():
+                if self.policy_net.policy_type == "filtering":
+                    probs, priority = self.policy_net.forward(pyg_graph)
+                if self.policy_net.policy_type == "ranking":
+                    probs = self.policy_net.forward(pyg_graph)
+            del pyg_graph
+        else:
+            return []
+        probs = probs[0].double()
+        if self.policy_net.policy_type == "filtering":
+            priority = priority[0].double()
+            priority_coef = self.config.priority_rules_fraction
+            probs = (1 - priority_coef) * probs + priority_coef * priority
+        sorted_probs, sorted_rules = torch.sort(probs, descending=True)
+        sorted_probs, sorted_rules = (
+            sorted_probs[: self.config.top_rules],
+            sorted_rules[: self.config.top_rules],
+        )
+        if self.policy_net.policy_type == "filtering":
+            sorted_probs = torch.softmax(sorted_probs, -1)
+        sorted_probs, sorted_rules = sorted_probs.tolist(), sorted_rules.tolist()
+        for prob, rule_id in zip(sorted_probs, sorted_rules):
+            if (
+                prob > self.config.rule_prob_threshold
+            ):  # search may fail if rule_prob_threshold is too low (recommended value is 0.0)
+                yield prob, reaction_rules[rule_id], rule_id

synplan/mcts/node.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Module containing a class Node in the tree search."""
+class Node:
+    """Node class represents a node in the tree search."""
+    def __init__(
+        self, precursors_to_expand: tuple = None, new_precursors: tuple = None
+    ) -> None:
+        """The function initializes the new Node object.
+        :param precursors_to_expand: The tuple of precursors to be expanded. The first precursor
+            in the tuple is the current precursor which will be expanded (for which new
+            precursors will be generated by applying the predicted reaction rules). When
+            the first precursor has been successfully expanded, the second precursor becomes
+            the current precursor to be expanded.
+        :param new_precursors: The tuple of new precursors generated by applying the reaction
+            rule.
+        """
+        self.precursors_to_expand = precursors_to_expand
+        self.new_precursors = new_precursors
+        if len(self.precursors_to_expand) == 0:
+            self.curr_precursor = tuple()
+        else:
+            self.curr_precursor = self.precursors_to_expand[0]
+            self.next_precursor = self.precursors_to_expand[1:]
+    def __len__(self) -> int:
+        """Returns the number of precursor in the node to expand."""
+        return len(self.precursors_to_expand)
+    def __repr__(self) -> str:
+        """Returns the SMILES of each precursor in precursor_to_expand and new_precursor."""
+        return (
+            f"New precursors: {self.new_precursors}\n"
+            f"Precursors to expand: {self.precursors_to_expand}\n"
+        )
+    def is_solved(self) -> bool:
+        """If True, it is a terminal node.
+        There are no precursors for expansion.
+        """
+        return len(self.precursors_to_expand) == 0

synplan/mcts/search.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""Module containing functions for running tree search for the set of target
+molecules."""
+import csv
+import json
+import logging
+import os.path
+from pathlib import Path
+from typing import Union
+from CGRtools.containers import MoleculeContainer
+from tqdm import tqdm
+from synplan.chem.reaction_routes.route_cgr import extract_reactions
+from synplan.chem.reaction_routes.io import write_routes_csv, write_routes_json
+from synplan.chem.utils import mol_from_smiles
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.tree import Tree, TreeConfig
+from synplan.utils.config import PolicyNetworkConfig
+from synplan.utils.loading import load_building_blocks, load_reaction_rules
+from synplan.utils.visualisation import extract_routes, generate_results_html
+def extract_tree_stats(
+    tree: Tree, target: Union[str, MoleculeContainer], init_smiles: str = None
+):
+    """Collects various statistics from a tree and returns them in a dictionary format.
+    :param tree: The built search tree.
+    :param target: The target molecule associated with the tree.
+    :param init_smiles: initial SMILES of the molecule, optional.
+    :return: A dictionary with the calculated statistics.
+    """
+    newick_tree, newick_meta = tree.newickify(visits_threshold=0)
+    newick_meta_line = ";".join(
+        [f"{nid},{v[0]},{v[1]},{v[2]}" for nid, v in newick_meta.items()]
+    )
+    return {
+        "target_smiles": init_smiles if init_smiles is not None else str(target),
+        "num_routes": len(tree.winning_nodes),
+        "num_nodes": len(tree),
+        "num_iter": tree.curr_iteration,
+        "tree_depth": max(tree.nodes_depth.values()),
+        "search_time": round(tree.curr_time, 1),
+        "newick_tree": newick_tree,
+        "newick_meta": newick_meta_line,
+        "solved": True if len(tree.winning_nodes) > 0 else False,
+    }
+def run_search(
+    targets_path: str,
+    search_config: dict,
+    policy_config: PolicyNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    value_network_path: str = None,
+    results_root: str = "search_results",
+) -> None:
+    """Performs a tree search on a set of target molecules using specified configuration
+    and reaction rules, logging the results and statistics.
+    :param targets_path: The path to the file containing the target molecules (in SDF or
+        SMILES format).
+    :param search_config: The config object containing the configuration for the tree
+        search.
+    :param policy_config: The config object containing the configuration for the policy.
+    :param reaction_rules_path: The path to the file containing reaction rules.
+    :param building_blocks_path: The path to the file containing building blocks.
+    :param value_network_path: The path to the file containing value weights (optional).
+    :param results_root: The name of the folder where the results of the tree search
+        will be saved.
+    :return: None.
+    """
+    # results folder
+    results_root = Path(results_root)
+    if not results_root.exists():
+        results_root.mkdir()
+    # output files
+    stats_file = results_root.joinpath("tree_search_stats.csv")
+    routes_file = results_root.joinpath("extracted_routes.json")
+    routes_folder = results_root.joinpath("extracted_routes_html")
+    routes_folder.mkdir(exist_ok=True)
+    # stats header
+    stats_header = [
+        "target_smiles",
+        "num_routes",
+        "num_nodes",
+        "num_iter",
+        "tree_depth",
+        "search_time",
+        "newick_tree",
+        "newick_meta",
+        "solved",
+        "error",
+    ]
+    # config
+    policy_function = PolicyNetworkFunction(policy_config=policy_config)
+    if search_config["evaluation_type"] == "gcn" and value_network_path:
+        value_function = ValueNetworkFunction(weights_path=value_network_path)
+    else:
+        value_function = None
+    reaction_rules = load_reaction_rules(reaction_rules_path)
+    building_blocks = load_building_blocks(building_blocks_path, standardize=True)
+    # run search
+    n_solved = 0
+    extracted_routes = []
+    tree_config = TreeConfig.from_dict(search_config)
+    tree_config.silent = True
+    with (
+        open(targets_path, "r", encoding="utf-8") as targets,
+        open(stats_file, "w", encoding="utf-8", newline="\n") as csvfile,
+    ):
+        statswriter = csv.DictWriter(csvfile, delimiter=",", fieldnames=stats_header)
+        statswriter.writeheader()
+        for ti, target_smi in tqdm(
+            enumerate(targets),
+            leave=True,
+            desc="Number of target molecules processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            target_smi = target_smi.strip()
+            target_mol = mol_from_smiles(target_smi)
+            try:
+                # run search
+                tree = Tree(
+                    target=target_mol,
+                    config=tree_config,
+                    reaction_rules=reaction_rules,
+                    building_blocks=building_blocks,
+                    expansion_function=policy_function,
+                    evaluation_function=value_function,
+                )
+                _ = list(tree)
+            except Exception as e:
+                extracted_routes.append(
+                    [
+                        {
+                            "type": "mol",
+                            "smiles": target_smi,
+                            "in_stock": False,
+                            "children": [],
+                        }
+                    ]
+                )
+                logging.warning(
+                    f"Retrosynthetic_planning {target_smi} failed with the following error: {e}"
+                )
+                continue
+            # is solved
+            n_solved += bool(tree.winning_nodes)
+            if bool(tree.winning_nodes):
+                # extract routes
+                extracted_routes.append(extract_routes(tree))
+                # save routes
+                generate_results_html(
+                    tree,
+                    os.path.join(routes_folder, f"retroroutes_target_{ti}.html"),
+                    extended=True,
+                )
+                # save stats
+                statswriter.writerow(extract_tree_stats(tree, target_smi))
+                csvfile.flush()
+                # save json routes
+                with open(routes_file, "w", encoding="utf-8") as f:
+                    json.dump(extracted_routes, f)
+                # Save mapped reactions (CSV)
+                routes_dict = extract_reactions(tree)
+                write_routes_csv(
+                    routes_dict, os.path.join(routes_folder, f"mapped_routes_{ti}.csv")
+                )
+                # save mapped reactions (JSON)
+                write_routes_json(
+                    routes_dict, os.path.join(routes_folder, f"mapped_routes_{ti}.json")
+                )
+    print(f"Number of solved target molecules: {n_solved}")

synplan/mcts/tree.py ADDED Viewed

	@@ -0,0 +1,635 @@

+"""Module containing a class Tree that used for tree search of retrosynthetic routes."""
+import logging
+import warnings
+from collections import defaultdict, deque
+from math import sqrt
+from random import choice, uniform
+from time import time
+from typing import Dict, List, Set, Tuple
+from CGRtools.reactor import Reactor
+from CGRtools.containers import MoleculeContainer
+from tqdm.auto import tqdm
+from synplan.chem.precursor import Precursor
+from synplan.chem.reaction import Reaction, apply_reaction_rule
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.node import Node
+from synplan.utils.config import TreeConfig
+class Tree:
+    """Tree class with attributes and methods for Monte-Carlo tree search."""
+    def __init__(
+        self,
+        target: MoleculeContainer,
+        config: TreeConfig,
+        reaction_rules: List[Reactor],
+        building_blocks: Set[str],
+        expansion_function: PolicyNetworkFunction,
+        evaluation_function: ValueNetworkFunction = None,
+    ):
+        """Initializes a tree object with optional parameters for tree search for target
+        molecule.
+        :param target: A target molecule for retrosynthetic routes search.
+        :param config: A tree configuration.
+        :param reaction_rules: A loaded reaction rules.
+        :param building_blocks: A loaded building blocks.
+        :param expansion_function: A loaded policy function.
+        :param evaluation_function: A loaded value function. If None, the rollout is
+            used as a default for node evaluation.
+        """
+        # config parameters
+        self.config = config
+        assert isinstance(
+            target, MoleculeContainer
+        ), "Target should be given as MoleculeContainer"
+        assert len(target) > 3, "Target molecule has less than 3 atoms"
+        target_molecule = Precursor(target)
+        target_molecule.prev_precursors.append(Precursor(target))
+        target_node = Node(
+            precursors_to_expand=(target_molecule,), new_precursors=(target_molecule,)
+        )
+        # tree structure init
+        self.nodes: Dict[int, Node] = {1: target_node}
+        self.parents: Dict[int, int] = {1: 0}
+        self.children: Dict[int, Set[int]] = {1: set()}
+        self.winning_nodes: List[int] = []
+        self.visited_nodes: Set[int] = set()
+        self.expanded_nodes: Set[int] = set()
+        self.nodes_visit: Dict[int, int] = {1: 0}
+        self.nodes_depth: Dict[int, int] = {1: 0}
+        self.nodes_prob: Dict[int, float] = {1: 0.0}
+        self.nodes_rules: Dict[int, float] = {}
+        self.nodes_init_value: Dict[int, float] = {1: 0.0}
+        self.nodes_total_value: Dict[int, float] = {1: 0.0}
+        # tree building limits
+        self.curr_iteration: int = 0
+        self.curr_tree_size: int = 2
+        self.start_time: float = 0
+        self.curr_time: float = 0
+        # building blocks and reaction reaction_rules
+        self.reaction_rules = reaction_rules
+        self.building_blocks = building_blocks
+        # policy and value functions
+        self.policy_network = expansion_function
+        if self.config.evaluation_type == "gcn":
+            if evaluation_function is None:
+                raise ValueError(
+                    "Value function not specified while evaluation type is 'gcn'"
+                )
+            if (
+                evaluation_function is not None
+                and self.config.evaluation_type == "rollout"
+            ):
+                raise ValueError(
+                    "Value function is not None while evaluation type is 'rollout'. What should  be evaluation type ?"
+                )
+            self.value_network = evaluation_function
+        # utils
+        self._tqdm = True  # needed to disable tqdm with multiprocessing module
+        target_smiles = str(self.nodes[1].curr_precursor.molecule)
+        if target_smiles in self.building_blocks:
+            self.building_blocks.remove(target_smiles)
+            print(
+                "Target was found in building blocks and removed from building blocks."
+            )
+    def __len__(self) -> int:
+        """Returns the current size (the number of nodes) in the tree."""
+        return self.curr_tree_size - 1
+    def __iter__(self) -> "Tree":
+        """The function is defining an iterator for a Tree object.
+        Also needed for the bar progress display.
+        """
+        self.start_time = time()
+        if self._tqdm:
+            self._tqdm = tqdm(
+                total=self.config.max_iterations, disable=self.config.silent
+            )
+        return self
+    def __repr__(self) -> str:
+        """Returns a string representation of the tree (target SMILES, tree size, and
+        the number of found routes)."""
+        return self.report()
+    def __next__(self) -> [bool, List[int]]:
+        """The __next__ method is used to do one iteration of the tree building.
+        :return: Returns True if the route was found and the node id of the last node in
+            the route. Otherwise, returns False and the id of the last visited node.
+        """
+        if self.curr_iteration >= self.config.max_iterations:
+            raise StopIteration("Iterations limit exceeded.")
+        if self.curr_tree_size >= self.config.max_tree_size:
+            raise StopIteration("Max tree size exceeded or all possible routes found.")
+        if self.curr_time >= self.config.max_time:
+            raise StopIteration("Time limit exceeded.")
+        # start new iteration
+        self.curr_iteration += 1
+        self.curr_time = time() - self.start_time
+        if self._tqdm:
+            self._tqdm.update()
+        curr_depth, node_id = 0, 1  # start from the root node_id
+        explore_route = True
+        while explore_route:
+            self.visited_nodes.add(node_id)
+            if self.nodes_visit[node_id]:  # already visited
+                if not self.children[node_id]:  # dead node
+                    self._update_visits(node_id)
+                    explore_route = False
+                else:
+                    node_id = self._select_node(node_id)  # select the child node
+                    curr_depth += 1
+            else:
+                if self.nodes[node_id].is_solved():  # found route
+                    self._update_visits(
+                        node_id
+                    )  # this prevents expanding of bb node_id
+                    self.winning_nodes.append(node_id)
+                    return True, [node_id]
+                if (
+                    curr_depth < self.config.max_depth
+                ):  # expand node if depth limit is not reached
+                    self._expand_node(node_id)
+                    if not self.children[node_id]:  # node was not expanded
+                        value_to_backprop = -1.0
+                    else:
+                        self.expanded_nodes.add(node_id)
+                        if self.config.search_strategy == "evaluation_first":
+                            # recalculate node value based on children synthesisability and backpropagation
+                            child_values = [
+                                self.nodes_init_value[child_id]
+                                for child_id in self.children[node_id]
+                            ]
+                            if self.config.evaluation_agg == "max":
+                                value_to_backprop = max(child_values)
+                            elif self.config.evaluation_agg == "average":
+                                value_to_backprop = sum(child_values) / len(
+                                    self.children[node_id]
+                                )
+                        elif self.config.search_strategy == "expansion_first":
+                            value_to_backprop = self._get_node_value(node_id)
+                    # backpropagation
+                    self._backpropagate(node_id, value_to_backprop)
+                    self._update_visits(node_id)
+                    explore_route = False
+                    if self.children[node_id]:
+                        # found after expansion
+                        found_after_expansion = set()
+                        for child_id in iter(self.children[node_id]):
+                            if self.nodes[child_id].is_solved():
+                                found_after_expansion.add(child_id)
+                                self.winning_nodes.append(child_id)
+                        if found_after_expansion:
+                            return True, list(found_after_expansion)
+                else:
+                    self._backpropagate(node_id, self.nodes_total_value[node_id])
+                    self._update_visits(node_id)
+                    explore_route = False
+        return False, [node_id]
+    def _ucb(self, node_id: int) -> float:
+        """Calculates the Upper Confidence Bound (UCB) statistics for a given node.
+        :param node_id: The id of the node.
+        :return: The calculated UCB.
+        """
+        prob = self.nodes_prob[node_id]  # predicted by policy network score
+        visit = self.nodes_visit[node_id]
+        if self.config.ucb_type == "puct":
+            u = (
+                self.config.c_ucb * prob * sqrt(self.nodes_visit[self.parents[node_id]])
+            ) / (visit + 1)
+            ucb_value = self.nodes_total_value[node_id] + u
+        if self.config.ucb_type == "uct":
+            u = (
+                self.config.c_ucb
+                * sqrt(self.nodes_visit[self.parents[node_id]])
+                / (visit + 1)
+            )
+            ucb_value = self.nodes_total_value[node_id] + u
+        if self.config.ucb_type == "value":
+            ucb_value = self.nodes_init_value[node_id] / (visit + 1)
+        return ucb_value
+    def _select_node(self, node_id: int) -> int:
+        """Selects a node based on its UCB value and returns the id of the node with the
+        highest UCB.
+        :param node_id: The id of the node.
+        :return: The id of the node with the highest UCB.
+        """
+        if self.config.epsilon > 0:
+            n = uniform(0, 1)
+            if n < self.config.epsilon:
+                return choice(list(self.children[node_id]))
+        best_score, best_children = None, []
+        for child_id in self.children[node_id]:
+            score = self._ucb(child_id)
+            if best_score is None or score > best_score:
+                best_score, best_children = score, [child_id]
+            elif score == best_score:
+                best_children.append(child_id)
+        # is needed for tree search reproducibility, when all child nodes has the same score
+        return best_children[0]
+    def _expand_node(self, node_id: int) -> None:
+        """Expands the node by generating new precursor with policy (expansion) function.
+        :param node_id: The id the node to be expanded.
+        :return: None.
+        """
+        curr_node = self.nodes[node_id]
+        prev_precursor = curr_node.curr_precursor.prev_precursors
+        tmp_precursor = set()
+        expanded = False
+        for prob, rule, rule_id in self.policy_network.predict_reaction_rules(
+            curr_node.curr_precursor, self.reaction_rules
+        ):
+            for products in apply_reaction_rule(
+                curr_node.curr_precursor.molecule, rule
+            ):
+                # check repeated products
+                if not products or not set(products) - tmp_precursor:
+                    continue
+                tmp_precursor.update(products)
+                for molecule in products:
+                    molecule.meta["reactor_id"] = rule_id
+                new_precursor = tuple(Precursor(mol) for mol in products)
+                scaled_prob = prob * len(
+                    list(filter(lambda x: len(x) > self.config.min_mol_size, products))
+                )
+                if set(prev_precursor).isdisjoint(new_precursor):
+                    precursors_to_expand = (
+                        *curr_node.next_precursor,
+                        *(
+                            x
+                            for x in new_precursor
+                            if not x.is_building_block(
+                                self.building_blocks, self.config.min_mol_size
+                            )
+                        ),
+                    )
+                    child_node = Node(
+                        precursors_to_expand=precursors_to_expand,
+                        new_precursors=new_precursor,
+                    )
+                    for new_precursor in new_precursor:
+                        new_precursor.prev_precursors = [new_precursor, *prev_precursor]
+                    self._add_node(node_id, child_node, scaled_prob, rule_id)
+                    expanded = True
+        if not expanded and node_id == 1:
+            raise StopIteration("\nThe target molecule was not expanded.")
+    def _add_node(
+        self,
+        node_id: int,
+        new_node: Node,
+        policy_prob: float = None,
+        rule_id: int = None,
+    ) -> None:
+        """Adds a new node to the tree with probability of reaction rules predicted by
+        policy function and applied to the parent node of the new node.
+        :param node_id: The id of the parent node.
+        :param new_node: The new node to be added.
+        :param policy_prob: The probability of reaction rules predicted by policy
+            function for thr parent node.
+        :return: None.
+        """
+        new_node_id = self.curr_tree_size
+        self.nodes[new_node_id] = new_node
+        self.parents[new_node_id] = node_id
+        self.children[node_id].add(new_node_id)
+        self.children[new_node_id] = set()
+        self.nodes_visit[new_node_id] = 0
+        self.nodes_prob[new_node_id] = policy_prob
+        self.nodes_rules[new_node_id] = rule_id
+        self.nodes_depth[new_node_id] = self.nodes_depth[node_id] + 1
+        self.curr_tree_size += 1
+        if self.config.search_strategy == "evaluation_first":
+            node_value = self._get_node_value(new_node_id)
+        elif self.config.search_strategy == "expansion_first":
+            node_value = self.config.init_node_value
+        self.nodes_init_value[new_node_id] = node_value
+        self.nodes_total_value[new_node_id] = node_value
+    def _get_node_value(self, node_id: int) -> float:
+        """Calculates the value for the given node (for example with rollout or value
+        network).
+        :param node_id: The id of the node to be evaluated.
+        :return: The estimated value of the node.
+        """
+        node = self.nodes[node_id]
+        if self.config.evaluation_type == "random":
+            node_value = uniform(0, 1)
+        elif self.config.evaluation_type == "rollout":
+            node_value = min(
+                (
+                    self._rollout_node(
+                        precursor, current_depth=self.nodes_depth[node_id]
+                    )
+                    for precursor in node.precursors_to_expand
+                ),
+                default=1.0,
+            )
+        elif self.config.evaluation_type == "gcn":
+            node_value = self.value_network.predict_value(node.new_precursors)
+        return node_value
+    def _update_visits(self, node_id: int) -> None:
+        """Updates the number of visits from the current node to the root node.
+        :param node_id: The id of the current node.
+        :return: None.
+        """
+        while node_id:
+            self.nodes_visit[node_id] += 1
+            node_id = self.parents[node_id]
+    def _backpropagate(self, node_id: int, value: float) -> None:
+        """Backpropagates the value through the tree from the current.
+        :param node_id: The id of the node from which to backpropagate the value.
+        :param value: The value to backpropagate.
+        :return: None.
+        """
+        while node_id:
+            if self.config.backprop_type == "muzero":
+                self.nodes_total_value[node_id] = (
+                    self.nodes_total_value[node_id] * self.nodes_visit[node_id] + value
+                ) / (self.nodes_visit[node_id] + 1)
+            elif self.config.backprop_type == "cumulative":
+                self.nodes_total_value[node_id] += value
+            node_id = self.parents[node_id]
+    def _rollout_node(self, precursor: Precursor, current_depth: int = None) -> float:
+        """Performs a rollout simulation from a given node in the tree. Given the
+        current precursor, find the first successful reaction and return the new precursor.
+        If the precursor is a building_block, return 1.0, else check the
+        first successful reaction.
+        If the reaction is not successful, return -1.0.
+        If the reaction is successful, but the generated precursor are not
+        the building_blocks and the precursor cannot be generated without
+        exceeding current_depth threshold, return -0.5.
+        If the reaction is successful, but the precursor are not the
+        building_blocks and the precursor cannot be generated, return
+        -1.0.
+        :param precursor: The precursor to be evaluated.
+        :param current_depth: The current depth of the tree.
+        :return: The reward (value) assigned to the precursor.
+        """
+        max_depth = self.config.max_depth - current_depth
+        # precursor checking
+        if precursor.is_building_block(self.building_blocks, self.config.min_mol_size):
+            return 1.0
+        if max_depth == 0:
+            print("max depth reached in the beginning")
+        # precursor simulating
+        occurred_precursor = set()
+        precursor_to_expand = deque([precursor])
+        history = defaultdict(dict)
+        rollout_depth = 0
+        while precursor_to_expand:
+            # Iterate through reactors and pick first successful reaction.
+            # Check products of the reaction if you can find them in in-building_blocks data
+            # If not, then add missed products to precursor_to_expand and try to decompose them
+            if len(history) >= max_depth:
+                reward = -0.5
+                return reward
+            current_precursor = precursor_to_expand.popleft()
+            history[rollout_depth]["target"] = current_precursor
+            occurred_precursor.add(current_precursor)
+            # Pick the first successful reaction while iterating through reactors
+            reaction_rule_applied = False
+            for prob, rule, rule_id in self.policy_network.predict_reaction_rules(
+                current_precursor, self.reaction_rules
+            ):
+                for products in apply_reaction_rule(current_precursor.molecule, rule):
+                    if products:
+                        reaction_rule_applied = True
+                        break
+                if reaction_rule_applied:
+                    history[rollout_depth]["rule_index"] = rule_id
+                    break
+            if not reaction_rule_applied:
+                reward = -1.0
+                return reward
+            products = tuple(Precursor(product) for product in products)
+            history[rollout_depth]["products"] = products
+            # check loops
+            if any(x in occurred_precursor for x in products) and products:
+                # sometimes manual can create a loop, when
+                # print('occurred_precursor')
+                reward = -1.0
+                return reward
+            if occurred_precursor.isdisjoint(products):
+                # added number of atoms check
+                precursor_to_expand.extend(
+                    [
+                        x
+                        for x in products
+                        if not x.is_building_block(
+                            self.building_blocks, self.config.min_mol_size
+                        )
+                    ]
+                )
+                rollout_depth += 1
+        reward = 1.0
+        return reward
+    def report(self) -> str:
+        """Returns the string representation of the tree."""
+        return (
+            f"Tree for: {str(self.nodes[1].precursors_to_expand[0])}\n"
+            f"Time: {round(self.curr_time, 1)} seconds\n"
+            f"Number of nodes: {len(self)}\n"
+            f"Number of iterations: {self.curr_iteration}\n"
+            f"Number of visited nodes: {len(self.visited_nodes)}\n"
+            f"Number of found routes: {len(self.winning_nodes)}"
+        )
+    def route_score(self, node_id: int) -> float:
+        """Calculates the score of a given route from the current node to the root node.
+        The score depends on cumulated node values nad the route length.
+        :param node_id: The id of the current given node.
+        :return: The route score.
+        """
+        cumulated_nodes_value, route_length = 0, 0
+        while node_id:
+            route_length += 1
+            cumulated_nodes_value += self.nodes_total_value[node_id]
+            node_id = self.parents[node_id]
+        return cumulated_nodes_value / (route_length**2)
+    def route_to_node(self, node_id: int) -> List[Node,]:
+        """Returns the route (list of id of nodes) to from the node current node to the
+        root node.
+        :param node_id: The id of the current node.
+        :return: The list of nodes.
+        """
+        nodes = []
+        while node_id:
+            nodes.append(node_id)
+            node_id = self.parents[node_id]
+        return [self.nodes[node_id] for node_id in reversed(nodes)]
+    def synthesis_route(self, node_id: int) -> Tuple[Reaction,]:
+        """Given a node_id, return a tuple of reactions that represent the
+        retrosynthetic route from the current node.
+        :param node_id: The id of the current node.
+        :return: The tuple of extracted reactions representing the synthesis route.
+        """
+        nodes = self.route_to_node(node_id)
+        reaction_sequence = [
+            Reaction(
+                [x.molecule for x in after.new_precursors],
+                [before.curr_precursor.molecule],
+            )
+            for before, after in zip(nodes, nodes[1:])
+        ]
+        for r in reaction_sequence:
+            r.clean2d()
+        return tuple(reversed(reaction_sequence))
+    def newickify(self, visits_threshold: int = 0, root_node_id: int = 1):
+        """
+        Adopted from https://stackoverflow.com/questions/50003007/how-to-convert-python-dictionary-to-newick-form-format.
+        :param visits_threshold: The minimum number of visits for the given node.
+        :param root_node_id: The id of the root node.
+        :return: The newick string and meta dict.
+        """
+        visited_nodes = set()
+        def newick_render_node(current_node_id: int) -> str:
+            """Recursively generates a Newick string representation of the tree.
+            :param current_node_id: The id of the current node.
+            :return: A string representation of a node in a Newick format.
+            """
+            assert (
+                current_node_id not in visited_nodes
+            ), "Error: The tree may not be circular!"
+            node_visit = self.nodes_visit[current_node_id]
+            visited_nodes.add(current_node_id)
+            if self.children[current_node_id]:
+                # Nodes
+                children = [
+                    child
+                    for child in list(self.children[current_node_id])
+                    if self.nodes_visit[child] >= visits_threshold
+                ]
+                children_strings = [newick_render_node(child) for child in children]
+                children_strings = ",".join(children_strings)
+                if children_strings:
+                    return f"({children_strings}){current_node_id}:{node_visit}"
+                # leafs within threshold
+                return f"{current_node_id}:{node_visit}"
+            return f"{current_node_id}:{node_visit}"
+        newick_string = newick_render_node(root_node_id) + ";"
+        meta = {}
+        for node_id in iter(visited_nodes):
+            node_value = round(self.nodes_total_value[node_id], 3)
+            node_synthesisability = round(self.nodes_init_value[node_id])
+            visit_in_node = self.nodes_visit[node_id]
+            meta[node_id] = (node_value, node_synthesisability, visit_in_node)
+        return newick_string, meta

synplan/ml/__init__.py ADDED Viewed

File without changes

synplan/ml/networks/__init__.py ADDED Viewed

File without changes

synplan/ml/networks/modules.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Module containing basic pytorch architectures of policy and value neural networks."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple, Union
+import torch
+from adabelief_pytorch import AdaBelief
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import GELU, Dropout, Linear, Module, ModuleDict, ModuleList
+from torch.nn.functional import relu
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torch_geometric.data.batch import Batch
+from torch_geometric.nn.conv import GCNConv
+from torch_geometric.nn.pool import global_add_pool
+class GraphEmbedding(Module):
+    """Needed to convert molecule atom vectors to the single vector using graph
+    convolution."""
+    def __init__(
+        self, vector_dim: int = 512, dropout: float = 0.4, num_conv_layers: int = 5
+    ):
+        """Initializes a graph convolutional module. Needed to convert molecule atom
+        vectors to the single vector using graph convolution.
+        :param vector_dim: The dimensionality of the hidden layers and output layer of
+            graph convolution module.
+        :param dropout: Dropout is a regularization technique used in neural networks to
+            prevent overfitting. It randomly sets a fraction of input units to 0 at each
+            update during training time.
+        :param num_conv_layers: The number of convolutional layers in a graph
+            convolutional module.
+        """
+        super().__init__()
+        self.expansion = Linear(11, vector_dim)
+        self.dropout = Dropout(dropout)
+        self.gcn_convs = ModuleList(
+            [
+                GCNConv(
+                    vector_dim,
+                    vector_dim,
+                    improved=True,
+                )
+                for _ in range(num_conv_layers)
+            ]
+        )
+    def forward(self, graph: Batch, batch_size: int) -> Tensor:
+        """Takes a graph as input and performs graph convolution on it.
+        :param graph: The batch of molecular graphs, where each atom is represented by
+            the atom/bond vector.
+        :param batch_size: The size of the batch.
+        :return: Graph embedding.
+        """
+        atoms, connections = graph.x.float(), graph.edge_index.long()
+        atoms = torch.log(atoms + 1)
+        atoms = self.expansion(atoms)
+        for gcn_conv in self.gcn_convs:
+            atoms = atoms + self.dropout(relu(gcn_conv(atoms, connections)))
+        return global_add_pool(atoms, graph.batch, size=batch_size)
+class GraphEmbeddingConcat(GraphEmbedding, Module):
+    """Needed to concat."""  # TODO for what ?
+    def __init__(
+        self, vector_dim: int = 512, dropout: float = 0.4, num_conv_layers: int = 8
+    ):
+        super().__init__()
+        gcn_dim = vector_dim // num_conv_layers
+        self.expansion = Linear(11, gcn_dim)
+        self.dropout = Dropout(dropout)
+        self.gcn_convs = ModuleList(
+            [
+                ModuleDict(
+                    {
+                        "gcn": GCNConv(gcn_dim, gcn_dim, improved=True),
+                        "activation": GELU(),
+                    }
+                )
+                for _ in range(num_conv_layers)
+            ]
+        )
+    def forward(self, graph: Batch, batch_size: int) -> Tensor:
+        """Takes a graph as input and performs graph convolution on it.
+        :param graph: The batch of molecular graphs, where each atom is represented by
+            the atom/bond vector.
+        :param batch_size: The size of the batch.
+        :return: Graph embedding.
+        """
+        atoms, connections = graph.x.float(), graph.edge_index.long()
+        atoms = torch.log(atoms + 1)
+        atoms = self.expansion(atoms)
+        collected_atoms = []
+        for gcn_convs in self.gcn_convs:
+            atoms = gcn_convs["gcn"](atoms, connections)
+            atoms = gcn_convs["activation"](atoms)
+            atoms = self.dropout(atoms)
+            collected_atoms.append(atoms)
+        atoms = torch.cat(collected_atoms, dim=-1)
+        return global_add_pool(atoms, graph.batch, size=batch_size)
+class MCTSNetwork(LightningModule, ABC):
+    """Basic class for policy and value networks."""
+    def __init__(
+        self,
+        vector_dim: int,
+        batch_size: int,
+        dropout: float = 0.4,
+        num_conv_layers: int = 5,
+        learning_rate: float = 0.001,
+        gcn_concat: bool = False,
+    ):
+        """The basic class for MCTS graph convolutional neural networks (policy and
+        value network).
+        :param vector_dim: The dimensionality of the hidden layers and output layer of
+            graph convolution module.
+        :param dropout: Dropout is a regularization technique used in neural networks to
+            prevent overfitting.
+        :param num_conv_layers: The number of convolutional layers in a graph
+            convolutional module.
+        :param learning_rate: The learning rate determines how quickly the model learns
+            from the training data.
+        :param gcn_concat: ???. #TODO explain
+        """
+        super().__init__()
+        if gcn_concat:
+            self.embedder = GraphEmbeddingConcat(vector_dim, dropout, num_conv_layers)
+        else:
+            self.embedder = GraphEmbedding(vector_dim, dropout, num_conv_layers)
+        self.batch_size = batch_size
+        self.lr = learning_rate
+    @abstractmethod
+    def forward(self, batch: Batch) -> Tensor:
+        """The forward function takes a batch of input data and performs forward
+        propagation through the neural network.
+        :param batch: The batch of molecular graphs processed together in a single
+            forward pass through the neural network.
+        """
+    @abstractmethod
+    def _get_loss(self, batch: Batch) -> Tensor:
+        """Calculate the loss for a given batch of data.
+        :param batch: The batch of input data that is used to compute the loss.
+        """
+    def training_step(self, batch: Batch, batch_idx: int) -> Tensor:
+        """Calculates the loss for a given training batch and logs the loss value.
+        :param batch: The batch of data that is used for training.
+        :param batch_idx: The index of the batch.
+        :return: The value of the training loss.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log(
+                "train_" + name,
+                value,
+                prog_bar=True,
+                on_step=True,
+                on_epoch=True,
+                batch_size=self.batch_size,
+            )
+        return metrics["loss"]
+    def validation_step(self, batch: Batch, batch_idx: int) -> None:
+        """Calculates the loss for a given validation batch and logs the loss value.
+        :param batch: The batch of data that is used for validation.
+        :param batch_idx: The index of the batch.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log("val_" + name, value, on_epoch=True, batch_size=self.batch_size)
+    def test_step(self, batch: Batch, batch_idx: int) -> None:
+        """Calculates the loss for a given test batch and logs the loss value.
+        :param batch: The batch of data that is used for testing.
+        :param batch_idx: The index of the batch.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log("test_" + name, value, on_epoch=True, batch_size=self.batch_size)
+    def configure_optimizers(
+        self,
+    ) -> Tuple[List[AdaBelief], List[Dict[str, Union[bool, str, ReduceLROnPlateau]]]]:
+        """Returns an optimizer and a learning rate scheduler for training a model using
+        the AdaBelief optimizer and ReduceLROnPlateau scheduler.
+        :return: The optimizer and a scheduler.
+        """
+        optimizer = AdaBelief(
+            self.parameters(),
+            lr=self.lr,
+            eps=1e-16,
+            betas=(0.9, 0.999),
+            weight_decouple=True,
+            rectify=True,
+            weight_decay=0.01,
+            print_change_log=False,
+        )
+        lr_scheduler = ReduceLROnPlateau(
+            optimizer, patience=3, factor=0.8, min_lr=5e-5, verbose=True
+        )
+        scheduler = {
+            "scheduler": lr_scheduler,
+            "reduce_on_plateau": True,
+            "monitor": "val_loss",
+        }
+        return [optimizer], [scheduler]

synplan/ml/networks/policy.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Module containing main class for policy network."""
+from abc import ABC
+from typing import Dict
+import torch
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import Linear
+from torch.nn.functional import binary_cross_entropy_with_logits, cross_entropy, one_hot
+from torch_geometric.data.batch import Batch
+from torchmetrics.functional.classification import f1_score, recall, specificity
+from synplan.ml.networks.modules import MCTSNetwork
+class PolicyNetwork(MCTSNetwork, LightningModule, ABC):
+    """Policy network."""
+    def __init__(
+        self,
+        *args,
+        n_rules: int,
+        vector_dim: int,
+        policy_type: str = "ranking",
+        **kwargs
+    ):
+        """Initializes a policy network with the given number of reaction rules (output
+        dimension) and vector graph embedding dimension, and creates linear layers for
+        predicting the regular and priority reaction rules.
+        :param n_rules: The number of reaction rules in the policy network.
+        :param vector_dim: The dimensionality of the input vectors.
+        """
+        super().__init__(vector_dim, *args, **kwargs)
+        self.save_hyperparameters()
+        self.policy_type = policy_type
+        self.n_rules = n_rules
+        self.y_predictor = Linear(vector_dim, n_rules)
+        if self.policy_type == "filtering":
+            self.priority_predictor = Linear(vector_dim, n_rules)
+    def forward(self, batch: Batch) -> Tensor:
+        """Takes a molecular graph, applies a graph convolution and sigmoid layers to
+        predict regular and priority reaction rules.
+        :param batch: The input batch of molecular graphs.
+        :return: Returns the vector of probabilities (given by sigmoid) of successful
+            application of regular and priority reaction rules.
+        """
+        x = self.embedder(batch, self.batch_size)
+        y = self.y_predictor(x)
+        if self.policy_type == "ranking":
+            y = torch.softmax(y, dim=-1)
+            return y
+        if self.policy_type == "filtering":
+            y = torch.sigmoid(y)
+            priority = torch.sigmoid(self.priority_predictor(x))
+            return y, priority
+    def _get_loss(self, batch: Batch) -> Dict[str, Tensor]:
+        """Calculates the loss and various classification metrics for a given batch for
+        reaction rules prediction.
+        :param batch: The batch of molecular graphs.
+        :return: A dictionary with loss value and balanced accuracy of reaction rules
+            prediction.
+        """
+        true_y = batch.y_rules.long()
+        x = self.embedder(batch, self.batch_size)
+        pred_y = self.y_predictor(x)
+        if self.policy_type == "ranking":
+            true_one_hot = one_hot(true_y, num_classes=self.n_rules)
+            loss = cross_entropy(pred_y, true_one_hot.float())
+            ba_y = (
+                recall(pred_y, true_y, task="multiclass", num_classes=self.n_rules)
+                + specificity(
+                    pred_y, true_y, task="multiclass", num_classes=self.n_rules
+                )
+            ) / 2
+            f1_y = f1_score(pred_y, true_y, task="multiclass", num_classes=self.n_rules)
+            metrics = {"loss": loss, "balanced_accuracy_y": ba_y, "f1_score_y": f1_y}
+        elif self.policy_type == "filtering":
+            loss_y = binary_cross_entropy_with_logits(pred_y, true_y.float())
+            ba_y = (
+                recall(pred_y, true_y, task="multilabel", num_labels=self.n_rules)
+                + specificity(
+                    pred_y, true_y, task="multilabel", num_labels=self.n_rules
+                )
+            ) / 2
+            f1_y = f1_score(pred_y, true_y, task="multilabel", num_labels=self.n_rules)
+            true_priority = batch.y_priority.float()
+            pred_priority = self.priority_predictor(x)
+            loss_priority = binary_cross_entropy_with_logits(
+                pred_priority, true_priority
+            )
+            loss = loss_y + loss_priority
+            true_priority = true_priority.long()
+            ba_priority = (
+                recall(
+                    pred_priority,
+                    true_priority,
+                    task="multilabel",
+                    num_labels=self.n_rules,
+                )
+                + specificity(
+                    pred_priority,
+                    true_priority,
+                    task="multilabel",
+                    num_labels=self.n_rules,
+                )
+            ) / 2
+            f1_priority = f1_score(
+                pred_priority, true_priority, task="multilabel", num_labels=self.n_rules
+            )
+            metrics = {
+                "loss": loss,
+                "balanced_accuracy_y": ba_y,
+                "f1_score_y": f1_y,
+                "balanced_accuracy_priority": ba_priority,
+                "f1_score_priority": f1_priority,
+            }
+        return metrics

synplan/ml/networks/value.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Module containing main class for value network."""
+from abc import ABC
+from typing import Any, Dict
+import torch
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import Linear
+from torch.nn.functional import binary_cross_entropy_with_logits
+from torch_geometric.data.batch import Batch
+from torchmetrics.functional.classification import (
+    binary_f1_score,
+    binary_recall,
+    binary_specificity,
+)
+from synplan.ml.networks.modules import MCTSNetwork
+class ValueNetwork(MCTSNetwork, LightningModule, ABC):
+    """Value network."""
+    def __init__(self, vector_dim: int, *args: Any, **kwargs: Any) -> None:
+        """Initializes a value network, and creates linear layer for predicting the
+        synthesisability of given precursor represented by molecular graph.
+        :param vector_dim: The dimensionality of the output linear layer.
+        """
+        super().__init__(vector_dim, *args, **kwargs)
+        self.save_hyperparameters()
+        self.predictor = Linear(vector_dim, 1)
+    def forward(self, batch) -> torch.Tensor:
+        """Takes a batch of molecular graphs, applies a graph convolution returns the
+        synthesisability (probability given by sigmoid function) of a given precursor
+        represented by molecular graph precessed by graph convolution.
+        :param batch: The batch of molecular graphs.
+        :return: The predicted synthesisability (between 0 and 1).
+        """
+        x = self.embedder(batch, self.batch_size)
+        x = torch.sigmoid(self.predictor(x))
+        return x
+    def _get_loss(self, batch: Batch) -> Dict[str, Tensor]:
+        """Calculates the loss and various classification metrics for a given batch for
+        the precursor synthesysability prediction.
+        :param batch: The batch of molecular graphs.
+        :return: The dictionary with loss value and balanced accuracy of precursor
+            synthesysability prediction.
+        """
+        true_y = batch.y.float()
+        true_y = torch.unsqueeze(true_y, -1)
+        x = self.embedder(batch, self.batch_size)
+        pred_y = self.predictor(x)
+        # calc loss func
+        loss = binary_cross_entropy_with_logits(pred_y, true_y)
+        true_y = true_y.long()
+        ba = (binary_recall(pred_y, true_y) + binary_specificity(pred_y, true_y)) / 2
+        f1 = binary_f1_score(pred_y, true_y)
+        metrics = {"loss": loss, "balanced_accuracy": ba, "f1_score": f1}
+        return metrics

synplan/ml/training/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .supervised import *
+from .preprocessing import ValueNetworkDataset, mol_to_pyg, MENDEL_INFO
+from .supervised import create_policy_dataset, run_policy_training
+__all__ = [
+    "ValueNetworkDataset",
+    "mol_to_pyg",
+    "MENDEL_INFO",
+    "create_policy_dataset",
+    "run_policy_training",
+]

synplan/ml/training/preprocessing.py ADDED Viewed

	@@ -0,0 +1,516 @@

+"""Module containing functions for preparation of the training sets for policy and value
+network."""
+import logging
+import os
+import pickle
+from abc import ABC
+from typing import Any, Dict, List, Optional, Tuple
+import ray
+import torch
+from CGRtools import smiles
+from CGRtools.containers import MoleculeContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from ray.util.queue import Empty, Queue
+from torch import Tensor
+from torch_geometric.data import InMemoryDataset
+from torch_geometric.data.data import Data
+from torch_geometric.data.makedirs import makedirs
+from torch_geometric.transforms import ToUndirected
+from tqdm import tqdm
+from synplan.chem.utils import unite_molecules
+from synplan.utils.files import ReactionReader
+from synplan.utils.loading import load_reaction_rules
+class ValueNetworkDataset(InMemoryDataset, ABC):
+    """Value network dataset."""
+    def __init__(self, extracted_precursor: Dict[str, float]) -> None:
+        """Initializes a value network dataset object.
+        :param extracted_precursor: The dictionary with the extracted from the built
+            search trees precursor and their labels.
+        """
+        super().__init__(None, None, None)
+        if extracted_precursor:
+            self.data, self.slices = self.graphs_from_extracted_precursor(
+                extracted_precursor
+            )
+    @staticmethod
+    def mol_to_graph(molecule: MoleculeContainer, label: float) -> Optional[Data]:
+        """Takes a molecule as input, and converts the molecule to a PyTorch geometric
+        graph, assigns the reward value (label) to the graph, and returns the graph.
+        :param molecule: The input molecule.
+        :param label: The label (solved/unsolved routes in the tree) of the molecule
+            (precursor).
+        :return: A PyTorch Geometric graph representation of a molecule.
+        """
+        if len(molecule) > 2:
+            pyg = mol_to_pyg(molecule)
+            if pyg:
+                pyg.y = torch.tensor([label])
+                return pyg
+        return None
+    def graphs_from_extracted_precursor(
+        self, extracted_precursor: Dict[str, float]
+    ) -> Tuple[Data, Dict]:
+        """Converts the extracted from the search trees precursor to the PyTorch geometric
+        graphs.
+        :param extracted_precursor: The dictionary with the extracted from the built
+            search trees precursor and their labels.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        processed_data = []
+        for smi, label in extracted_precursor.items():
+            mol = smiles(smi)
+            pyg = self.mol_to_graph(mol, label)
+            if pyg:
+                processed_data.append(pyg)
+        data, slices = self.collate(processed_data)
+        return data, slices
+class RankingPolicyDataset(InMemoryDataset):
+    """Ranking policy network dataset."""
+    def __init__(self, reactions_path: str, reaction_rules_path: str, output_path: str):
+        """Initializes a policy network dataset.
+        :param reactions_path: The path to the file containing the reaction data used
+            for extraction of reaction rules.
+        :param reaction_rules_path: The path to the file containing the reaction rules.
+        :param output_path: The output path to the file where policy network dataset
+            will be saved.
+        """
+        super().__init__(None, None, None)
+        self.reactions_path = reactions_path
+        self.reaction_rules_path = reaction_rules_path
+        self.output_path = output_path
+        if output_path and os.path.exists(output_path):
+            self.data, self.slices = torch.load(self.output_path)
+        else:
+            self.data, self.slices = self.prepare_data()
+    @property
+    def num_classes(self) -> int:
+        return self._infer_num_classes(self._data.y_rules)
+    def prepare_data(self) -> Tuple[Data, Dict[str, Tensor]]:
+        """Prepares data by loading reaction rules, preprocessing the molecules,
+        collating the data, and returning the data and slices.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        with open(self.reaction_rules_path, "rb") as inp:
+            reaction_rules = pickle.load(inp)
+        reaction_rules = sorted(reaction_rules, key=lambda x: len(x[1]), reverse=True)
+        reaction_rule_pairs = {}
+        for rule_i, (_, reactions_ids) in enumerate(reaction_rules):
+            for reaction_id in reactions_ids:
+                reaction_rule_pairs[reaction_id] = rule_i
+        reaction_rule_pairs = dict(sorted(reaction_rule_pairs.items()))
+        list_of_graphs = []
+        with ReactionReader(self.reactions_path) as reactions:
+            for reaction_id, reaction in tqdm(
+                enumerate(reactions),
+                desc="Number of reactions processed: ",
+                bar_format="{desc}{n} [{elapsed}]",
+            ):
+                rule_id = reaction_rule_pairs.get(reaction_id)
+                if rule_id:
+                    try:  #  MENDEL_INFO does not contain cadmium (Cd) properties
+                        molecule = unite_molecules(reaction.products)
+                        pyg_graph = mol_to_pyg(molecule)
+                    except (
+                        Exception
+                    ) as e:  # TypeError: can't assign a NoneType to a torch.ByteTensor
+                        logging.debug(e)
+                        continue
+                    if pyg_graph is not None:
+                        pyg_graph.y_rules = torch.tensor([rule_id], dtype=torch.long)
+                        list_of_graphs.append(pyg_graph)
+                else:
+                    continue
+        data, slices = self.collate(list_of_graphs)
+        if self.output_path:
+            makedirs(os.path.dirname(self.output_path))
+            torch.save((data, slices), self.output_path)
+        return data, slices
+class FilteringPolicyDataset(InMemoryDataset):
+    """Filtering policy network dataset."""
+    def __init__(
+        self,
+        molecules_path: str,
+        reaction_rules_path: str,
+        output_path: str,
+        num_cpus: int,
+    ) -> None:
+        """Initializes a policy network dataset object.
+        :param molecules_path: The path to the file containing the molecules for
+            reaction rule appliance.
+        :param reaction_rules_path: The path to the file containing the reaction rules.
+        :param output_path: The output path to the file where policy network dataset
+            will be stored.
+        :param num_cpus: The number of CPUs to be used for the dataset preparation.
+        :return: None.
+        """
+        super().__init__(None, None, None)
+        self.molecules_path = molecules_path
+        self.reaction_rules_path = reaction_rules_path
+        self.output_path = output_path
+        self.num_cpus = num_cpus
+        self.batch_size = 100
+        if output_path and os.path.exists(output_path):
+            self.data, self.slices = torch.load(self.output_path)
+        else:
+            self.data, self.slices = self.prepare_data()
+    @property
+    def num_classes(self) -> int:
+        return self._data.y_rules.shape[1]
+    def prepare_data(self) -> Tuple[Data, Dict]:
+        """Prepares data by loading reaction rules, initializing Ray, preprocessing the
+        molecules, collating the data, and returning the data and slices.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        ray.init(num_cpus=self.num_cpus, ignore_reinit_error=True)
+        reaction_rules = load_reaction_rules(self.reaction_rules_path)
+        reaction_rules_ids = ray.put(reaction_rules)
+        to_process = Queue(maxsize=self.batch_size * self.num_cpus)
+        processed_data = []
+        results_ids = [
+            preprocess_filtering_policy_molecules.remote(to_process, reaction_rules_ids)
+            for _ in range(self.num_cpus)
+        ]
+        with open(self.molecules_path, "r", encoding="utf-8") as inp_data:
+            for molecule in tqdm(
+                inp_data.read().splitlines(),
+                desc="Number of molecules processed: ",
+                bar_format="{desc}{n} [{elapsed}]",
+            ):
+                to_process.put(molecule)
+        results = [graph for res in ray.get(results_ids) if res for graph in res]
+        processed_data.extend(results)
+        ray.shutdown()
+        for pyg in processed_data:
+            pyg.y_rules = pyg.y_rules.to_dense()
+            pyg.y_priority = pyg.y_priority.to_dense()
+        data, slices = self.collate(processed_data)
+        if self.output_path:
+            makedirs(os.path.dirname(self.output_path))
+            torch.save((data, slices), self.output_path)
+        return data, slices
+def reaction_rules_appliance(
+    molecule: MoleculeContainer, reaction_rules: List[Reactor]
+) -> Tuple[List[int], List[int]]:
+    """Applies each reaction rule from the list of reaction rules to a given molecule
+    and returns the indexes of the successfully applied regular and prioritized reaction
+    rules.
+    :param molecule: The input molecule.
+    :param reaction_rules: The list of reaction rules.
+    :return: The two lists of indexes of successfully applied regular reaction rules and
+        priority reaction rules.
+    """
+    applied_rules, priority_rules = [], []
+    for i, rule in enumerate(reaction_rules):
+        rule_applied = False
+        rule_prioritized = False
+        try:
+            for reaction in rule([molecule]):
+                for prod in reaction.products:
+                    prod.kekule()
+                    if prod.check_valence():
+                        break
+                    rule_applied = True
+                    # check priority rules
+                    if len(reaction.products) > 1:
+                        # check coupling retro manual
+                        if all(len(mol) > 6 for mol in reaction.products):
+                            if (
+                                sum(len(mol) for mol in reaction.products)
+                                - len(reaction.reactants[0])
+                                < 6
+                            ):
+                                rule_prioritized = True
+                    else:
+                        # check cyclization retro manual
+                        if sum(len(mol.sssr) for mol in reaction.products) < sum(
+                            len(mol.sssr) for mol in reaction.reactants
+                        ):
+                            rule_prioritized = True
+            #
+            if rule_applied:
+                applied_rules.append(i)
+                #
+                if rule_prioritized:
+                    priority_rules.append(i)
+        except Exception as e:
+            logging.debug(e)
+            continue
+    return applied_rules, priority_rules
+@ray.remote
+def preprocess_filtering_policy_molecules(
+    to_process: Queue, reaction_rules: List[Reactor]
+) -> List[Optional[Data]]:
+    """Preprocesses a list of molecules by applying reaction rules and converting
+    molecules into PyTorch geometric graphs. Successfully applied reaction rules are
+    converted to binary vectors for policy network training.
+    :param to_process: The queue containing SMILES of molecules to be converted to the
+        training data.
+    :param reaction_rules: The list of reaction rules.
+    :return: The list of PyGraph objects.
+    """
+    pyg_graphs = []
+    while True:
+        try:
+            molecule = smiles(to_process.get(timeout=30))
+            if not isinstance(molecule, MoleculeContainer):
+                continue
+            # reaction reaction_rules application
+            applied_rules, priority_rules = reaction_rules_appliance(
+                molecule, reaction_rules
+            )
+            y_rules = torch.sparse_coo_tensor(
+                [applied_rules],
+                torch.ones(len(applied_rules)),
+                (len(reaction_rules),),
+                dtype=torch.uint8,
+            )
+            y_priority = torch.sparse_coo_tensor(
+                [priority_rules],
+                torch.ones(len(priority_rules)),
+                (len(reaction_rules),),
+                dtype=torch.uint8,
+            )
+            y_rules = torch.unsqueeze(y_rules, 0)
+            y_priority = torch.unsqueeze(y_priority, 0)
+            pyg_graph = mol_to_pyg(molecule)
+            if not pyg_graph:
+                continue
+            pyg_graph.y_rules = y_rules
+            pyg_graph.y_priority = y_priority
+            pyg_graphs.append(pyg_graph)
+        except Empty:
+            break
+    return pyg_graphs
+def atom_to_vector(atom: Any) -> Tensor:
+    """Given an atom, return a vector of length 8 with the following
+    information:
+    1. Atomic number
+    2. Period
+    3. Group
+    4. Number of electrons + atom's charge
+    5. Shell
+    6. Total number of hydrogens
+    7. Whether the atom is in a ring
+    8. Number of neighbors
+    :param atom: The atom object.
+    :return: The vector of the atom.
+    """
+    vector = torch.zeros(8, dtype=torch.uint8)
+    period, group, shell, electrons = MENDEL_INFO[atom.atomic_symbol]
+    vector[0] = atom.atomic_number
+    vector[1] = period
+    vector[2] = group
+    vector[3] = electrons + atom.charge
+    vector[4] = shell
+    vector[5] = atom.total_hydrogens
+    vector[6] = int(atom.in_ring)
+    vector[7] = atom.neighbors
+    return vector
+def bonds_to_vector(molecule: MoleculeContainer, atom_ind: int) -> Tensor:
+    """Takes a molecule and an atom index as input, and returns a vector representing
+    the bond orders of the atom's bonds.
+    :param molecule: The given molecule.
+    :param atom_ind: The index of the atom in the molecule to be converted to the bond
+        vector.
+    :return: The torch tensor of size 3, with each element representing the order of
+        bonds connected to the atom with the given index in the molecule.
+    """
+    vector = torch.zeros(3, dtype=torch.uint8)
+    for b_order in molecule._bonds[atom_ind].values():
+        vector[int(b_order) - 1] += 1
+    return vector
+def mol_to_matrix(molecule: MoleculeContainer) -> Tensor:
+    """Given a molecule, it returns a vector of shape (max_atoms, 12) where each row is
+    an atom and each column is a feature.
+    :param molecule: The molecule to be converted to a vector
+    :return: The atoms vectors array.
+    """
+    atoms_vectors = torch.zeros((len(molecule), 11), dtype=torch.uint8)
+    for n, atom in molecule.atoms():
+        atoms_vectors[n - 1][:8] = atom_to_vector(atom)
+    for n, _ in molecule.atoms():
+        atoms_vectors[n - 1][8:] = bonds_to_vector(molecule, n)
+    return atoms_vectors
+def mol_to_pyg(
+    molecule: MoleculeContainer, canonicalize: bool = True
+) -> Optional[Data]:
+    """Takes a list of molecules and returns a list of PyTorch Geometric graphs, a one-
+    hot encoded vectors of the atoms, and a matrices of the bonds.
+    :param molecule: The molecule to be converted to PyTorch Geometric graph.
+    :param canonicalize: If True, the input molecule is canonicalized.
+    :return: The list of PyGraph objects.
+    """
+    if len(molecule) == 1:  # to avoid a precursor to be a single atom
+        return None
+    tmp_molecule = molecule.copy()
+    try:
+        if canonicalize:
+            tmp_molecule.canonicalize()
+        tmp_molecule.kekule()
+        if tmp_molecule.check_valence():
+            return None
+    except InvalidAromaticRing:
+        return None
+    # remapping target for torch_geometric because
+    # it is necessary that the elements in edge_index only hold nodes_idx in the range { 0, ..., num_nodes - 1}
+    new_mappings = {n: i for i, (n, _) in enumerate(tmp_molecule.atoms(), 1)}
+    tmp_molecule.remap(new_mappings)
+    # get edge indexes from target mapping
+    edge_index = []
+    for atom, neighbour, bond in tmp_molecule.bonds():
+        edge_index.append([atom - 1, neighbour - 1])
+    edge_index = torch.tensor(edge_index, dtype=torch.long)
+    #
+    x = mol_to_matrix(tmp_molecule)
+    mol_pyg_graph = Data(x=x, edge_index=edge_index.t().contiguous())
+    mol_pyg_graph = ToUndirected()(mol_pyg_graph)
+    assert mol_pyg_graph.is_undirected()
+    return mol_pyg_graph
+MENDEL_INFO = {
+    "Ag": (5, 11, 1, 1),
+    "Al": (3, 13, 2, 1),
+    "Ar": (3, 18, 2, 6),
+    "As": (4, 15, 2, 3),
+    "B": (2, 13, 2, 1),
+    "Ba": (6, 2, 1, 2),
+    "Bi": (6, 15, 2, 3),
+    "Br": (4, 17, 2, 5),
+    "C": (2, 14, 2, 2),
+    "Ca": (4, 2, 1, 2),
+    "Ce": (6, None, 1, 2),
+    "Cl": (3, 17, 2, 5),
+    "Cr": (4, 6, 1, 1),
+    "Cs": (6, 1, 1, 1),
+    "Cu": (4, 11, 1, 1),
+    "Dy": (6, None, 1, 2),
+    "Er": (6, None, 1, 2),
+    "F": (2, 17, 2, 5),
+    "Fe": (4, 8, 1, 2),
+    "Ga": (4, 13, 2, 1),
+    "Gd": (6, None, 1, 2),
+    "Ge": (4, 14, 2, 2),
+    "Hg": (6, 12, 1, 2),
+    "I": (5, 17, 2, 5),
+    "In": (5, 13, 2, 1),
+    "K": (4, 1, 1, 1),
+    "La": (6, 3, 1, 2),
+    "Li": (2, 1, 1, 1),
+    "Mg": (3, 2, 1, 2),
+    "Mn": (4, 7, 1, 2),
+    "N": (2, 15, 2, 3),
+    "Na": (3, 1, 1, 1),
+    "Nd": (6, None, 1, 2),
+    "O": (2, 16, 2, 4),
+    "P": (3, 15, 2, 3),
+    "Pb": (6, 14, 2, 2),
+    "Pd": (5, 10, 3, 10),
+    "Pr": (6, None, 1, 2),
+    "Rb": (5, 1, 1, 1),
+    "S": (3, 16, 2, 4),
+    "Sb": (5, 15, 2, 3),
+    "Se": (4, 16, 2, 4),
+    "Si": (3, 14, 2, 2),
+    "Sm": (6, None, 1, 2),
+    "Sn": (5, 14, 2, 2),
+    "Sr": (5, 2, 1, 2),
+    "Te": (5, 16, 2, 4),
+    "Ti": (4, 4, 1, 2),
+    "Tl": (6, 13, 2, 1),
+    "Yb": (6, None, 1, 2),
+    "Zn": (4, 12, 1, 2),
+}

synplan/ml/training/reinforcement.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""Module containing functions for running value network tuning with reinforcement learning
+approach."""
+import os
+import random
+from collections import defaultdict
+from pathlib import Path
+from random import shuffle
+from typing import Dict, List
+import torch
+from CGRtools.containers import MoleculeContainer
+from pytorch_lightning import Trainer
+from torch.utils.data import random_split
+from torch_geometric.data.lightning import LightningDataset
+from synplan.chem.precursor import compose_precursors
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.tree import Tree
+from synplan.ml.networks.value import ValueNetwork
+from synplan.ml.training.preprocessing import ValueNetworkDataset
+from synplan.utils.config import (
+    PolicyNetworkConfig,
+    TuningConfig,
+    TreeConfig,
+    ValueNetworkConfig,
+)
+from synplan.utils.files import MoleculeReader
+from synplan.utils.loading import (
+    load_building_blocks,
+    load_reaction_rules,
+    load_value_net,
+)
+from synplan.utils.logging import DisableLogger, HiddenPrints
+def create_value_network(value_config: ValueNetworkConfig) -> ValueNetwork:
+    """Creates the initial value network.
+    :param value_config: The value network configuration.
+    :return: The valueNetwork to be trained/tuned.
+    """
+    weights_path = Path(value_config.weights_path)
+    value_network = ValueNetwork(
+        vector_dim=value_config.vector_dim,
+        batch_size=value_config.batch_size,
+        dropout=value_config.dropout,
+        num_conv_layers=value_config.num_conv_layers,
+        learning_rate=value_config.learning_rate,
+    )
+    with DisableLogger(), HiddenPrints():
+        trainer = Trainer()
+        trainer.strategy.connect(value_network)
+        trainer.save_checkpoint(weights_path)
+    return value_network
+def create_targets_batch(
+    targets: List[MoleculeContainer], batch_size: int
+) -> List[List[MoleculeContainer]]:
+    """Creates the targets batches for planning simulations and value network tuning.
+    :param targets: The list of target molecules.
+    :param batch_size: The size of each target batch.
+    :return: The list of lists corresponding to each target batch.
+    """
+    num_targets = len(targets)
+    batch_splits = list(
+        range(num_targets // batch_size + int(bool(num_targets % batch_size)))
+    )
+    if int(num_targets / batch_size) == 0:
+        print(f"1 batch were created with {num_targets} molecules")
+    else:
+        print(
+            f"{len(batch_splits)} batches were created with {batch_size} molecules each"
+        )
+    targets_batch_list = []
+    for batch_id in batch_splits:
+        batch_slices = [
+            i
+            for i in range(batch_id * batch_size, (batch_id + 1) * batch_size)
+            if i < len(targets)
+        ]
+        targets_batch_list.append([targets[i] for i in batch_slices])
+    return targets_batch_list
+def run_tree_search(
+    target: MoleculeContainer,
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+) -> Tree:
+    """Runs tree search for the given target molecule.
+    :param target: The target molecule.
+    :param tree_config: The planning configuration of tree search.
+    :param policy_config: The policy network configuration.
+    :param value_config: The value network configuration.
+    :param reaction_rules_path: The path to the file with reaction rules.
+    :param building_blocks_path: The path to the file with building blocks.
+    :return: The built search tree for the given molecule.
+    """
+    # policy and value function loading
+    policy_function = PolicyNetworkFunction(policy_config=policy_config)
+    value_function = ValueNetworkFunction(weights_path=value_config.weights_path)
+    reaction_rules = load_reaction_rules(reaction_rules_path)
+    building_blocks = load_building_blocks(building_blocks_path, standardize=True)
+    # initialize tree
+    tree_config.evaluation_type = "gcn"
+    tree_config.silent = True
+    tree = Tree(
+        target=target,
+        config=tree_config,
+        reaction_rules=reaction_rules,
+        building_blocks=building_blocks,
+        expansion_function=policy_function,
+        evaluation_function=value_function,
+    )
+    tree._tqdm = False
+    # remove target from buildings blocs
+    if str(target) in tree.building_blocks:
+        tree.building_blocks.remove(str(target))
+    # run tree search
+    _ = list(tree)
+    return tree
+def extract_tree_precursor(tree_list: List[Tree]) -> Dict[str, float]:
+    """Takes the built tree and extracts the precursor for value network tuning. The
+    precursor from found retrosynthetic routes are labeled as a positive class and precursor
+    from not solved routes are labeled as a negative class.
+    :param tree_list: The list of built search trees.
+    :return: The dictionary with the precursor SMILES and its class (positive - 1 or negative - 0).
+    """
+    extracted_precursor = defaultdict(float)
+    for tree in tree_list:
+        for idx, node in tree.nodes.items():
+            # add solved nodes to set
+            if node.is_solved():
+                parent = idx
+                while parent and parent != 1:
+                    composed_smi = str(
+                        compose_precursors(tree.nodes[parent].new_precursors)
+                    )
+                    extracted_precursor[composed_smi] = 1.0
+                    parent = tree.parents[parent]
+            else:
+                composed_smi = str(compose_precursors(tree.nodes[idx].new_precursors))
+                extracted_precursor[composed_smi] = 0.0
+    # shuffle extracted precursor
+    processed_keys = list(extracted_precursor.keys())
+    shuffle(processed_keys)
+    extracted_precursor = {i: extracted_precursor[i] for i in processed_keys}
+    return extracted_precursor
+def balance_extracted_precursor(extracted_precursor):
+    extracted_precursor_balanced = {}
+    neg_list = [i for i, j in extracted_precursor.items() if j == 0]
+    for k, v in extracted_precursor.items():
+        if v == 1:
+            extracted_precursor_balanced[k] = v
+        if len(extracted_precursor_balanced) < len(neg_list):
+            neg_list.pop(random.choice(range(len(neg_list))))
+    return extracted_precursor_balanced
+def create_updating_set(
+    extracted_precursor: Dict[str, float], batch_size: int = 1
+) -> LightningDataset:
+    """Creates the value network updating dataset from precursor extracted from the planning
+    simulation.
+    :param extracted_precursor: The dictionary with the extracted precursor and their
+        labels.
+    :param batch_size: The size of the batch in value network updating.
+    :return: A LightningDataset object, which contains the tuning set for value network
+        tuning.
+    """
+    extracted_precursor = balance_extracted_precursor(extracted_precursor)
+    full_dataset = ValueNetworkDataset(extracted_precursor)
+    train_size = int(0.6 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_set, val_set = random_split(
+        full_dataset, [train_size, val_size], torch.Generator().manual_seed(42)
+    )
+    print(f"Training set size: {len(train_set)}")
+    print(f"Validation set size: {len(val_set)}")
+    return LightningDataset(
+        train_set, val_set, batch_size=batch_size, pin_memory=True, drop_last=True
+    )
+def tune_value_network(
+    datamodule: LightningDataset, value_config: ValueNetworkConfig
+) -> None:
+    """Trains the value network using a given tuning data and saves the trained neural
+    network.
+    :param datamodule: The tuning dataset (LightningDataset).
+    :param value_config: The value network configuration.
+    :return: None.
+    """
+    current_weights = value_config.weights_path
+    value_network = load_value_net(ValueNetwork, current_weights)
+    with DisableLogger(), HiddenPrints():
+        trainer = Trainer(
+            accelerator="gpu",
+            devices=[0],
+            max_epochs=value_config.num_epoch,
+            enable_checkpointing=False,
+            logger=False,
+            gradient_clip_val=1.0,
+            enable_progress_bar=False,
+        )
+        trainer.fit(value_network, datamodule)
+        val_score = trainer.validate(value_network, datamodule.val_dataloader())[0]
+        trainer.save_checkpoint(current_weights)
+    print(f"Value network balanced accuracy: {val_score['val_balanced_accuracy']}")
+def run_training(
+    extracted_precursor: Dict[str, float] = None,
+    value_config: ValueNetworkConfig = None,
+) -> None:
+    """Runs the training stage in value network tuning.
+    :param extracted_precursor: The precursor extracted from the planing simulations.
+    :param value_config: The value network configuration.
+    :return: None.
+    """
+    # create training set
+    training_set = create_updating_set(
+        extracted_precursor=extracted_precursor, batch_size=value_config.batch_size
+    )
+    # retrain value network
+    tune_value_network(datamodule=training_set, value_config=value_config)
+def run_planning(
+    targets_batch: List[MoleculeContainer],
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    targets_batch_id: int,
+):
+    """Performs planning stage (tree search) for target molecules and save extracted
+    from built trees precursor for further tuning the value network in the training stage.
+    :param targets_batch:
+    :param tree_config:
+    :param policy_config:
+    :param value_config:
+    :param reaction_rules_path:
+    :param building_blocks_path:
+    :param targets_batch_id:
+    """
+    from tqdm import tqdm
+    print(f"\nProcess batch number {targets_batch_id}")
+    tree_list = []
+    tree_config.silent = False
+    for target in tqdm(targets_batch):
+        try:
+            tree = run_tree_search(
+                target=target,
+                tree_config=tree_config,
+                policy_config=policy_config,
+                value_config=value_config,
+                reaction_rules_path=reaction_rules_path,
+                building_blocks_path=building_blocks_path,
+            )
+            tree_list.append(tree)
+        except Exception as e:
+            print(e)
+            continue
+    num_solved = sum([len(i.winning_nodes) > 0 for i in tree_list])
+    print(f"Planning is finished with {num_solved} solved targets")
+    return tree_list
+def run_updating(
+    targets_path: str,
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reinforce_config: TuningConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    results_root: str = None,
+) -> None:
+    """Performs updating of value network.
+    :param targets_path: The path to the file with target molecules.
+    :param tree_config: The search tree configuration.
+    :param policy_config: The policy network configuration.
+    :param value_config: The value network configuration.
+    :param reinforce_config: The value network tuning configuration.
+    :param reaction_rules_path: The path to the file with reaction rules.
+    :param building_blocks_path: The path to the file with building blocks.
+    :param results_root: The path to the directory where trained value network will be
+        saved.
+    :return: None.
+    """
+    # create results root folder
+    results_root = Path(results_root)
+    if not results_root.exists():
+        results_root.mkdir()
+    # load targets list
+    with MoleculeReader(targets_path) as targets:
+        targets = list(targets)
+    # create value neural network
+    value_config.weights_path = os.path.join(results_root, "value_network.ckpt")
+    create_value_network(value_config)
+    # create targets batch
+    targets_batch_list = create_targets_batch(
+        targets, batch_size=reinforce_config.batch_size
+    )
+    # run value network tuning
+    for batch_id, targets_batch in enumerate(targets_batch_list, start=1):
+        # start tree planning simulation for batch of targets
+        tree_list = run_planning(
+            targets_batch=targets_batch,
+            tree_config=tree_config,
+            policy_config=policy_config,
+            value_config=value_config,
+            reaction_rules_path=reaction_rules_path,
+            building_blocks_path=building_blocks_path,
+            targets_batch_id=batch_id,
+        )
+        # extract pos and neg precursor from the list of built trees
+        extracted_precursor = extract_tree_precursor(tree_list)
+        # train value network for extracted precursor
+        run_training(extracted_precursor=extracted_precursor, value_config=value_config)

synplan/ml/training/supervised.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Module for the preparation and training of a policy network used in the expansion of
+nodes in tree search.
+This module includes functions for creating training datasets and running the training
+process for the policy network.
+"""
+import warnings
+from pathlib import Path
+from typing import Union, List
+import os
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import random_split
+from torch_geometric.data.lightning import LightningDataset
+from synplan.ml.networks.policy import PolicyNetwork
+from synplan.ml.training.preprocessing import (
+    FilteringPolicyDataset,
+    RankingPolicyDataset,
+)
+from synplan.utils.config import PolicyNetworkConfig
+from synplan.utils.logging import DisableLogger, HiddenPrints
+warnings.filterwarnings("ignore")
+def create_policy_dataset(
+    reaction_rules_path: str,
+    molecules_or_reactions_path: str,
+    output_path: str,
+    dataset_type: str = "filtering",
+    batch_size: int = 100,
+    num_cpus: int = 1,
+    training_data_ratio: float = 0.8,
+):
+    """
+    Create a training dataset for a policy network.
+    :param reaction_rules_path: Path to the reaction rules file.
+    :param molecules_or_reactions_path: Path to the molecules or reactions file used to create the training set.
+    :param output_path: Path to store the processed dataset.
+    :param dataset_type: Type of the dataset to be created ('ranking' or 'filtering').
+    :param batch_size: The size of batch of molecules/reactions.
+    :param training_data_ratio: Ratio of training data to total data.
+    :param num_cpus: Number of CPUs to use for data processing.
+    :return: A `LightningDataset` object containing training and validation datasets.
+    """
+    with DisableLogger(), HiddenPrints():
+        if dataset_type == "filtering":
+            full_dataset = FilteringPolicyDataset(
+                reaction_rules_path=reaction_rules_path,
+                molecules_path=molecules_or_reactions_path,
+                output_path=output_path,
+                num_cpus=num_cpus,
+            )
+        elif dataset_type == "ranking":
+            full_dataset = RankingPolicyDataset(
+                reaction_rules_path=reaction_rules_path,
+                reactions_path=molecules_or_reactions_path,
+                output_path=output_path,
+            )
+    train_size = int(training_data_ratio * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset, [train_size, val_size], torch.Generator().manual_seed(42)
+    )
+    print(
+        f"Training set size: {len(train_dataset)}, validation set size: {len(val_dataset)}"
+    )
+    datamodule = LightningDataset(
+        train_dataset,
+        val_dataset,
+        batch_size=batch_size,
+        pin_memory=True,
+        drop_last=True,
+    )
+    return datamodule
+def run_policy_training(
+    datamodule: LightningDataset,
+    config: PolicyNetworkConfig,
+    results_path: str,
+    weights_file_name: str = "policy_network",
+    accelerator: str = "gpu",
+    devices: Union[List[int], str, int] = "auto",
+    silent: bool = False,
+) -> None:
+    """
+    Trains a policy network using a given datamodule and training configuration.
+    :param datamodule: A PyTorch Lightning `DataModule` class instance. It is responsible for loading, processing, and preparing the training data for the model.
+    :param config: The dictionary that contains various configuration settings for the policy training process.
+    :param results_path: Path to store the training results and logs.
+    :param accelerator: Supports passing different accelerator types (“cpu”, “gpu”, “tpu”, “hpu”, “mps”, “auto”) as well as custom accelerator instances. Default: "gpu".
+    :param devices: The devices to use. Can be set to a positive number (int or str), a sequence of device indices (list or str), the value -1 to indicate all available devices should be used, or "auto" for automatic selection based on the chosen accelerator. Default: "auto".
+    :param silent: Run in the silent mode with no progress bars. Default: True.
+    :param weights_file_name: The name of weights file to be saved. Default: "policy_network".
+    :return: None.
+    """
+    results_path = Path(results_path)
+    results_path.mkdir(exist_ok=True)
+    network = PolicyNetwork(
+        vector_dim=config.vector_dim,
+        n_rules=datamodule.train_dataset.dataset.num_classes,
+        batch_size=config.batch_size,
+        dropout=config.dropout,
+        num_conv_layers=config.num_conv_layers,
+        learning_rate=config.learning_rate,
+        policy_type=config.policy_type,
+    )
+    checkpoint = ModelCheckpoint(
+        dirpath=results_path, filename=weights_file_name, monitor="val_loss", mode="min"
+    )
+    if silent:
+        enable_progress_bar = False
+    else:
+        enable_progress_bar = True
+    trainer = Trainer(
+        accelerator=accelerator,
+        devices=devices,
+        max_epochs=config.num_epoch,
+        callbacks=[checkpoint],
+        logger=False,
+        gradient_clip_val=1.0,
+        enable_progress_bar=enable_progress_bar,
+    )
+    if silent:
+        with DisableLogger(), HiddenPrints():
+            trainer.fit(network, datamodule)
+    else:
+        trainer.fit(network, datamodule)
+    ba = round(trainer.logged_metrics["train_balanced_accuracy_y_step"].item(), 3)
+    print(f"Policy network balanced accuracy: {ba}")

synplan/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from typing import Union
+from os import PathLike
+path_type = Union[str, PathLike]

synplan/utils/config.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""Module containing configuration classes."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Union
+from chython import smarts
+import yaml
+from CGRtools.containers import MoleculeContainer, QueryContainer
+@dataclass
+class ConfigABC(ABC):
+    """Abstract base class for configuration classes."""
+    @staticmethod
+    @abstractmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """Create an instance of the configuration from a dictionary."""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the configuration into a dictionary."""
+        return {
+            k: str(v) if isinstance(v, Path) else v for k, v in self.__dict__.items()
+        }
+    @staticmethod
+    @abstractmethod
+    def from_yaml(file_path: str):
+        """Deserialize a YAML file into a configuration object."""
+    def to_yaml(self, file_path: str):
+        """Serializes the configuration to a YAML file.
+        :param file_path: The path to the output YAML file.
+        """
+        with open(file_path, "w", encoding="utf-8") as file:
+            yaml.dump(self.to_dict(), file)
+    @abstractmethod
+    def _validate_params(self, params: Dict[str, Any]):
+        """Validate configuration parameters."""
+    def __post_init__(self):
+        """Validates the configuration parameters."""
+        # call _validate_params method after initialization
+        params = self.to_dict()
+        self._validate_params(params)
+@dataclass
+class RuleExtractionConfig(ConfigABC):
+    """Configuration class for extracting reaction rules.
+    :param multicenter_rules: If True, extracts a single rule
+        encompassing all centers. If False, extracts separate reaction
+        rules for each reaction center in a multicenter reaction.
+    :param as_query_container: If True, the extracted rules are
+        generated as QueryContainer objects, analogous to SMARTS objects
+        for pattern matching in chemical structures.
+    :param reverse_rule: If True, reverses the direction of the reaction
+        for rule extraction.
+    :param reactor_validation: If True, validates each generated rule in
+        a chemical reactor to ensure correct generation of products from
+        reactants.
+    :param include_func_groups: If True, includes specific functional
+        groups in the reaction rule in addition to the reaction center
+        and its environment.
+    :param func_groups_list: A list of functional groups to be
+        considered when include_func_groups is True.
+    :param include_rings: If True, includes ring structures in the
+        reaction rules.
+    :param keep_leaving_groups: If True, retains leaving groups in the
+        extracted reaction rule.
+    :param keep_incoming_groups: If True, retains incoming groups in the
+        extracted reaction rule.
+    :param keep_reagents: If True, includes reagents in the extracted
+        reaction rule.
+    :param environment_atom_count: Defines the size of the environment
+        around the reaction center to be included in the rule (0 for
+        only the reaction center, 1 for the first environment, etc.).
+    :param min_popularity: Minimum number of times a rule must be
+        applied to be considered for further analysis.
+    :param keep_metadata: If True, retains metadata associated with the
+        reaction in the extracted rule.
+    :param single_reactant_only: If True, includes only reaction rules
+        with a single reactant molecule.
+    :param atom_info_retention: Controls the amount of information about
+        each atom to retain ('none', 'reaction_center', or 'all').
+    """
+    # default low-level parameters
+    single_reactant_only: bool = True
+    keep_metadata: bool = False
+    reactor_validation: bool = True
+    reverse_rule: bool = True
+    as_query_container: bool = True
+    include_func_groups: bool = False
+    func_groups_list: List[str] = field(default_factory=list)
+    # adjustable parameters
+    environment_atom_count: int = 1
+    min_popularity: int = 3
+    include_rings: bool = True
+    multicenter_rules: bool = True
+    keep_leaving_groups: bool = True
+    keep_incoming_groups: bool = True
+    keep_reagents: bool = False
+    atom_info_retention: Dict[str, Dict[str, bool]] = field(default_factory=dict)
+    def __post_init__(self):
+        super().__post_init__()
+        self._validate_params(self.to_dict())
+        self._initialize_default_atom_info_retention()
+        self._parse_functional_groups()
+    def _initialize_default_atom_info_retention(self):
+        default_atom_info = {
+            "reaction_center": {
+                "neighbors": True,
+                "hybridization": True,
+                "implicit_hydrogens": False,
+                "ring_sizes": False,
+            },
+            "environment": {
+                "neighbors": False,
+                "hybridization": False,
+                "implicit_hydrogens": False,
+                "ring_sizes": False,
+            },
+        }
+        if not self.atom_info_retention:
+            self.atom_info_retention = default_atom_info
+        else:
+            for key in default_atom_info:
+                self.atom_info_retention[key].update(
+                    self.atom_info_retention.get(key, {})
+                )
+    def _parse_functional_groups(self):
+        func_groups_list = []
+        for group_smarts in self.func_groups_list:
+            try:
+                query = smarts(group_smarts)
+                func_groups_list.append(query)
+            except Exception as e:
+                print(f"Functional group {group_smarts} was not parsed because of {e}")
+        self.func_groups_list = func_groups_list
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "RuleExtractionConfig":
+        return RuleExtractionConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "RuleExtractionConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return RuleExtractionConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        if not isinstance(params["multicenter_rules"], bool):
+            raise ValueError("multicenter_rules must be a boolean.")
+        if not isinstance(params["as_query_container"], bool):
+            raise ValueError("as_query_container must be a boolean.")
+        if not isinstance(params["reverse_rule"], bool):
+            raise ValueError("reverse_rule must be a boolean.")
+        if not isinstance(params["reactor_validation"], bool):
+            raise ValueError("reactor_validation must be a boolean.")
+        if not isinstance(params["include_func_groups"], bool):
+            raise ValueError("include_func_groups must be a boolean.")
+        if params["func_groups_list"] is not None and not all(
+            isinstance(group, str) for group in params["func_groups_list"]
+        ):
+            raise ValueError("func_groups_list must be a list of SMARTS.")
+        if not isinstance(params["include_rings"], bool):
+            raise ValueError("include_rings must be a boolean.")
+        if not isinstance(params["keep_leaving_groups"], bool):
+            raise ValueError("keep_leaving_groups must be a boolean.")
+        if not isinstance(params["keep_incoming_groups"], bool):
+            raise ValueError("keep_incoming_groups must be a boolean.")
+        if not isinstance(params["keep_reagents"], bool):
+            raise ValueError("keep_reagents must be a boolean.")
+        if not isinstance(params["environment_atom_count"], int):
+            raise ValueError("environment_atom_count must be an integer.")
+        if not isinstance(params["min_popularity"], int):
+            raise ValueError("min_popularity must be an integer.")
+        if not isinstance(params["keep_metadata"], bool):
+            raise ValueError("keep_metadata must be a boolean.")
+        if not isinstance(params["single_reactant_only"], bool):
+            raise ValueError("single_reactant_only must be a boolean.")
+        if params["atom_info_retention"] is not None:
+            if not isinstance(params["atom_info_retention"], dict):
+                raise ValueError("atom_info_retention must be a dictionary.")
+            required_keys = {"reaction_center", "environment"}
+            if not required_keys.issubset(params["atom_info_retention"]):
+                missing_keys = required_keys - set(params["atom_info_retention"].keys())
+                raise ValueError(
+                    f"atom_info_retention missing required keys: {missing_keys}"
+                )
+            for key, value in params["atom_info_retention"].items():
+                if key not in required_keys:
+                    raise ValueError(f"Unexpected key in atom_info_retention: {key}")
+                expected_subkeys = {
+                    "neighbors",
+                    "hybridization",
+                    "implicit_hydrogens",
+                    "ring_sizes",
+                }
+                if not isinstance(value, dict) or not expected_subkeys.issubset(value):
+                    missing_subkeys = expected_subkeys - set(value.keys())
+                    raise ValueError(
+                        f"Invalid structure for {key} in atom_info_retention. Missing subkeys: {missing_subkeys}"
+                    )
+                for subkey, subvalue in value.items():
+                    if not isinstance(subvalue, bool):
+                        raise ValueError(
+                            f"Value for {subkey} in {key} of atom_info_retention must be boolean."
+                        )
+@dataclass
+class PolicyNetworkConfig(ConfigABC):
+    """Configuration class for the policy network.
+    :param vector_dim: Dimension of the input vectors.
+    :param batch_size: Number of samples per batch.
+    :param dropout: Dropout rate for regularization.
+    :param learning_rate: Learning rate for the optimizer.
+    :param num_conv_layers: Number of convolutional layers in the network.
+    :param num_epoch: Number of training epochs.
+    :param policy_type: Mode of operation, either 'filtering' or 'ranking'.
+    """
+    policy_type: str = "ranking"
+    vector_dim: int = 256
+    batch_size: int = 500
+    dropout: float = 0.4
+    learning_rate: float = 0.008
+    num_conv_layers: int = 5
+    num_epoch: int = 100
+    weights_path: str = None
+    # for filtering policy
+    priority_rules_fraction: float = 0.5
+    rule_prob_threshold: float = 0.0
+    top_rules: int = 50
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "PolicyNetworkConfig":
+        return PolicyNetworkConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "PolicyNetworkConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return PolicyNetworkConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        if params["policy_type"] not in ["filtering", "ranking"]:
+            raise ValueError("policy_type must be either 'filtering' or 'ranking'.")
+        if not isinstance(params["vector_dim"], int) or params["vector_dim"] <= 0:
+            raise ValueError("vector_dim must be a positive integer.")
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+        if (
+            not isinstance(params["num_conv_layers"], int)
+            or params["num_conv_layers"] <= 0
+        ):
+            raise ValueError("num_conv_layers must be a positive integer.")
+        if not isinstance(params["num_epoch"], int) or params["num_epoch"] <= 0:
+            raise ValueError("num_epoch must be a positive integer.")
+        if not isinstance(params["dropout"], float) or not (
+            0.0 <= params["dropout"] <= 1.0
+        ):
+            raise ValueError("dropout must be a float between 0.0 and 1.0.")
+        if (
+            not isinstance(params["learning_rate"], float)
+            or params["learning_rate"] <= 0.0
+        ):
+            raise ValueError("learning_rate must be a positive float.")
+        if (
+            not isinstance(params["priority_rules_fraction"], float)
+            or params["priority_rules_fraction"] < 0.0
+        ):
+            raise ValueError(
+                "priority_rules_fraction must be a non-negative positive float."
+            )
+        if (
+            not isinstance(params["rule_prob_threshold"], float)
+            or params["rule_prob_threshold"] < 0.0
+        ):
+            raise ValueError("rule_prob_threshold must be a non-negative float.")
+        if not isinstance(params["top_rules"], int) or params["top_rules"] <= 0:
+            raise ValueError("top_rules must be a positive integer.")
+@dataclass
+class ValueNetworkConfig(ConfigABC):
+    """Configuration class for the value network.
+    :param vector_dim: Dimension of the input vectors.
+    :param batch_size: Number of samples per batch.
+    :param dropout: Dropout rate for regularization.
+    :param learning_rate: Learning rate for the optimizer.
+    :param num_conv_layers: Number of convolutional layers in the network.
+    :param num_epoch: Number of training epochs.
+    """
+    weights_path: str = None
+    vector_dim: int = 256
+    batch_size: int = 500
+    dropout: float = 0.4
+    learning_rate: float = 0.008
+    num_conv_layers: int = 5
+    num_epoch: int = 100
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ValueNetworkConfig":
+        return ValueNetworkConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "ValueNetworkConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ValueNetworkConfig.from_dict(config_dict)
+    def to_yaml(self, file_path: str):
+        with open(file_path, "w", encoding="utf-8") as file:
+            yaml.dump(self.to_dict(), file)
+    def _validate_params(self, params: Dict[str, Any]):
+        if not isinstance(params["vector_dim"], int) or params["vector_dim"] <= 0:
+            raise ValueError("vector_dim must be a positive integer.")
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+        if (
+            not isinstance(params["num_conv_layers"], int)
+            or params["num_conv_layers"] <= 0
+        ):
+            raise ValueError("num_conv_layers must be a positive integer.")
+        if not isinstance(params["num_epoch"], int) or params["num_epoch"] <= 0:
+            raise ValueError("num_epoch must be a positive integer.")
+        if not isinstance(params["dropout"], float) or not (
+            0.0 <= params["dropout"] <= 1.0
+        ):
+            raise ValueError("dropout must be a float between 0.0 and 1.0.")
+        if (
+            not isinstance(params["learning_rate"], float)
+            or params["learning_rate"] <= 0.0
+        ):
+            raise ValueError("learning_rate must be a positive float.")
+@dataclass
+class TuningConfig(ConfigABC):
+    """Configuration class for the network training.
+    :param batch_size: The number of targets per batch in the planning simulation step.
+    :param num_simulations: The number of planning simulations.
+    """
+    batch_size: int = 100
+    num_simulations: int = 1
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "TuningConfig":
+        return TuningConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "TuningConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return TuningConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+@dataclass
+class TreeConfig(ConfigABC):
+    """Configuration class for the tree search algorithm.
+    :param max_iterations: The number of iterations to run the algorithm
+        for.
+    :param max_tree_size: The maximum number of nodes in the tree.
+    :param max_time: The time limit (in seconds) for the algorithm to
+        run.
+    :param max_depth: The maximum depth of the tree.
+    :param ucb_type: Type of UCB used in the search algorithm. Options
+        are "puct", "uct", "value", defaults to "uct".
+    :param c_ucb: The exploration-exploitation balance coefficient used
+        in Upper Confidence Bound (UCB).
+    :param backprop_type: Type of backpropagation algorithm. Options are
+        "muzero", "cumulative", defaults to "muzero".
+    :param search_strategy: The strategy used for tree search. Options
+        are "expansion_first", "evaluation_first".
+    :param exclude_small: Whether to exclude small molecules during the
+        search.
+    :param evaluation_agg: Method for aggregating evaluation scores.
+        Options are "max", "average", defaults to "max".
+    :param evaluation_type: The method used for evaluating nodes.
+        Options are "random", "rollout", "gcn".
+    :param init_node_value: Initial value for a new node.
+    :param epsilon: A parameter in the epsilon-greedy search strategy
+        representing the chance of random selection of reaction rules
+        during the selection stage in Monte Carlo Tree Search,
+        specifically during Upper Confidence Bound estimation. It
+        balances between exploration and exploitation.
+    :param min_mol_size: Defines the minimum size of a molecule that is
+        have to be synthesized. Molecules with 6 or fewer heavy atoms
+        are assumed to be building blocks by definition, thus setting
+        the threshold for considering larger molecules in the search,
+        defaults to 6.
+    :param silent: Whether to suppress progress output.
+    """
+    max_iterations: int = 100
+    max_tree_size: int = 1000000
+    max_time: float = 600
+    max_depth: int = 6
+    ucb_type: str = "uct"
+    c_ucb: float = 0.1
+    backprop_type: str = "muzero"
+    search_strategy: str = "expansion_first"
+    exclude_small: bool = True
+    evaluation_agg: str = "max"
+    evaluation_type: str = "gcn"
+    init_node_value: float = 0.0
+    epsilon: float = 0.0
+    min_mol_size: int = 6
+    silent: bool = False
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "TreeConfig":
+        return TreeConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "TreeConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return TreeConfig.from_dict(config_dict)
+    def _validate_params(self, params):
+        if params["ucb_type"] not in ["puct", "uct", "value"]:
+            raise ValueError(
+                "Invalid ucb_type. Allowed values are 'puct', 'uct', 'value'."
+            )
+        if params["backprop_type"] not in ["muzero", "cumulative"]:
+            raise ValueError(
+                "Invalid backprop_type. Allowed values are 'muzero', 'cumulative'."
+            )
+        if params["evaluation_type"] not in ["random", "rollout", "gcn"]:
+            raise ValueError(
+                "Invalid evaluation_type. Allowed values are 'random', 'rollout', 'gcn'."
+            )
+        if params["evaluation_agg"] not in ["max", "average"]:
+            raise ValueError(
+                "Invalid evaluation_agg. Allowed values are 'max', 'average'."
+            )
+        if not isinstance(params["c_ucb"], float):
+            raise TypeError("c_ucb must be a float.")
+        if not isinstance(params["max_depth"], int) or params["max_depth"] < 1:
+            raise ValueError("max_depth must be a positive integer.")
+        if not isinstance(params["max_tree_size"], int) or params["max_tree_size"] < 1:
+            raise ValueError("max_tree_size must be a positive integer.")
+        if (
+            not isinstance(params["max_iterations"], int)
+            or params["max_iterations"] < 1
+        ):
+            raise ValueError("max_iterations must be a positive integer.")
+        if not isinstance(params["max_time"], int) or params["max_time"] < 1:
+            raise ValueError("max_time must be a positive integer.")
+        if not isinstance(params["exclude_small"], bool):
+            raise TypeError("exclude_small must be a boolean.")
+        if not isinstance(params["silent"], bool):
+            raise TypeError("silent must be a boolean.")
+        if not isinstance(params["init_node_value"], float):
+            raise TypeError("init_node_value must be a float if provided.")
+        if params["search_strategy"] not in ["expansion_first", "evaluation_first"]:
+            raise ValueError(
+                f"Invalid search_strategy: {params['search_strategy']}: "
+                f"Allowed values are 'expansion_first', 'evaluation_first'"
+            )
+        if not isinstance(params["epsilon"], float) or 0 >= params["epsilon"] >= 1:
+            raise ValueError("epsilon epsilon be a positive float between 0 and 1.")
+        if not isinstance(params["min_mol_size"], int) or params["min_mol_size"] < 0:
+            raise ValueError("min_mol_size must be a non-negative integer.")
+def convert_config_to_dict(config_attr: ConfigABC, config_type) -> Dict | None:
+    """Converts a configuration attribute to a dictionary if it's either a dictionary or
+    an instance of a specified configuration type.
+    :param config_attr: The configuration attribute to be converted.
+    :param config_type: The type to check against for conversion.
+    :return: The configuration attribute as a dictionary, or None if it's not an
+        instance of the given type or dict.
+    """
+    if isinstance(config_attr, dict):
+        return config_attr
+    if isinstance(config_attr, config_type):
+        return config_attr.to_dict()
+    return None

synplan/utils/files.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""Module containing classes and functions needed for reactions/molecules data
+reading/writing."""
+from os.path import splitext
+from pathlib import Path
+from typing import Iterable, Union
+from CGRtools import smiles
+from CGRtools.containers import CGRContainer, MoleculeContainer, ReactionContainer
+from CGRtools.files.RDFrw import RDFRead, RDFWrite
+from CGRtools.files.SDFrw import SDFRead, SDFWrite
+class FileHandler:
+    """General class to handle chemical files."""
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """General class to handle chemical files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        self._file = None
+        _, ext = splitext(filename)
+        file_types = {".smi": "SMI", ".smiles": "SMI", ".rdf": "RDF", ".sdf": "SDF"}
+        try:
+            self._file_type = file_types[ext]
+        except KeyError:
+            raise ValueError("I don't know the file extension,", ext)
+    def close(self):
+        self._file.close()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+class Reader(FileHandler):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """General class to read reactions/molecules data files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+    def __enter__(self):
+        return self._file
+    def __iter__(self):
+        return iter(self._file)
+    def __next__(self):
+        return next(self._file)
+    def __len__(self):
+        return len(self._file)
+class SMILESRead:
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Simplified class to read files containing a SMILES (Molecules or Reaction)
+        string per line.
+        :param filename: The path and name of the SMILES file to parse.
+        :return: None.
+        """
+        filename = str(Path(filename).resolve(strict=True))
+        self._file = open(filename, "r", encoding="utf-8")
+        self._data = self.__data()
+    def __data(
+        self,
+    ) -> Iterable[Union[ReactionContainer, CGRContainer, MoleculeContainer]]:
+        for line in iter(self._file.readline, ""):
+            line = line.strip()
+            x = smiles(line)
+            if isinstance(x, (ReactionContainer, CGRContainer, MoleculeContainer)):
+                x.meta["init_smiles"] = line
+                yield x
+    def __enter__(self):
+        return self
+    def read(self):
+        """Parse the whole SMILES file.
+        :return: List of parsed molecules or reactions.
+        """
+        return list(iter(self))
+    def __iter__(self):
+        return (x for x in self._data)
+    def __next__(self):
+        return next(iter(self))
+    def close(self):
+        self._file.close()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+class Writer(FileHandler):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """General class to write chemical files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        self._mapping = mapping
+    def __enter__(self):
+        return self
+class ReactionReader(Reader):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Class to read reaction files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        if self._file_type == "SMI":
+            self._file = SMILESRead(filename, **kwargs)
+        elif self._file_type == "RDF":
+            self._file = RDFRead(filename, indexable=True, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+class ReactionWriter(Writer):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """Class to write reaction files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, mapping, **kwargs)
+        if self._file_type == "SMI":
+            self._file = open(filename, "w", encoding="utf-8", **kwargs)
+        elif self._file_type == "RDF":
+            self._file = RDFWrite(filename, append=False, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+    def write(self, reaction: ReactionContainer):
+        """Function to write a specific reaction to the file.
+        :param reaction: The path and name of the file.
+        :return: None.
+        """
+        if self._file_type == "SMI":
+            rea_str = to_reaction_smiles_record(reaction)
+            self._file.write(rea_str + "\n")
+        elif self._file_type == "RDF":
+            self._file.write(reaction)
+class MoleculeReader(Reader):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Class to read molecule files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        if self._file_type == "SMI":
+            self._file = SMILESRead(filename, ignore=True, **kwargs)
+        elif self._file_type == "SDF":
+            self._file = SDFRead(filename, indexable=True, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+class MoleculeWriter(Writer):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """Class to write molecule files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, mapping, **kwargs)
+        if self._file_type == "SMI":
+            self._file = open(filename, "w", encoding="utf-8", **kwargs)
+        elif self._file_type == "SDF":
+            self._file = SDFWrite(filename, append=False, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+    def write(self, molecule: MoleculeContainer):
+        """Function to write a specific molecule to the file.
+        :param molecule: The path and name of the file.
+        :return: None.
+        """
+        if self._file_type == "SMI":
+            mol_str = str(molecule)
+            self._file.write(mol_str + "\n")
+        elif self._file_type == "SDF":
+            self._file.write(molecule)
+def to_reaction_smiles_record(reaction: ReactionContainer) -> str:
+    """Converts the reaction to the SMILES record. Needed for reaction/molecule writers.
+    :param reaction: The reaction to be written.
+    :return: The SMILES record to be written.
+    """
+    if isinstance(reaction, str):
+        return reaction
+    reaction_record = [format(reaction, "m")]
+    sorted_meta = sorted(reaction.meta.items(), key=lambda x: x[0])
+    for _, meta_info in sorted_meta:
+        meta_info = ""
+        meta_info = ";".join(meta_info.split("\n"))
+        reaction_record.append(str(meta_info))
+    return "\t".join(reaction_record)

synplan/utils/loading.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Module containing functions for loading reaction rules, building blocks and
+retrosynthetic models."""
+import functools
+import pickle
+import zipfile
+from pathlib import Path
+from typing import List, Set, Union
+from CGRtools.reactor.reactor import Reactor
+from torch import device
+from huggingface_hub import hf_hub_download, snapshot_download
+from tqdm import tqdm
+from synplan.ml.networks.policy import PolicyNetwork
+from synplan.ml.networks.value import ValueNetwork
+from synplan.utils.files import MoleculeReader
+def download_unpack_data(filename, subfolder, save_to="."):
+    if isinstance(save_to, str):
+        save_to = Path(save_to).resolve()
+        save_to.mkdir(exist_ok=True)
+    # Download the zip file from the repository
+    file_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename=filename,
+        subfolder=subfolder,
+        local_dir=save_to,
+    )
+    file_path = Path(file_path)
+    if file_path.suffix == ".zip":
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            # Extract the single file in the zip
+            zip_ref.extractall(save_to)
+            extracted_file = save_to / zip_ref.namelist()[0]
+        file_path.unlink()
+        return extracted_file
+    else:
+        return file_path
+def download_all_data(save_to="."):
+    dir_path = snapshot_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner", local_dir=save_to
+    )
+    dir_path = Path(dir_path).resolve()
+    for zip_file in dir_path.rglob("*.zip"):
+        with zipfile.ZipFile(zip_file, "r") as zip_ref:
+            # Check each file in the zip
+            for file_name in zip_ref.namelist():
+                extracted_file_path = zip_file.parent / file_name
+                # Check if the extracted file already exists
+                if not extracted_file_path.exists():
+                    # Extract the file if it does not exist
+                    zip_ref.extract(file_name, zip_file.parent)
+                    print(f"Extracted {file_name} to {zip_file.parent}")
+@functools.lru_cache(maxsize=None)
+def load_reaction_rules(file: str) -> List[Reactor]:
+    """Loads the reaction rules from a pickle file and converts them into a list of
+    Reactor objects if necessary.
+    :param file: The path to the pickle file that stores the reaction rules.
+    :return: A list of reaction rules as Reactor objects.
+    """
+    with open(file, "rb") as f:
+        reaction_rules = pickle.load(f)
+    if not isinstance(reaction_rules[0][0], Reactor):
+        reaction_rules = [Reactor(x) for x, _ in reaction_rules]
+    return reaction_rules
+@functools.lru_cache(maxsize=None)
+def load_building_blocks(
+    building_blocks_path: Union[str, Path], standardize: bool = True
+) -> Set[str]:
+    """Loads building blocks data from a file and returns a frozen set of building
+    blocks.
+    :param building_blocks_path: The path to the file containing the building blocks.
+    :param standardize: Flag if building blocks have to be standardized before loading. Default=True.
+    :return: The set of building blocks smiles.
+    """
+    building_blocks_path = Path(building_blocks_path).resolve()
+    assert (
+        building_blocks_path.suffix == ".smi"
+        or building_blocks_path.suffix == ".smiles"
+    )
+    building_blocks_smiles = set()
+    if standardize:
+        with MoleculeReader(building_blocks_path) as molecules:
+            for mol in tqdm(
+                molecules,
+                desc="Number of building blocks processed: ",
+                bar_format="{desc}{n} [{elapsed}]",
+            ):
+                try:
+                    mol.canonicalize()
+                    mol.clean_stereo()
+                    building_blocks_smiles.add(str(mol))
+                except:  # mol.canonicalize() / InvalidAromaticRing
+                    pass
+    else:
+        with open(building_blocks_path, "r") as inp:
+            for line in inp:
+                smiles = line.strip().split()[0]
+                building_blocks_smiles.add(smiles)
+    return building_blocks_smiles
+def load_value_net(
+    model_class: ValueNetwork, value_network_path: Union[str, Path]
+) -> ValueNetwork:
+    """Loads the value network.
+    :param value_network_path: The path to the file storing value network weights.
+    :param model_class: The model class to be loaded.
+    :return: The loaded value network.
+    """
+    map_location = device("cpu")
+    return model_class.load_from_checkpoint(value_network_path, map_location)
+def load_policy_net(
+    model_class: PolicyNetwork, policy_network_path: Union[str, Path]
+) -> PolicyNetwork:
+    """Loads the policy network.
+    :param policy_network_path: The path to the file storing policy network weights.
+    :param model_class: The model class to be loaded.
+    :return: The loaded policy network.
+    """
+    map_location = device("cpu")
+    return model_class.load_from_checkpoint(
+        policy_network_path, map_location, batch_size=1
+    )

synplan/utils/logging.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+Generic logging helpers for scripts, notebooks and Ray clusters.
+"""
+from __future__ import annotations
+import logging, sys, os, warnings
+from pathlib import Path
+from datetime import datetime
+from typing import Iterable, Optional
+from IPython import get_ipython
+# --------------------------------------------------------------------------- #
+#                               Helper classes                                #
+# --------------------------------------------------------------------------- #
+class DisableLogger:
+    """Context‑manager that suppresses *all* logging inside its scope."""
+    def __enter__(self):
+        logging.disable(logging.CRITICAL)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        logging.disable(logging.NOTSET)
+class HiddenPrints:
+    """Context‑manager that suppresses *print* output inside its scope."""
+    def __enter__(self):
+        self._orig = sys.stdout
+        sys.stdout = open(os.devnull, "w")
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout.close()
+        sys.stdout = self._orig
+# --------------------------------------------------------------------------- #
+#                         Notebook‑aware console handler                      #
+# --------------------------------------------------------------------------- #
+def _in_notebook() -> bool:
+    ip = get_ipython()
+    return bool(ip) and ip.__class__.__name__ == "ZMQInteractiveShell"
+class TqdmHandler(logging.StreamHandler):
+    """Write via tqdm.write so log lines don't break progress bars."""
+    def emit(self, record):
+        try:
+            from tqdm import tqdm
+            tqdm.write(self.format(record), end=self.terminator)
+        except ModuleNotFoundError:
+            super().emit(record)
+# --------------------------------------------------------------------------- #
+#                           Public initialisation API                         #
+# --------------------------------------------------------------------------- #
+def init_logger(
+    *,
+    name: str = "app",
+    console_level: str | int = "ERROR",
+    file_level: str | int = "INFO",
+    log_dir: str | os.PathLike = ".",
+    redirect_tqdm: bool = True,
+) -> logging.Logger:
+    """
+    Initialise (or fetch) a namespaced logger that works in scripts &
+    notebooks.  Idempotent ‑ safe to call multiple times.
+    Returns
+    -------
+    logging.Logger
+        Configured logger instance.
+    """
+    logger = logging.getLogger(name)
+    if logger.handlers:  # already configured
+        return logger
+    logger.setLevel("DEBUG")  # capture everything; handlers filter
+    # console / notebook handler
+    if _in_notebook() or (redirect_tqdm and "tqdm" in sys.modules):
+        ch: logging.Handler = TqdmHandler()
+    else:
+        ch = logging.StreamHandler(sys.stderr)
+    ch.setLevel(console_level)
+    ch.setFormatter(
+        logging.Formatter(
+            "%(asctime)s | %(levelname)-8s | %(message)s",
+            datefmt="%H:%M:%S",
+        )
+    )
+    logger.addHandler(ch)
+    # rotating file handler (one file per session)
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    fh = logging.FileHandler(Path(log_dir) / f"{name}_{stamp}.log", encoding="utf-8")
+    fh.setLevel(file_level)
+    fh.setFormatter(
+        logging.Formatter(
+            "%(asctime)s | %(name)s | %(levelname)-8s | %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    )
+    logger.addHandler(fh)
+    # logger.propagate = False # Removed correctly
+    log_file_path = fh.baseFilename
+    logger.info("Logging initialised → %s", log_file_path)
+    return logger, log_file_path  # <-- Return path too
+# --------------------------------------------------------------------------- #
+#                 Optional Ray‑specific configuration helpers                 #
+# --------------------------------------------------------------------------- #
+def init_ray_logging(
+    *,
+    python_level: str | int = "ERROR",
+    backend_level: str = "error",
+    log_to_driver: bool = False,
+    filter_userwarnings: bool = True,
+) -> "ray.LoggingConfig":
+    """
+    Prepare environment + Ray LoggingConfig **before** `ray.init()`.
+    Returns
+    -------
+    ray.LoggingConfig
+        Pass as `logging_config=` argument to `ray.init()`.
+    """
+    # 1) silence C++ backend (raylet / plasma) BEFORE importing ray
+    os.environ.setdefault("RAY_BACKEND_LOG_LEVEL", backend_level)
+    # 2) optional warnings filter
+    if filter_userwarnings:
+        warnings.filterwarnings("ignore", category=UserWarning)
+    import ray  # local import to avoid hard dep
+    # 3) global Python logger levels for every worker
+    ray_logger_names: Iterable[str] = (
+        "ray",
+        "ray.worker",
+        "ray.runtime",
+        "ray.dashboard",
+        "ray.tune",
+        "ray.serve",
+    )
+    for n in ray_logger_names:
+        logging.getLogger(n).setLevel(python_level)
+    # 4) build LoggingConfig that propagates to workers
+    return ray.LoggingConfig(
+        log_to_driver=log_to_driver,
+        log_level=python_level,
+    )
+def silence_logger(
+    logger_name: str,
+    level: int | str = logging.ERROR,
+):
+    """
+    Call at the *top* of every `@ray.remote` function or actor `__init__`
+    to raise the threshold of a chatty library **inside the worker**.
+    """
+    logging.getLogger(logger_name).setLevel(level)

synplan/utils/visualisation.py ADDED Viewed

	@@ -0,0 +1,1365 @@

+"""Module containing functions for analysis and visualization of the built tree."""
+import base64
+from itertools import count, islice
+from collections import deque
+from typing import Any, Dict, List, Union
+from CGRtools.containers.molecule import MoleculeContainer
+from CGRtools import smiles as read_smiles
+from synplan.chem.reaction_routes.visualisation import (
+    cgr_display,
+    depict_custom_reaction,
+)
+from synplan.chem.reaction_routes.io import make_dict
+from synplan.mcts.tree import Tree
+from IPython.display import display, HTML
+def get_child_nodes(
+    tree: Tree,
+    molecule: MoleculeContainer,
+    graph: Dict[MoleculeContainer, List[MoleculeContainer]],
+) -> Dict[str, Any]:
+    """Extracts the child nodes of the given molecule.
+    :param tree: The built tree.
+    :param molecule: The molecule in the tree from which to extract child nodes.
+    :param graph: The relationship between the given molecule and child nodes.
+    :return: The dict with extracted child nodes.
+    """
+    nodes = []
+    try:
+        graph[molecule]
+    except KeyError:
+        return []
+    for precursor in graph[molecule]:
+        temp_obj = {
+            "smiles": str(precursor),
+            "type": "mol",
+            "in_stock": str(precursor) in tree.building_blocks,
+        }
+        node = get_child_nodes(tree, precursor, graph)
+        if node:
+            temp_obj["children"] = [node]
+        nodes.append(temp_obj)
+    return {"type": "reaction", "children": nodes}
+def extract_routes(
+    tree: Tree, extended: bool = False, min_mol_size: int = 0
+) -> List[Dict[str, Any]]:
+    """Takes the target and the dictionary of successors and predecessors and returns a
+    list of dictionaries that contain the target and the list of successors.
+    :param tree: The built tree.
+    :param extended: If True, generates the extended route representation.
+    :param min_mol_size: If the size of the Precursor is equal or smaller than
+            min_mol_size it is automatically classified as building block.
+    :return: A list of dictionaries. Each dictionary contains a target, a list of
+        children, and a boolean indicating whether the target is in building_blocks.
+    """
+    target = tree.nodes[1].precursors_to_expand[0].molecule
+    target_in_stock = tree.nodes[1].curr_precursor.is_building_block(
+        tree.building_blocks, min_mol_size
+    )
+    # append encoded routes to list
+    routes_block = []
+    winning_nodes = []
+    if extended:
+        # collect routes
+        for i, node in tree.nodes.items():
+            if node.is_solved():
+                winning_nodes.append(i)
+    else:
+        winning_nodes = tree.winning_nodes
+    if winning_nodes:
+        for winning_node in winning_nodes:
+            # Create graph for route
+            nodes = tree.route_to_node(winning_node)
+            graph, pred = {}, {}
+            for before, after in zip(nodes, nodes[1:]):
+                before = before.curr_precursor.molecule
+                graph[before] = after = [x.molecule for x in after.new_precursors]
+                for x in after:
+                    pred[x] = before
+            routes_block.append(
+                {
+                    "type": "mol",
+                    "smiles": str(target),
+                    "in_stock": target_in_stock,
+                    "children": [get_child_nodes(tree, target, graph)],
+                }
+            )
+    else:
+        routes_block = [
+            {
+                "type": "mol",
+                "smiles": str(target),
+                "in_stock": target_in_stock,
+                "children": [],
+            }
+        ]
+    return routes_block
+def render_svg(pred, columns, box_colors):
+    """
+    Renders an SVG representation of a retrosynthetic route.
+    This function takes the predicted reaction steps, the molecules organized
+    into columns representing reaction stages, and a mapping of molecule status
+    to box colors, and generates an SVG string visualizing the route. It
+    calculates positions for molecules and arrows, and constructs the SVG
+    elements.
+    Args:
+        pred (tuple): A tuple of tuples representing the predicted reaction
+                      steps. Each inner tuple is (source_molecule_index,
+                      target_molecule_index). The indices correspond to the
+                      flattened list of molecules across all columns.
+        columns (list): A list of lists, where each inner list contains
+                        Molecule objects for a specific stage (column) in the
+                        retrosynthetic route.
+        box_colors (dict): A dictionary mapping molecule status strings (e.g.,
+                          'target', 'mulecule', 'instock') to SVG color strings
+                          for the boxes around the molecules.
+    Returns:
+        str: A string containing the complete SVG code for the retrosynthetic
+             route visualization.
+    """
+    x_shift = 0.0
+    c_max_x = 0.0
+    c_max_y = 0.0
+    render = []
+    cx = count()
+    cy = count()
+    arrow_points = {}
+    for ms in columns:
+        heights = []
+        for m in ms:
+            m.clean2d()
+            # X-shift for target
+            min_x = min(x for x, y in m._plane.values()) - x_shift
+            min_y = min(y for x, y in m._plane.values())
+            m._plane = {n: (x - min_x, y - min_y) for n, (x, y) in m._plane.items()}
+            max_x = max(x for x, y in m._plane.values())
+            c_max_x = max(c_max_x, max_x)
+            arrow_points[next(cx)] = [x_shift, max_x]
+            heights.append(max(y for x, y in m._plane.values()))
+        x_shift = c_max_x + 5.0  # between columns gap
+        # calculate Y-shift
+        y_shift = sum(heights) + 3.0 * (len(heights) - 1)
+        c_max_y = max(c_max_y, y_shift)
+        y_shift /= 2.0
+        for m, h in zip(ms, heights):
+            m._plane = {n: (x, y - y_shift) for n, (x, y) in m._plane.items()}
+            # calculate coordinates for boxes
+            max_x = max(x for x, y in m._plane.values()) + 0.9  # max x
+            min_x = min(x for x, y in m._plane.values()) - 0.6  # min x
+            max_y = -(max(y for x, y in m._plane.values()) + 0.45)  # max y
+            min_y = -(min(y for x, y in m._plane.values()) - 0.45)  # min y
+            x_delta = abs(max_x - min_x)
+            y_delta = abs(max_y - min_y)
+            box = (
+                f'<rect x="{min_x}" y="{max_y}" rx="{y_delta * 0.1}" ry="{y_delta * 0.1}" width="{x_delta}" height="{y_delta}"'
+                f' stroke="black" stroke-width=".0025" fill="{box_colors[m.meta["status"]]}" fill-opacity="0.30"/>'
+            )
+            arrow_points[next(cy)].append(y_shift - h / 2.0)
+            y_shift -= h + 3.0
+            depicted_molecule = list(m.depict(embedding=True))[:3]
+            depicted_molecule.append(box)
+            render.append(depicted_molecule)
+    # calculate mid-X coordinate to draw square arrows
+    graph = {}
+    for s, p in pred:
+        try:
+            graph[s].append(p)
+        except KeyError:
+            graph[s] = [p]
+    for s, ps in graph.items():
+        mid_x = float("-inf")
+        for p in ps:
+            s_min_x, s_max, s_y = arrow_points[s][:3]  # s
+            p_min_x, p_max, p_y = arrow_points[p][:3]  # p
+            p_max += 1
+            mid = p_max + (s_min_x - p_max) / 3
+            mid_x = max(mid_x, mid)
+        for p in ps:
+            arrow_points[p].append(mid_x)
+    config = MoleculeContainer._render_config
+    font_size = config["font_size"]
+    font125 = 1.25 * font_size
+    width = c_max_x + 4.0 * font_size  # 3.0 by default
+    height = c_max_y + 3.5 * font_size  # 2.5 by default
+    box_y = height / 2.0
+    svg = [
+        f'<svg width="{0.6 * width:.2f}cm" height="{0.6 * height:.2f}cm" '
+        f'viewBox="{-font125:.2f} {-box_y:.2f} {width:.2f} '
+        f'{height:.2f}" xmlns="http://www.w3.org/2000/svg" version="1.1">',
+        '  <defs>\n    <marker id="arrow" markerWidth="10" markerHeight="10" '
+        'refX="0" refY="3" orient="auto">\n      <path d="M0,0 L0,6 L9,3"/>\n    </marker>\n  </defs>',
+    ]
+    for s, p in pred:
+        s_min_x, s_max, s_y = arrow_points[s][:3]
+        p_min_x, p_max, p_y = arrow_points[p][:3]
+        p_max += 1
+        mid_x = arrow_points[p][-1]  # p_max + (s_min_x - p_max) / 3
+        arrow = f"""  <polyline points="{p_max:.2f} {p_y:.2f}, {mid_x:.2f} {p_y:.2f}, {mid_x:.2f} {s_y:.2f}, {s_min_x - 1.:.2f} {s_y:.2f}"
+                fill="none" stroke="black" stroke-width=".04" marker-end="url(#arrow)"/>"""
+        if p_y != s_y:
+            arrow += f'  <circle cx="{mid_x}" cy="{p_y}" r="0.1"/>'
+        svg.append(arrow)
+    for atoms, bonds, masks, box in render:
+        molecule_svg = MoleculeContainer._graph_svg(
+            atoms, bonds, masks, -font125, -box_y, width, height
+        )
+        molecule_svg.insert(1, box)
+        svg.extend(molecule_svg)
+    svg.append("</svg>")
+    return "\n".join(svg)
+def get_route_svg_mod(tree: Tree, node_id: int) -> str:
+    """
+    Visualizes the full retrosynthetic route from the target to a given node.
+    This function generates an SVG image for the synthetic path from the target
+    molecule to the specified node_id. It correctly handles paths that have not
+    been fully resolved to building blocks. The layout follows standard
+    retrosynthetic analysis, with the target on the right and precursors
+    arranged in columns to the left.
+    :param tree: The built MCTS tree.
+    :param node_id: The ID of the node to which the route should be visualized.
+    :return: A string containing the SVG visualization of the route.
+    """
+    # Box colors for molecule status
+    box_colors = {
+        "target": "#98EEFF",    # Light Blue for the main target
+        "mulecule": "#F0AB90",  # Peach for intermediates not in stock
+        "instock": "#9BFAB3",   # Light Green for building blocks
+    }
+    # Obtain the sequence of reaction steps in retrosynthetic order
+    retro_reactions = list(reversed(tree.synthesis_route(node_id)))
+    # Handle the case of the root node with no preceding reactions
+    if not retro_reactions:
+        target_node = tree.nodes.get(node_id)
+        if not target_node:
+            return ""
+        molecule = target_node.curr_precursor.molecule
+        molecule.meta["status"] = "target"
+        return render_svg(tuple(), [[molecule]], box_colors)
+    # Map all unique molecule SMILES to their MoleculeContainer objects
+    mol_map = {str(m): m for r in retro_reactions for m in r.reactants + r.products}
+    # Set the status for each unique molecule
+    for smiles, molecule in mol_map.items():
+        molecule.meta["status"] = "instock" if smiles in tree.building_blocks else "mulecule"
+    # The final target is the product of the first retrosynthetic reaction
+    target_molecule = retro_reactions[0].products[0]
+    target_molecule.meta["status"] = "target"
+    mol_map[str(target_molecule)] = target_molecule
+    # --- Build columns from left to right based on reaction dependencies ---
+    columns = []
+    # Identify molecules that are products in any reaction step
+    products_smiles = {str(p) for r in retro_reactions for p in r.products}
+    # The leftmost column consists of reactants that are not products of any other step in the path
+    leftmost_smiles = {str(m) for r in retro_reactions for m in r.reactants} - products_smiles
+    if not leftmost_smiles: # Fallback for simple A->B routes
+        leftmost_smiles = {str(m) for m in retro_reactions[-1].reactants}
+    columns.append([mol_map[s] for s in leftmost_smiles])
+    placed_smiles = set(leftmost_smiles)
+    # Iteratively build the next columns
+    while len(placed_smiles) < len(mol_map):
+        next_products = set()
+        for r in retro_reactions:
+            # If all reactants for a reaction have been placed in previous columns...
+            if all(str(reactant) in placed_smiles for reactant in r.reactants):
+                # ...then its products belong in the next column.
+                for product in r.products:
+                    if str(product) not in placed_smiles:
+                        next_products.add(str(product))
+        if not next_products:
+            break  # Safety break if no new column can be formed
+        columns.append([mol_map[s] for s in next_products])
+        placed_smiles.update(next_products)
+    # --- Prepare data for rendering ---
+    # Flatten the columns to get a single list of molecules for indexing
+    flat_mols = [mol for col in columns for mol in col]
+    mol_to_idx = {str(mol): i for i, mol in enumerate(flat_mols)}
+    # Define the connections (precursor -> product) for the SVG rendering
+    # The arrow in render_svg points from 'p' to 's'
+    pred = []
+    for reaction in retro_reactions:
+        for product in reaction.products:
+            if str(product) in mol_to_idx:
+                s_idx = mol_to_idx[str(product)]  # 's' is the product (on the right)
+                for reactant in reaction.reactants:
+                    if str(reactant) in mol_to_idx:
+                        p_idx = mol_to_idx[str(reactant)]  # 'p' is the reactant (on the left)
+                        pred.append((s_idx, p_idx))
+    return render_svg(tuple(pred), columns, box_colors)
+def get_route_svg(tree: Tree, node_id: int) -> str:
+    """Visualizes the retrosynthetic route.
+    :param tree: The built tree.
+    :param node_id: The id of the node from which to visualize the route.
+    :return: The SVG string.
+    """
+    nodes = tree.route_to_node(node_id)
+    # Set up node_id types for different box colors
+    for n in nodes:
+        for precursor in n.new_precursors:
+            precursor.molecule.meta["status"] = (
+                "instock"
+                if precursor.is_building_block(tree.building_blocks)
+                else "mulecule"
+            )
+    nodes[0].curr_precursor.molecule.meta["status"] = "target"
+    # Box colors
+    box_colors = {
+        "target": "#98EEFF",  # 152, 238, 255
+        "mulecule": "#F0AB90",  # 240, 171, 144
+        "instock": "#9BFAB3",  # 155, 250, 179
+    }
+    # first column is target
+    # second column are first new precursor_to_expand
+    columns = [
+        [nodes[0].curr_precursor.molecule],
+        [x.molecule for x in nodes[1].new_precursors],
+    ]
+    pred = {x: 0 for x in range(1, len(columns[1]) + 1)}
+    cx = [
+        n
+        for n, x in enumerate(nodes[1].new_precursors, 1)
+        if not x.is_building_block(tree.building_blocks)
+    ]
+    size = len(cx)
+    nodes = iter(nodes[2:])
+    cy = count(len(columns[1]) + 1)
+    while size:
+        layer = []
+        for s in islice(nodes, size):
+            n = cx.pop(0)
+            for x in s.new_precursors:
+                layer.append(x)
+                m = next(cy)
+                if not x.is_building_block(tree.building_blocks):
+                    cx.append(m)
+                pred[m] = n
+        size = len(cx)
+        columns.append([x.molecule for x in layer])
+    columns = [
+        columns[::-1] for columns in columns[::-1]
+    ]  # Reverse array to make retrosynthetic graph
+    pred = tuple(  # Change dict to tuple to make multiple precursor_to_expand available
+        (abs(source - len(pred)), abs(target - len(pred)))
+        for target, source in pred.items()
+    )
+    svg = render_svg(pred, columns, box_colors)
+    return svg
+def get_route_svg_from_json(routes_json: dict, route_id: int) -> str:
+    """
+    Visualizes the retrosynthetic route described in routes_json[route_id].
+    :param routes_json: A dict mapping route IDs to nested JSON trees of molecules/reactions.
+    :param route_id: The id of the route from which to visualize the route.
+    :return:           The SVG string .
+    """
+    # 1) Parse JSON into per-depth lists of mol-dicts, remembering parent links
+    if route_id not in routes_json.keys():
+        try:
+            root = routes_json[str(route_id)]
+        except KeyError:
+            raise ValueError(f"Route ID {route_id} not found in routes_json.")
+    else:
+        root = routes_json[route_id]
+    levels = []  # levels[d] = list of mol-dicts at depth d
+    parent_of = {}  # mol_id -> parent_mol_dict
+    Q = deque([(root, 0, None)])
+    while Q:
+        node, depth, parent = Q.popleft()
+        if node.get("type") != "mol":
+            continue
+        if len(levels) <= depth:
+            levels.append([])
+        levels[depth].append(node)
+        parent_of[id(node)] = parent
+        for child in node.get("children", []):
+            if child.get("type") == "reaction":
+                for mol_child in child.get("children", []):
+                    if mol_child.get("type") == "mol":
+                        Q.append((mol_child, depth + 1, node))
+    # 2) Build MoleculeContainer objects & set meta["status"]
+    mol_container = {}
+    for depth, mols in enumerate(levels):
+        for mol in mols:
+            m = read_smiles(mol["smiles"])
+            # target at depth=0, else instock vs mulecule
+            if depth == 0:
+                m.meta["status"] = "target"
+            else:
+                m.meta["status"] = (
+                    "instock" if mol.get("in_stock", False) else "mulecule"
+                )
+            mol_container[id(mol)] = m
+    # 3) Mirror columns left↔right at the JSON level
+    json_columns = levels[::-1]
+    # 4) Flatten JSON node IDs in that mirrored order (so flat_index keys = id(mol_dict))
+    flat_node_ids = [id(m) for lvl in json_columns for m in lvl]
+    flat_index = {nid: idx for idx, nid in enumerate(flat_node_ids)}
+    # 5) Build pred from those JSON‐node IDs
+    pred = tuple(
+        (flat_index[id(parent)], flat_index[child_id])
+        for child_id, parent in parent_of.items()
+        if parent is not None
+    )
+    # 6) Now map JSON columns → MoleculeContainer columns for layout
+    columns = [[mol_container[id(m)] for m in lvl] for lvl in json_columns]
+    # 6) The rest is identical to your original rendering logic:
+    box_colors = {
+        "target": "#98EEFF",
+        "mulecule": "#F0AB90",
+        "instock": "#9BFAB3",
+    }
+    svg = render_svg(pred, columns, box_colors)
+    return svg
+def generate_results_html(
+    tree: Tree, html_path: str, aam: bool = False, extended: bool = False
+) -> None:
+    """Writes an HTML page with the synthesis routes in SVG format and corresponding
+    reactions in SMILES format.
+    :param tree: The built tree.
+    :param extended: If True, generates the extended route representation.
+    :param html_path: The path to the file where to store resulting HTML.
+    :param aam: If True, depict atom-to-atom mapping.
+    :return: None.
+    """
+    if aam:
+        MoleculeContainer.depict_settings(aam=True)
+    else:
+        MoleculeContainer.depict_settings(aam=False)
+    routes = []
+    if extended:
+        # Gather paths
+        for idx, node in tree.nodes.items():
+            if node.is_solved():
+                routes.append(idx)
+    else:
+        routes = tree.winning_nodes
+    # HTML Tags
+    th = '<th style="text-align: left; background-color:#978785; border: 1px solid black; border-spacing: 0">'
+    td = '<td style="text-align: left; border: 1px solid black; border-spacing: 0">'
+    font_red = "<font color='red' style='font-weight: bold'>"
+    font_green = "<font color='light-green' style='font-weight: bold'>"
+    font_head = "<font style='font-weight: bold; font-size: 18px'>"
+    font_normal = "<font style='font-weight: normal; font-size: 18px'>"
+    font_close = "</font>"
+    template_begin = """
+    <!doctype html>
+    <html lang="en">
+    <head>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
+    rel="stylesheet"
+    integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3"
+    crossorigin="anonymous">
+    <script
+    src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"
+    integrity="sha384-ka7Sk0Gln4gmtz2MlQnikT1wXgYsOg+OMhuP+IlRH9sENBO0LRn5q+8nbTov4+1p"
+    crossorigin="anonymous">
+    </script>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Predicted Paths Report</title>
+    <meta name="description" content="A simple HTML5 Template for new projects.">
+    <meta name="author" content="SitePoint">
+    </head>
+    <body>
+    """
+    template_end = """
+    </body>
+    </html>
+    """
+    # SVG Template
+    box_mark = """
+    <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg">
+    <circle cx="0.5" cy="0.5" r="0.5" fill="rgb()" fill-opacity="0.35" />
+    </svg>
+    """
+    # table = f"<table><thead><{th}>Retrosynthetic Routes</th></thead><tbody>"
+    table = """
+    <table class="table table-striped table-hover caption-top">
+    <caption><h3>Retrosynthetic Routes Report</h3></caption>
+    <tbody>"""
+    # Gather path data
+    table += f"<tr>{td}{font_normal}Target Molecule: {str(tree.nodes[1].curr_precursor)}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Tree Size: {len(tree)}{font_close} nodes</td></tr>"
+    table += f"<tr>{td}{font_normal}Number of visited nodes: {len(tree.visited_nodes)}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Found paths: {len(routes)}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Time: {round(tree.curr_time, 4)}{font_close} seconds</td></tr>"
+    table += f"""
+    <tr>{td}
+                 <div>
+    {box_mark.replace("rgb()", "rgb(152, 238, 255)")}
+    Target Molecule
+    {box_mark.replace("rgb()", "rgb(240, 171, 144)")}
+    Molecule Not In Stock
+    {box_mark.replace("rgb()", "rgb(155, 250, 179)")}
+    Molecule In Stock
+    </div>
+    </td></tr>
+    """
+    for route in routes:
+        svg = get_route_svg(tree, route)  # get SVG
+        full_route = tree.synthesis_route(route)  # get route
+        # write SMILES of all reactions in synthesis path
+        step = 1
+        reactions = ""
+        for synth_step in full_route:
+            reactions += f"<b>Step {step}:</b> {str(synth_step)}<br>"
+            step += 1
+        # Concatenate all content of path
+        route_score = round(tree.route_score(route), 3)
+        table += (
+            f'<tr style="line-height: 250%">{td}{font_head}Route {route}; '
+            f"Steps: {len(full_route)}; "
+            f"Cumulated nodes' value: {route_score}{font_close}</td></tr>"
+        )
+        # f"Cumulated nodes' value: {node._probabilities[path]}{font_close}</td></tr>"
+        table += f"<tr>{td}{svg}</td></tr>"
+        table += f"<tr>{td}{reactions}</td></tr>"
+    table += "</tbody>"
+    if html_path is None:
+        return table
+    with open(html_path, "w", encoding="utf-8") as html_file:
+        html_file.write(template_begin)
+        html_file.write(table)
+        html_file.write(template_end)
+def html_top_routes_cluster(clusters: dict, tree: Tree, target_smiles: str) -> str:
+    """9. Clustering Results Download: Providing functionality to download the clustering results with styled HTML report."""
+    # Compute summary
+    total_routes = sum(len(data.get("node_ids", [])) for data in clusters.values())
+    total_clusters = len(clusters)
+    # Build styled HTML report using Bootstrap
+    html = []
+    html.append("<!doctype html><html lang='en'><head>")
+    html.append(
+        "<meta charset='utf-8'><meta name='viewport' content='width=device-width, initial-scale=1'>"
+    )
+    html.append(
+        "<link href='https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css' rel='stylesheet'>"
+    )
+    html.append("<title>Clustering Results Report</title>")
+    html.append(
+        "<style> svg{max-width:100%;height:auto;} .report-table th,.report-table td{vertical-align:top;border:1px solid #dee2e6;} </style>"
+    )
+    html.append("</head><body><div class='container my-4'>")
+    # Report header
+    html.append(f"<h1 class='mb-3'>Best route from each cluster</h1>")
+    html.append(f"<p><strong>Target molecule (SMILES):</strong> {target_smiles}</p>")
+    html.append(f"<p><strong>Total number of routes:</strong> {total_routes}</p>")
+    html.append(f"<p><strong>Total number of clusters:</strong> {total_clusters}</p>")
+    # Table header
+    # html.append("<table class='table report-table'><thead><tr>")
+    html.append(
+        "<table class='table report-table'><colgroup><col style='width:5%'><colgroup><col style='width:5%'><col style='width:15%'><col style='width:75%'></colgroup><thead><tr>"
+    )
+    html.append(
+        "<th>Cluster index</th><th>Size</th><th>ReducedRouteCGR</th><th>Best Route</th>"
+    )
+    html.append("</tr></thead><tbody>")
+    # Rows per cluster
+    for cluster_num, group_data in clusters.items():
+        node_ids = group_data.get("node_ids", [])
+        if not node_ids:
+            continue
+        node_id = node_ids[0]
+        # Get SVGs
+        svg = get_route_svg(tree, node_id)
+        r_cgr = group_data.get("sb_cgr")
+        r_cgr_svg = None
+        if r_cgr:
+            r_cgr.clean2d()
+            r_cgr_svg = cgr_display(r_cgr)
+        # Start row
+        html.append(f"<tr><td>{cluster_num}</td>")
+        html.append(f"<td>{len(node_ids)}</td>")
+        # ReducedRouteCGR cell
+        html.append("<td>")
+        if r_cgr_svg:
+            b64_r = base64.b64encode(r_cgr_svg.encode("utf-8")).decode()
+            html.append(
+                f"<img src='data:image/svg+xml;base64,{b64_r}' alt='ReducedRouteCGR' class='img-fluid'/>"
+            )
+        html.append("</td>")
+        # Best Route cell
+        html.append("<td>")
+        if svg:
+            b64_svg = base64.b64encode(svg.encode("utf-8")).decode()
+            html.append(
+                f"<img src='data:image/svg+xml;base64,{b64_svg}' alt='Route {node_id}' class='img-fluid'/>"
+            )
+        html.append("</td></tr>")
+    # Close table and HTML
+    html.append("</tbody></table>")
+    html.append("</div></body></html>")
+    report_html = "".join(html)
+    return report_html
+def routes_clustering_report(
+    source: Union[Tree, dict],
+    clusters: dict,
+    group_index: str,
+    sb_cgrs_dict: dict,
+    aam: bool = False,
+    html_path: str = None,
+) -> str:
+    """
+    Generates an HTML report visualizing a cluster of retrosynthetic routes.
+    This function takes a source of retrosynthetic routes (either a Tree object
+    or a dictionary representing routes in JSON format), cluster information,
+    and a dictionary of ReducedRouteCGRs, and produces a comprehensive HTML report.
+    The report includes details about the cluster, a representative ReducedRouteCGR,
+    and SVG visualizations of each route within the specified cluster.
+    Args:
+        source (Union[Tree, dict]): The source of retrosynthetic routes.
+                                     Can be a Tree object containing the full
+                                     search tree, or a dictionary loaded from
+                                     a routes JSON file.
+        clusters (dict): A dictionary containing clustering results. It should
+                       contain information about different clusters, typically
+                       including a list of 'node_ids' for each cluster.
+        group_index (str): The key identifying the specific cluster within the
+                           `clusters` dictionary for which the report should be
+                           generated.
+        sb_cgrs_dict (dict): A dictionary mapping route IDs (integers) to
+                             ReducedRouteCGR (Retrosynthetic Graph-based Chemical
+                             Reaction) objects. Used to display a representative
+                             ReducedRouteCGR for the cluster.
+        aam (bool, optional): Whether to enable atom-atom mapping visualization
+                              in molecule depictions. Defaults to False.
+        html_path (str, optional): The file path where the generated HTML
+                                   report should be saved. If provided, the
+                                   function saves the report to this file and
+                                   returns a confirmation message. If None,
+                                   the function returns the HTML string
+                                   directly. Defaults to None.
+    Returns:
+        str: The generated HTML report as a string, or a string confirming
+             the file path where the report was saved if `html_path` is
+             provided. Returns an error message string if the input `source`
+             or `clusters` are invalid, or if the specified `group_index` is
+             not found.
+    """
+    # --- Depict Settings ---
+    try:
+        MoleculeContainer.depict_settings(aam=bool(aam))
+    except Exception:
+        pass
+    # --- Figure out what `source` is ---
+    using_tree = False
+    if hasattr(source, "nodes") and hasattr(source, "route_to_node"):
+        tree = source
+        using_tree = True
+    elif isinstance(source, dict):
+        routes_json = source
+        tree = None
+    else:
+        return "<html><body>Error: first argument must be a Tree or a routes_json dict.</body></html>"
+    # --- Validate clusters ---
+    if not isinstance(clusters, dict):
+        return "<html><body>Error: clusters must be a dict.</body></html>"
+    group = clusters.get(group_index)
+    if group is None:
+        return f"<html><body>Error: no group with index {group_index!r}.</body></html>"
+    cluster_node_ids = group.get("node_ids", [])
+    # Filter valid routes
+    valid_routes = []
+    if using_tree:
+        for nid in cluster_node_ids:
+            if nid in tree.nodes and tree.nodes[nid].is_solved():
+                valid_routes.append(nid)
+    else:
+        # JSON mode: check if the node ID exists in the routes_dict
+        routes_dict = make_dict(routes_json)
+        for nid in cluster_node_ids:
+            if nid in routes_dict.keys():
+                valid_routes.append(nid)
+    if not valid_routes:
+        return f"""
+        <!doctype html><html><body>
+          <h3>Cluster {group_index} Report</h3>
+          <p>No valid routes found in this cluster.</p>
+        </body></html>
+        """
+    # --- Boilerplate HTML head/tail omitted for brevity ---
+    template_begin = (
+        """<!doctype html><html><head>…</head><body><div class="container">"""
+    )
+    template_end = """</div></body></html>"""
+    table = f"""
+      <table class="table">
+        <caption><h3>Cluster {group_index} Routes</h3></caption>
+        <tbody>
+    """
+    # show target
+    if using_tree:
+        try:
+            target_smiles = str(tree.nodes[1].curr_precursor)
+        except Exception:
+            target_smiles = "N/A"
+    else:
+        # JSON mode: take the root smiles of the first route
+        target_smiles = routes_json[str(valid_routes[0])]["smiles"]
+    # legend row omitted…
+    # --- HTML Templates & Tags ---
+    th = '<th style="text-align: left; background-color:#978785; border: 1px solid black; border-spacing: 0">'
+    td = '<td style="text-align: left; border: 1px solid black; border-spacing: 0">'
+    font_head = "<font style='font-weight: bold; font-size: 18px'>"
+    font_normal = "<font style='font-weight: normal; font-size: 18px'>"
+    font_close = "</font>"
+    template_begin = f"""
+    <!doctype html>
+    <html lang="en">
+    <head>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
+    rel="stylesheet"
+    integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3"
+    crossorigin="anonymous">
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Cluster {group_index} Routes Report</title>
+    <style>
+        /* Optional: Add some basic styling */
+        .table {{ border-collapse: collapse; width: 100%; }}
+        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
+        tr:nth-child(even) {{ background-color: #ffffff; }}
+        caption {{ caption-side: top; font-size: 1.5em; margin: 1em 0; }}
+        svg {{ max-width: 100%; height: auto; }}
+    </style>
+    </head>
+    <body>
+    <div class="container"> """
+    template_end = """
+    </div> <script
+    src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"
+    integrity="sha384-ka7Sk0Gln4gmtz2MlQnikT1wXgYsOg+OMhuP+IlRH9sENBO0LRn5q+8nbTov4+1p"
+    crossorigin="anonymous">
+    </script>
+    </body>
+    </html>
+    """
+    box_mark = """
+    <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg" style="vertical-align: middle; margin-right: 5px;">
+    <circle cx="0.5" cy="0.5" r="0.5" fill="rgb()" fill-opacity="0.35" />
+    </svg>
+    """
+    # --- Build HTML Table ---
+    table = f"""
+    <table class="table table-hover caption-top">
+    <caption><h3>Retrosynthetic Routes Report - Cluster {group_index}</h3></caption>
+    <tbody>"""
+    table += (
+        f"<tr>{td}{font_normal}Target Molecule: {target_smiles}{font_close}</td></tr>"
+    )
+    table += f"<tr>{td}{font_normal}Group index: {group_index}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Size of Cluster: {len(valid_routes)} routes{font_close} </td></tr>"
+    # --- Add ReducedRouteCGR Image ---
+    first_route_id = valid_routes[0] if valid_routes else None
+    if first_route_id and sb_cgrs_dict:
+        try:
+            sb_cgr = sb_cgrs_dict[first_route_id]
+            sb_cgr.clean2d()
+            sb_cgr_svg = cgr_display(sb_cgr)
+            if sb_cgr_svg.strip().startswith("<svg"):
+                table += f"<tr>{td}{font_normal}Identified Strategic Bonds{font_close}<br>{sb_cgr_svg}</td></tr>"
+            else:
+                table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Invalid SVG format retrieved.</i></td></tr>"
+                print(
+                    f"Warning: Expected SVG for ReducedRouteCGR of node {first_route_id}, but got: {sb_cgr_svg[:100]}..."
+                )
+        except Exception as e:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Error retrieving/displaying ReducedRouteCGR: {e}</i></td></tr>"
+    else:
+        if first_route_id:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Not found in provided ReducedRouteCGR dictionary.</i></td></tr>"
+        else:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR:{font_close}<br><i>No valid routes in cluster to select from.</i></td></tr>"
+    table += f"""
+    <tr>{td}
+        <div style="display: flex; align-items: center; flex-wrap: wrap; gap: 15px;">
+            <span>{box_mark.replace("rgb()", "rgb(152, 238, 255)")} Target Molecule</span>
+            <span>{box_mark.replace("rgb()", "rgb(240, 171, 144)")} Molecule Not In Stock</span>
+            <span>{box_mark.replace("rgb()", "rgb(155, 250, 179)")} Molecule In Stock</span>
+        </div>
+    </td></tr>
+    """
+    for route_id in valid_routes:
+        if using_tree:
+            # 1) SVG from Tree
+            svg = get_route_svg(tree, route_id)
+            # 2) Reaction steps & score
+            steps = tree.synthesis_route(route_id)
+            score = round(tree.route_score(route_id), 3)
+            # build reaction list
+            reac_html = "".join(
+                f"<b>Step {i+1}:</b> {str(r)}<br>" for i, r in enumerate(steps)
+            )
+            header = f"Route {route_id} — {len(steps)} steps, score={score}"
+            table += f"<tr><td><b>{header}</b></td></tr>"
+            table += f"<tr><td>{svg}</td></tr>"
+            table += f"<tr><td>{reac_html}</td></tr>"
+        else:
+            # 1) SVG from JSON
+            svg = get_route_svg_from_json(routes_json, route_id)
+            steps = routes_dict[route_id]
+            reac_html = "".join(
+                f"<b>Step {i+1}:</b> {str(r)}<br>" for i, r in steps.items()
+            )
+            header = f"Route {route_id} — {len(steps)} steps"
+            table += f"<tr><td><b>{header}</b></td></tr>"
+            table += f"<tr><td>{svg}</td></tr>"
+            table += f"<tr><td>{reac_html}</td></tr>"
+    table += "</tbody></table>"
+    html = template_begin + table + template_end
+    if html_path:
+        with open(html_path, "w", encoding="utf-8") as f:
+            f.write(html)
+        return f"Written to {html_path}"
+    return html
+def lg_table_2_html(subcluster, nodes_to_display=[], if_display=True):
+    """
+    Generates an HTML table visualizing leaving groups (X) 'marks' for routes within a subcluster.
+    This function creates an HTML table where each row represents a routes
+    from the specified subcluster (or a subset of nodes), and columns
+    represent unique 'marks' found across the nodes. The cells contain
+    the SVG depiction of the corresponding mark for that node.
+    Args:
+        subcluster (dict): A dictionary containing subcluster data, expected
+                           to have a 'nodes_data' key mapping node IDs to
+                           dictionaries of marks and their associated data
+                           (where the first element is a depictable object).
+        nodes_to_display (list, optional): A list of specific node IDs to
+                                           include in the table. If empty,
+                                           all nodes in `subcluster["nodes_data"]`
+                                           are included. Defaults to [].
+        if_display (bool, optional): If True, the generated HTML is
+                                     displayed directly using `display(HTML())`.
+                                     Defaults to True.
+    Returns:
+        str: The generated HTML string for the table.
+    """
+    # Create HTML table header
+    html = "<table style='border-collapse: collapse;'><tr><th style='border: 1px solid black; padding: 4px;'>Route ID</th>"
+    # Extract all unique marks across all nodes to form consistent columns
+    all_marks = set()
+    for node_data in subcluster["nodes_data"].values():
+        all_marks.update(node_data.keys())
+    all_marks = sorted(all_marks)  # sort for consistent ordering
+    # Add marks as headers
+    for mark in all_marks:
+        html += f"<th style='border: 1px solid black; padding: 4px;'>{mark}</th>"
+    html += "</tr>"
+    # Fill in the rows
+    if len(nodes_to_display) == 0:
+        for node_id, node_data in subcluster["nodes_data"].items():
+            html += (
+                f"<tr><td style='border: 1px solid black; padding: 4px;'>{node_id}</td>"
+            )
+            for mark in all_marks:
+                html += "<td style='border: 1px solid black; padding: 4px;'>"
+                if mark in node_data:
+                    svg = node_data[mark][0].depict()  # Get SVG data as string
+                    html += svg
+                html += "</td>"
+            html += "</tr>"
+    else:
+        for node_id in nodes_to_display:
+            # Check if the node_id exists in the subcluster data
+            if node_id in subcluster["nodes_data"]:
+                node_data = subcluster["nodes_data"][node_id]
+                html += f"<tr><td style='border: 1px solid black; padding: 4px;'>{node_id}</td>"
+                for mark in all_marks:
+                    html += "<td style='border: 1px solid black; padding: 4px;'>"
+                    if mark in node_data:
+                        svg = node_data[mark][0].depict()  # Get SVG data as string
+                        html += svg
+                    html += "</td>"
+                html += "</tr>"
+            else:
+                # Optionally, you can note that the node_id was not found
+                html += f"<tr><td colspan='{len(all_marks)+1}' style='border: 1px solid black; padding: 4px; color:red;'>Route ID {node_id} not found.</td></tr>"
+    html += "</table>"
+    if if_display:
+        display(HTML(html))
+    return html
+def group_lg_table_2_html_fixed(
+    grouped: dict,
+    groups_to_display=None,
+    if_display=False,
+    max_group_col_width: int = 200,
+) -> str:
+    """
+    Generates an HTML table visualizing leaving groups X 'marks' for representative routes in grouped data.
+    This function takes a dictionary of grouped data, where each key represents
+    a group (e.g., a collection of node IDs of routes) and the value is a representative
+    dictionary of 'marks' for that group. It generates an HTML table with a
+    fixed layout, where each row corresponds to a group, and columns show the
+    SVG depiction or string representation of the 'marks' for the group's
+    representative.
+    Args:
+        grouped (dict): A dictionary where keys are group identifiers (e.g.,
+                        tuples of node IDs of routes) and values are dictionaries
+                        representing the 'marks' for the representative of
+                        that group. The 'marks' dictionary should map mark
+                        names (str) to objects that have a `.depict()` method
+                        or are convertible to a string.
+        groups_to_display (list, optional): A list of specific group
+                                            identifiers to include in the table.
+                                            If None, all groups in the `grouped`
+                                            dictionary are included. Defaults to None.
+        if_display (bool, optional): If True, the generated HTML is
+                                     displayed directly using `display(HTML())`.
+                                     Defaults to False.
+        max_group_col_width (int, optional): The maximum width (in pixels)
+                                             for the column displaying the
+                                             group identifiers. Defaults to 200.
+    Returns:
+        str: The generated HTML string for the table.
+    """
+    # 1) pick which groups to show
+    if groups_to_display is None:
+        groups = list(grouped.keys())
+    else:
+        groups = [g for g in groups_to_display if g in grouped]
+    # 2) collect all marks for the header
+    all_marks = sorted({m for rep in grouped.values() for m in rep.keys()})
+    # 3) build table start with auto layout
+    html = [
+        "<table style='width:100%; table-layout:auto; border-collapse: collapse;'>",
+        "<thead><tr>",
+        "<th style='border:1px solid #ccc; padding:4px;'>Route IDs</th>",
+    ]
+    # numeric headers
+    html += [
+        f"<th style='border:1px solid #ccc; padding:4px; text-align:center;'>{mark}</th>"
+        for mark in all_marks
+    ]
+    html.append("</tr></thead><tbody>")
+    # 4) each row
+    group_td_style = (
+        f"border:1px solid #ccc; padding:4px; "
+        "white-space: normal; overflow-wrap: break-word; "
+        f"max-width:{max_group_col_width}px;"
+    )
+    img_td_style = (
+        "border:1px solid #ccc; padding:4px; text-align:center; vertical-align:middle;"
+    )
+    for group in groups:
+        rep = grouped[group]
+        label = ",".join(str(n) for n in group)
+        # start row
+        row = [f"<td style='{group_td_style}'>{label}</td>"]
+        # fill in each mark column
+        for mark in all_marks:
+            cell = ["<td style='" + img_td_style + "'>"]
+            if mark in rep:
+                val = rep[mark]
+                cell.append(val.depict() if hasattr(val, "depict") else str(val))
+            cell.append("</td>")
+            row.append("".join(cell))
+        html.append("<tr>" + "".join(row) + "</tr>")
+    html.append("</tbody></table>")
+    out = "".join(html)
+    if if_display:
+        display(HTML(out))
+    return out
+def routes_subclustering_report(
+    source: Union[Tree, dict],
+    subcluster: dict,
+    group_index: str,
+    cluster_num: int,
+    sb_cgrs_dict: dict,
+    if_lg_group: bool = False,
+    aam: bool = False,
+    html_path: str = None,
+) -> str:
+    """
+    Generates an HTML report visualizing a specific subcluster of retrosynthetic routes.
+    This function takes a source of retrosynthetic routes (either a Tree object
+    or a dictionary representing routes in JSON format), data for a specific
+    subcluster, and a dictionary of ReducedRouteCGRs. It produces a detailed HTML report
+    for the subcluster, including general cluster information, a representative
+    ReducedRouteCGR, a synthon pseudo reaction, a table of leaving groups (either per
+    node or grouped), and SVG visualizations of each valid route within the
+    subcluster.
+    Args:
+        source (Union[Tree, dict]): The source of retrosynthetic routes.
+                                     Can be a Tree object containing the full
+                                     search tree, or a dictionary loaded from
+                                     a routes JSON file.
+        subcluster (dict): A dictionary containing data for the specific
+                           subcluster. Expected keys include 'nodes_data'
+                           (mapping node IDs to mark data), 'synthon_reaction',
+                           and optionally 'group_lgs' if `if_lg_group` is True.
+        group_index (str): The index of the main cluster to which this
+                           subcluster belongs. Used for report titling.
+        cluster_num (int): The number or identifier of the subcluster within
+                           its main group. Used for report titling.
+        sb_cgrs_dict (dict): A dictionary mapping route IDs (integers) to
+                             ReducedRouteCGR objects. Used to display a representative
+                             ReducedRouteCGR for the cluster.
+        if_lg_group (bool, optional): If True, the leaving groups table will
+                                     display grouped leaving groups from
+                                     `subcluster['group_lgs']`. If False, it
+                                     will display leaving groups per individual
+                                     node from `subcluster['nodes_data']`.
+                                     Defaults to False.
+        aam (bool, optional): Whether to enable atom-atom mapping visualization
+                              in molecule depictions. Defaults to False.
+        html_path (str, optional): The file path where the generated HTML
+                                   report should be saved. If provided, the
+                                   function saves the report to this file and
+                                   returns a confirmation message. If None,
+                                   the function returns the HTML string
+                                   directly. Defaults to None.
+    Returns:
+        str: The generated HTML report as a string, or a string confirming
+             the file path where the report was saved if `html_path` is
+             provided. Returns a minimal HTML page indicating no valid routes
+             if the subcluster contains no valid/solved routes. Returns an
+             error message string if the input `source` or `subcluster` are
+             invalid.
+    """
+    # --- Depict Settings ---
+    try:
+        MoleculeContainer.depict_settings(aam=bool(aam))
+    except Exception:
+        pass
+    # --- Figure out what `source` is ---
+    using_tree = False
+    if hasattr(source, "nodes") and hasattr(source, "route_to_node"):
+        tree = source
+        using_tree = True
+    elif isinstance(source, dict):
+        routes_json = source
+        tree = None
+    else:
+        return "<html><body>Error: first argument must be a Tree or a routes_json dict.</body></html>"
+    # --- Validate groups ---
+    if not isinstance(subcluster, dict):
+        return "<html><body>Error: groups must be a dict.</body></html>"
+    subcluster_node_ids = list(subcluster["nodes_data"].keys())
+    # Filter valid routes
+    valid_routes = []
+    if using_tree:
+        for nid in subcluster_node_ids:
+            if nid in tree.nodes and tree.nodes[nid].is_solved():
+                valid_routes.append(nid)
+    else:
+        # JSON mode: just keep those IDs present in the JSON
+        for nid in subcluster_node_ids:
+            if nid in routes_json:
+                valid_routes.append(nid)
+        routes_dict = make_dict(routes_json)
+    if not valid_routes:
+        # Return a minimal HTML page indicating no valid routes
+        return f"""
+        <!doctype html><html lang="en"><head><meta charset="utf-8">
+        <title>Cluster {group_index}.{cluster_num} Report</title></head><body>
+        <h3>Cluster {group_index}.{cluster_num} Report</h3>
+        <p>No valid/solved routes found for this cluster.</p>
+        </body></html>"""
+    # --- Boilerplate HTML head/tail omitted for brevity ---
+    template_begin = (
+        """<!doctype html><html><head>…</head><body><div class="container">"""
+    )
+    template_end = """</div></body></html>"""
+    table = f"""
+      <table class="table">
+        <caption><h3>Cluster {group_index} Routes</h3></caption>
+        <tbody>
+    """
+    # show target
+    if using_tree:
+        try:
+            target_smiles = str(tree.nodes[1].curr_precursor)
+        except Exception:
+            target_smiles = "N/A"
+    else:
+        # JSON mode: take the root smiles of the first route
+        target_smiles = routes_json[valid_routes[0]]["smiles"]
+    # legend row omitted…
+    # --- HTML Templates & Tags ---
+    th = '<th style="text-align: left; background-color:#978785; border: 1px solid black; border-spacing: 0">'
+    td = '<td style="text-align: left; border: 1px solid black; border-spacing: 0">'
+    font_head = "<font style='font-weight: bold; font-size: 18px'>"
+    font_normal = "<font style='font-weight: normal; font-size: 18px'>"
+    font_close = "</font>"
+    template_begin = f"""
+    <!doctype html>
+    <html lang="en">
+    <head>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
+    rel="stylesheet"
+    integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3"
+    crossorigin="anonymous">
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>SubCluster {group_index}.{cluster_num} Routes Report</title>
+    <style>
+        /* Optional: Add some basic styling */
+        .table {{ border-collapse: collapse; width: 100%; }}
+        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
+        tr:nth-child(even) {{ background-color: #ffffff; }}
+        caption {{ caption-side: top; font-size: 1.5em; margin: 1em 0; }}
+        svg {{ max-width: 100%; height: auto; }}
+    </style>
+    </head>
+    <body>
+    <div class="container"> """
+    template_end = """
+    </div> <script
+    src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"
+    integrity="sha384-ka7Sk0Gln4gmtz2MlQnikT1wXgYsOg+OMhuP+IlRH9sENBO0LRn5q+8nbTov4+1p"
+    crossorigin="anonymous">
+    </script>
+    </body>
+    </html>
+    """
+    box_mark = """
+    <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg" style="vertical-align: middle; margin-right: 5px;">
+    <circle cx="0.5" cy="0.5" r="0.5" fill="rgb()" fill-opacity="0.35" />
+    </svg>
+    """
+    # --- Build HTML Table ---
+    table = f"""
+    <table class="table table-hover caption-top">
+    <caption><h3>Retrosynthetic Routes Report - Cluster {group_index}.{cluster_num}</h3></caption>
+    <tbody>"""
+    table += (
+        f"<tr>{td}{font_normal}Target Molecule: {target_smiles}{font_close}</td></tr>"
+    )
+    table += f"<tr>{td}{font_normal}Group index: {group_index}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Cluster Number: {cluster_num}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Size of Cluster: {len(valid_routes)} routes{font_close} </td></tr>"
+    # --- Add ReducedRouteCGR Image ---
+    first_route_id = valid_routes[0] if valid_routes else None
+    if first_route_id and sb_cgrs_dict:
+        try:
+            sb_cgr = sb_cgrs_dict[first_route_id]
+            sb_cgr.clean2d()
+            sb_cgr_svg = cgr_display(sb_cgr)
+            if sb_cgr_svg.strip().startswith("<svg"):
+                table += f"<tr>{td}{font_normal}Identified Strategic Bonds{font_close}<br>{sb_cgr_svg}</td></tr>"
+            else:
+                table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Invalid SVG format retrieved.</i></td></tr>"
+                print(
+                    f"Warning: Expected SVG for ReducedRouteCGR of node {first_route_id}, but got: {sb_cgr_svg[:100]}..."
+                )
+        except Exception as e:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Error retrieving/displaying ReducedRouteCGR: {e}</i></td></tr>"
+    else:
+        if first_route_id:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR (from Route {first_route_id}):{font_close}<br><i>Not found in provided ReducedRouteCGR dictionary.</i></td></tr>"
+        else:
+            table += f"<tr>{td}{font_normal}Cluster Representative ReducedRouteCGR:{font_close}<br><i>No valid routes in cluster to select from.</i></td></tr>"
+    try:
+        synthon_reaction = subcluster["synthon_reaction"]
+        synthon_reaction.clean2d()
+        synthon_svg = depict_custom_reaction(synthon_reaction)
+        extra_synthon = f"<tr>{td}{font_normal}Synthon pseudo reaction:{font_close}<br>{synthon_svg}</td></tr>"
+        table += extra_synthon
+    except Exception as e:
+        table += f"<tr><td colspan='1' style='color: red;'>Error displaying synthon reaction: {e}</td></tr>"
+    try:
+        if if_lg_group:
+            grouped_lgs = subcluster["group_lgs"]
+            lg_table_html = group_lg_table_2_html_fixed(grouped_lgs, if_display=False)
+        else:
+            lg_table_html = lg_table_2_html(subcluster, if_display=False)
+        extra_lg = f"<tr>{td}{font_normal}Leaving Groups table:{font_close}<br>{lg_table_html}</td></tr>"
+        table += extra_lg
+    except Exception as e:
+        table += f"<tr><td colspan='1' style='color: red;'>Error displaying leaving groups: {e}</td></tr>"
+    table += f"""
+    <tr>{td}
+        <div style="display: flex; align-items: center; flex-wrap: wrap; gap: 15px;">
+            <span>{box_mark.replace("rgb()", "rgb(152, 238, 255)")} Target Molecule</span>
+            <span>{box_mark.replace("rgb()", "rgb(240, 171, 144)")} Molecule Not In Stock</span>
+            <span>{box_mark.replace("rgb()", "rgb(155, 250, 179)")} Molecule In Stock</span>
+        </div>
+    </td></tr>
+    """
+    for route_id in valid_routes:
+        if using_tree:
+            # 1) SVG from Tree
+            svg = get_route_svg(tree, route_id)
+            # 2) Reaction steps & score
+            steps = tree.synthesis_route(route_id)
+            score = round(tree.route_score(route_id), 3)
+            # build reaction list
+            reac_html = "".join(
+                f"<b>Step {i+1}:</b> {str(r)}<br>" for i, r in enumerate(steps)
+            )
+            header = f"Route {route_id} — {len(steps)} steps, score={score}"
+            table += f"<tr><td><b>{header}</b></td></tr>"
+            table += f"<tr><td>{svg}</td></tr>"
+            table += f"<tr><td>{reac_html}</td></tr>"
+        else:
+            # 1) SVG from JSON
+            svg = get_route_svg_from_json(routes_json, route_id)
+            steps = routes_dict[route_id]
+            reac_html = "".join(
+                f"<b>Step {i+1}:</b> {str(r)}<br>" for i, r in steps.items()
+            )
+            header = f"Route {route_id} — {len(steps)} steps"
+            table += f"<tr><td><b>{header}</b></td></tr>"
+            table += f"<tr><td>{svg}</td></tr>"
+            table += f"<tr><td>{reac_html}</td></tr>"
+    table += "</tbody></table>"
+    html = template_begin + table + template_end
+    if html_path:
+        with open(html_path, "w", encoding="utf-8") as f:
+            f.write(html)
+        return f"Written to {html_path}"
+    return html