Spaces:

Bani57
/

website

Running

Andrej Janchevski commited on 14 days ago

Commit

bc4fc5c

1 Parent(s): 469d523

feat(kganomaly): add streaming denoising backend with KG-likelihood metric

- New endpoints emit Server-Sent Events via api/renderers.py so the
diffusion reverse process streams progress + frame previews instead of
blocking on a single response.
- kg_likelihood.py exposes a per-step mean log-sigmoid score from the
frozen KG embedder + link ranker; kg_anomaly_inference.py logs it on
frame boundaries and attaches kg_log_likelihood / kg_log_likelihood_step
to progress events so the UI can render a "denoising-is-working"
trace alongside step-duration sparklines.
- registry._build_sample_subgraphs now accepts a seed (so each request
gets a different DFS partition), shuffles candidate (row, col) pairs,
and rejects ill-shaped bipartite samples whose halves can't form valid
inpaint quadrants. Sampler.get_context_subgraph_samples_dfs gains the
matching seed parameter in the research code.
- api.yaml + backend README document the new optional progress fields.

Files changed (8) hide show

docs/api.yaml +11 -0
src/backend/README.md +5 -0
src/backend/api/renderers.py +18 -0
src/backend/api/services/kg_anomaly_inference.py +357 -138
src/backend/api/services/kg_likelihood.py +79 -0
src/backend/api/services/registry.py +60 -10
src/backend/api/views/kg_anomaly.py +64 -12
src/research/COINs-KGGeneration/graph_completion/graphs/preprocess.py +7 -1

docs/api.yaml CHANGED Viewed

@@ -1806,3 +1806,14 @@ components:
         total:
           type: integer
           description: Total steps in the stage

         total:
           type: integer
           description: Total steps in the stage
+        kg_log_likelihood:
+          type: number
+          nullable: true
+          description: >
+            Mean log-sigmoid score from the frozen KG embedder + link ranker
+            applied to the edges currently present in the argmax reconstruction.
+            Higher = cleaner. Present only on frame-boundary events.
+        kg_log_likelihood_step:
+          type: integer
+          nullable: true
+          description: Step index that `kg_log_likelihood` corresponds to.

src/backend/README.md CHANGED Viewed

@@ -111,6 +111,11 @@ event: progress
 data: {"type":"progress","phase":"denoise","step":42,"total_steps":500,"elapsed_ms":2100}
 ```
 **`event: preview`** — base64 PNG of the graph's current state, emitted at key frames:
 ```
 event: preview

 data: {"type":"progress","phase":"denoise","step":42,"total_steps":500,"elapsed_ms":2100}
 ```
+KG-anomaly progress events additionally carry an optional `kg_log_likelihood`
+(float) + `kg_log_likelihood_step` (int) on frame boundaries — the mean
+log-sigmoid score from the frozen KG embedder + link ranker on the edges
+currently present in the argmax reconstruction. Higher = cleaner.
 **`event: preview`** — base64 PNG of the graph's current state, emitted at key frames:
 ```
 event: preview

src/backend/api/renderers.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from rest_framework.renderers import BaseRenderer
+class EventStreamRenderer(BaseRenderer):
+    """Renderer declaring text/event-stream so DRF content negotiation accepts SSE clients.
+    The streaming views return a StreamingHttpResponse directly, so this
+    renderer is never invoked to produce bytes — it exists only to satisfy
+    DRF's Accept header negotiation.
+    """
+    media_type = "text/event-stream"
+    format = "sse"
+    charset = None
+    render_style = "binary"
+    def render(self, data, accepted_media_type=None, renderer_context=None):
+        return b""

src/backend/api/services/kg_anomaly_inference.py CHANGED Viewed

@@ -1,13 +1,27 @@
 import base64
 import io
 import time
 import torch
 import torch.nn.functional as F
 from api.services.graphgen_inference import (
     _frames_to_gif_b64, _pil_to_b64,
 )
 STATE_BLOB_MAX_BYTES = 10 * 1024 * 1024  # 10 MB
 REQUIRED_STATE_KEYS = {
@@ -76,7 +90,7 @@ def build_kg_tensors(subgraph, loader, model):
                 X_c[0, i] = int(communities[eid])
     n_nodes = torch.tensor([n], dtype=torch.long)
-    is_bip = torch.tensor([n > 20], dtype=torch.bool)
     node_mask = torch.ones(1, n, dtype=torch.bool)
     return {
@@ -90,14 +104,16 @@ def _to_device(t, device):
     return t.to(device) if isinstance(t, torch.Tensor) else t
-def apply_edge_noise(model, tensors, task, noise_level, seed=None):
     """Forward-diffuse the given subgraph's edges at t = noise_level * T.
     For task="correct", only edges inside the inpaint mask (the second half of
     nodes) are noised, matching what the correction endpoint will regenerate.
     For task="generate", every edge slot is noised.
-    Returns a new list of {source_idx, target_idx, relation_id} dicts.
     """
     from graph_generation.src.utils import get_inpaint_mask
     from graph_generation.src.diffusion import diffusion_utils
@@ -137,6 +153,24 @@ def apply_edge_noise(model, tensors, task, noise_level, seed=None):
     E_mixed = E_noised * inpaint_mask + E * (~inpaint_mask)
     E_int = E_mixed[0].argmax(dim=-1).cpu()
     edges = []
     for i in range(n):
         for j in range(n):
@@ -145,9 +179,15 @@ def apply_edge_noise(model, tensors, task, noise_level, seed=None):
             cls = int(E_int[i, j])
             if cls == 0:
                 continue
-            edges.append({
-                "source_idx": i, "target_idx": j, "relation_id": cls - 1,
-            })
     return edges
@@ -155,7 +195,7 @@ def apply_edge_noise(model, tensors, task, noise_level, seed=None):
 # Change detection
 # ---------------------------------------------------------------------------
-def compute_changes(original_E_int, corrected_E_int, num_nodes, loader):
     """Compute before/after edge diff for a directed KG subgraph.
     original_E_int / corrected_E_int: 2-D int tensors (n, n) where 0 = no edge
@@ -163,6 +203,13 @@ def compute_changes(original_E_int, corrected_E_int, num_nodes, loader):
     """
     _, _, inv_relations = loader.dataset.get_inverted_name_maps()
     edges = []
     summary = {"added": 0, "removed": 0, "modified": 0, "unchanged": 0}
@@ -182,7 +229,7 @@ def compute_changes(original_E_int, corrected_E_int, num_nodes, loader):
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "unchanged",
                     "relation_id": c - 1,
-                    "relation_name": str(inv_relations.get(c - 1, c - 1)),
                 })
                 continue
             if o == 0 and c > 0:
@@ -190,23 +237,23 @@ def compute_changes(original_E_int, corrected_E_int, num_nodes, loader):
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "added",
                     "relation_id": c - 1,
-                    "relation_name": str(inv_relations.get(c - 1, c - 1)),
                 })
             elif o > 0 and c == 0:
                 summary["removed"] += 1
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "removed",
                     "original_relation_id": o - 1,
-                    "original_relation_name": str(inv_relations.get(o - 1, o - 1)),
                 })
             else:
                 summary["modified"] += 1
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "modified",
                     "original_relation_id": o - 1,
-                    "original_relation_name": str(inv_relations.get(o - 1, o - 1)),
                     "relation_id": c - 1,
-                    "relation_name": str(inv_relations.get(c - 1, c - 1)),
                 })
     return {"edges": edges, "summary": summary}
@@ -216,50 +263,117 @@ def compute_changes(original_E_int, corrected_E_int, num_nodes, loader):
 # Rendering
 # ---------------------------------------------------------------------------
-def _format_entity_label(dataset_id, name):
-    s = str(name)
-    if dataset_id == "freebase":
-        s = s.replace("/m/", "")
-    elif dataset_id == "wordnet":
-        s = s.split(".")[0]
     else:
-        if "concept" in s:
-            parts = s.split(":")
-            s = parts[-2] if "new" in s and len(parts) >= 2 else parts[-1]
-    if len(s) > 14:
-        s = s[:13] + "…"
-    return s
-def _format_relation_label(dataset_id, name):
-    s = str(name)
-    if dataset_id == "freebase":
-        parts = s.split(".")
-        s = ".".join(["_".join(p.split("/")[-2:]) for p in parts])
-    elif dataset_id == "wordnet":
-        s = s[1:] if s.startswith("_") else s
     else:
-        if "concept" in s:
-            parts = s.split(":")
-            s = parts[-2] if "new" in s and len(parts) >= 2 else parts[-1]
-    if len(s) > 16:
-        s = s[:15] + "…"
-    return s
-def render_kg_subgraph(E_int, num_nodes, X_index, dataset_id, loader, changes=None):
     """Render a directed KG subgraph as a PIL image using networkx + PIL.
-    Does not use matplotlib (same reason as graphgen_inference: Windows thread safety).
     """
-    import networkx as nx
-    from PIL import Image, ImageDraw, ImageFont
-    inv_nodes, _, inv_relations = loader.dataset.get_inverted_name_maps()
     e = E_int.cpu().tolist()
     xi = X_index.cpu().tolist()
     G = nx.DiGraph()
     for i in range(num_nodes):
         G.add_node(i)
@@ -270,105 +384,162 @@ def render_kg_subgraph(E_int, num_nodes, X_index, dataset_id, loader, changes=No
             if int(e[i][j]) > 0:
                 G.add_edge(i, j, rel=int(e[i][j]) - 1)
-    pos = nx.spring_layout(G, seed=42)
-    # Build change lookup: (i, j) -> change_type
-    change_lookup = {}
     if changes is not None:
-        for entry in changes.get("edges", []):
-            change_lookup[(entry["source_idx"], entry["target_idx"])] = entry["change"]
-    size = 500
-    margin = 50
-    scale = (size - 2 * margin) / 2
-    cx, cy = size / 2, size / 2
-    pixel_pos = {k: (cx + v[0] * scale, cy + v[1] * scale) for k, v in pos.items()}
     img = Image.new("RGB", (size, size), "white")
     draw = ImageDraw.Draw(img)
     try:
-        font = ImageFont.truetype("arial.ttf", 11)
-        small_font = ImageFont.truetype("arial.ttf", 9)
     except (OSError, IOError):
         font = ImageFont.load_default()
-        small_font = font
-    node_r = 10
-    # Draw edges first (so nodes overlay them)
-    # Include "removed" edges from change_lookup even if not in G
-    all_edges = set((i, j) for i, j in G.edges())
-    if changes is not None:
-        for (i, j), ct in change_lookup.items():
-            if ct == "removed":
-                all_edges.add((i, j))
-    for (i, j) in all_edges:
-        change_type = change_lookup.get((i, j))
-        color = CHANGE_COLORS.get(change_type, "#444444") if changes is not None else "#444444"
-        dashed = (change_type == "removed")
-        x0, y0 = pixel_pos[i]
-        x1, y1 = pixel_pos[j]
-        # Shorten line to not overlap node circles
-        dx, dy = x1 - x0, y1 - y0
-        dist = max(1.0, (dx * dx + dy * dy) ** 0.5)
-        ux, uy = dx / dist, dy / dist
-        sx, sy = x0 + ux * node_r, y0 + uy * node_r
-        ex, ey = x1 - ux * node_r, y1 - uy * node_r
-        if dashed:
-            _draw_dashed(draw, (sx, sy), (ex, ey), color, width=2, dash=6)
         else:
-            draw.line([(sx, sy), (ex, ey)], fill=color, width=2)
-        # Arrowhead
-        _draw_arrowhead(draw, (ex, ey), (ux, uy), color)
-        # Relation label
-        if (i, j) in G.edges():
-            rel_id = G.edges[(i, j)]["rel"]
-            rel_name = _format_relation_label(dataset_id, inv_relations.get(rel_id, rel_id))
-            mx, my = (sx + ex) / 2, (sy + ey) / 2
-            draw.text((mx + 3, my - 5), rel_name, fill=color, font=small_font)
-    # Draw nodes
-    for i in range(num_nodes):
-        x, y = pixel_pos[i]
         draw.ellipse([x - node_r, y - node_r, x + node_r, y + node_r],
-                     fill="#2ecc71", outline="#1a7a42")
-        eid = int(xi[i]) if i < len(xi) else i
-        label = _format_entity_label(dataset_id, inv_nodes.get(eid, eid))
-        draw.text((x + node_r + 2, y - 6), label, fill="#111111", font=font)
     return img
-def _draw_arrowhead(draw, tip, direction, color):
-    import math
-    ux, uy = direction
-    angle = math.atan2(uy, ux)
-    ah_len = 7
-    ah_angle = math.radians(25)
-    x, y = tip
-    x1 = x - ah_len * math.cos(angle - ah_angle)
-    y1 = y - ah_len * math.sin(angle - ah_angle)
-    x2 = x - ah_len * math.cos(angle + ah_angle)
-    y2 = y - ah_len * math.sin(angle + ah_angle)
-    draw.polygon([(x, y), (x1, y1), (x2, y2)], fill=color)
-def _draw_dashed(draw, start, end, color, width=2, dash=6):
-    x0, y0 = start
-    x1, y1 = end
-    dx, dy = x1 - x0, y1 - y0
-    dist = max(1.0, (dx * dx + dy * dy) ** 0.5)
-    steps = int(dist // dash)
-    ux, uy = dx / dist, dy / dist
-    for k in range(steps):
-        if k % 2 == 1:
-            continue
-        sx = x0 + ux * dash * k
-        sy = y0 + uy * dash * k
-        ex = x0 + ux * dash * min(k + 1, steps)
-        ey = y0 + uy * dash * min(k + 1, steps)
-        draw.line([(sx, sy), (ex, ey)], fill=color, width=width)
 # ---------------------------------------------------------------------------
@@ -406,17 +577,21 @@ def run_standard_correction(model, tensors, dataset_id, task, loader,
     E_given = tensors["E_given"].to(device)
     y_given = tensors["y_given"].to(device)
     X_index = tensors["X_index"].to(device)
     is_bip = tensors["is_bip"].to(device)
     n_nodes = tensors["n_nodes"].to(device)
     node_mask = tensors["node_mask"].to(device)
     n_max = n_nodes.item()
     inpaint_mask = _build_inpaint_mask(
         task, node_mask, is_bip, model.Edim_output, device)
     original_E_int = E_given[0].argmax(dim=-1).long()  # (n, n)
     original_img = render_kg_subgraph(
-        original_E_int, n_max, X_index[0], dataset_id, loader, changes=None)
     model_T = model.T
     step_stride = max(1, model_T // diffusion_steps)
@@ -451,17 +626,28 @@ def run_standard_correction(model, tensors, dataset_id, task, loader,
             }
             if is_frame:
                 frame = render_kg_subgraph(
-                    E_int_prev, n_max, X_index[0], dataset_id, loader)
                 gif_frames.append(frame)
                 event["preview"] = _pil_to_b64(frame)
             yield event
         X_final, E_final = _collapse_final_kg(model, X, E, y, node_mask)
     corrected_E_int = E_final[0]
-    changes = compute_changes(original_E_int, corrected_E_int, n_max, loader)
     corrected_img = render_kg_subgraph(
-        corrected_E_int, n_max, X_index[0], dataset_id, loader, changes=changes)
     elapsed_ms = int((time.time() - t0) * 1000)
     yield {
@@ -493,9 +679,11 @@ def run_multiprox_correction_init(model, tensors, dataset_id, task, loader,
     inpaint_mask = _build_inpaint_mask(
         task, node_mask, is_bip, model.Edim_output, device)
     original_E_int = E_given[0].argmax(dim=-1).long()
     original_img = render_kg_subgraph(
-        original_E_int, n_max, X_index[0], dataset_id, loader, changes=None)
     t0 = time.time()
     # Sample initial noise for each of M Gibbs chains
@@ -524,9 +712,10 @@ def run_multiprox_correction_init(model, tensors, dataset_id, task, loader,
         agg_y = torch.median(y_ens.float(), dim=1).values
         X_int, E_int = _collapse_final_kg(model, X_given, agg_E, agg_y, node_mask)
     corrected_E_int = E_int[0]
-    changes = compute_changes(original_E_int, corrected_E_int, n_max, loader)
     preview_img = render_kg_subgraph(
-        corrected_E_int, n_max, X_index[0], dataset_id, loader, changes=changes)
     elapsed_ms = int((time.time() - t0) * 1000)
     state = {
@@ -535,11 +724,13 @@ def run_multiprox_correction_init(model, tensors, dataset_id, task, loader,
         "y": y_ens.cpu(),
         "n_nodes": n_nodes.cpu(),
         "dataset_id": dataset_id,
         "task": task,
         "X_index": X_index.cpu(),
         "X_c": X_c.cpu(),
         "is_bip": is_bip.cpu(),
         "original_E_int": original_E_int.cpu(),
         "T": model.T, "n": n, "m": m, "t": t, "t_prime": t_prime,
         "gibbs_chain_freq": gibbs_chain_freq,
         "inner_step": 0, "step": 0,
@@ -562,9 +753,12 @@ def run_multiprox_correction_step(model, state, loader):
     E = state["E"].to(device)
     y = state["y"].to(device)
     X_index = state["X_index"].to(device)
     is_bip = state["is_bip"].to(device)
     n_nodes = state["n_nodes"].to(device)
     original_E_int = state["original_E_int"].to(device)
     T = state["T"]
     n = state["n"]
@@ -578,6 +772,7 @@ def run_multiprox_correction_step(model, state, loader):
     n_max = int(n_nodes.item())
     node_mask = torch.ones(1, n_max, dtype=torch.bool, device=device)
     inpaint_mask = _build_inpaint_mask(task, node_mask, is_bip, model.Edim_output, device)
     fixed_t_norm = t * torch.ones((1, 1), dtype=torch.float, device=device)
     fixed_s_norm = fixed_t_norm - (1.0 / T)
@@ -608,8 +803,9 @@ def run_multiprox_correction_step(model, state, loader):
             prev_y = torch.median(y.float(), dim=1).values
             _, prev_Ei = _collapse_final_kg(model, X_given, prev_E, prev_y, node_mask)
             preview_img = render_kg_subgraph(
-                prev_Ei[0], n_max, X_index[0], dataset_id, loader)
-            yield {
                 "type": "progress",
                 "phase": "gibbs",
                 "step": i + 1,
@@ -617,6 +813,16 @@ def run_multiprox_correction_step(model, state, loader):
                 "elapsed_ms": int((time.time() - t0) * 1000),
                 "preview": _pil_to_b64(preview_img),
             }
         new_inner_step = inner_step + steps_this_call
         round_complete = new_inner_step >= m
@@ -649,21 +855,34 @@ def run_multiprox_correction_step(model, state, loader):
                 "elapsed_ms": int((time.time() - t0) * 1000),
             }
             if is_frame:
                 event["preview"] = _pil_to_b64(render_kg_subgraph(
-                    discrete_s.E[0].long(), n_max, X_index[0], dataset_id, loader))
             yield event
         X_int, E_int = _collapse_final_kg(model, cur_X, cur_E, cur_y, node_mask)
     corrected_E_int = E_int[0]
-    changes = compute_changes(original_E_int, corrected_E_int, n_max, loader)
     corrected_img = render_kg_subgraph(
-        corrected_E_int, n_max, X_index[0], dataset_id, loader, changes=changes)
     elapsed_ms = int((time.time() - t0) * 1000)
     updated_state = {
         **state,
         "E": E.cpu(), "y": y.cpu(),
         "step": new_step, "inner_step": new_inner_step,
     }
     yield {

 import base64
+import faulthandler
 import io
+import logging
+import math
+import sys
 import time
+import traceback
+faulthandler.enable(file=sys.stderr, all_threads=True)
+logger = logging.getLogger(__name__)
 import torch
 import torch.nn.functional as F
+import numpy as np
+import networkx as nx
+from PIL import Image, ImageDraw, ImageFont
 from api.services.graphgen_inference import (
     _frames_to_gif_b64, _pil_to_b64,
 )
+from api.services.kg_likelihood import kg_edge_log_likelihood
+from api.utils import clean_entity_name, clean_relation_name
 STATE_BLOB_MAX_BYTES = 10 * 1024 * 1024  # 10 MB
 REQUIRED_STATE_KEYS = {
                 X_c[0, i] = int(communities[eid])
     n_nodes = torch.tensor([n], dtype=torch.long)
+    is_bip = torch.tensor([bool(subgraph.get("is_bip", False))], dtype=torch.bool)
     node_mask = torch.ones(1, n, dtype=torch.bool)
     return {
     return t.to(device) if isinstance(t, torch.Tensor) else t
+def apply_edge_noise(model, tensors, task, noise_level, seed=None,
+                     loader=None, dataset_id=None, nodes=None):
     """Forward-diffuse the given subgraph's edges at t = noise_level * T.
     For task="correct", only edges inside the inpaint mask (the second half of
     nodes) are noised, matching what the correction endpoint will regenerate.
     For task="generate", every edge slot is noised.
+    Returns a new list of edge dicts enriched with cleaned relation/entity
+    names when ``loader``/``dataset_id``/``nodes`` are supplied.
     """
     from graph_generation.src.utils import get_inpaint_mask
     from graph_generation.src.diffusion import diffusion_utils
     E_mixed = E_noised * inpaint_mask + E * (~inpaint_mask)
     E_int = E_mixed[0].argmax(dim=-1).cpu()
+    inv_relations = None
+    if loader is not None and dataset_id is not None:
+        _, _, inv_relations = loader.dataset.get_inverted_name_maps()
+    def node_name(idx):
+        if nodes is not None and 0 <= idx < len(nodes):
+            return nodes[idx].get("entity_name") or f"#{nodes[idx].get('entity_id', idx)}"
+        return f"#{idx}"
+    def relation_label(rid):
+        if inv_relations is None:
+            return None
+        raw = inv_relations.get(rid)
+        if raw is None or raw != raw or str(raw).strip() == "":
+            return f"rel#{rid}"
+        cleaned = clean_relation_name(str(raw), dataset_id)
+        return cleaned if cleaned else f"rel#{rid}"
     edges = []
     for i in range(n):
         for j in range(n):
             cls = int(E_int[i, j])
             if cls == 0:
                 continue
+            rel_id = cls - 1
+            edge = {
+                "source_idx": i, "target_idx": j, "relation_id": rel_id,
+            }
+            if inv_relations is not None:
+                edge["relation_name"] = relation_label(rel_id)
+                edge["entity_name_source"] = node_name(i)
+                edge["entity_name_target"] = node_name(j)
+            edges.append(edge)
     return edges
 # Change detection
 # ---------------------------------------------------------------------------
+def compute_changes(original_E_int, corrected_E_int, num_nodes, loader, dataset_id):
     """Compute before/after edge diff for a directed KG subgraph.
     original_E_int / corrected_E_int: 2-D int tensors (n, n) where 0 = no edge
     """
     _, _, inv_relations = loader.dataset.get_inverted_name_maps()
+    def rel_name(idx):
+        raw = inv_relations.get(idx)
+        if raw is None or raw != raw or str(raw).strip() == "":
+            return f"rel#{idx}"
+        cleaned = clean_relation_name(str(raw), dataset_id)
+        return cleaned if cleaned else f"rel#{idx}"
     edges = []
     summary = {"added": 0, "removed": 0, "modified": 0, "unchanged": 0}
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "unchanged",
                     "relation_id": c - 1,
+                    "relation_name": rel_name(c - 1),
                 })
                 continue
             if o == 0 and c > 0:
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "added",
                     "relation_id": c - 1,
+                    "relation_name": rel_name(c - 1),
                 })
             elif o > 0 and c == 0:
                 summary["removed"] += 1
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "removed",
                     "original_relation_id": o - 1,
+                    "original_relation_name": rel_name(o - 1),
                 })
             else:
                 summary["modified"] += 1
                 edges.append({
                     "source_idx": i, "target_idx": j, "change": "modified",
                     "original_relation_id": o - 1,
+                    "original_relation_name": rel_name(o - 1),
                     "relation_id": c - 1,
+                    "relation_name": rel_name(c - 1),
                 })
     return {"edges": edges, "summary": summary}
 # Rendering
 # ---------------------------------------------------------------------------
+def _truncate_label(s, limit):
+    s = str(s)
+    return s if len(s) <= limit else s[: limit - 1] + "…"
+# Green-palette anchors matching the site's primary colour: pale mint -> vivid
+# green -> deep forest. Used to colour nodes by the normalized-Laplacian
+# eigenvector.
+_GREEN_LOW = (212, 237, 218)
+_GREEN_MID = (82, 180, 120)
+_GREEN_HIGH = (22, 80, 50)
+def _green_rgb(t):
+    """Map t in [-1, 1] to an (r, g, b) tuple on a three-anchor green gradient."""
+    t = max(-1.0, min(1.0, float(t)))
+    if t < 0:
+        w = t + 1.0  # 0 at -1, 1 at 0
+        a, b = _GREEN_LOW, _GREEN_MID
     else:
+        w = t  # 0 at 0, 1 at 1
+        a, b = _GREEN_MID, _GREEN_HIGH
+    return (
+        int(a[0] + (b[0] - a[0]) * w),
+        int(a[1] + (b[1] - a[1]) * w),
+        int(a[2] + (b[2] - a[2]) * w),
+    )
+def _quad_bezier(p0, p1, p2, steps=20):
+    out = []
+    for k in range(steps + 1):
+        t = k / steps
+        u = 1 - t
+        x = u * u * p0[0] + 2 * u * t * p1[0] + t * t * p2[0]
+        y = u * u * p0[1] + 2 * u * t * p1[1] + t * t * p2[1]
+        out.append((x, y))
+    return out
+def _draw_arrowhead(draw, tip, direction, color):
+    ux, uy = direction
+    angle = math.atan2(uy, ux)
+    ah_len = 9
+    ah_angle = math.radians(25)
+    x, y = tip
+    x1 = x - ah_len * math.cos(angle - ah_angle)
+    y1 = y - ah_len * math.sin(angle - ah_angle)
+    x2 = x - ah_len * math.cos(angle + ah_angle)
+    y2 = y - ah_len * math.sin(angle + ah_angle)
+    draw.polygon([(x, y), (x1, y1), (x2, y2)], fill=color)
+def _draw_curve(draw, p0, p2, color, width=2, dashed=False, rad=0.2):
+    """Draw a quadratic Bézier from p0 to p2 curved by `rad` (fraction of chord length)."""
+    mx, my = (p0[0] + p2[0]) / 2, (p0[1] + p2[1]) / 2
+    dx, dy = p2[0] - p0[0], p2[1] - p0[1]
+    # Perpendicular offset for the control point
+    nx_, ny_ = -dy, dx
+    nlen = max(1.0, math.hypot(nx_, ny_))
+    ctrl = (mx + nx_ / nlen * rad * math.hypot(dx, dy),
+            my + ny_ / nlen * rad * math.hypot(dx, dy))
+    pts = _quad_bezier(p0, ctrl, p2, steps=24)
+    if dashed:
+        for k in range(0, len(pts) - 1, 2):
+            draw.line([pts[k], pts[k + 1]], fill=color, width=width)
     else:
+        draw.line(pts, fill=color, width=width, joint="curve")
+    # Tangent at p2 (for arrowhead direction)
+    px, py = pts[-2]
+    tx, ty = p2[0] - px, p2[1] - py
+    tlen = max(1.0, math.hypot(tx, ty))
+    return (tx / tlen, ty / tlen)
+def _bipartite_layout(row_nodes, col_nodes):
+    """Two-column layout: row nodes on the left, col nodes on the right, evenly spaced."""
+    pos = {}
+    def place(nodes, x):
+        k = len(nodes)
+        for i, n in enumerate(nodes):
+            y = 1.0 - 2.0 * (i + 1) / (k + 1)  # evenly spaced in [-1, 1], top-down
+            pos[n] = (x, y)
+    place(row_nodes, -1.0)
+    place(col_nodes, 1.0)
+    return pos
+def render_kg_subgraph(E_int, num_nodes, X_index, dataset_id, loader, changes=None, is_bip=False):
     """Render a directed KG subgraph as a PIL image using networkx + PIL.
+    Ports the improvements from KnowledgeGraphVisualization.visualize_non_molecule
+    without importing matplotlib (which conflicts with torch on Windows): isolated
+    singleton filter, spring_layout(k=1), coolwarm node colouring from the
+    normalized-Laplacian eigenvector, and curved edges. When `changes` is
+    provided, per-edge CHANGE_COLORS override the default grey and "removed"
+    edges are drawn dashed. When `is_bip`, uses a two-column layout that
+    separates the row/col partitions and visually marks the inpaint quadrants.
     """
+    inv_nodes, _, _ = loader.dataset.get_inverted_name_maps()
     e = E_int.cpu().tolist()
     xi = X_index.cpu().tolist()
+    change_lookup = {}
+    if changes is not None:
+        for entry in changes.get("edges", []):
+            change_lookup[(entry["source_idx"], entry["target_idx"])] = entry["change"]
     G = nx.DiGraph()
     for i in range(num_nodes):
         G.add_node(i)
             if int(e[i][j]) > 0:
                 G.add_edge(i, j, rel=int(e[i][j]) - 1)
     if changes is not None:
+        for (i, j), ct in change_lookup.items():
+            if ct == "removed" and not G.has_edge(i, j):
+                G.add_edge(i, j, rel=None)
+    # Bipartite: keep every node so the row/col structure stays visible.
+    # Community: drop isolated singletons so the spring layout focuses on structure.
+    if is_bip:
+        graph = G.copy()
+    else:
+        components = [G.subgraph(c).copy() for c in nx.connected_components(G.to_undirected())]
+        components = [c for c in components if c.number_of_nodes() > 1]
+        graph = nx.compose_all(components) if components else G
+    size = 520
     img = Image.new("RGB", (size, size), "white")
     draw = ImageDraw.Draw(img)
+    if graph.number_of_nodes() == 0:
+        return img
     try:
+        font = ImageFont.truetype("arial.ttf", 12)
     except (OSError, IOError):
         font = ImageFont.load_default()
+    if is_bip:
+        row_nodes = [n for n in graph.nodes() if n < num_nodes // 2]
+        col_nodes = [n for n in graph.nodes() if n >= num_nodes // 2]
+        pos = _bipartite_layout(row_nodes, col_nodes)
+    else:
+        pos = nx.spring_layout(graph, k=1, iterations=100, seed=42)
+    # Normalized Laplacian eigenvector for node colouring. Use torch.linalg.eigh
+    # rather than numpy.linalg.eigh — on Windows, numpy's MKL DLLs conflict with
+    # torch's (Windows code 0xc06d007f), and torch is already healthy in-process.
+    try:
+        L = nx.normalized_laplacian_matrix(graph.to_undirected()).toarray()
+        L_t = torch.from_numpy(L).to(torch.float64)
+        _, U_t = torch.linalg.eigh(L_t)
+        U = U_t.numpy()
+        eigen_dim = 1 if U.shape[1] > 1 else 0
+        vec = U[:, eigen_dim]
+        m_abs = max(abs(vec.min()), abs(vec.max()), 1e-9)
+        vec_norm = vec / m_abs  # now in [-1, 1]
+    except Exception:
+        logger.warning(
+            "eigenvector colouring failed; using flat colour:\n%s",
+            traceback.format_exc(),
+        )
+        vec_norm = np.zeros(graph.number_of_nodes())
+    node_list = list(graph.nodes())
+    node_color = {n: _green_rgb(vec_norm[k]) for k, n in enumerate(node_list)}
+    xs, ys = zip(*pos.values())
+    x_min, x_max = min(xs), max(xs)
+    y_min, y_max = min(ys), max(ys)
+    x_span = (x_max - x_min) or 1.0
+    y_span = (y_max - y_min) or 1.0
+    margin = 55
+    scale = (size - 2 * margin) / max(x_span, y_span)
+    cx = size / 2 - (x_min + x_span / 2) * scale
+    cy = size / 2 + (y_min + y_span / 2) * scale  # flip y so "up" is up
+    def to_px(p):
+        return (cx + p[0] * scale, cy - p[1] * scale)
+    pixel_pos = {n: to_px(pos[n]) for n in graph.nodes()}
+    node_r = 12
+    # Edges first so nodes overlay them
+    for (i, j) in graph.edges():
+        ct = change_lookup.get((i, j))
+        dashed = (ct == "removed")
+        if ct is not None:
+            color = CHANGE_COLORS.get(ct, "#6b6b6b")
         else:
+            color = "#6b6b6b"
+        p0 = pixel_pos[i]
+        p2 = pixel_pos[j]
+        # Shorten endpoints so curve doesn't overlap node circles
+        dx, dy = p2[0] - p0[0], p2[1] - p0[1]
+        dist = max(1.0, math.hypot(dx, dy))
+        ux, uy = dx / dist, dy / dist
+        sx, sy = p0[0] + ux * node_r, p0[1] + uy * node_r
+        ex, ey = p2[0] - ux * node_r, p2[1] - uy * node_r
+        tangent = _draw_curve(draw, (sx, sy), (ex, ey), color,
+                              width=2, dashed=dashed, rad=0.2)
+        _draw_arrowhead(draw, (ex, ey), tangent, color)
+    for n in graph.nodes():
+        x, y = pixel_pos[n]
+        r, g, b = node_color[n]
         draw.ellipse([x - node_r, y - node_r, x + node_r, y + node_r],
+                     fill=(r, g, b), outline="#333333", width=1)
+        eid = int(xi[n]) if n < len(xi) else n
+        raw = inv_nodes.get(eid)
+        if raw is None or raw != raw or str(raw).strip() == "":
+            label = f"#{eid}"
+        else:
+            cleaned = clean_entity_name(str(raw), dataset_id)
+            label = cleaned if cleaned else f"#{eid}"
+        label = _truncate_label(label, 20)
+        _draw_text_with_bg(draw, (x + node_r + 3, y - 7), label, font, fill="#111111")
     return img
+def _draw_text_with_bg(draw, xy, text, font, fill):
+    """Draw text with a semi-opaque white pad so labels stay readable over lines."""
+    try:
+        bbox = draw.textbbox(xy, text, font=font)
+        pad = 1
+        draw.rectangle(
+            [bbox[0] - pad, bbox[1] - pad, bbox[2] + pad, bbox[3] + pad],
+            fill="white",
+        )
+    except Exception:
+        pass
+    draw.text(xy, text, font=font, fill=fill)
+def render_sample_subgraph_b64(subgraph, loader, dataset_id):
+    """Render a sample subgraph dict (API payload shape) to a PNG data URI.
+    Uses the same renderer as inference outputs so thumbnails are visually
+    consistent with the before/after images produced later.
+    """
+    nodes = subgraph["nodes"]
+    edges = subgraph["edges"]
+    n = len(nodes)
+    if n == 0:
+        return None
+    try:
+        E_int = torch.zeros(n, n, dtype=torch.long)
+        for e in edges:
+            src = int(e["source_idx"])
+            tgt = int(e["target_idx"])
+            rel = int(e["relation_id"])
+            if 0 <= src < n and 0 <= tgt < n:
+                E_int[src, tgt] = rel + 1
+        X_index = torch.tensor([int(node["entity_id"]) for node in nodes], dtype=torch.long)
+        is_bip = bool(subgraph.get("is_bip", False))
+        img = render_kg_subgraph(
+            E_int, n, X_index, dataset_id, loader, changes=None, is_bip=is_bip)
+        return _pil_to_b64(img)
+    except Exception:
+        logger.error(
+            "render_sample_subgraph_b64 failed: dataset=%s n=%d\n%s",
+            dataset_id, n, traceback.format_exc(),
+        )
+        raise
 # ---------------------------------------------------------------------------
     E_given = tensors["E_given"].to(device)
     y_given = tensors["y_given"].to(device)
     X_index = tensors["X_index"].to(device)
+    X_c = tensors["X_c"].to(device)
     is_bip = tensors["is_bip"].to(device)
     n_nodes = tensors["n_nodes"].to(device)
     node_mask = tensors["node_mask"].to(device)
     n_max = n_nodes.item()
+    kg_experiment = getattr(model, "kg_experiment", None)
     inpaint_mask = _build_inpaint_mask(
         task, node_mask, is_bip, model.Edim_output, device)
+    is_bip_bool = bool(is_bip.item())
     original_E_int = E_given[0].argmax(dim=-1).long()  # (n, n)
     original_img = render_kg_subgraph(
+        original_E_int, n_max, X_index[0], dataset_id, loader,
+        changes=None, is_bip=is_bip_bool)
     model_T = model.T
     step_stride = max(1, model_T // diffusion_steps)
             }
             if is_frame:
                 frame = render_kg_subgraph(
+                    E_int_prev, n_max, X_index[0], dataset_id, loader,
+                    is_bip=is_bip_bool)
                 gif_frames.append(frame)
                 event["preview"] = _pil_to_b64(frame)
+                if kg_experiment is not None:
+                    ll = kg_edge_log_likelihood(
+                        E_int_prev, X_given[0], X_index[0], X_c[0], kg_experiment)
+                    if ll is not None:
+                        event["kg_log_likelihood"] = ll
+                        event["kg_log_likelihood_step"] = emitted
+                        logger.info(
+                            "[kg-anomaly] denoise step=%d/%d kg_log_lik=%.4f",
+                            emitted, total_loop_steps, ll)
             yield event
         X_final, E_final = _collapse_final_kg(model, X, E, y, node_mask)
     corrected_E_int = E_final[0]
+    changes = compute_changes(original_E_int, corrected_E_int, n_max, loader, dataset_id)
     corrected_img = render_kg_subgraph(
+        corrected_E_int, n_max, X_index[0], dataset_id, loader,
+        changes=changes, is_bip=is_bip_bool)
     elapsed_ms = int((time.time() - t0) * 1000)
     yield {
     inpaint_mask = _build_inpaint_mask(
         task, node_mask, is_bip, model.Edim_output, device)
+    is_bip_bool = bool(is_bip.item())
     original_E_int = E_given[0].argmax(dim=-1).long()
     original_img = render_kg_subgraph(
+        original_E_int, n_max, X_index[0], dataset_id, loader,
+        changes=None, is_bip=is_bip_bool)
     t0 = time.time()
     # Sample initial noise for each of M Gibbs chains
         agg_y = torch.median(y_ens.float(), dim=1).values
         X_int, E_int = _collapse_final_kg(model, X_given, agg_E, agg_y, node_mask)
     corrected_E_int = E_int[0]
+    changes = compute_changes(original_E_int, corrected_E_int, n_max, loader, dataset_id)
     preview_img = render_kg_subgraph(
+        corrected_E_int, n_max, X_index[0], dataset_id, loader,
+        changes=changes, is_bip=is_bip_bool)
     elapsed_ms = int((time.time() - t0) * 1000)
     state = {
         "y": y_ens.cpu(),
         "n_nodes": n_nodes.cpu(),
         "dataset_id": dataset_id,
+        "is_bip": bool(is_bip_bool),
         "task": task,
         "X_index": X_index.cpu(),
         "X_c": X_c.cpu(),
         "is_bip": is_bip.cpu(),
         "original_E_int": original_E_int.cpu(),
+        "prev_E_int": corrected_E_int.cpu(),
         "T": model.T, "n": n, "m": m, "t": t, "t_prime": t_prime,
         "gibbs_chain_freq": gibbs_chain_freq,
         "inner_step": 0, "step": 0,
     E = state["E"].to(device)
     y = state["y"].to(device)
     X_index = state["X_index"].to(device)
+    X_c = state["X_c"].to(device)
     is_bip = state["is_bip"].to(device)
     n_nodes = state["n_nodes"].to(device)
     original_E_int = state["original_E_int"].to(device)
+    prev_E_int = state.get("prev_E_int", state["original_E_int"]).to(device)
+    kg_experiment = getattr(model, "kg_experiment", None)
     T = state["T"]
     n = state["n"]
     n_max = int(n_nodes.item())
     node_mask = torch.ones(1, n_max, dtype=torch.bool, device=device)
     inpaint_mask = _build_inpaint_mask(task, node_mask, is_bip, model.Edim_output, device)
+    is_bip_bool = bool(is_bip.item())
     fixed_t_norm = t * torch.ones((1, 1), dtype=torch.float, device=device)
     fixed_s_norm = fixed_t_norm - (1.0 / T)
             prev_y = torch.median(y.float(), dim=1).values
             _, prev_Ei = _collapse_final_kg(model, X_given, prev_E, prev_y, node_mask)
             preview_img = render_kg_subgraph(
+                prev_Ei[0], n_max, X_index[0], dataset_id, loader,
+                is_bip=is_bip_bool)
+            event = {
                 "type": "progress",
                 "phase": "gibbs",
                 "step": i + 1,
                 "elapsed_ms": int((time.time() - t0) * 1000),
                 "preview": _pil_to_b64(preview_img),
             }
+            if kg_experiment is not None:
+                ll = kg_edge_log_likelihood(
+                    prev_Ei[0], X_given[0], X_index[0], X_c[0], kg_experiment)
+                if ll is not None:
+                    event["kg_log_likelihood"] = ll
+                    event["kg_log_likelihood_step"] = i + 1
+                    logger.info(
+                        "[kg-anomaly] gibbs step=%d/%d kg_log_lik=%.4f",
+                        i + 1, steps_this_call, ll)
+            yield event
         new_inner_step = inner_step + steps_this_call
         round_complete = new_inner_step >= m
                 "elapsed_ms": int((time.time() - t0) * 1000),
             }
             if is_frame:
+                refine_E_int = discrete_s.E[0].long()
                 event["preview"] = _pil_to_b64(render_kg_subgraph(
+                    refine_E_int, n_max, X_index[0], dataset_id, loader,
+                    is_bip=is_bip_bool))
+                if kg_experiment is not None:
+                    ll = kg_edge_log_likelihood(
+                        refine_E_int, X_given[0], X_index[0], X_c[0], kg_experiment)
+                    if ll is not None:
+                        event["kg_log_likelihood"] = ll
+                        event["kg_log_likelihood_step"] = j + 1
+                        logger.info(
+                            "[kg-anomaly] refine step=%d/%d kg_log_lik=%.4f",
+                            j + 1, P, ll)
             yield event
         X_int, E_int = _collapse_final_kg(model, cur_X, cur_E, cur_y, node_mask)
     corrected_E_int = E_int[0]
+    changes = compute_changes(prev_E_int, corrected_E_int, n_max, loader, dataset_id)
     corrected_img = render_kg_subgraph(
+        corrected_E_int, n_max, X_index[0], dataset_id, loader,
+        changes=changes, is_bip=is_bip_bool)
     elapsed_ms = int((time.time() - t0) * 1000)
     updated_state = {
         **state,
         "E": E.cpu(), "y": y.cpu(),
+        "prev_E_int": corrected_E_int.cpu(),
         "step": new_step, "inner_step": new_inner_step,
     }
     yield {

src/backend/api/services/kg_likelihood.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Per-step KG link-prediction log-likelihood for the denoising loop.
+Wraps the math of `KGLikelihoodMetric.update` (see research/COINs-KGGeneration
+.../metrics/abstract_metrics.py) in a one-shot, stateless helper. We query
+the frozen KG embedder + link ranker on the edges currently present in the
+argmax reconstruction and return their mean log-sigmoid score — a positive
+higher-is-better value that rises as the graph becomes cleaner.
+"""
+import logging
+import torch
+from torch.nn.functional import logsigmoid, one_hot
+logger = logging.getLogger(__name__)
+def kg_edge_log_likelihood(E_int, X, X_index, X_c, kg_experiment):
+    """Mean log-sigmoid link-ranker score over edges currently present.
+    E_int:   (n, n) long tensor. 0 = no edge; otherwise class = relation_id + 1.
+    X:       (n, num_node_types) one-hot node types (unbatched, float).
+    X_index: (n,) long dataset-global entity ids (unbatched).
+    X_c:     (n,) long community ids (unbatched).
+    kg_experiment: COINs experiment exposing .embedder, .link_ranker,
+                   .loader.num_relations, .mini_batch_size, .device.
+    Returns a Python float (log-likelihood per edge) or None if no edges are
+    present or the scoring pass fails for any reason.
+    """
+    from graph_completion.graphs.preprocess import QueryData
+    from graph_completion.graphs.queries import Query
+    try:
+        embedder = kg_experiment.embedder
+        link_ranker = kg_experiment.link_ranker.link_ranker
+        num_relations = kg_experiment.loader.num_relations
+        kg_device = kg_experiment.device
+        mini_batch_size = kg_experiment.mini_batch_size
+        nz = E_int.nonzero(as_tuple=False)
+        if nz.numel() == 0:
+            return None
+        nz = nz[nz[:, 0] != nz[:, 1]]
+        if nz.numel() == 0:
+            return None
+        s, t = nz[:, 0], nz[:, 1]
+        r = E_int[s, t] - 1
+        e_s, e_t = X_index[s].long(), X_index[t].long()
+        x_s, x_t = X[s].float(), X[t].float()
+        c_s, c_t = X_c[s].long(), X_c[t].long()
+        # Stable sort by (c_s, c_t) — the embedder batches by community pair.
+        s_sort = torch.argsort(c_s)
+        t_sort = torch.sort(c_t[s_sort], stable=True).indices
+        pick = lambda v: v[s_sort][t_sort]
+        e = [pick(e_s), pick(e_t)]
+        x = [pick(x_s), pick(x_t)]
+        c = [pick(c_s), pick(c_t)]
+        r = pick(r)
+        edge_attr = [one_hot(r, num_relations + 1).float()]
+        q = Query("1p")
+        q.build_query_tree()
+        query_data = QueryData(q, e=e, x=x, c=c, edge_attr=edge_attr).to(kg_device)
+        scores = []
+        with torch.no_grad():
+            for qd_batch in query_data.batch_split(mini_batch_size):
+                q_emb, a_emb = embedder(qd_batch)
+                scores.append(link_ranker(q_emb, a_emb))
+        scores = torch.cat(scores, dim=0).view(-1)
+        if scores.numel() == 0:
+            return None
+        return float(logsigmoid(scores).mean().item())
+    except Exception as exc:
+        logger.warning("[kg-likelihood] skipped: %s", exc)
+        return None

src/backend/api/services/registry.py CHANGED Viewed

@@ -706,28 +706,73 @@ class ModelRegistry:
             except Exception:
                 logger.exception("Failed to generate sample subgraphs for %s", dataset_id)
-    def _build_sample_subgraphs(self, dataset_id, loader, num_subgraphs=20, max_graph_size=10):
-        """Build sample subgraphs using the Sampler's DFS-based context subgraph partitioning."""
         inv_nodes, _, inv_relations = loader.dataset.get_inverted_name_maps()
         node_types = loader.dataset.node_data.type.values
         # Use the Sampler's DFS partitioning to get context subgraphs
         samples = loader.sampler.get_context_subgraph_samples_dfs(
             max_graph_size, loader.graph_indexes, loader.num_nodes,
-            max_samples=num_subgraphs * 5, disable_tqdm=True,
         )
         subgraphs = []
         for subgraph_row, subgraph_col, nodes_row, nodes_col, edges in samples:
             if len(subgraphs) >= num_subgraphs:
                 break
             if len(edges) < 3:
                 continue
-            if subgraph_row == subgraph_col:
-                sg_nodes = nodes_row
-            else:
                 sg_nodes = nodes_row + nodes_col
             node_idx = {n: i for i, n in enumerate(sg_nodes)}
@@ -736,7 +781,7 @@ class ModelRegistry:
                 type_id = int(node_types[n]) if n < len(node_types) else 0
                 nodes.append({
                     "entity_id": n,
-                    "entity_name": str(inv_nodes.get(n, n)),
                     "type_id": type_id,
                 })
@@ -747,18 +792,23 @@ class ModelRegistry:
                         "source_idx": node_idx[h],
                         "target_idx": node_idx[t],
                         "relation_id": r,
-                        "relation_name": str(inv_relations.get(r, r)),
-                        "entity_name_source": str(inv_nodes.get(h, h)),
-                        "entity_name_target": str(inv_nodes.get(t, t)),
                     })
             subgraphs.append({
                 "id": f"sample_{len(subgraphs) + 1}",
                 "num_nodes": len(nodes),
                 "num_edges": len(edge_list),
                 "nodes": nodes,
                 "edges": edge_list,
             })
         # Free the partitioning data stored on the sampler
         loader.sampler.context_subgraphs_nodes = None

             except Exception:
                 logger.exception("Failed to generate sample subgraphs for %s", dataset_id)
+    def _build_sample_subgraphs(self, dataset_id, loader, num_subgraphs=40,
+                                max_graph_size=10, seed=None):
+        """Build sample subgraphs using the Sampler's DFS-based context subgraph partitioning.
+        When ``seed`` is provided, the DFS iterates node indices in a shuffled order, so
+        different seeds produce different partitions. Without a seed the order is
+        deterministic (original research-code behaviour).
+        """
         inv_nodes, _, inv_relations = loader.dataset.get_inverted_name_maps()
         node_types = loader.dataset.node_data.type.values
+        def entity_label(idx):
+            raw = inv_nodes.get(idx)
+            if raw is None or raw != raw or str(raw).strip() == "":  # None / NaN / empty
+                return f"#{idx}"
+            cleaned = clean_entity_name(str(raw), dataset_id)
+            return cleaned if cleaned else f"#{idx}"
+        def relation_label(idx):
+            raw = inv_relations.get(idx)
+            if raw is None or raw != raw or str(raw).strip() == "":
+                return f"rel#{idx}"
+            cleaned = clean_relation_name(str(raw), dataset_id)
+            return cleaned if cleaned else f"rel#{idx}"
         # Use the Sampler's DFS partitioning to get context subgraphs
         samples = loader.sampler.get_context_subgraph_samples_dfs(
             max_graph_size, loader.graph_indexes, loader.num_nodes,
+            max_samples=num_subgraphs * 5, seed=seed, disable_tqdm=True,
         )
+        # Randomly pick (row, col) partition pairs so each sample is structurally
+        # distinct from the others. Without shuffling, the DFS returns samples in
+        # nested (k, l) order, which means the first N samples all reuse
+        # partition 0's nodes. Shuffling + a disjoint-partitions guard gives 5
+        # different subgraphs each call.
+        import random as _random
+        rng = _random.Random(seed)
+        samples = list(samples)
+        rng.shuffle(samples)
+        used_partitions = set()
         subgraphs = []
         for subgraph_row, subgraph_col, nodes_row, nodes_col, edges in samples:
             if len(subgraphs) >= num_subgraphs:
                 break
+            if subgraph_row in used_partitions or subgraph_col in used_partitions:
+                continue
             if len(edges) < 3:
                 continue
+            is_bip = (subgraph_row != subgraph_col)
+            if is_bip:
+                # Inpaint mask math assumes balanced halves (n/4, n/2, 3n/4 split).
+                # Only accept bipartite samples where row/col are the same size and
+                # divisible by 4, so the four quadrants are well-defined.
+                if len(nodes_row) != len(nodes_col) or len(nodes_row) < 2:
+                    continue
+                if (2 * len(nodes_row)) % 4 != 0:
+                    continue
                 sg_nodes = nodes_row + nodes_col
+                row_size = len(nodes_row)
+            else:
+                if len(nodes_row) < 4 or len(nodes_row) % 2 != 0:
+                    continue
+                sg_nodes = nodes_row
+                row_size = len(nodes_row)
             node_idx = {n: i for i, n in enumerate(sg_nodes)}
                 type_id = int(node_types[n]) if n < len(node_types) else 0
                 nodes.append({
                     "entity_id": n,
+                    "entity_name": entity_label(n),
                     "type_id": type_id,
                 })
                         "source_idx": node_idx[h],
                         "target_idx": node_idx[t],
                         "relation_id": r,
+                        "relation_name": relation_label(r),
+                        "entity_name_source": entity_label(h),
+                        "entity_name_target": entity_label(t),
                     })
             subgraphs.append({
                 "id": f"sample_{len(subgraphs) + 1}",
                 "num_nodes": len(nodes),
                 "num_edges": len(edge_list),
+                "is_bip": is_bip,
+                "row_size": row_size,
                 "nodes": nodes,
                 "edges": edge_list,
             })
+            used_partitions.add(subgraph_row)
+            if is_bip:
+                used_partitions.add(subgraph_col)
         # Free the partitioning data stored on the sampler
         loader.sampler.context_subgraphs_nodes = None

src/backend/api/views/kg_anomaly.py CHANGED Viewed

@@ -1,12 +1,22 @@
 from rest_framework.response import Response
 from rest_framework.views import APIView
 from api.exceptions import InvalidRequestError, ModelUnavailable, NotFoundError
 from api.services.constants import KG_ANOMALY_DATASET_META
-from api.services.kg_anomaly_inference import apply_edge_noise, build_kg_tensors
 from api.services.registry import ModelRegistry
 from api.views.graph_generation import _streaming_sse_response
 class KgAnomalyDatasetsView(APIView):
     def get(self, request):
@@ -29,14 +39,29 @@ class KgAnomalySampleSubgraphsView(APIView):
             raise NotFoundError(f"Dataset '{dataset_id}' not found")
         registry = ModelRegistry.get()
-        sg_info = registry.kg_anomaly_subgraphs.get(dataset_id)
-        if sg_info is None:
             raise NotFoundError(f"No sample subgraphs available for dataset '{dataset_id}'")
         count = int(request.query_params.get("count", 5))
         count = max(1, min(10, count))
-        subgraphs = [dict(sg) for sg in sg_info.subgraphs[:count]]
         noise_level_raw = request.query_params.get("noise_level")
         if noise_level_raw is not None:
@@ -56,16 +81,39 @@ class KgAnomalySampleSubgraphsView(APIView):
                 raise ModelUnavailable(
                     f"No '{task}' checkpoint available for dataset '{dataset_id}'")
-            seed_raw = request.query_params.get("seed")
-            seed = int(seed_raw) if seed_raw is not None else None
-            loader = registry.loaders[dataset_id]
-            model = registry._load_kg_anomaly_model(dataset_id, task)
             for i, sg in enumerate(subgraphs):
-                offset_seed = None if seed is None else seed + i
-                tensors = build_kg_tensors(sg, loader, model)
-                sg["edges"] = apply_edge_noise(model, tensors, task, noise_level, offset_seed)
         return Response({
             "dataset_id": dataset_id,
@@ -99,6 +147,8 @@ def _validate_subgraph(subgraph):
 class KgAnomalyCorrectView(APIView):
     def post(self, request):
         data = request.data
         registry = ModelRegistry.get()
@@ -166,6 +216,8 @@ class KgAnomalyCorrectView(APIView):
 class KgAnomalyContinueView(APIView):
     def post(self, request):
         state_b64 = request.data.get("state")
         if not state_b64 or not isinstance(state_b64, str):

+import logging
+import random
+import traceback
+from rest_framework.renderers import JSONRenderer
 from rest_framework.response import Response
 from rest_framework.views import APIView
 from api.exceptions import InvalidRequestError, ModelUnavailable, NotFoundError
+from api.renderers import EventStreamRenderer
 from api.services.constants import KG_ANOMALY_DATASET_META
+from api.services.kg_anomaly_inference import (
+    apply_edge_noise, build_kg_tensors, render_sample_subgraph_b64,
+)
 from api.services.registry import ModelRegistry
 from api.views.graph_generation import _streaming_sse_response
+logger = logging.getLogger(__name__)
 class KgAnomalyDatasetsView(APIView):
     def get(self, request):
             raise NotFoundError(f"Dataset '{dataset_id}' not found")
         registry = ModelRegistry.get()
+        loader = registry.loaders.get(dataset_id)
+        if loader is None:
             raise NotFoundError(f"No sample subgraphs available for dataset '{dataset_id}'")
         count = int(request.query_params.get("count", 5))
         count = max(1, min(10, count))
+        seed_raw = request.query_params.get("seed")
+        # Random per-request seed when the caller doesn't pin one, so each call
+        # produces a different DFS node-order shuffle and therefore different partitions.
+        seed = int(seed_raw) if seed_raw is not None else random.randrange(2**31)
+        # Fresh DFS partitioning per request — see Sampler.get_context_subgraph_samples_dfs
+        # in research/COINs-KGGeneration. The registry shuffles sample pairs and
+        # enforces disjoint (row, col) partitions per sample, so the returned
+        # subgraphs are all structurally distinct.
+        logger.info("[sample-subgraphs] building fresh pool for %s (seed=%d)", dataset_id, seed)
+        pool = registry._build_sample_subgraphs(
+            dataset_id, loader, num_subgraphs=count, seed=seed,
+        )
+        subgraphs = [dict(sg) for sg in pool[:count]]
+        for i, sg in enumerate(subgraphs):
+            sg["id"] = f"sample_{i + 1}"
         noise_level_raw = request.query_params.get("noise_level")
         if noise_level_raw is not None:
                 raise ModelUnavailable(
                     f"No '{task}' checkpoint available for dataset '{dataset_id}'")
+            logger.info("[sample-subgraphs] loading kg-anomaly model: %s/%s", dataset_id, task)
+            try:
+                model = registry._load_kg_anomaly_model(dataset_id, task)
+            except Exception:
+                logger.error("[sample-subgraphs] model load failed:\n%s", traceback.format_exc())
+                raise
+            logger.info("[sample-subgraphs] model ready, noising %d subgraphs", len(subgraphs))
             for i, sg in enumerate(subgraphs):
+                try:
+                    tensors = build_kg_tensors(sg, loader, model)
+                    sg["edges"] = apply_edge_noise(
+                        model, tensors, task, noise_level, seed + i,
+                        loader=loader, dataset_id=dataset_id, nodes=sg["nodes"])
+                    sg["num_edges"] = len(sg["edges"])
+                    logger.info("[sample-subgraphs] noised subgraph %d/%d", i + 1, len(subgraphs))
+                except Exception:
+                    logger.error(
+                        "[sample-subgraphs] noise failed on subgraph %d:\n%s",
+                        i, traceback.format_exc(),
+                    )
+                    raise
+        for i, sg in enumerate(subgraphs):
+            try:
+                sg["image"] = render_sample_subgraph_b64(sg, loader, dataset_id)
+                logger.info("[sample-subgraphs] rendered subgraph %d/%d", i + 1, len(subgraphs))
+            except Exception:
+                logger.error(
+                    "[sample-subgraphs] render failed on subgraph %d:\n%s",
+                    i, traceback.format_exc(),
+                )
+                raise
         return Response({
             "dataset_id": dataset_id,
 class KgAnomalyCorrectView(APIView):
+    renderer_classes = [EventStreamRenderer, JSONRenderer]
     def post(self, request):
         data = request.data
         registry = ModelRegistry.get()
 class KgAnomalyContinueView(APIView):
+    renderer_classes = [EventStreamRenderer, JSONRenderer]
     def post(self, request):
         state_b64 = request.data.get("state")
         if not state_b64 or not isinstance(state_b64, str):

src/research/COINs-KGGeneration/graph_completion/graphs/preprocess.py CHANGED Viewed

@@ -742,6 +742,7 @@ class Sampler:
     def get_context_subgraph_samples_dfs(self, max_graph_size: int, graph_indexes: Iterable[AdjacencyIndex],
                                          num_nodes: int, allow_disc: bool = False,
                                          max_samples: int = 0,
                                          disable_tqdm: bool = False) -> Iterable[ContextSubgraph]:
         _, adj_s_to_t, adj_t_to_s, _, _ = graph_indexes
         assignment = -np.ones(num_nodes, dtype=int)
@@ -753,7 +754,12 @@ class Sampler:
         progress_bar = tqdm(desc="Assigning nodes to context subgraphs", total=num_nodes, leave=False,
                             disable=disable_tqdm)
-        for i in range(num_nodes):
             if max_subgraphs > 0 and subgraph >= max_subgraphs:
                 break
             if assignment[i] >= 0:

     def get_context_subgraph_samples_dfs(self, max_graph_size: int, graph_indexes: Iterable[AdjacencyIndex],
                                          num_nodes: int, allow_disc: bool = False,
                                          max_samples: int = 0,
+                                         seed: Optional[int] = None,
                                          disable_tqdm: bool = False) -> Iterable[ContextSubgraph]:
         _, adj_s_to_t, adj_t_to_s, _, _ = graph_indexes
         assignment = -np.ones(num_nodes, dtype=int)
         progress_bar = tqdm(desc="Assigning nodes to context subgraphs", total=num_nodes, leave=False,
                             disable=disable_tqdm)
+        # When seed is given, iterate in a shuffled node order so different seeds produce
+        # different partitions. Default order reproduces the deterministic behaviour.
+        node_order = (np.random.default_rng(seed).permutation(num_nodes)
+                      if seed is not None else range(num_nodes))
+        for i in node_order:
             if max_subgraphs > 0 and subgraph >= max_subgraphs:
                 break
             if assignment[i] >= 0: