sem-diff / semantic_diff.py
JavaPD's picture
Initial deploy
941e8f6
Raw
History Blame Contribute Delete
3.78 kB
"""
backend/semantic_diff.py
Entity/relation extraction via Groq + graph diff computation + LLM summary.
"""
import re, json
from groq import Groq
MODEL = 'openai/gpt-oss-120b'
EXTRACT_PROMPT = """You are an information-extraction system.
Given the text below, extract:
1. Named entities (people, organisations, locations, products, departments, roles, dates).
2. Relationships between pairs of entities as (subject, predicate, object) triples.
Return ONLY valid JSON in this exact format β€” no prose, no markdown fences:
{
"entities": ["Entity1", "Entity2"],
"relations": [["Subject", "predicate", "Object"]]
}
Rules:
- Entities must be meaningful, specific nouns or noun phrases.
- Predicates must be short verb phrases (e.g. "leads", "located_in", "partnered_with").
- Include only relations where both subject and object appear in the entity list.
Text:
\"\"\"
{TEXT}
\"\"\"
"""
SUMMARY_PROMPT = """You are a knowledge-graph analyst.
Summarise the following knowledge-graph diff in 3–5 clear sentences.
Highlight the most significant structural changes β€” new entities, removed entities, and key relationship shifts.
Be specific and name the entities that changed.
Diff (JSON):
{DIFF}
"""
def _client(api_key: str) -> Groq:
return Groq(api_key=api_key)
def extract_entities_and_relations(text: str, api_key: str) -> dict:
"""Call Groq LLM to extract entities and relations from document text."""
prompt = EXTRACT_PROMPT.replace('{TEXT}', text[:4000])
client = _client(api_key)
resp = client.chat.completions.create(
model=MODEL,
temperature=0,
reasoning_effort='low',
response_format={'type': 'json_object'},
messages=[{'role': 'user', 'content': prompt}],
)
raw = (resp.choices[0].message.content or '').strip()
# Strip markdown fences if present
raw = re.sub(r'^```[a-z]*\n?', '', raw)
raw = re.sub(r'\n?```$', '', raw)
if not raw:
return {'entities': [], 'relations': []}
try:
data = json.loads(raw)
# Normalise: ensure both keys exist
data.setdefault('entities', [])
data.setdefault('relations', [])
# Filter relations to only those where both nodes exist
entity_set = set(data['entities'])
data['relations'] = [
r for r in data['relations']
if len(r) == 3 and r[0] in entity_set and r[2] in entity_set
]
return data
except json.JSONDecodeError:
return {'entities': [], 'relations': []}
def compute_diff(old_data: dict, new_data: dict) -> dict:
"""Compute set-difference between two KG snapshots."""
old_ents = set(old_data.get('entities', []))
new_ents = set(new_data.get('entities', []))
old_rels = {tuple(r) for r in old_data.get('relations', []) if len(r) == 3}
new_rels = {tuple(r) for r in new_data.get('relations', []) if len(r) == 3}
return {
'added_entities': sorted(new_ents - old_ents),
'removed_entities': sorted(old_ents - new_ents),
'unchanged_entities': sorted(old_ents & new_ents),
'added_relations': [list(r) for r in (new_rels - old_rels)],
'removed_relations': [list(r) for r in (old_rels - new_rels)],
'unchanged_relations': [list(r) for r in (old_rels & new_rels)],
}
def summarise_diff(diff: dict, api_key: str) -> str:
"""Ask the LLM to write a plain-English summary of the diff."""
prompt = SUMMARY_PROMPT.replace('{DIFF}', json.dumps(diff, indent=2))
client = _client(api_key)
resp = client.chat.completions.create(
model=MODEL,
temperature=0.3,
reasoning_effort='low',
messages=[{'role': 'user', 'content': prompt}],
)
return (resp.choices[0].message.content or '').strip()