| """validator_node — source citation check (anti-hallucination). |
| |
| Verifies that: |
| 1. At least 1 tool call ran (otherwise skip — e.g. "thanks" message) |
| 2. The final_answer contains a [Source: X] citation or filename mention |
| 3. The answer is at least 20 chars |
| 4. The cited filenames actually appear in the tool outputs |
| |
| If any of these fail and ``retry_count < settings.validator_max_retries``, |
| go back to the agent with a HumanMessage: "Please re-call the tools and |
| include [Source: filename] citations." |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| from langchain_core.messages import HumanMessage, ToolMessage |
|
|
| from config import settings |
| from graph.states.chat_state import ChatState |
|
|
|
|
| _FILENAME_PATTERN = re.compile(r"\b([\w_\-]+\.(?:pdf|docx|png|jpg|jpeg|txt))\b", re.I) |
| _SOURCE_PATTERN = re.compile(r"\[(?:Source|Forrás)[:\s]+([^\]]+)\]", re.I) |
|
|
|
|
| async def validator_node(state: ChatState) -> dict: |
| """Check the final_answer for source citations.""" |
| final_answer = state.get("final_answer", "") |
| messages = state.get("messages") or [] |
| retry_count = state.get("validator_retry_count", 0) |
|
|
| |
| tool_msgs = [m for m in messages if isinstance(m, ToolMessage)] |
| if not tool_msgs: |
| |
| return { |
| "trace": ["validator: no tool call → skipped"], |
| "sources_cited": [], |
| } |
|
|
| |
| if len(final_answer.strip()) < 20: |
| if retry_count < settings.validator_max_retries: |
| return _retry(state, retry_count, "The answer is too short (< 20 chars).") |
| |
| return { |
| "trace": ["validator: too short, but max retry → end"], |
| "sources_cited": [], |
| } |
|
|
| |
| source_matches = _SOURCE_PATTERN.findall(final_answer) |
| filename_mentions = _FILENAME_PATTERN.findall(final_answer) |
|
|
| if not source_matches and not filename_mentions: |
| if retry_count < settings.validator_max_retries: |
| return _retry(state, retry_count, "Missing source citation in [Source: filename] format.") |
| return { |
| "trace": ["validator: no source citation, but max retry → end"], |
| "sources_cited": [], |
| } |
|
|
| |
| available_files: set[str] = set() |
| for tm in tool_msgs: |
| content = str(tm.content) |
| for match in _FILENAME_PATTERN.findall(content): |
| available_files.add(match.lower()) |
|
|
| cited_files = [] |
| for citation in source_matches: |
| |
| for f in re.split(r"[,;]", citation): |
| f = f.strip() |
| if f: |
| cited_files.append(f) |
| cited_files.extend(filename_mentions) |
|
|
| invalid_citations = [ |
| c for c in cited_files |
| if c.lower() not in available_files and not any( |
| c.lower() in af for af in available_files |
| ) |
| ] |
|
|
| if invalid_citations and retry_count < settings.validator_max_retries: |
| return _retry(state, retry_count, |
| f"Cited filenames are not in the tool results: {invalid_citations}") |
|
|
| return { |
| "trace": [f"validator: ok (sources: {cited_files[:3]})"], |
| "sources_cited": list({c.lower() for c in cited_files}), |
| } |
|
|
|
|
| def _retry(state: ChatState, retry_count: int, reason: str) -> dict: |
| """Go back to the agent with a HumanMessage.""" |
| msg = HumanMessage(content=( |
| f"Your answer is not acceptable: {reason} " |
| "Please re-call the tools and include [Source: filename.pdf] citations." |
| )) |
| return { |
| "messages": [msg], |
| "validator_retry_count": retry_count + 1, |
| "trace": [f"validator: retry {retry_count + 1} ({reason})"], |
| } |
|
|