readctrl / code /reasoning /reasoning.py

Add files using upload-large-folder tool

c7a6fe6 verified about 1 month ago

4.46 kB

	import os
	import json
	import tqdm
	from openai import OpenAI

	# --- CONFIGURATION ---
	MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
	API_URL = "http://172.16.34.29:8004/v1"
	API_KEY = "EMPTY"

	client = OpenAI(base_url=API_URL, api_key=API_KEY)

	def get_reasoning_prompt_json(source_text, gold_summary, generated_text, subclaim, level):
	"""
	Forces the model to output a machine-readable JSON object for clinical logic validation.
	"""
	return f"""You are a clinical logic validator auditing medical text simplification.

	### Context & Goals:
	- Target Literacy Level: {level}

	1. Level: Low Health Literacy (High Readability)

	Target: Individuals needing the simplest terms for immediate action.

	Linguistic Goal: Use "living room" language. Replace all medical jargon with functional descriptions (e.g., "renal" becomes "kidney").

	Information Density: Focus strictly on the "need-to-know" info found in the Gold Summary.

	Strategy: High paraphrasing using analogies. One idea per sentence.

	Faithfulness: Must align perfectly with the Gold Summary.

	2. Level: Intermediate Health Literacy (Medium Readability)

	Target: The general public (news-reading level).

	Linguistic Goal: Standard vocabulary. Common medical terms are okay, but technical "doctor-speak" must be simplified.

	Information Density: Balanced. Use the Gold Summary as the lead, supplemented by necessary context from the Source Text.

	Strategy: Moderate paraphrasing. Remove minor technical details to avoid information overload.

	Faithfulness: Maintains the main narrative of the Gold Summary.

	3. Level: Proficient Health Literacy (Low Readability)

	Target: Researchers, clinicians, or highly informed patients.

	Linguistic Goal: Technical and academic language. Prioritize clinical nuance and medical accuracy.

	Information Density: High. Use the Full Source Text to include data, physiological mechanisms, and statistics.

	Strategy: Minimal paraphrasing. Retain all original technical terminology.

	Faithfulness: Adhere to the Source Text; you may add related subclaims that provide deeper scientific context.

	### Input Data:
	1. Source Text: {source_text}
	2. Gold Summary (Reference): {gold_summary}
	3. Generated Text (Output): {generated_text}
	4. Subclaim to Evaluate: {subclaim}

	### Task:
	Evaluate the Subclaim's status in the Generated Text compared to the Source and Gold Summary. Output ONLY a JSON object.

	### Classification Categories:
	- "reasonable_removal": Subclaim in Source, but NOT in Gold (non-essential).
	- "reasonable_modification": Subclaim simplified correctly for the {level} goal.
	- "unreasonable_removal": Subclaim in Gold but MISSING from Generated (critical loss).
	- "unreasonable_addition": Subclaim in Generated but NOT in Source/Gold (hallucination).
	- "preserved": Fact maintained with high fidelity.

	### JSON Schema Requirement:
	{{
	"category": "string (reasonable_removal \| reasonable_modification \| unreasonable_removal \| unreasonable_addition \| preserved)",
	"action": "string (added \| removed \| modified \| preserved)",
	"presence_in_gold": "boolean",
	"presence_in_generated": "boolean",
	"verdict": "string (one sentence clinical justification)"
	}}

	Output JSON:"""

	def evaluate_reasoning_json(source, gold, generated, subclaim, level):
	prompt = get_reasoning_prompt_json(source, gold, generated, subclaim, level)

	try:
	response = client.chat.completions.create(
	model=MODEL_PATH,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=400,
	temperature=0.1,
	# If your vLLM setup supports JSON mode, you can add response_format={"type": "json_object"}
	)
	content = response.choices[0].message.content.strip()

	# Clean potential markdown formatting if model outputs ```json ... ```
	if content.startswith("```json"):
	content = content.replace("```json", "").replace("```", "").strip()

	return json.loads(content)
	except Exception as e:
	return {
	"category": "error",
	"action": "error",
	"verdict": f"API or Parsing Error: {str(e)}"
	}

	# -----------------------------
	# Example Usage in your Main Loop:
	# -----------------------------
	# result = evaluate_reasoning_json(full_text, ref_summary, summary_at_level, sc, level)
	# print(result['category'])