readctrl / code /attribution_eval.py

Add files using upload-large-folder tool

1db7196 verified about 1 month ago

5.96 kB

	def return_prompts_attribution(reference_full_text, generated_summary, subclaims_json, difficulty_level):
	return f'''
	### SYSTEM / ROLE INSTRUCTION

	You are a medical factuality and attribution evaluator.
	You will assess whether unsupported subclaims in a generated summary (those with `"result": 0"`) are reasonable additions based on the readability level (easy / intermediate / hard).

	The goal is to determine whether these extra pieces of information are acceptable simplifications or hallucinations that reduce factual faithfulness.

	---

	### READABILITY & ATTRIBUTION GUIDELINES

	\| Level \| Audience \| Content Goal \| Allowable Additions \|
	\| :--------------- \| :------------------------------- \| :--------------------------------------------------------------------- \| :--------------------------------------------------------------------------------- \|
	\| Easy \| General public \| Simplify and clarify events \| Allow general background info or lay explanations, but not new facts or diagnoses. \|
	\| Intermediate \| Educated layperson / med student \| Add brief clarifications or causal context if consistent with the text \| Allow inferred, non-contradictory context; avoid adding unconfirmed data. \|
	\| Hard \| Medical professional \| Maintain factual precision \| No additions; everything must be supported by source text. \|

	---

	### INPUT FIELDS

	Reference full text:
	{reference_full_text}

	Generated summary ({difficulty_level}):
	{generated_summary}

	Subclaims and results:
	{subclaims_json}

	---

	### TASK INSTRUCTIONS

	1. Focus only on subclaims with `"result": 0"` (not supported by the input text).
	2. For each unsupported subclaim:

	* Judge whether adding it is reasonable for the given readability level.
	* Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
	* Provide a 1–2 sentence justification explaining your reasoning.
	3. After all evaluations, assign a numerical attribution score (0–5):

	* 5 = All additions are reasonable or harmless simplifications.
	* 4 = Mostly reasonable; minor harmless additions.
	* 3 = Some misleading or unjustified additions.
	* 2 = Many factual inaccuracies.
	* 1 = Serious hallucinations; distorts source meaning.
	* 0 = Highly unfaithful; mostly invented content.
	4. End with an overall explanation (3–5 sentences) summarizing your reasoning and suggestions.

	---

	### OUTPUT FORMAT (strict JSON)

	```json
	{{
	"evaluation_table": [
	{{
	"id": <subclaim_id>,
	"subclaim": "<text>",
	"evaluation": "<reasonable addition \| unnecessary but harmless \| misleading / hallucinated>",
	"explanation": "<short justification>"
	}}
	],
	"attribution_score": <0-5>,
	"overall_explanation": "<concise summary of your judgment>"
	}}
	```
	'''
	from openai import OpenAI
	import json
	file_path = "/home/mshahidul/api_new.json"
	with open(file_path, "r") as file:
	api_keys = json.load(file)

	openai_api_key = api_keys.get("openai")

	client = OpenAI(api_key=openai_api_key)
	def openai_return(prompt):
	response = client.chat.completions.create(
	model="gpt-5-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	)
	cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
	return json.loads(cleaned_response)


	import json
	file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

	with open(file_path, 'r') as f:
	synthetic_data = json.load(f)

	file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

	with open(file_path_qwen3_32B, 'r') as f:
	qwen3_32B_results = json.load(f)

	# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
	# print(f"Full text: {synthetic_data[0]['full_text']}")
	import os

	res=[]
	temp=""
	save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_attribution.json"
	if os.path.exists(save_path):
	with open(save_path, 'r') as f:
	res = json.load(f)
	print(f"Resuming from {len(res)} entries")
	existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
	import tqdm
	for ind in tqdm.tqdm(range(len(res),100)):
	for version in ["easy", "intermediate", "hard"]:
	if (synthetic_data[ind]['id'], version) in existing_check:
	print(f"Skipping {synthetic_data[ind]['id']}, {version}")
	continue
	ref_full_text_summary = (f"{synthetic_data[ind]['full_text']}")
	generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
	subclaims_results = (f"{qwen3_32B_results[ind]['attribution']['results']}")
	prompt = return_prompts_attribution(ref_full_text_summary, generated_summary, subclaims_results, version)
	try:
	ans=openai_return(prompt)
	res.append({
	"id": synthetic_data[ind]['id'],
	"difficulty_level": version,
	"response": ans
	})

	if len(res)%2==0:
	print(f"Completed {len(res)} out of 300")
	with open(save_path, 'w') as outfile:
	json.dump(res, outfile, indent=2)
	except Exception as e:
	print(f"Error at index {ind}, version {version}: {e}")

	with open(save_path, 'w') as outfile:
	json.dump(res, outfile, indent=2)