readctrl / code /data_creation /dataset_creation_for_attribution_training.py

Add files using upload-large-folder tool

c7a6fe6 verified about 1 month ago

5.78 kB

	import os
	import json
	import tqdm
	from openai import OpenAI

	# =====================================================
	# 1️⃣ Setup: Load API key, initialize client
	# =====================================================

	api_file = "/home/mshahidul/api_new.json"
	with open(api_file, "r") as f:
	api_keys = json.load(f)
	openai_api_key = api_keys["openai"]

	client = OpenAI(api_key=openai_api_key)


	# =====================================================
	# 2️⃣ OpenAI call helper
	# =====================================================

	def openai_return(prompt, model="gpt-5"):
	"""Send a prompt to GPT and parse JSON."""
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	)
	content = response.choices[0].message.content.strip()
	cleaned = content.replace("```json", "").replace("```", "").strip()
	try:
	return json.loads(cleaned)
	except json.JSONDecodeError:
	print("⚠️ JSON parse failed — storing raw text.")
	return cleaned


	# =====================================================
	# 3️⃣ Multi‑subclaim attribution prompt builder
	# =====================================================

	def return_prompts_attribution_multi(reference_full_text, generated_summary, subclaims_json, difficulty_level):
	return f"""
	### SYSTEM / ROLE INSTRUCTION

	You are a medical factuality and attribution evaluator.
	You will analyze all subclaims found in a generated summary, each labeled with a `"result"` flag:
	- `1` = supported by the reference
	- `0` = unsupported by the reference

	Your main task is to evaluate only the unsupported subclaims (`"result": 0"`), judging whether each is a reasonable addition given the specified readability level (easy / intermediate / hard).

	The presence of supported items (`"result": 1"`) helps you understand the full context of what is confirmed versus speculative,
	but you will not rate those. Their inclusion enriches the training data diversity and realism.

	---

	### READABILITY & ATTRIBUTION GUIDELINES

	\| Level \| Audience \| Linguistic & Stylistic Profile \| Allowable Additions \|
	\| :-- \| :-- \| :-- \| :-- \|
	\| Easy (FH 70–100) \| General public \| Short, simple, concrete sentences \| General explanations only; no new factual claims \|
	\| Intermediate (FH 50–69) \| Educated layperson \| Moderate complexity and precision \| Clarifying causal links aligned with the text \|
	\| Hard (FH 0–49) \| Professionals \| Formal, technical, multi‑clause detail \| Must strictly reflect source evidence \|

	---

	### Input
	Readability Level: {difficulty_level}

	Reference Full Text:
	{reference_full_text}

	Generated Summary:
	{generated_summary}

	All Subclaims with Support Results:
	{subclaims_json}

	---

	### TASK INSTRUCTIONS

	For each subclaim where `"result": 0"`, classify it as:

	- `"reasonable"` – legitimate simplification aligned with readability needs
	- `"partially_reasonable"` – harmless addition or neutral paraphrase
	- `"unreasonable"` – misleading, speculative, or factually unsupported

	Support your judgment with a 1–2 sentence justification per item.

	Do not modify or comment on subclaims where `"result": 1"`.

	---

	### Output JSON Format

	```json
	{{
	"evaluations": [
	{{
	"subclaim_id": <id>,
	"subclaim": "<verbatim_subclaim>",
	"result": <0 or 1>,
	"reasonableness": "<reasonable \| partially_reasonable \| unreasonable \| not_applicable>",
	"justification": "<short justification for result=0; for result=1, just write 'supported, no evaluation required'>"
	}},
	...
	]
	}}
	"""
	file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
	file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
	save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_100_gpt5_train_v2.json"

	with open(file_synth, 'r') as f:
	synthetic_data = json.load(f)
	with open(file_qwen_results, 'r') as f:
	qwen3_32B_results = json.load(f)
	res = []
	if os.path.exists(save_path):
	with open(save_path, 'r') as f:
	res = json.load(f)
	print(f"🔁 Resuming from {len(res)} entries")

	existing = set((e["id"], e["difficulty_level"]) for e in res)

	for ind in tqdm.tqdm(range(0, 30)):
	entry = synthetic_data[ind]
	subclaims_results = qwen3_32B_results[ind]['attribution']['results']
	subclaims_json = json.dumps(subclaims_results, indent=2, ensure_ascii=False)
	for level in ["easy", "intermediate", "hard"]:
	if (entry["id"], level) in existing:
	print(f"⏭️ Skipping {entry['id']} ({level})")
	continue

	ref_full_text = entry["full_text"]
	generated_summary = entry["readability_versions"][level]["text"]

	prompt = return_prompts_attribution_multi(
	ref_full_text,
	generated_summary,
	subclaims_json,
	level
	)
	# print(prompt)
	# assert False

	try:
	response = openai_return(prompt)
	res.append({
	"id": entry["id"],
	"difficulty_level": level,
	"response": response
	})

	# save periodically
	if len(res) % 2 == 0:
	with open(save_path, 'w') as f:
	json.dump(res, f, indent=2, ensure_ascii=False)
	print(f"💾 Saved after {len(res)} entries")

	except Exception as e:
	print(f"❌ Error at index {ind}, level {level}: {e}")