| import json |
| from pathlib import Path |
|
|
| |
| DATA_PATH = Path( |
| "/home/mshahidul/readctrl/data/extracting_subclaim/synthetic_subclaims_first200.json" |
| ) |
| OUTPUT_PATH = Path( |
| "/home/mshahidul/readctrl/data/finetuning_data/dataset_for_sft_support_check_list_new.json" |
| ) |
|
|
|
|
| def training_prompt(medical_text, subclaims, labels): |
| numbered_subclaims = "\n".join( |
| [f"{idx + 1}. {claim}" for idx, claim in enumerate(subclaims)] |
| ) |
| |
| system_prompt = f""" |
| You are an expert medical adjudicator. Determine if the 'Medical Passage' contains the core factual information of each 'Subclaim', even if the passage uses simpler language or layperson terms. |
| Rules: |
| - Label 'supported' if the essential meaning is present. |
| - Label 'not_supported' only if the information is missing or contradicted. |
| Output: JSON array of strings ['supported', 'not_supported', ...] |
| |
| Medical text: |
| {medical_text} |
| |
| Subclaims: |
| {numbered_subclaims} |
| """ |
|
|
| conversation = {} |
| conversation["conversations"] = ( |
| {"from": "user", "content": system_prompt}, |
| {"from": "assistant", "content": json.dumps(labels, ensure_ascii=False)}, |
| ) |
| return conversation |
|
|
|
|
| def load_conversation_dataset(data_path=DATA_PATH): |
| with Path(data_path).open("r", encoding="utf-8") as f: |
| raw_data = json.load(f) |
|
|
| formatted_data = [] |
| for record in raw_data: |
| generated = record.get("generated", {}) |
| medical_text = generated.get("passage", "") |
| raw_subclaims = generated.get("subclaims", []) |
|
|
| subclaims = [] |
| labels = [] |
| for subclaim in raw_subclaims: |
| claim_text = subclaim.get("claim_text", "").strip() |
| if not claim_text: |
| continue |
| subclaims.append(claim_text) |
| labels.append(subclaim.get("label", "not_supported")) |
|
|
| if not medical_text or not subclaims: |
| continue |
|
|
| formatted_data.append(training_prompt(medical_text, subclaims, labels)) |
|
|
| return formatted_data |
|
|
|
|
| |
| dataset_for_sft = load_conversation_dataset() |
|
|
| with OUTPUT_PATH.open("w", encoding="utf-8") as f: |
| json.dump(dataset_for_sft, f, ensure_ascii=False, indent=2) |
|
|
| print(len(dataset_for_sft)) |
| print(dataset_for_sft[0]) |