| | import json |
| | import concurrent.futures |
| | from openai import OpenAI |
| |
|
| | class FactualityBenchmarker: |
| | def __init__(self, api_url="http://172.16.34.29:8004/v1", model="qwen3-32b-readctrl"): |
| | self.client = OpenAI(base_url=api_url, api_key="EMPTY") |
| | self.model = model |
| |
|
| | def verify_claim(self, context, claim): |
| | """ |
| | Asks the model to determine if the context supports the claim. |
| | """ |
| | prompt = f""" |
| | CONTEXT: |
| | {context} |
| | |
| | CLAIM TO VERIFY: |
| | {claim} |
| | |
| | INSTRUCTION: |
| | Does the CONTEXT above provide enough evidence to support the CLAIM? |
| | - Answer 'supported' if the claim is explicitly stated or logically followable. |
| | - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info. |
| | |
| | Output only one word: 'supported' or 'not_supported'. |
| | """ |
| | |
| | try: |
| | response = self.client.chat.completions.create( |
| | model=self.model, |
| | messages=[{"role": "user", "content": prompt}], |
| | temperature=0.0, |
| | max_tokens=10 |
| | ) |
| | result = response.choices[0].message.content.strip().lower() |
| | return "supported" if "supported" in result and "not_supported" not in result else "not_supported" |
| | except Exception as e: |
| | print(f"Error: {e}") |
| | return "not_supported" |
| |
|
| | def run_evaluation(self, test_cases): |
| | """ |
| | Runs the benchmark over a list of test cases. |
| | Each test case: {"context": "...", "claims": [{"text": "...", "label": 1.0/0.0}]} |
| | """ |
| | total_claims = 0 |
| | correct_predictions = 0 |
| |
|
| | print(f"--- Starting Evaluation on {self.model} ---") |
| | |
| | for i, case in enumerate(test_cases): |
| | context = case["context"] |
| | print(f"\nTest Case {i+1}:") |
| | |
| | for claim_data in case["claims"]: |
| | claim_text = claim_data["text"] |
| | expected = claim_data["expected"] |
| | |
| | |
| | prediction = self.verify_claim(context, claim_text) |
| | |
| | is_correct = (prediction == expected) |
| | if is_correct: |
| | correct_predictions += 1 |
| | total_claims += 1 |
| | |
| | status = "PASS" if is_correct else "FAIL" |
| | print(f" [{status}] Claim: {claim_text[:60]}... (Expected: {expected}, Got: {prediction})") |
| |
|
| | accuracy = (correct_predictions / total_claims) * 100 if total_claims > 0 else 0 |
| | print(f"\n" + "="*30) |
| | print(f"FINAL ACCURACY: {accuracy:.2f}% ({correct_predictions}/{total_claims})") |
| | print("="*30) |
| |
|
| | |
| | test_data = [ |
| | { |
| | "context": """CASE PRESENTATION: |
| | A 64-year-old male with a 15-year history of Type 2 Diabetes Mellitus and stage 3 chronic kidney disease (CKD) |
| | presented to the emergency department with acute shortness of breath and peripheral edema. On physical |
| | examination, the patient was hypertensive (175/95 mmHg) and tachycardic (110 bpm). Lung auscultation revealed |
| | bilateral crackles in the lower lobes, consistent with pulmonary congestion. Notable laboratory findings |
| | included a Serum Creatinine of 2.8 mg/dL (baseline 1.9 mg/dL) and a Brain Natriuretic Peptide (BNP) of 1,250 pg/mL. |
| | |
| | Crucially, the patient reported no history of tobacco use and denied any chest pain or radiating pain to the |
| | left arm. An EKG showed sinus tachycardia but no ST-segment elevation or T-wave inversion. The medical team |
| | initiated a regimen of intravenous furosemide (40mg bolus) and transitioned the patient from his home |
| | medication (Metformin) to insulin glargine to manage blood glucose during the acute episode, citing concerns |
| | over lactic acidosis risk given the acute kidney injury. After 48 hours, the patient's oxygen saturation |
| | improved from 89% on room air to 95%, and his weight decreased by 3.2 kg due to successful diuresis. |
| | The discharge summary noted that despite the respiratory distress, there were no signs of systemic infection |
| | or fever during the entire 4-day hospital stay.""", |
| | "claims":[ |
| | |
| | {"text": "The patient has had Type 2 Diabetes for 15 years.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The patient showed signs of fluid buildup in the lungs.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The patient has a history of smoking.", "expected": "not_supported"}, |
| | |
| | |
| | {"text": "The patient's Serum Creatinine increased by 0.9 mg/dL from his baseline.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The doctors stopped Metformin because of the risk of lactic acidosis.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The patient complained of pain moving down his left arm.", "expected": "not_supported"}, |
| | |
| | |
| | {"text": "The patient was experiencing high blood pressure and a fast heart rate upon arrival.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The patient lost over 3 kilograms during the first two days of treatment.", "expected": "supported"}, |
| | |
| | |
| | {"text": "The EKG provided clear evidence of an active heart attack.", "expected": "not_supported"}, |
| | |
| | |
| | {"text": "The patient remained afebrile throughout the hospitalization.", "expected": "supported"} |
| | ] |
| | }, |
| | { |
| | "context": "The company reported a 15% increase in revenue, reaching $2 billion this quarter. However, net profit dropped due to high R&D costs.", |
| | "claims": [ |
| | {"text": "Revenue reached $2 billion.", "expected": "supported"}, |
| | {"text": "Net profit increased this quarter.", "expected": "not_supported"}, |
| | {"text": "Spending on Research and Development impacted profits.", "expected": "supported"} |
| | ] |
| | } |
| | ] |
| |
|
| | if __name__ == "__main__": |
| | benchmarker = FactualityBenchmarker() |
| | benchmarker.run_evaluation(test_data) |