from datasets import load_metric import numpy as np def compute_rouge(preds, refs): rouge = load_metric("rouge") return rouge.compute(predictions=preds, references=refs) def compute_bleu(preds, refs): bleu = load_metric("bleu") refs = [[r] for r in refs] # bleu expects list of lists return bleu.compute(predictions=preds, references=refs) def factuality_score(preds, refs): """Very simple lexical overlap metric for factual alignment.""" scores = [] for p, r in zip(preds, refs): p_tokens = set(p.lower().split()) r_tokens = set(r.lower().split()) scores.append(len(p_tokens & r_tokens) / max(1, len(r_tokens))) return {"factuality": np.mean(scores)}