Spaces:
Running
Running
| from datasets import load_metric | |
| import numpy as np | |
| def compute_rouge(preds, refs): | |
| rouge = load_metric("rouge") | |
| return rouge.compute(predictions=preds, references=refs) | |
| def compute_bleu(preds, refs): | |
| bleu = load_metric("bleu") | |
| refs = [[r] for r in refs] # bleu expects list of lists | |
| return bleu.compute(predictions=preds, references=refs) | |
| def factuality_score(preds, refs): | |
| """Very simple lexical overlap metric for factual alignment.""" | |
| scores = [] | |
| for p, r in zip(preds, refs): | |
| p_tokens = set(p.lower().split()) | |
| r_tokens = set(r.lower().split()) | |
| scores.append(len(p_tokens & r_tokens) / max(1, len(r_tokens))) | |
| return {"factuality": np.mean(scores)} | |