axionx-demo / shared /metrics.py
deepsodha's picture
Upload 25 files
beb5479 verified
raw
history blame contribute delete
716 Bytes
from datasets import load_metric
import numpy as np
def compute_rouge(preds, refs):
rouge = load_metric("rouge")
return rouge.compute(predictions=preds, references=refs)
def compute_bleu(preds, refs):
bleu = load_metric("bleu")
refs = [[r] for r in refs] # bleu expects list of lists
return bleu.compute(predictions=preds, references=refs)
def factuality_score(preds, refs):
"""Very simple lexical overlap metric for factual alignment."""
scores = []
for p, r in zip(preds, refs):
p_tokens = set(p.lower().split())
r_tokens = set(r.lower().split())
scores.append(len(p_tokens & r_tokens) / max(1, len(r_tokens)))
return {"factuality": np.mean(scores)}