hallucination-guard-env / generate_cache.py
SamSankar's picture
Upload folder using huggingface_hub
e442896 verified
"""
Cache Generator for HallucinationGuard-Env
==========================================
Run this ONCE on your PC to download all datasets and save them as cache files.
Then upload the cache/ folder to your HF Space.
Usage:
pip install datasets
python generate_cache.py
Output:
server/cache/squad_2000.json
server/cache/trivia_qa_2000.json
server/cache/halueval_1000.json
... etc (one file per dataset)
After this finishes, upload everything:
python -c "from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='.', repo_id='SamSankar/hallucination-guard-env', repo_type='space', ignore_patterns=['__pycache__', '*.pyc']); print('Upload complete!')"
"""
import json
import os
import sys
import time
from pathlib import Path
try:
from datasets import load_dataset as hf_load
except ImportError:
print("Run: pip install datasets")
sys.exit(1)
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "server", "cache")
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
DATASETS = {
"squad": 10000,
"trivia_qa": 10000,
"halueval": 5000,
"truthful_qa": 817,
"hotpotqa": 10000,
"boolq": 9427,
"faithdial": 10000,
"fever": 10000,
"arc": 3370,
"openbookqa": 4957,
"ms_marco": 10000,
"coqa": 7199,
"nq_open": 8000,
"commonsense_qa": 8000,
"winogrande": 5000,
}
# Target: ~100k examples
def load_squad(cap):
ds = hf_load("squad", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
ans = item.get("answers", {}).get("text", [])
answer = ans[0] if ans else ""
if not answer or not item.get("context"): continue
out.append({"question": item["question"], "context": item["context"][:1500],
"answer": answer, "id": f"squad_{i}", "source": "squad",
"difficulty": "intermediate", "category": "reading_comprehension",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_trivia_qa(cap):
ds = hf_load("trivia_qa", "rc.wikipedia", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
cp = item.get("entity_pages", {})
ctx = ""
if isinstance(cp, dict):
ctxs = cp.get("wiki_context", [])
ctx = ctxs[0] if isinstance(ctxs, list) and ctxs else str(ctxs)
if not ctx: continue
aliases = item.get("answer", {}).get("normalized_aliases", [])
answer = aliases[0] if aliases else item.get("answer", {}).get("value", "")
if not answer: continue
out.append({"question": item["question"], "context": ctx[:1500],
"answer": str(answer), "id": f"triviaqa_{i}", "source": "trivia_qa",
"difficulty": "intermediate", "category": "trivia",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_halueval(cap):
ds = hf_load("pminervini/HaluEval", "qa", split=f"data[:{cap}]")
out = []
for i, item in enumerate(ds):
q = item.get("question", "")
ctx = item.get("knowledge", item.get("context", ""))
ans = item.get("right_answer", item.get("answer", ""))
if not q or not ans: continue
out.append({"question": q, "context": str(ctx)[:1500],
"answer": str(ans), "id": f"halueval_{i}", "source": "halueval",
"difficulty": "advanced", "category": "hallucination_detection",
"hallucination_type": item.get("hallucination_type"),
"entities": [], "metadata": {}})
return out
def load_truthful_qa(cap):
ds = hf_load("truthful_qa", "generation", split="validation")
out = []
for i, item in enumerate(ds):
if i >= cap: break
best = item.get("best_answer", "")
correct = item.get("correct_answers", [])
ctx = " ".join(correct) if correct else item.get("question", "")
if not best: continue
out.append({"question": item["question"], "context": ctx[:1500],
"answer": best, "id": f"truthfulqa_{i}", "source": "truthful_qa",
"difficulty": "expert", "category": "factuality",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_hotpotqa(cap):
ds = hf_load("hotpot_qa", "fullwiki", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
q = item.get("question", "")
ans = item.get("answer", "")
titles = item.get("context", {}).get("title", [])
sents = item.get("context", {}).get("sentences", [])
ctx = " ".join(f"{t}: {' '.join(s)}" for t, s in zip(titles, sents))[:1500]
if not q or not ans or not ctx: continue
out.append({"question": q, "context": ctx, "answer": str(ans),
"id": f"hotpotqa_{i}", "source": "hotpotqa",
"difficulty": "advanced", "category": "multi_hop_reasoning",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_boolq(cap):
ds = hf_load("google/boolq", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
q = item.get("question", "")
p = item.get("passage", "")
if not q or not p: continue
out.append({"question": q, "context": p[:1500],
"answer": "yes" if item.get("answer", False) else "no",
"id": f"boolq_{i}", "source": "boolq",
"difficulty": "beginner", "category": "yes_no_qa",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_faithdial(cap):
# Anthropic HH-RLHF: human preference dialogues, 100% parquet
ds = hf_load("Anthropic/hh-rlhf", split="train[:%d]" % cap)
out = []
for i, item in enumerate(ds):
chosen = item.get("chosen", "")
if not chosen:
continue
parts = chosen.split("Human:")
question = ""
answer = ""
for part in parts[1:]:
if "Assistant:" in part:
q_part, a_part = part.split("Assistant:", 1)
q = q_part.strip()
a = a_part.split("Human:")[0].strip()
if q and a:
question = q
answer = a
if not question or not answer:
continue
ctx = chosen[:800]
out.append({
"question": question[:200],
"context": ctx,
"answer": answer[:400],
"id": "faithdial_%d" % i,
"source": "faithdial",
"difficulty": "advanced",
"category": "hallucination_detection",
"hallucination_type": None,
"entities": [],
"metadata": {}
})
return out
def load_fever(cap):
# Stanford NLI: entailment/contradiction/neutral — pure parquet, fact verification task
ds = hf_load("stanfordnlp/snli", split=f"train[:{cap}]")
label_map = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO", -1: "NOT ENOUGH INFO"}
out = []
for i, item in enumerate(ds):
premise = item.get("premise", "")
hypothesis = item.get("hypothesis", "")
label_int = item.get("label", -1)
label = label_map.get(int(label_int), "NOT ENOUGH INFO")
if not premise or not hypothesis or label_int == -1: continue
out.append({"question": f"Does the premise support or refute this hypothesis? Hypothesis: {hypothesis}",
"context": f"Premise: {premise}", "answer": label,
"id": f"fever_{i}", "source": "fever",
"difficulty": "advanced", "category": "fact_verification",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_arc(cap):
# Combine train + validation + test to get full ~3370 examples
out = []
for split in ["train", "validation", "test"]:
try:
ds = hf_load("allenai/ai2_arc", "ARC-Challenge", split=split)
for item in ds:
if len(out) >= cap: break
q = item.get("question", "")
choices = item.get("choices", {})
ans_key = item.get("answerKey", "")
labels = choices.get("label", [])
texts = choices.get("text", [])
ctx = "Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts))
answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
if not q or not answer: continue
out.append({"question": q, "context": ctx, "answer": answer,
"id": f"arc_{len(out)}", "source": "arc",
"difficulty": "advanced", "category": "science_exam",
"hallucination_type": None, "entities": [], "metadata": {}})
except Exception:
continue
return out
def load_openbookqa(cap):
ds = hf_load("allenai/openbookqa", "main", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
q = item.get("question_stem", "")
choices = item.get("choices", {})
ans_key = item.get("answerKey", "")
labels = choices.get("label", [])
texts = choices.get("text", [])
fact = item.get("fact1", "")
ctx = f"Core fact: {fact} | Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts))
answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
if not q or not answer: continue
out.append({"question": q, "context": ctx[:1500], "answer": answer,
"id": f"openbookqa_{i}", "source": "openbookqa",
"difficulty": "intermediate", "category": "science_facts",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_ms_marco(cap):
ds = hf_load("microsoft/ms_marco", "v2.1", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
q = item.get("query", "")
passages = item.get("passages", {})
texts = passages.get("passage_text", []) if isinstance(passages, dict) else []
ctx = " ".join(texts)[:1500] if texts else ""
answers = item.get("answers", [])
answer = answers[0] if answers else ""
if not q or not ctx or not answer or answer == "No Answer Present.": continue
out.append({"question": q, "context": ctx, "answer": str(answer),
"id": f"msmarco_{i}", "source": "ms_marco",
"difficulty": "intermediate", "category": "web_search_qa",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_coqa(cap):
ds = hf_load("stanfordnlp/coqa", split=f"train[:{cap}]")
out = []
for i, item in enumerate(ds):
story = item.get("story", "")
questions = item.get("questions", [])
answers = item.get("answers", {})
ans_texts = answers.get("input_text", []) if isinstance(answers, dict) else []
if not story or not questions or not ans_texts: continue
q = questions[0] if questions else ""
answer = ans_texts[0] if ans_texts else ""
if not q or not answer: continue
out.append({"question": str(q), "context": story[:1500], "answer": str(answer),
"id": f"coqa_{i}", "source": "coqa",
"difficulty": "intermediate", "category": "conversational_qa",
"hallucination_type": None, "entities": [], "metadata": {}})
return out
def load_nq_open(cap):
ds = hf_load("nq_open", split="train[:%d]" % cap)
out = []
for i, item in enumerate(ds):
q = item.get("question", "")
answers = item.get("answer", [])
answer = answers[0] if answers else ""
if not q or not answer:
continue
out.append({
"question": q,
"context": "Answer based on your knowledge: " + q,
"answer": str(answer),
"id": "nq_open_%d" % i,
"source": "nq_open",
"difficulty": "intermediate",
"category": "open_domain_qa",
"hallucination_type": None,
"entities": [],
"metadata": {}
})
return out
def load_commonsense_qa(cap):
ds = hf_load("tau/commonsense_qa", split="train[:%d]" % cap)
out = []
for i, item in enumerate(ds):
q = item.get("question", "")
choices = item.get("choices", {})
labels = choices.get("label", []) if isinstance(choices, dict) else []
texts = choices.get("text", []) if isinstance(choices, dict) else []
ans_key = item.get("answerKey", "")
ctx = "Choices: " + " | ".join(
"%s: %s" % (l, t) for l, t in zip(labels, texts))
answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
if not q or not answer:
continue
out.append({
"question": q,
"context": ctx,
"answer": answer,
"id": "csqa_%d" % i,
"source": "commonsense_qa",
"difficulty": "intermediate",
"category": "commonsense_reasoning",
"hallucination_type": None,
"entities": [],
"metadata": {}
})
return out
def load_winogrande(cap):
# WinoGrande — commonsense reasoning, 100% parquet, no scripts
ds = hf_load("allenai/winogrande", "winogrande_xl", split="train[:%d]" % cap)
out = []
for i, item in enumerate(ds):
sentence = item.get("sentence", "")
opt1 = item.get("option1", "")
opt2 = item.get("option2", "")
answer_key = str(item.get("answer", "1"))
answer = opt1 if answer_key == "1" else opt2
if not sentence or not answer:
continue
ctx = "Sentence: %s Options: 1: %s | 2: %s" % (sentence, opt1, opt2)
out.append({
"question": "Which option correctly fills the blank? " + sentence,
"context": ctx,
"answer": answer,
"id": "winogrande_%d" % i,
"source": "winogrande",
"difficulty": "intermediate",
"category": "commonsense_reasoning",
"hallucination_type": None,
"entities": [],
"metadata": {}
})
return out
LOADERS = {
"squad": load_squad,
"trivia_qa": load_trivia_qa,
"halueval": load_halueval,
"truthful_qa": load_truthful_qa,
"hotpotqa": load_hotpotqa,
"boolq": load_boolq,
"faithdial": load_faithdial,
"fever": load_fever,
"arc": load_arc,
"openbookqa": load_openbookqa,
"ms_marco": load_ms_marco,
"coqa": load_coqa,
"nq_open": load_nq_open,
"commonsense_qa": load_commonsense_qa,
"winogrande": load_winogrande,
}
def main():
total = 0
print(f"\n{'='*55}")
print(f" HallucinationGuard Cache Generator")
print(f" Output: {CACHE_DIR}")
print(f"{'='*55}\n")
for ds_name, cap in DATASETS.items():
cache_file = os.path.join(CACHE_DIR, f"{ds_name}_{cap}.json")
if os.path.exists(cache_file):
with open(cache_file) as f:
existing = json.load(f)
print(f" ✅ {ds_name}: already cached ({len(existing)} examples) — skipping")
total += len(existing)
continue
print(f" Downloading {ds_name} ({cap} examples)...", end=" ", flush=True)
t0 = time.time()
try:
loader = LOADERS[ds_name]
examples = loader(cap)
with open(cache_file, "w") as f:
json.dump(examples, f)
elapsed = time.time() - t0
total += len(examples)
print(f"✅ {len(examples)} examples saved ({elapsed:.0f}s)")
except Exception as e:
print(f"❌ Failed: {e}")
print(f"\n{'='*55}")
print(f" Done! Total examples cached: {total:,}")
print(f" Cache location: {CACHE_DIR}")
print(f"\n Now upload to HF Space:")
print(f" python -c \"from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='.', repo_id='SamSankar/hallucination-guard-env', repo_type='space', ignore_patterns=['__pycache__', '*.pyc']); print('Upload complete!')\"")
print(f"{'='*55}\n")
if __name__ == "__main__":
main()