Spaces:

bapaurana
/

xkcd_finder

Runtime error

xkcd_finder / build_index.py

Petzys

Feat: index build is now part of docker build

531b3e4 20 days ago

1.35 kB

	from __future__ import annotations

	import pickle
	import faiss
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer

	INDEX_FILE = "xkcd.index"
	META_FILE = "meta.pkl"

	# --- Build / load index ---
	def build_index():
	print("Building FAISS index...")
	ds = load_dataset("olivierdehaene/xkcd", split="train")
	model = SentenceTransformer("all-MiniLM-L6-v2")
	texts = []
	for ex in ds:
	title = ex["title"] if ex["title"] else ""
	transcript = ex["transcript"] if ex["transcript"] else ""
	explanation = (
	ex["explanation"] if "explanation" in ex and ex["explanation"] else ""
	)
	texts.append(f"{title} {transcript} {explanation}")

	embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatL2(dim)
	index.add(embeddings)
	faiss.write_index(index, INDEX_FILE)

	# Store just the metadata we need (pickle-friendly)
	meta = [
	{
	"id": ex["id"],
	"title": ex["title"],
	"transcript": ex["transcript"],
	"explanation": ex["explanation"] if "explanation" in ex else "",
	}
	for ex in ds
	]
	with open(META_FILE, "wb") as f:
	pickle.dump(meta, f)

	return index, meta

	build_index()