Spaces:
Sleeping
Sleeping
| # data_utils.py | |
| # Lightweight dataset loaders + simple hashing vectorizer (no sklearn) | |
| # Works on CPU-only Spaces and avoids heavy tokenizers. | |
| from typing import List, Tuple | |
| import numpy as np | |
| from datasets import load_dataset | |
| # ----------------------------- | |
| # Hashing vectorizer (unigram + bigram) | |
| # ----------------------------- | |
| def hash_vectorize(texts: List[str], n_features: int = 4096, seed: int = 1234) -> np.ndarray: | |
| """ | |
| Very fast, tokenizer-free vectorizer. | |
| - Lowercases text | |
| - Splits on whitespace | |
| - Uses Python's hash to place unigrams + bigrams into a fixed-size bag | |
| - L2-normalizes each row | |
| """ | |
| n = len(texts) | |
| X = np.zeros((n, n_features), dtype=np.float32) | |
| for i, t in enumerate(texts): | |
| if t is None: | |
| continue | |
| toks = t.lower().split() | |
| prev = None | |
| for tok in toks: | |
| h1 = hash(tok) % n_features | |
| X[i, h1] += 1.0 | |
| if prev is not None: | |
| bg = prev + "_" + tok | |
| h2 = hash(bg) % n_features | |
| X[i, h2] += 1.0 | |
| prev = tok | |
| # L2 norm | |
| norm = float(np.linalg.norm(X[i])) + 1e-8 | |
| X[i] /= norm | |
| return X | |
| # ----------------------------- | |
| # PIQA tiny subset loader | |
| # Produces pair-expanded binary rows for a quick proxy classifier. | |
| # ----------------------------- | |
| def load_piqa(subset: int = 800, seed: int = 42) -> Tuple[list, np.ndarray, list, np.ndarray]: | |
| """ | |
| Returns: | |
| Xtr_txt, ytr, Xva_txt, yva | |
| Where: | |
| - For each original PIQA example, we emit TWO rows: | |
| [goal + sol1] with label 1 if sol1 is correct else 0 | |
| [goal + sol2] with label 1 if sol2 is correct else 0 | |
| """ | |
| ds = load_dataset("piqa") | |
| tr = ds["train"] | |
| va = ds["validation"] | |
| rng = np.random.RandomState(seed) | |
| idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False) | |
| idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False) | |
| def pack(rows, idxs): | |
| X_text, y = [], [] | |
| for k in idxs: | |
| p = rows[k] | |
| stem = (p.get("goal") or "").strip() | |
| sol1 = (p.get("sol1") or "").strip() | |
| sol2 = (p.get("sol2") or "").strip() | |
| label = int(p.get("label", 0)) | |
| X_text.append(f"{stem} {sol1}") | |
| y.append(1 if label == 0 else 0) | |
| X_text.append(f"{stem} {sol2}") | |
| y.append(1 if label == 1 else 0) | |
| return X_text, np.array(y, dtype=np.int64) | |
| Xtr_txt, ytr = pack(tr, idx_tr) | |
| Xva_txt, yva = pack(va, idx_va) | |
| return Xtr_txt, ytr, Xva_txt, yva | |
| # ----------------------------- | |
| # HellaSwag tiny subset loader | |
| # Expands each example into 4 rows (one-vs-all), later regrouped into argmax. | |
| # ----------------------------- | |
| def load_hellaswag(subset: int = 800, seed: int = 42) -> Tuple[list, np.ndarray, list, np.ndarray]: | |
| """ | |
| Returns: | |
| Xtr_txt, ytr, Xva_txt, yva | |
| Where: | |
| - For each original example, we emit FOUR rows: | |
| [context + ending_i] with label 1 if i is correct else 0 | |
| """ | |
| ds = load_dataset("hellaswag") | |
| tr = ds["train"] | |
| va = ds["validation"] | |
| rng = np.random.RandomState(seed) | |
| idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False) | |
| idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False) | |
| def pack(rows, idxs): | |
| X_text, y = [], [] | |
| for k in idxs: | |
| p = rows[k] | |
| # Some variants have keys like 'ctx' + 'ctx_a'; fall back defensively. | |
| ctx = f"{(p.get('ctx') or '')} {(p.get('ctx_a') or '')}".strip() | |
| endings = p.get("endings") or [] | |
| label = int(p.get("label", 0)) | |
| for i, e in enumerate(endings): | |
| X_text.append(f"{ctx} {e}".strip()) | |
| y.append(1 if i == label else 0) | |
| return X_text, np.array(y, dtype=np.int64) | |
| Xtr_txt, ytr = pack(tr, idx_tr) | |
| Xva_txt, yva = pack(va, idx_va) | |
| return Xtr_txt, ytr, Xva_txt, yva | |