luisrui
Deploy ModelLens v1: BYOK OpenAI key, size filter, official-only filter, 47k HF model pool
c330598 | """Build the candidate model pool consumed by the recommendation web app. | |
| The output is a single .npz that bundles, for every candidate model: | |
| - model_name (str) | |
| - size_id (int, bucket id matching the trained MLPMetric) | |
| - family_id (int) | |
| - popularity (int, HF downloads in the last 30d; 0 if unknown) | |
| - hf_url (str, https://huggingface.co/<name> if name looks like a repo id) | |
| Run from the project root: | |
| python web/build_model_pool.py \ | |
| --data-dir data/unified_augmented \ | |
| --args checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json \ | |
| --out web/assets/model_pool.npz | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import numpy as np | |
| SIZE_EDGES_DEFAULT = [ | |
| 0.001, 0.003, 0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, | |
| 0.5, 0.6, 0.8, 1, 3, 7, 14, 35, 70, 100, 1000, | |
| ] | |
| def assign_size_bucket(size_b: float, size_edges: np.ndarray, unknown_id: int) -> int: | |
| try: | |
| x = float(size_b) | |
| except (TypeError, ValueError): | |
| return unknown_id | |
| if not np.isfinite(x) or x == 0.0: | |
| return unknown_id | |
| return int(np.searchsorted(size_edges, x, side="right")) | |
| def get_size_b(profile_entry) -> float: | |
| if not isinstance(profile_entry, dict): | |
| return float("nan") | |
| size = profile_entry.get("size") | |
| try: | |
| if isinstance(size, str) and size.strip().lower() == "unknown": | |
| return float("nan") | |
| x = float(size) | |
| return x if x != 0.0 else float("nan") | |
| except Exception: | |
| return float("nan") | |
| def hf_url_for(name: str) -> str: | |
| return f"https://huggingface.co/{name}" if "/" in name else "" | |
| def main(argv=None): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--data-dir", default="data/unified_augmented") | |
| p.add_argument( | |
| "--args", | |
| default="checkpoint/mlp/unified_augmented/ablation_no_model_id_no_dataset_id/args.json", | |
| help="Path to the training args.json — used to read size_bucket so bucket ids align with the checkpoint.", | |
| ) | |
| p.add_argument("--out", default="web/assets/model_pool.npz") | |
| p.add_argument( | |
| "--min-popularity", | |
| type=int, | |
| default=0, | |
| help="Drop candidate models with HF download count below this. 0 keeps all.", | |
| ) | |
| args = p.parse_args(argv) | |
| os.makedirs(os.path.dirname(args.out), exist_ok=True) | |
| with open(os.path.join(args.data_dir, "model2id.json")) as f: | |
| model2id = json.load(f) | |
| with open(os.path.join(args.data_dir, "model2family.json")) as f: | |
| model2family = json.load(f) | |
| with open(os.path.join(args.data_dir, "family2id.json")) as f: | |
| family2id = json.load(f) | |
| with open(os.path.join(args.data_dir, "model_profile.json")) as f: | |
| model_profile = json.load(f) | |
| pop_path = os.path.join(args.data_dir, "model_popularity.json") | |
| pop_map = {} | |
| if os.path.exists(pop_path): | |
| pop_doc = json.load(open(pop_path)) | |
| # Doc shape: {fetched_at, source, num_models, status_counts, models: {name: {downloads, status}}} | |
| models_field = pop_doc.get("models", pop_doc) | |
| for name, entry in models_field.items(): | |
| if isinstance(entry, dict): | |
| pop_map[name] = int(entry.get("downloads", 0) or 0) | |
| else: | |
| try: | |
| pop_map[name] = int(entry) | |
| except Exception: | |
| pop_map[name] = 0 | |
| if os.path.exists(args.args): | |
| train_args = json.load(open(args.args)) | |
| size_edges = np.array(train_args.get("size_bucket", SIZE_EDGES_DEFAULT), dtype=float) | |
| else: | |
| size_edges = np.array(SIZE_EDGES_DEFAULT, dtype=float) | |
| unknown_size_id = len(size_edges) + 1 | |
| unknown_family_id = family2id.get("unknown", len(family2id) - 1) | |
| names = [] | |
| size_ids = [] | |
| sizes_b = [] | |
| family_ids = [] | |
| popularities = [] | |
| urls = [] | |
| dropped_pop = 0 | |
| for name in model2id.keys(): | |
| pop = pop_map.get(name, 0) | |
| if pop < args.min_popularity: | |
| dropped_pop += 1 | |
| continue | |
| size_b = get_size_b(model_profile.get(name)) | |
| sid = assign_size_bucket(size_b, size_edges, unknown_size_id) | |
| fam = model2family.get(name, "unknown") | |
| fid = family2id.get(fam, unknown_family_id) | |
| names.append(name) | |
| size_ids.append(sid) | |
| sizes_b.append(size_b) # NaN means unknown | |
| family_ids.append(fid) | |
| popularities.append(pop) | |
| urls.append(hf_url_for(name)) | |
| names_arr = np.array(names, dtype=object) | |
| size_arr = np.array(size_ids, dtype=np.int64) | |
| sizes_b_arr = np.array(sizes_b, dtype=np.float32) | |
| fam_arr = np.array(family_ids, dtype=np.int64) | |
| pop_arr = np.array(popularities, dtype=np.int64) | |
| url_arr = np.array(urls, dtype=object) | |
| np.savez( | |
| args.out, | |
| names=names_arr, | |
| size_ids=size_arr, | |
| sizes_b=sizes_b_arr, | |
| family_ids=fam_arr, | |
| popularities=pop_arr, | |
| urls=url_arr, | |
| ) | |
| print(f"Wrote {len(names)} models to {args.out} (dropped {dropped_pop} below min-popularity={args.min_popularity})") | |
| print(f" unique families: {len(set(family_ids))}, unique size buckets: {len(set(size_ids))}") | |
| print(f" models with HF URL: {sum(1 for u in urls if u)} / {len(urls)}") | |
| if __name__ == "__main__": | |
| main() | |