Spaces:

loocorez
/

nanochat-mid-transformers

Sleeping

App Files Files Community

loocorez commited on 14 days ago

Commit

6aeb857

verified ·

1 Parent(s): 9ef35af

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +12 -34

app.py CHANGED Viewed

@@ -3,17 +3,21 @@ os.environ.setdefault("HF_HOME", "/tmp/hf")
 os.environ.setdefault("HF_HUB_CACHE", "/tmp/hf/hub")
 os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
-from huggingface_hub import hf_hub_download, snapshot_download
 import torch
 import gradio as gr
 import pickle
-import json
-from nanochat.gpt import GPT, GPTConfig
 MODEL_ID = "loocorez/nanochat-mid-d20-test"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load tokenizer.pkl directly (avoid AutoTokenizer mapping issues)
 tok_path = hf_hub_download(MODEL_ID, filename="tokenizer.pkl")
@@ -34,41 +38,15 @@ class PklTokenizer:
 tokenizer = PklTokenizer(tok_path)
-# Load model weights directly using nanochat GPT to avoid Conv1D mismatch
-local_dir = snapshot_download(MODEL_ID)
-with open(os.path.join(local_dir, "config.json"), "r") as f:
-    meta = json.load(f)
-cfg = GPTConfig(
-    sequence_len=meta.get("sequence_len", 2048),
-    vocab_size=meta["vocab_size"],
-    n_layer=meta["n_layer"],
-    n_head=meta["n_head"],
-    n_kv_head=meta["n_kv_head"],
-    n_embd=meta["n_embd"],
-)
-with torch.device("meta"):
-    model = GPT(cfg)
-model.to_empty(device=device)
-model.init_weights()
-weights_path = os.path.join(local_dir, "pytorch_model.bin")
-state = torch.load(weights_path, map_location=device)
-state = {k.lstrip("_orig_mod."): v for k, v in state.items()}
-model.load_state_dict(state, strict=True, assign=True)
-# Ensure rotary buffers and weights are bf16 as expected by model
-model = model.to(device).to(dtype=torch.bfloat16)
-model.eval()
 def complete(prompt, max_new_tokens=64):
     input_ids = tokenizer.encode(prompt, prepend=tokenizer.get_bos_token_id())
     ids = torch.tensor([input_ids], dtype=torch.long, device=device)
     with torch.inference_mode():
-        # autocast so activations match model bf16 dtype
-        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
-            for _ in range(max_new_tokens):
-                logits = model.forward(ids)
-                logits = logits[:, -1, :]
-                next_token = torch.argmax(logits, dim=-1, keepdim=True)
-                ids = torch.cat([ids, next_token], dim=1)
     return tokenizer.decode(ids[0].tolist())
 with gr.Blocks() as demo:

 os.environ.setdefault("HF_HUB_CACHE", "/tmp/hf/hub")
 os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
+from transformers import AutoModel
+from huggingface_hub import hf_hub_download
 import torch
 import gradio as gr
 import pickle
 MODEL_ID = "loocorez/nanochat-mid-d20-test"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load model via Auto* with trust_remote_code
+model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = model.to(device)
+model.eval()
 # Load tokenizer.pkl directly (avoid AutoTokenizer mapping issues)
 tok_path = hf_hub_download(MODEL_ID, filename="tokenizer.pkl")
 tokenizer = PklTokenizer(tok_path)
 def complete(prompt, max_new_tokens=64):
     input_ids = tokenizer.encode(prompt, prepend=tokenizer.get_bos_token_id())
     ids = torch.tensor([input_ids], dtype=torch.long, device=device)
     with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model(input_ids=ids)
+            logits = outputs["logits"] if isinstance(outputs, dict) else outputs.logits
+            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
+            ids = torch.cat([ids, next_token], dim=1)
     return tokenizer.decode(ids[0].tolist())
 with gr.Blocks() as demo: