loocorez commited on
Commit
6aeb857
·
verified ·
1 Parent(s): 9ef35af

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +12 -34
app.py CHANGED
@@ -3,17 +3,21 @@ os.environ.setdefault("HF_HOME", "/tmp/hf")
3
  os.environ.setdefault("HF_HUB_CACHE", "/tmp/hf/hub")
4
  os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
5
 
6
- from huggingface_hub import hf_hub_download, snapshot_download
 
7
  import torch
8
  import gradio as gr
9
  import pickle
10
- import json
11
- from nanochat.gpt import GPT, GPTConfig
12
 
13
  MODEL_ID = "loocorez/nanochat-mid-d20-test"
14
 
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
 
 
 
 
 
 
17
  # Load tokenizer.pkl directly (avoid AutoTokenizer mapping issues)
18
  tok_path = hf_hub_download(MODEL_ID, filename="tokenizer.pkl")
19
 
@@ -34,41 +38,15 @@ class PklTokenizer:
34
 
35
  tokenizer = PklTokenizer(tok_path)
36
 
37
- # Load model weights directly using nanochat GPT to avoid Conv1D mismatch
38
- local_dir = snapshot_download(MODEL_ID)
39
- with open(os.path.join(local_dir, "config.json"), "r") as f:
40
- meta = json.load(f)
41
- cfg = GPTConfig(
42
- sequence_len=meta.get("sequence_len", 2048),
43
- vocab_size=meta["vocab_size"],
44
- n_layer=meta["n_layer"],
45
- n_head=meta["n_head"],
46
- n_kv_head=meta["n_kv_head"],
47
- n_embd=meta["n_embd"],
48
- )
49
- with torch.device("meta"):
50
- model = GPT(cfg)
51
- model.to_empty(device=device)
52
- model.init_weights()
53
- weights_path = os.path.join(local_dir, "pytorch_model.bin")
54
- state = torch.load(weights_path, map_location=device)
55
- state = {k.lstrip("_orig_mod."): v for k, v in state.items()}
56
- model.load_state_dict(state, strict=True, assign=True)
57
- # Ensure rotary buffers and weights are bf16 as expected by model
58
- model = model.to(device).to(dtype=torch.bfloat16)
59
- model.eval()
60
-
61
  def complete(prompt, max_new_tokens=64):
62
  input_ids = tokenizer.encode(prompt, prepend=tokenizer.get_bos_token_id())
63
  ids = torch.tensor([input_ids], dtype=torch.long, device=device)
64
  with torch.inference_mode():
65
- # autocast so activations match model bf16 dtype
66
- with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
67
- for _ in range(max_new_tokens):
68
- logits = model.forward(ids)
69
- logits = logits[:, -1, :]
70
- next_token = torch.argmax(logits, dim=-1, keepdim=True)
71
- ids = torch.cat([ids, next_token], dim=1)
72
  return tokenizer.decode(ids[0].tolist())
73
 
74
  with gr.Blocks() as demo:
 
3
  os.environ.setdefault("HF_HUB_CACHE", "/tmp/hf/hub")
4
  os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
5
 
6
+ from transformers import AutoModel
7
+ from huggingface_hub import hf_hub_download
8
  import torch
9
  import gradio as gr
10
  import pickle
 
 
11
 
12
  MODEL_ID = "loocorez/nanochat-mid-d20-test"
13
 
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
 
16
+ # Load model via Auto* with trust_remote_code
17
+ model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True)
18
+ model = model.to(device)
19
+ model.eval()
20
+
21
  # Load tokenizer.pkl directly (avoid AutoTokenizer mapping issues)
22
  tok_path = hf_hub_download(MODEL_ID, filename="tokenizer.pkl")
23
 
 
38
 
39
  tokenizer = PklTokenizer(tok_path)
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def complete(prompt, max_new_tokens=64):
42
  input_ids = tokenizer.encode(prompt, prepend=tokenizer.get_bos_token_id())
43
  ids = torch.tensor([input_ids], dtype=torch.long, device=device)
44
  with torch.inference_mode():
45
+ for _ in range(max_new_tokens):
46
+ outputs = model(input_ids=ids)
47
+ logits = outputs["logits"] if isinstance(outputs, dict) else outputs.logits
48
+ next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
49
+ ids = torch.cat([ids, next_token], dim=1)
 
 
50
  return tokenizer.decode(ids[0].tolist())
51
 
52
  with gr.Blocks() as demo: