ScottzillaSystems commited on
Commit
385c8f1
Β·
verified Β·
1 Parent(s): 67699cf

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -0
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent Zero β€” HF Spaces Native Version
4
+ Loads your actual ScottzillaSystems model weights directly via transformers.
5
+ No TGE endpoints, no LiteLLM proxy, no Docker Compose β€” works on any HF Space.
6
+ """
7
+
8
+ import os
9
+ import re
10
+ import json
11
+ import asyncio
12
+ from pathlib import Path
13
+ from typing import List, Dict, Optional, Any
14
+ from threading import Thread
15
+
16
+ import gradio as gr
17
+ import spaces
18
+ import torch
19
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
20
+
21
+
22
+ # ─── Configuration ───────────────────────────────────────────────────────────
23
+
24
+ AVAILABLE_MODELS = {
25
+ "cydonia-24b": {
26
+ "repo": "ScottzillaSystems/Cydonia-24B-v4.1",
27
+ "description": "Cydonia 24B β€” Mistral-based general purpose",
28
+ "tier": "T2",
29
+ "device_map": "auto",
30
+ "max_new_tokens": 2048,
31
+ },
32
+ "qwen3.5-27b": {
33
+ "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
34
+ "description": "Qwen3.5 27B β€” Claude Opus distilled reasoning",
35
+ "tier": "T3",
36
+ "device_map": "auto",
37
+ "max_new_tokens": 4096,
38
+ },
39
+ "qwen3.5-9b": {
40
+ "repo": "ScottzillaSystems/Qwen3.5-9B-Chat",
41
+ "description": "Qwen3.5 9B β€” Fast general purpose, daily driver",
42
+ "tier": "T1",
43
+ "device_map": "auto",
44
+ "max_new_tokens": 2048,
45
+ },
46
+ "chatgpt5": {
47
+ "repo": "ScottzillaSystems/ChatGPT-5-Chat",
48
+ "description": "ChatGPT-5 494M β€” Ultra-fast router/classification",
49
+ "tier": "T0",
50
+ "device_map": "auto",
51
+ "max_new_tokens": 1024,
52
+ },
53
+ "fallen-command": {
54
+ "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat",
55
+ "description": "Fallen Command 111B β€” Flagship reasoning",
56
+ "tier": "T4",
57
+ "device_map": "auto",
58
+ "load_in_8bit": True,
59
+ "max_new_tokens": 4096,
60
+ },
61
+ }
62
+
63
+ DEFAULT_MODEL = "qwen3.5-9b"
64
+
65
+ _model_cache: Dict[str, Any] = {}
66
+ _tokenizer_cache: Dict[str, Any] = {}
67
+
68
+
69
+ # ─── Model Loading ───────────────────────────────────────────────────────────
70
+
71
+ def load_model(model_key: str):
72
+ """Load model and tokenizer, caching in memory."""
73
+ if model_key in _model_cache:
74
+ return _model_cache[model_key], _tokenizer_cache[model_key]
75
+
76
+ config = AVAILABLE_MODELS.get(model_key)
77
+ if not config:
78
+ raise ValueError(f"Unknown model: {model_key}")
79
+
80
+ repo_id = config["repo"]
81
+ print(f"[AgentZero] Loading {model_key} from {repo_id}...")
82
+
83
+ tokenizer = AutoTokenizer.from_pretrained(
84
+ repo_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"),
85
+ )
86
+ if tokenizer.pad_token is None:
87
+ tokenizer.pad_token = tokenizer.eos_token
88
+
89
+ load_kwargs = {
90
+ "pretrained_model_name_or_path": repo_id,
91
+ "trust_remote_code": True,
92
+ "token": os.getenv("HF_TOKEN"),
93
+ "torch_dtype": torch.bfloat16,
94
+ "device_map": config.get("device_map", "auto"),
95
+ }
96
+ if config.get("load_in_8bit"):
97
+ load_kwargs["load_in_8bit"] = True
98
+
99
+ model = AutoModelForCausalLM.from_pretrained(**load_kwargs)
100
+
101
+ _model_cache[model_key] = model
102
+ _tokenizer_cache[model_key] = tokenizer
103
+
104
+ print(f"[AgentZero] {model_key} loaded")
105
+ return model, tokenizer
106
+
107
+
108
+ def unload_model(model_key: str):
109
+ if model_key in _model_cache:
110
+ del _model_cache[model_key]
111
+ del _tokenizer_cache[model_key]
112
+ torch.cuda.empty_cache()
113
+ return f"Unloaded {model_key}"
114
+ return f"{model_key} not loaded"
115
+
116
+
117
+ def get_status():
118
+ loaded = list(_model_cache.keys())
119
+ mem = torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0
120
+ return f"Loaded: {', '.join(loaded) if loaded else 'none'} | GPU: {mem}GB"
121
+
122
+
123
+ # ─── Inference ───────────────────────────────────────────────────────────────
124
+
125
+ @spaces.GPU(duration=120)
126
+ def generate_stream(model_key, messages, max_new_tokens=None, temperature=0.7):
127
+ model, tokenizer = load_model(model_key)
128
+ config = AVAILABLE_MODELS[model_key]
129
+ if max_new_tokens is None:
130
+ max_new_tokens = config.get("max_new_tokens", 2048)
131
+
132
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
133
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True)
134
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
135
+
136
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
137
+
138
+ gen_kwargs = dict(
139
+ inputs, streamer=streamer, max_new_tokens=max_new_tokens,
140
+ do_sample=True, temperature=temperature, top_p=0.9,
141
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
142
+ )
143
+
144
+ Thread(target=model.generate, kwargs=gen_kwargs).start()
145
+ for text in streamer:
146
+ yield text
147
+
148
+
149
+ # ─── Gradio UI ───────────────────────────────────────────────────────────────
150
+
151
+ CSS = """
152
+ .az-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; }
153
+ .az-header h1 { color: #e94560; margin: 0; font-size: 2em; }
154
+ .az-header p { color: #a0a0b0; margin: 4px 0 0 0; }
155
+ .model-card { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; }
156
+ .tier-T0 { background: #00d4aa; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
157
+ .tier-T1 { background: #00a8e8; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
158
+ .tier-T2 { background: #f7b731; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
159
+ .tier-T3 { background: #e94560; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
160
+ .tier-T4 { background: #9b59b6; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
161
+ """
162
+
163
+
164
+ def create_ui():
165
+ with gr.Blocks(css=CSS, title="Agent Zero v2") as demo:
166
+ with gr.Column(elem_classes="az-header"):
167
+ gr.HTML("<h1>πŸ€– Agent Zero v2</h1><p>Loading YOUR model weights β€” no proxies, no TGI, no lies</p>")
168
+
169
+ with gr.Row():
170
+ with gr.Column(scale=1):
171
+ gr.Markdown("### Model")
172
+ model_dd = gr.Dropdown(choices=list(AVAILABLE_MODELS.keys()), value=DEFAULT_MODEL, label="Active Model")
173
+ model_info = gr.Markdown("Select a model")
174
+
175
+ with gr.Accordion("Catalog", open=False):
176
+ rows = ""
177
+ for k, v in AVAILABLE_MODELS.items():
178
+ rows += f"<tr><td><b>{k}</b></td><td><span class='tier-{v['tier']}'>{v['tier']}</span></td><td>{v['description']}</td></tr>"
179
+ gr.HTML(f"<table width='100%'>{rows}</table>")
180
+
181
+ with gr.Accordion("Settings", open=False):
182
+ max_tok = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens")
183
+ temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
184
+
185
+ status = gr.Textbox(value="Ready", label="Status", interactive=False)
186
+
187
+ with gr.Column(scale=3):
188
+ chatbot = gr.Chatbot(type="messages", height=550, label="Agent Zero v2")
189
+ with gr.Row():
190
+ msg = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8)
191
+ send = gr.Button("Send", scale=1, variant="primary")
192
+ with gr.Row():
193
+ clear = gr.Button("πŸ—‘ Clear")
194
+ unload = gr.Button("πŸ”„ Unload")
195
+ statbtn = gr.Button("πŸ“Š Status")
196
+
197
+ def update_info(k):
198
+ c = AVAILABLE_MODELS.get(k, {})
199
+ tier = c.get("tier", "T0")
200
+ return (
201
+ f"<div class='model-card'><b>{c.get('description', '?')}</b><br>"
202
+ f"<span class='tier-{tier}'>{tier}</span> | "
203
+ f"{c.get('max_new_tokens', '?')} tokens<br>"
204
+ f"<code>{c.get('repo', '?')}</code></div>"
205
+ )
206
+
207
+ model_dd.change(update_info, model_dd, model_info)
208
+
209
+ async def chat_fn(message, history, mk, mtok, tmp):
210
+ if not message.strip():
211
+ yield history, "", ""
212
+ history = history or []
213
+ history.append({"role": "user", "content": message})
214
+ yield history, "", f"Loading {mk}..."
215
+ try:
216
+ msgs = [{"role": h["role"], "content": h["content"]} for h in history]
217
+ out = ""
218
+ for chunk in generate_stream(mk, msgs, mtok, tmp):
219
+ out += chunk
220
+ if history and history[-1]["role"] == "assistant":
221
+ history[-1]["content"] = out
222
+ else:
223
+ history.append({"role": "assistant", "content": out})
224
+ yield history, "", get_status()
225
+ except Exception as e:
226
+ history.append({"role": "assistant", "content": f"❌ Error: {e}"})
227
+ yield history, "", get_status()
228
+
229
+ send.click(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status])
230
+ msg.submit(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status])
231
+ clear.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg, status])
232
+ unload.click(lambda m: (unload_model(m), get_status()), model_dd, [status, status])
233
+ statbtn.click(get_status, outputs=status)
234
+
235
+ return demo
236
+
237
+
238
+ if __name__ == "__main__":
239
+ demo = create_ui()
240
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=False)