ScottzillaSystems
/

agentzerov2

Model card Files Files and versions

xet

Community

ScottzillaSystems commited on 9 days ago

Commit

385c8f1

verified ·

1 Parent(s): 67699cf

Upload app.py

Browse files

Files changed (1) hide show

app.py +240 -0

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+#!/usr/bin/env python3
+"""
+Agent Zero — HF Spaces Native Version
+Loads your actual ScottzillaSystems model weights directly via transformers.
+No TGE endpoints, no LiteLLM proxy, no Docker Compose — works on any HF Space.
+"""
+import os
+import re
+import json
+import asyncio
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from threading import Thread
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# ─── Configuration ───────────────────────────────────────────────────────────
+AVAILABLE_MODELS = {
+    "cydonia-24b": {
+        "repo": "ScottzillaSystems/Cydonia-24B-v4.1",
+        "description": "Cydonia 24B — Mistral-based general purpose",
+        "tier": "T2",
+        "device_map": "auto",
+        "max_new_tokens": 2048,
+    },
+    "qwen3.5-27b": {
+        "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
+        "description": "Qwen3.5 27B — Claude Opus distilled reasoning",
+        "tier": "T3",
+        "device_map": "auto",
+        "max_new_tokens": 4096,
+    },
+    "qwen3.5-9b": {
+        "repo": "ScottzillaSystems/Qwen3.5-9B-Chat",
+        "description": "Qwen3.5 9B — Fast general purpose, daily driver",
+        "tier": "T1",
+        "device_map": "auto",
+        "max_new_tokens": 2048,
+    },
+    "chatgpt5": {
+        "repo": "ScottzillaSystems/ChatGPT-5-Chat",
+        "description": "ChatGPT-5 494M — Ultra-fast router/classification",
+        "tier": "T0",
+        "device_map": "auto",
+        "max_new_tokens": 1024,
+    },
+    "fallen-command": {
+        "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat",
+        "description": "Fallen Command 111B — Flagship reasoning",
+        "tier": "T4",
+        "device_map": "auto",
+        "load_in_8bit": True,
+        "max_new_tokens": 4096,
+    },
+}
+DEFAULT_MODEL = "qwen3.5-9b"
+_model_cache: Dict[str, Any] = {}
+_tokenizer_cache: Dict[str, Any] = {}
+# ─── Model Loading ───────────────────────────────────────────────────────────
+def load_model(model_key: str):
+    """Load model and tokenizer, caching in memory."""
+    if model_key in _model_cache:
+        return _model_cache[model_key], _tokenizer_cache[model_key]
+    config = AVAILABLE_MODELS.get(model_key)
+    if not config:
+        raise ValueError(f"Unknown model: {model_key}")
+    repo_id = config["repo"]
+    print(f"[AgentZero] Loading {model_key} from {repo_id}...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        repo_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"),
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    load_kwargs = {
+        "pretrained_model_name_or_path": repo_id,
+        "trust_remote_code": True,
+        "token": os.getenv("HF_TOKEN"),
+        "torch_dtype": torch.bfloat16,
+        "device_map": config.get("device_map", "auto"),
+    }
+    if config.get("load_in_8bit"):
+        load_kwargs["load_in_8bit"] = True
+    model = AutoModelForCausalLM.from_pretrained(**load_kwargs)
+    _model_cache[model_key] = model
+    _tokenizer_cache[model_key] = tokenizer
+    print(f"[AgentZero] {model_key} loaded")
+    return model, tokenizer
+def unload_model(model_key: str):
+    if model_key in _model_cache:
+        del _model_cache[model_key]
+        del _tokenizer_cache[model_key]
+        torch.cuda.empty_cache()
+        return f"Unloaded {model_key}"
+    return f"{model_key} not loaded"
+def get_status():
+    loaded = list(_model_cache.keys())
+    mem = torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0
+    return f"Loaded: {', '.join(loaded) if loaded else 'none'} | GPU: {mem}GB"
+# ─── Inference ───────────────────────────────────────────────────────────────
+@spaces.GPU(duration=120)
+def generate_stream(model_key, messages, max_new_tokens=None, temperature=0.7):
+    model, tokenizer = load_model(model_key)
+    config = AVAILABLE_MODELS[model_key]
+    if max_new_tokens is None:
+        max_new_tokens = config.get("max_new_tokens", 2048)
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        inputs, streamer=streamer, max_new_tokens=max_new_tokens,
+        do_sample=True, temperature=temperature, top_p=0.9,
+        pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
+    )
+    Thread(target=model.generate, kwargs=gen_kwargs).start()
+    for text in streamer:
+        yield text
+# ─── Gradio UI ───────────────────────────────────────────────────────────────
+CSS = """
+.az-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; }
+.az-header h1 { color: #e94560; margin: 0; font-size: 2em; }
+.az-header p { color: #a0a0b0; margin: 4px 0 0 0; }
+.model-card { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; }
+.tier-T0 { background: #00d4aa; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
+.tier-T1 { background: #00a8e8; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
+.tier-T2 { background: #f7b731; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
+.tier-T3 { background: #e94560; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
+.tier-T4 { background: #9b59b6; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; }
+"""
+def create_ui():
+    with gr.Blocks(css=CSS, title="Agent Zero v2") as demo:
+        with gr.Column(elem_classes="az-header"):
+            gr.HTML("<h1>🤖 Agent Zero v2</h1><p>Loading YOUR model weights — no proxies, no TGI, no lies</p>")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Model")
+                model_dd = gr.Dropdown(choices=list(AVAILABLE_MODELS.keys()), value=DEFAULT_MODEL, label="Active Model")
+                model_info = gr.Markdown("Select a model")
+                with gr.Accordion("Catalog", open=False):
+                    rows = ""
+                    for k, v in AVAILABLE_MODELS.items():
+                        rows += f"<tr><td><b>{k}</b></td><td><span class='tier-{v['tier']}'>{v['tier']}</span></td><td>{v['description']}</td></tr>"
+                    gr.HTML(f"<table width='100%'>{rows}</table>")
+                with gr.Accordion("Settings", open=False):
+                    max_tok = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens")
+                    temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
+                status = gr.Textbox(value="Ready", label="Status", interactive=False)
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(type="messages", height=550, label="Agent Zero v2")
+                with gr.Row():
+                    msg = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8)
+                    send = gr.Button("Send", scale=1, variant="primary")
+                with gr.Row():
+                    clear = gr.Button("🗑 Clear")
+                    unload = gr.Button("🔄 Unload")
+                    statbtn = gr.Button("📊 Status")
+        def update_info(k):
+            c = AVAILABLE_MODELS.get(k, {})
+            tier = c.get("tier", "T0")
+            return (
+                f"<div class='model-card'><b>{c.get('description', '?')}</b><br>"
+                f"<span class='tier-{tier}'>{tier}</span> | "
+                f"{c.get('max_new_tokens', '?')} tokens<br>"
+                f"<code>{c.get('repo', '?')}</code></div>"
+            )
+        model_dd.change(update_info, model_dd, model_info)
+        async def chat_fn(message, history, mk, mtok, tmp):
+            if not message.strip():
+                yield history, "", ""
+            history = history or []
+            history.append({"role": "user", "content": message})
+            yield history, "", f"Loading {mk}..."
+            try:
+                msgs = [{"role": h["role"], "content": h["content"]} for h in history]
+                out = ""
+                for chunk in generate_stream(mk, msgs, mtok, tmp):
+                    out += chunk
+                    if history and history[-1]["role"] == "assistant":
+                        history[-1]["content"] = out
+                    else:
+                        history.append({"role": "assistant", "content": out})
+                    yield history, "", get_status()
+            except Exception as e:
+                history.append({"role": "assistant", "content": f"❌ Error: {e}"})
+                yield history, "", get_status()
+        send.click(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status])
+        msg.submit(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status])
+        clear.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg, status])
+        unload.click(lambda m: (unload_model(m), get_status()), model_dd, [status, status])
+        statbtn.click(get_status, outputs=status)
+    return demo
+if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=False)