test-old / app.py
SeaWolf-AI's picture
Upload 6 files
ca19627 verified
#!/usr/bin/env python3
"""
AETHER-Net 0.8B β€” Inference Test Space
Private λͺ¨λΈμ„ λ‘œλ“œν•˜μ—¬ ν…μŠ€νŠΈ 생성을 ν…ŒμŠ€νŠΈν•©λ‹ˆλ‹€.
HF Space: T4 GPU, HF_TOKEN secret ν•„μš”
Deploy: FINAL-Bench/aether-net-test
"""
import os
import sys
import time
import json
import torch
import torch.nn.functional as F
import gradio as gr
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download
# ── Config ──
MODEL_REPO = "FINAL-Bench/AETHER-Net-0.8B"
DONOR_REPO = "Qwen/Qwen3.5-0.8B" # For tokenizer
HF_TOKEN = os.getenv("HF_TOKEN")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")
print(f"HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}")
# ── Download model weights from private repo ──
print(f"Downloading AETHER-Net weights from {MODEL_REPO}...")
model_dir = None
try:
model_dir = snapshot_download(
MODEL_REPO, token=HF_TOKEN,
allow_patterns=["model.safetensors", "config.json"],
)
print(f" Model downloaded to: {model_dir}")
except Exception as e:
print(f" Download failed: {e}")
# Source files are co-located in the same directory
APP_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, APP_DIR)
# ── Load model ──
MODEL = None
TOKENIZER = None
def load_model():
global MODEL, TOKENIZER
if MODEL is not None:
return True
# Load tokenizer from donor
print("Loading tokenizer...")
from transformers import AutoTokenizer
try:
TOKENIZER = AutoTokenizer.from_pretrained(
DONOR_REPO, trust_remote_code=True, token=HF_TOKEN
)
print(f" Tokenizer loaded: vocab_size={TOKENIZER.vocab_size}")
except Exception as e:
print(f" Tokenizer failed: {e}")
return False
# Load AETHER-Net
print("Loading AETHER-Net model...")
try:
from config import AetherNetConfig
from model import AetherNetModel
# Load config
config_path = Path(model_dir) / "config.json" if model_dir else None
if config_path and config_path.exists():
with open(config_path) as f:
cfg_dict = json.load(f)
# Filter valid fields
valid_fields = {k for k in AetherNetConfig.__dataclass_fields__}
filtered = {k: v for k, v in cfg_dict.items() if k in valid_fields}
config = AetherNetConfig(**filtered)
print(f" Config loaded: hidden={config.hidden_size}, layers={config.num_layers}")
else:
print(" No config.json, using defaults")
config = AetherNetConfig(
hidden_size=1024, intermediate_size=3584,
num_layers=25, num_attention_heads=16, num_kv_heads=2,
head_dim=64, vocab_size=248320,
max_position_embeddings=4096,
expert_intermediate_size=716,
overcome_gate_hidden=64,
sliding_window_size=1024,
gdn_state_size=64, mamba2_state_size=64,
tie_word_embeddings=True,
)
model = AetherNetModel(config)
# Load weights
weights_path = Path(model_dir) / "model.safetensors" if model_dir else None
if weights_path and weights_path.exists():
from safetensors.torch import load_file
state = load_file(str(weights_path), device="cpu")
model.load_state_dict(state, strict=False)
print(f" Weights loaded: {len(state)} tensors")
else:
print(" ⚠️ No weights found, using random init")
model = model.to(DEVICE).eval()
MODEL = model
params = sum(p.numel() for p in model.parameters())
mem = params * 2 / 1e9 # BF16 estimate
print(f" Model ready: {params:,} params (~{mem:.1f}GB)")
return True
except Exception as e:
import traceback
print(f" Model load failed: {e}")
traceback.print_exc()
return False
# ── Generation ──
@torch.no_grad()
def generate(prompt, max_tokens=128, temperature=0.8, top_k=50, top_p=0.9):
"""Generate text from prompt."""
if MODEL is None:
success = load_model()
if not success:
return "❌ Model failed to load. Check logs."
# Tokenize
input_ids = TOKENIZER.encode(prompt, return_tensors="pt").to(DEVICE)
generated = input_ids.clone()
t0 = time.time()
for i in range(max_tokens):
# Truncate to max position
if generated.shape[1] > 4096:
generated = generated[:, -4096:]
outputs = MODEL(input_ids=generated)
logits = outputs["logits"][:, -1, :]
# Temperature
if temperature > 0:
logits = logits / temperature
# Top-k
if top_k > 0:
values, _ = torch.topk(logits, top_k)
min_val = values[:, -1].unsqueeze(-1)
logits = torch.where(logits < min_val, torch.full_like(logits, -float('inf')), logits)
# Top-p (nucleus)
if top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
mask = cum_probs - F.softmax(sorted_logits, dim=-1) > top_p
sorted_logits[mask] = -float('inf')
logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
probs = F.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
else:
next_token = logits.argmax(dim=-1, keepdim=True)
generated = torch.cat([generated, next_token], dim=-1)
# EOS check
if next_token.item() == TOKENIZER.eos_token_id:
break
elapsed = time.time() - t0
tokens_generated = generated.shape[1] - input_ids.shape[1]
tps = tokens_generated / elapsed if elapsed > 0 else 0
output_text = TOKENIZER.decode(generated[0], skip_special_tokens=True)
stats = f"\n\n---\nπŸ“Š {tokens_generated} tokens | {tps:.1f} tok/s | {elapsed:.2f}s"
return output_text + stats
def get_model_info():
"""Return model architecture info."""
if MODEL is None:
load_model()
if MODEL is None:
return "Model not loaded"
info = "## AETHER-Net 0.8B β€” Architecture Info\n\n"
info += f"| Item | Value |\n|---|---|\n"
info += f"| Device | {DEVICE} |\n"
info += f"| Parameters | {sum(p.numel() for p in MODEL.parameters()):,} |\n"
info += f"| Layers | {len(MODEL.layers)} |\n"
info += f"| Vocab | {MODEL.config.vocab_size:,} |\n"
info += f"| Hidden | {MODEL.config.hidden_size} |\n"
# Layer types
from config import LAYER_TYPES, LAYER_TO_ELEMENT, ELEMENTS
info += f"\n### Layer Map\n\n"
info += "| Layer | Type | Element |\n|---|---|---|\n"
for i in range(len(MODEL.layers)):
lt = LAYER_TYPES[i]
elem = LAYER_TO_ELEMENT[i]
info += f"| {i} | {lt.upper()} | {elem} |\n"
# Oheng status
info += f"\n### Oheng Status\n\n"
for elem in ELEMENTS:
layers = [i for i in range(25) if LAYER_TO_ELEMENT[i] == elem]
alphas = []
for li in layers:
gb = MODEL.layers[li].moe.generate_boost
if gb is not None:
a = torch.sigmoid(gb.alpha).detach()
eidx = ELEMENTS.index(elem)
if eidx < a.shape[0]:
alphas.append(a[eidx].item())
avg = sum(alphas) / len(alphas) if alphas else 0
info += f"- {elem}: Ξ±={avg:.4f}\n"
return info
# ── Gradio UI ──
TITLE = """
<div style="text-align:center; padding:15px 0;">
<h1>🌌 AETHER-Net 0.8B β€” Inference Test</h1>
<p style="color:#666;">Cross-Architecture Knowledge Distillation from Qwen3.5-0.8B</p>
<p style="color:#999; font-size:0.9em;">5Γ—5 Magic Square | Oheng MoE | 5 Attention Types</p>
</div>
"""
with gr.Blocks(title="AETHER-Net Test") as app:
gr.HTML(TITLE)
with gr.Tabs():
with gr.Tab("πŸ’¬ Generate"):
gr.Markdown("ν”„λ‘¬ν”„νŠΈλ₯Ό μž…λ ₯ν•˜λ©΄ AETHER-Net이 ν…μŠ€νŠΈλ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.")
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Prompt",
placeholder="Enter your prompt here...",
lines=3,
value="The theory of relativity explains that"
)
with gr.Column(scale=1):
max_tokens = gr.Slider(16, 512, value=128, step=16, label="Max Tokens")
temperature = gr.Slider(0.0, 2.0, value=0.8, step=0.1, label="Temperature")
top_k = gr.Slider(0, 100, value=50, step=5, label="Top-K")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
gen_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
output = gr.Textbox(label="Output", lines=12, interactive=False)
gen_btn.click(
fn=generate,
inputs=[prompt, max_tokens, temperature, top_k, top_p],
outputs=output,
)
gr.Markdown("### Quick Prompts")
examples = gr.Examples(
examples=[
["The theory of relativity explains that"],
["In Python, the most efficient way to sort a list is"],
["The five elements of nature are"],
["Artificial general intelligence requires"],
["ν•œκ΅­μ˜ μˆ˜λ„λŠ”"],
["def fibonacci(n):"],
],
inputs=prompt,
)
with gr.Tab("πŸ” Model Info"):
info_btn = gr.Button("Load Model Info", variant="primary")
info_output = gr.Markdown()
info_btn.click(fn=get_model_info, outputs=info_output)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## AETHER-Net 0.8B
**Cross-Architecture Knowledge Distillation from Qwen3.5-0.8B**
### Method
- **Weight Transplant**: Qwen3.5-0.8B β†’ AETHER-Net (5Γ—5 Magic Square layout)
- **3-Stage MOHAWK Distillation**: KLD β†’ Hidden Alignment β†’ Oheng Regularization
- **Cost**: ~$0 (CPU-only, 100 steps demo)
### Architecture
- 25 Layers: 5 attention types Γ— 5 elements
- GDN, Full, Mamba2, Sliding Window, Cross Attention
- Oheng MoE: 25 experts, 상생(Generate) + 상극(Overcome)
### Source
- Model: [FINAL-Bench/AETHER-Net-0.8B](https://huggingface.co/FINAL-Bench/AETHER-Net-0.8B) (private)
- Space: [FINAL-Bench/agi-model-gen](https://huggingface.co/spaces/FINAL-Bench/agi-model-gen)
---
Β© 2026 VIDRAFT / Ginigen AI
""")
# ── Preload model on startup ──
print("\n=== Pre-loading model ===")
load_model()
print("=== Ready ===\n")
if __name__ == "__main__":
app.launch()