walidsobhie-code Claude Opus 4.6 commited on
Commit ·
99a7be2
1
Parent(s): bfc7d04
refactor: Clean up project structure - fewer root folders
Browse filesReorganized to user-friendly structure:
- Moved legacy docs to docs/archive/
- Merged CLI tools to src/cli/
- Moved training scripts to scripts/
- Removed empty/broken directories (benchmarks, space, website)
- Added directory structure documentation
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- benchmarks/benchmark_context_lengths.py +0 -442
- benchmarks/test_context_window.py +0 -330
- CONTEXT_UPDATE_SUMMARY.md → docs/archive/CONTEXT_UPDATE_SUMMARY.md +0 -0
- DATA_SCALING_PLAN.md → docs/archive/DATA_SCALING_PLAN.md +0 -0
- DEPLOYMENT_TEST_REPORT.md → docs/archive/DEPLOYMENT_TEST_REPORT.md +0 -0
- EVAL_PLAN.md → docs/archive/EVAL_PLAN.md +0 -0
- IMPLEMENTATION_SUMMARY.md → docs/archive/IMPLEMENTATION_SUMMARY.md +0 -0
- LICENSES.md → docs/archive/LICENSES.md +0 -0
- MAXIMIZATION_PLAN.md → docs/archive/MAXIMIZATION_PLAN.md +0 -0
- OPENROUTER_SUBMISSION_CHECKLIST.md → docs/archive/OPENROUTER_SUBMISSION_CHECKLIST.md +0 -0
- PUSH_GUIDE.md → docs/archive/PUSH_GUIDE.md +0 -0
- STACK_CLI_README.md → docs/archive/STACK_CLI_README.md +0 -0
- SUBMISSION_PACKAGE_SUMMARY.md → docs/archive/SUBMISSION_PACKAGE_SUMMARY.md +0 -0
- TOGETHER_AI.md → docs/archive/TOGETHER_AI.md +0 -0
- context_window_upgrade_summary.md → docs/archive/context_window_upgrade_summary.md +0 -0
- {website → docs/archive/website}/app.js +0 -0
- {website → docs/archive/website}/benchmark.html +0 -0
- {website → docs/archive/website}/index.html +0 -0
- {website → docs/archive/website}/styles.css +0 -0
- training-data-extractor.js → scripts/training-data-extractor.js +0 -0
- space/Dockerfile +0 -37
- space/README.md +0 -124
- space/app.py +0 -600
- space/requirements.txt +0 -24
- {stack-2.9-cli → src/cli}/__init__.py +0 -0
- {stack_cli → src/cli}/agent.py +0 -0
- {stack_cli → src/cli}/cli.py +0 -0
- {stack_cli → src/cli}/context.py +0 -0
- {stack-2.9-cli → src/cli}/main.py +0 -0
- {stack_cli → src/cli}/pyproject.toml +0 -0
- {stack_cli → src/cli}/tools.py +0 -0
- stack-2.9-deploy/Dockerfile +22 -92
- stack-2.9-deploy/README.md +82 -304
- stack-2.9-deploy/app.py +577 -253
- stack-2.9-deploy/requirements.txt +24 -14
- {self_evolution → stack-2.9-training}/__init__.py +0 -0
- {self_evolution → stack-2.9-training}/apply.py +0 -0
- {self_evolution → stack-2.9-training}/learner.py +0 -0
- {self_evolution → stack-2.9-training}/memory.py +0 -0
- {self_evolution → stack-2.9-training}/observer.py +0 -0
- {stack_2_9_training → stack-2.9-training}/train_config_colab.yaml +0 -0
- {self_evolution → stack-2.9-training}/trainer.py +0 -0
- stack_cli/__init__.py +0 -19
- verify_repo.sh +0 -141
benchmarks/benchmark_context_lengths.py
DELETED
|
@@ -1,442 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Benchmark script for comparing context window performance across different lengths.
|
| 4 |
-
|
| 5 |
-
This script compares:
|
| 6 |
-
1. 32K context (original claim)
|
| 7 |
-
2. 64K context (mid-range)
|
| 8 |
-
3. 128K context (full potential)
|
| 9 |
-
|
| 10 |
-
For each context length, it tests:
|
| 11 |
-
- Memory consumption (VRAM and RAM)
|
| 12 |
-
- Throughput (tokens/second during generation)
|
| 13 |
-
- Latency (time to first token)
|
| 14 |
-
- Quality (ability to process and generate coherent output)
|
| 15 |
-
- Task completion on sample coding tasks
|
| 16 |
-
|
| 17 |
-
Output: JSON results + summary report
|
| 18 |
-
"""
|
| 19 |
-
|
| 20 |
-
import os
|
| 21 |
-
import sys
|
| 22 |
-
import json
|
| 23 |
-
import time
|
| 24 |
-
import argparse
|
| 25 |
-
import statistics
|
| 26 |
-
from pathlib import Path
|
| 27 |
-
from typing import Dict, List, Any
|
| 28 |
-
|
| 29 |
-
# Required packages: vllm, transformers, psutil, torch
|
| 30 |
-
|
| 31 |
-
def get_memory_info():
|
| 32 |
-
"""Get memory statistics."""
|
| 33 |
-
import torch
|
| 34 |
-
import psutil
|
| 35 |
-
|
| 36 |
-
process = psutil.Process(os.getpid())
|
| 37 |
-
ram_mb = process.memory_info().rss / 1024 / 1024
|
| 38 |
-
|
| 39 |
-
if torch.cuda.is_available():
|
| 40 |
-
gpu_mem_allocated = torch.cuda.memory_allocated() / 1024 / 1024
|
| 41 |
-
gpu_mem_reserved = torch.cuda.memory_reserved() / 1024 / 1024
|
| 42 |
-
return {
|
| 43 |
-
"ram_mb": round(ram_mb, 1),
|
| 44 |
-
"gpu_allocated_mb": round(gpu_mem_allocated, 1),
|
| 45 |
-
"gpu_reserved_mb": round(gpu_mem_reserved, 1),
|
| 46 |
-
"gpu_used": True
|
| 47 |
-
}
|
| 48 |
-
else:
|
| 49 |
-
return {
|
| 50 |
-
"ram_mb": round(ram_mb, 1),
|
| 51 |
-
"gpu_used": False
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
def preprocess_prompt(prompt: str, tokenizer, target_tokens: int, mode: str = "repeat") -> List[int]:
|
| 55 |
-
"""Preprocess a prompt to reach target token length."""
|
| 56 |
-
tokens = tokenizer.encode(prompt)
|
| 57 |
-
|
| 58 |
-
if len(tokens) >= target_tokens:
|
| 59 |
-
return tokens[:target_tokens]
|
| 60 |
-
|
| 61 |
-
needed = target_tokens - len(tokens)
|
| 62 |
-
|
| 63 |
-
if mode == "repeat":
|
| 64 |
-
# Repeat a filler pattern
|
| 65 |
-
filler = " This is additional context to fill the window. " * 100
|
| 66 |
-
filler_tokens = tokenizer.encode(filler)
|
| 67 |
-
repeats = (needed // len(filler_tokens)) + 1
|
| 68 |
-
tokens.extend(filler_tokens * repeats)
|
| 69 |
-
elif mode == "noise":
|
| 70 |
-
# Use random-like content (code snippets)
|
| 71 |
-
noise = """
|
| 72 |
-
// Dummy code for context expansion
|
| 73 |
-
function placeholder() {
|
| 74 |
-
const x = 1;
|
| 75 |
-
const y = 2;
|
| 76 |
-
return x + y;
|
| 77 |
-
}
|
| 78 |
-
class DummyClass {
|
| 79 |
-
constructor() {}
|
| 80 |
-
method() {}
|
| 81 |
-
}
|
| 82 |
-
""".repeat(needed // 50 + 1)
|
| 83 |
-
noise_tokens = tokenizer.encode(noise)
|
| 84 |
-
tokens.extend(noise_tokens)
|
| 85 |
-
|
| 86 |
-
return tokens[:target_tokens]
|
| 87 |
-
|
| 88 |
-
def load_model(model_name: str, max_model_len: int, block_size: int):
|
| 89 |
-
"""Load vLLM model with specified configuration."""
|
| 90 |
-
from vllm import LLM
|
| 91 |
-
|
| 92 |
-
print(f"Loading model with max_model_len={max_model_len}, block_size={block_size}")
|
| 93 |
-
model = LLM(
|
| 94 |
-
model=model_name,
|
| 95 |
-
max_model_len=max_model_len,
|
| 96 |
-
block_size=block_size,
|
| 97 |
-
gpu_memory_utilization=0.9,
|
| 98 |
-
trust_remote_code=True,
|
| 99 |
-
tensor_parallel_size=1,
|
| 100 |
-
# For benchmarking, disable speculative decoding for consistent results
|
| 101 |
-
enable_chunked_prefill=False
|
| 102 |
-
)
|
| 103 |
-
return model
|
| 104 |
-
|
| 105 |
-
def run_generation(model, tokenizer, prompt_tokens: List[int], max_new_tokens: int = 200) -> Dict[str, Any]:
|
| 106 |
-
"""Run generation and collect metrics."""
|
| 107 |
-
from vllm import SamplingParams
|
| 108 |
-
|
| 109 |
-
sampling_params = SamplingParams(
|
| 110 |
-
temperature=0.7,
|
| 111 |
-
top_p=0.95,
|
| 112 |
-
max_tokens=max_new_tokens,
|
| 113 |
-
min_p=0.05
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
# Prefill phase timing
|
| 117 |
-
torch = sys.modules.get('torch')
|
| 118 |
-
if torch and torch.cuda.is_available():
|
| 119 |
-
torch.cuda.synchronize()
|
| 120 |
-
|
| 121 |
-
start_time = time.time()
|
| 122 |
-
outputs = model.generate(
|
| 123 |
-
prompt_token_ids=prompt_tokens,
|
| 124 |
-
sampling_params=sampling_params,
|
| 125 |
-
use_tqdm=False
|
| 126 |
-
)
|
| 127 |
-
end_time = time.time()
|
| 128 |
-
|
| 129 |
-
if torch and torch.cuda.is_available():
|
| 130 |
-
torch.cuda.synchronize()
|
| 131 |
-
|
| 132 |
-
elapsed = end_time - start_time
|
| 133 |
-
output_token_ids = outputs[0].outputs[0].token_ids
|
| 134 |
-
output_text = outputs[0].outputs[0].text
|
| 135 |
-
|
| 136 |
-
# Count tokens in output
|
| 137 |
-
output_length = len(output_token_ids)
|
| 138 |
-
|
| 139 |
-
# Calculate prefill latency (estimated)
|
| 140 |
-
prefill_latency = elapsed * 0.3 # Rough estimate
|
| 141 |
-
decode_latency = elapsed - prefill_latency
|
| 142 |
-
|
| 143 |
-
# Tokens per second
|
| 144 |
-
total_tokens = output_length
|
| 145 |
-
tokens_per_second = total_tokens / elapsed if elapsed > 0 else 0
|
| 146 |
-
|
| 147 |
-
return {
|
| 148 |
-
"elapsed_seconds": round(elapsed, 4),
|
| 149 |
-
"output_tokens": output_length,
|
| 150 |
-
"output_text": output_text[:200],
|
| 151 |
-
"tokens_per_second": round(tokens_per_second, 2),
|
| 152 |
-
"prefill_latency_est": round(prefill_latency, 4),
|
| 153 |
-
"decode_latency_est": round(decode_latency, 4)
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
def test_task(model, tokenizer, context_length: int, task_name: str, prompt: str, max_response: int = 200) -> Dict[str, Any]:
|
| 157 |
-
"""Run a single benchmark task."""
|
| 158 |
-
print(f"\n Task: {task_name}")
|
| 159 |
-
sys.stdout.flush()
|
| 160 |
-
|
| 161 |
-
mem_before = get_memory_info()
|
| 162 |
-
prompt_tokens = preprocess_prompt(prompt, tokenizer, context_length)
|
| 163 |
-
actual_context_len = len(prompt_tokens)
|
| 164 |
-
|
| 165 |
-
start_time = time.time()
|
| 166 |
-
try:
|
| 167 |
-
result = run_generation(model, tokenizer, prompt_tokens, max_response)
|
| 168 |
-
elapsed = time.time() - start_time
|
| 169 |
-
mem_after = get_memory_info()
|
| 170 |
-
|
| 171 |
-
# Calculate memory delta
|
| 172 |
-
mem_delta = {}
|
| 173 |
-
if mem_after.get("gpu_used"):
|
| 174 |
-
mem_delta["gpu_allocated_delta_mb"] = round(
|
| 175 |
-
mem_after["gpu_allocated_mb"] - mem_before["gpu_allocated_mb"], 1
|
| 176 |
-
)
|
| 177 |
-
mem_delta["ram_delta_mb"] = round(
|
| 178 |
-
mem_after["ram_mb"] - mem_before["ram_mb"], 1
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
return {
|
| 182 |
-
"task": task_name,
|
| 183 |
-
"context_length_target": context_length,
|
| 184 |
-
"context_length_actual": actual_context_len,
|
| 185 |
-
"success": True,
|
| 186 |
-
**result,
|
| 187 |
-
**mem_delta
|
| 188 |
-
}
|
| 189 |
-
except Exception as e:
|
| 190 |
-
elapsed = time.time() - start_time
|
| 191 |
-
print(f" ❌ Failed: {e}")
|
| 192 |
-
return {
|
| 193 |
-
"task": task_name,
|
| 194 |
-
"context_length_target": context_length,
|
| 195 |
-
"success": False,
|
| 196 |
-
"error": str(e),
|
| 197 |
-
"elapsed_seconds": round(elapsed, 4)
|
| 198 |
-
}
|
| 199 |
-
|
| 200 |
-
def main():
|
| 201 |
-
parser = argparse.ArgumentParser(description="Benchmark context lengths: 32K, 64K, 128K")
|
| 202 |
-
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-32B",
|
| 203 |
-
help="Model name")
|
| 204 |
-
parser.add_argument("--output-dir", type=str, default="benchmarks/results",
|
| 205 |
-
help="Directory to save results")
|
| 206 |
-
parser.add_argument("--context-lengths", type=int, nargs='+', default=[32768, 65536, 131072],
|
| 207 |
-
help="Context lengths to test")
|
| 208 |
-
parser.add_argument("--tasks-per-length", type=int, default=5,
|
| 209 |
-
help="Number of tasks per context length")
|
| 210 |
-
|
| 211 |
-
args = parser.parse_args()
|
| 212 |
-
|
| 213 |
-
print("="*70)
|
| 214 |
-
print("CONTEXT LENGTH BENCHMARK")
|
| 215 |
-
print("="*70)
|
| 216 |
-
print(f"Model: {args.model}")
|
| 217 |
-
print(f"Context lengths: {args.context_lengths}")
|
| 218 |
-
print(f"Tasks per length: {args.tasks_per_length}")
|
| 219 |
-
|
| 220 |
-
# Sample tasks for benchmarking
|
| 221 |
-
tasks = [
|
| 222 |
-
{
|
| 223 |
-
"name": "Code Completion",
|
| 224 |
-
"prompt": """import React from 'react';
|
| 225 |
-
function Component({ children }) {
|
| 226 |
-
return (
|
| 227 |
-
<div className="container">
|
| 228 |
-
{children}
|
| 229 |
-
</div>
|
| 230 |
-
);
|
| 231 |
-
}
|
| 232 |
-
export default Component;"""
|
| 233 |
-
},
|
| 234 |
-
{
|
| 235 |
-
"name": "Bug Fix",
|
| 236 |
-
"prompt": """function calculateTotal(items) {
|
| 237 |
-
let total = 0;
|
| 238 |
-
for (let i = 0; i <= items.length; i++) {
|
| 239 |
-
total += items[i].price;
|
| 240 |
-
}
|
| 241 |
-
return total;
|
| 242 |
-
}
|
| 243 |
-
// This function has a bug. What is it and how would you fix it?"""
|
| 244 |
-
},
|
| 245 |
-
{
|
| 246 |
-
"name": "Documentation Generation",
|
| 247 |
-
"prompt": """class DataProcessor {
|
| 248 |
-
constructor(config) {
|
| 249 |
-
this.config = config;
|
| 250 |
-
this.cache = new Map();
|
| 251 |
-
}
|
| 252 |
-
|
| 253 |
-
async process(data) {
|
| 254 |
-
const result = await this.transform(data);
|
| 255 |
-
return this.validate(result);
|
| 256 |
-
}
|
| 257 |
-
|
| 258 |
-
transform(data) {
|
| 259 |
-
// Transform logic here
|
| 260 |
-
return data.map(item => ({ ...item, processed: true }));
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
-
validate(result) {
|
| 264 |
-
return result.filter(item => item.valid !== false);
|
| 265 |
-
}
|
| 266 |
-
}
|
| 267 |
-
// Please generate comprehensive JSDoc documentation for this class."""
|
| 268 |
-
},
|
| 269 |
-
{
|
| 270 |
-
"name": "Test Generation",
|
| 271 |
-
"prompt": """const sum = (a, b) => a + b;
|
| 272 |
-
const multiply = (a, b) => a * b;
|
| 273 |
-
const divide = (a, b) => {
|
| 274 |
-
if (b === 0) throw new Error('Division by zero');
|
| 275 |
-
return a / b;
|
| 276 |
-
};
|
| 277 |
-
// Write Jest unit tests for these utility functions."""
|
| 278 |
-
},
|
| 279 |
-
{
|
| 280 |
-
"name": "Refactoring",
|
| 281 |
-
"prompt": """function processUserData(users) {
|
| 282 |
-
const result = [];
|
| 283 |
-
for (let i = 0; i < users.length; i++) {
|
| 284 |
-
const user = users[i];
|
| 285 |
-
if (user.active) {
|
| 286 |
-
result.push({
|
| 287 |
-
id: user.id,
|
| 288 |
-
name: user.firstName + ' ' + user.lastName,
|
| 289 |
-
email: user.email.toLowerCase()
|
| 290 |
-
});
|
| 291 |
-
}
|
| 292 |
-
}
|
| 293 |
-
return result;
|
| 294 |
-
}
|
| 295 |
-
// Refactor this function using modern ES6+ features (map, filter, destructuring, template literals)."""
|
| 296 |
-
}
|
| 297 |
-
]
|
| 298 |
-
|
| 299 |
-
results = {
|
| 300 |
-
"metadata": {
|
| 301 |
-
"model": args.model,
|
| 302 |
-
"context_lengths_tested": args.context_lengths,
|
| 303 |
-
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 304 |
-
"tasks": [t["name"] for t in tasks],
|
| 305 |
-
"max_new_tokens": 200
|
| 306 |
-
},
|
| 307 |
-
"results": []
|
| 308 |
-
}
|
| 309 |
-
|
| 310 |
-
try:
|
| 311 |
-
# Import dependencies
|
| 312 |
-
print("\n📦 Loading dependencies...")
|
| 313 |
-
from transformers import AutoTokenizer
|
| 314 |
-
sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-deploy')
|
| 315 |
-
|
| 316 |
-
print(f"\n🔍 Loading tokenizer for {args.model}...")
|
| 317 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 318 |
-
args.model,
|
| 319 |
-
trust_remote_code=True
|
| 320 |
-
)
|
| 321 |
-
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
|
| 322 |
-
|
| 323 |
-
all_task_results = []
|
| 324 |
-
|
| 325 |
-
# Test each context length
|
| 326 |
-
for context_len in args.context_lengths:
|
| 327 |
-
print(f"\n{'='*70}")
|
| 328 |
-
print(f"TESTING CONTEXT LENGTH: {context_len} tokens ({context_len/1024:.0f}K)")
|
| 329 |
-
print(f"{'='*70}")
|
| 330 |
-
|
| 331 |
-
# Load model fresh for each context length (optional, but cleaner)
|
| 332 |
-
print(f"\n🤖 Loading model...")
|
| 333 |
-
model = load_model(args.model, max_model_len=context_len, block_size=64)
|
| 334 |
-
|
| 335 |
-
# Get initial memory after load
|
| 336 |
-
mem_after_load = get_memory_info()
|
| 337 |
-
print(f" Model loaded. Memory: {mem_after_load}")
|
| 338 |
-
|
| 339 |
-
length_results = []
|
| 340 |
-
|
| 341 |
-
# Run tasks (selected subset based on context length)
|
| 342 |
-
num_tasks = min(args.tasks_per_length, len(tasks))
|
| 343 |
-
|
| 344 |
-
for i in range(num_tasks):
|
| 345 |
-
task = tasks[i % len(tasks)]
|
| 346 |
-
print(f"\n[{i+1}/{num_tasks}] Running task: {task['name']}")
|
| 347 |
-
sys.stdout.flush()
|
| 348 |
-
|
| 349 |
-
result = test_task(
|
| 350 |
-
model, tokenizer, context_len,
|
| 351 |
-
f"{task['name']} @ {context_len}",
|
| 352 |
-
task["prompt"]
|
| 353 |
-
)
|
| 354 |
-
length_results.append(result)
|
| 355 |
-
all_task_results.append(result)
|
| 356 |
-
|
| 357 |
-
# Small delay between tasks
|
| 358 |
-
time.sleep(1)
|
| 359 |
-
|
| 360 |
-
# Print summary for this context length
|
| 361 |
-
successful = [r for r in length_results if r.get('success', False)]
|
| 362 |
-
if successful:
|
| 363 |
-
avg_tps = statistics.mean([r['tokens_per_second'] for r in successful])
|
| 364 |
-
avg_latency = statistics.mean([r['elapsed_seconds'] for r in successful])
|
| 365 |
-
print(f"\n📈 Summary for {context_len} tokens:")
|
| 366 |
-
print(f" Avg throughput: {avg_tps:.2f} tokens/sec")
|
| 367 |
-
print(f" Avg latency: {avg_latency:.3f}s")
|
| 368 |
-
print(f" Success count: {len(successful)}/{len(length_results)}")
|
| 369 |
-
|
| 370 |
-
# Unload model to free memory before next test
|
| 371 |
-
del model
|
| 372 |
-
import gc
|
| 373 |
-
gc.collect()
|
| 374 |
-
if torch.cuda.is_available():
|
| 375 |
-
torch.cuda.empty_cache()
|
| 376 |
-
|
| 377 |
-
print(f" ✓ Completed testing for {context_len}")
|
| 378 |
-
|
| 379 |
-
# Compile final results
|
| 380 |
-
results["results"] = all_task_results
|
| 381 |
-
|
| 382 |
-
# Calculate summary statistics
|
| 383 |
-
summary = {}
|
| 384 |
-
for context_len in args.context_lengths:
|
| 385 |
-
len_results = [r for r in all_task_results
|
| 386 |
-
if r.get('context_length_target') == context_len and r.get('success')]
|
| 387 |
-
if len_results:
|
| 388 |
-
summary[str(context_len)] = {
|
| 389 |
-
"count": len(len_results),
|
| 390 |
-
"avg_tokens_per_second": round(statistics.mean([r['tokens_per_second'] for r in len_results]), 2),
|
| 391 |
-
"avg_latency_seconds": round(statistics.mean([r['elapsed_seconds'] for r in len_results]), 3),
|
| 392 |
-
"avg_gpu_memory_delta_mb": round(statistics.mean([r.get('gpu_allocated_delta_mb', 0) for r in len_results]), 1),
|
| 393 |
-
"avg_ram_delta_mb": round(statistics.mean([r.get('ram_delta_mb', 0) for r in len_results]), 1)
|
| 394 |
-
}
|
| 395 |
-
results["summary"] = summary
|
| 396 |
-
|
| 397 |
-
except ImportError as e:
|
| 398 |
-
print(f"❌ Missing dependencies: {e}")
|
| 399 |
-
print("Please install: pip install vllm transformers psutil torch")
|
| 400 |
-
sys.exit(1)
|
| 401 |
-
except Exception as e:
|
| 402 |
-
print(f"❌ Error: {e}")
|
| 403 |
-
import traceback
|
| 404 |
-
traceback.print_exc()
|
| 405 |
-
sys.exit(1)
|
| 406 |
-
|
| 407 |
-
# Save results
|
| 408 |
-
output_dir = Path(args.output_dir)
|
| 409 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 410 |
-
|
| 411 |
-
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
| 412 |
-
output_file = output_dir / f"benchmark_{timestamp}.json"
|
| 413 |
-
|
| 414 |
-
with open(output_file, 'w') as f:
|
| 415 |
-
json.dump(results, f, indent=2)
|
| 416 |
-
|
| 417 |
-
print(f"\n{'='*70}")
|
| 418 |
-
print("BENCHMARK COMPLETE")
|
| 419 |
-
print(f"{'='*70}")
|
| 420 |
-
print(f"Results saved to: {output_file}")
|
| 421 |
-
|
| 422 |
-
# Print summary table
|
| 423 |
-
print("\n📊 Performance Summary:")
|
| 424 |
-
print("-"*70)
|
| 425 |
-
print(f"{'Context':<10} {'Throughput':<15} {'Latency':<12} {'GPU Δ':<12} {'RAM Δ':<12}")
|
| 426 |
-
print("-"*70)
|
| 427 |
-
|
| 428 |
-
if summary:
|
| 429 |
-
for length_str, stats in sorted(summary.items()):
|
| 430 |
-
length = int(length_str)
|
| 431 |
-
length_k = length // 1024
|
| 432 |
-
print(f"{length_k:>3}K {stats['avg_tokens_per_second']:>5.1f} tok/s {stats['avg_latency_seconds']:>6.3f}s "
|
| 433 |
-
f"{stats['avg_gpu_memory_delta_mb']:>6.1f} MB {stats['avg_ram_delta_mb']:>6.1f} MB")
|
| 434 |
-
|
| 435 |
-
print("\n✅ Benchmark finished!")
|
| 436 |
-
print("\nNext steps:")
|
| 437 |
-
print(" 1. Review results in the JSON output file")
|
| 438 |
-
print(" 2. Check if 128K provides quality benefits that justify any performance trade-offs")
|
| 439 |
-
print(" 3. Update deployment configuration with optimal block_size and scheduler settings")
|
| 440 |
-
|
| 441 |
-
if __name__ == "__main__":
|
| 442 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/test_context_window.py
DELETED
|
@@ -1,330 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Test script for verifying 128K context window support for Qwen2.5-Coder-32B.
|
| 4 |
-
|
| 5 |
-
This script:
|
| 6 |
-
1. Loads the model with vLLM configured for 128K context
|
| 7 |
-
2. Tests with various input lengths (32K, 64K, 96K, 128K)
|
| 8 |
-
3. Measures memory usage, throughput, and latency
|
| 9 |
-
4. Tests with real codebase context (entire project)
|
| 10 |
-
5. Validates that the model correctly processes long inputs
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import os
|
| 14 |
-
import sys
|
| 15 |
-
import json
|
| 16 |
-
import time
|
| 17 |
-
import psutil
|
| 18 |
-
import argparse
|
| 19 |
-
from pathlib import Path
|
| 20 |
-
from typing import Dict, List, Tuple
|
| 21 |
-
|
| 22 |
-
# Add vLLM to path
|
| 23 |
-
sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-deploy')
|
| 24 |
-
|
| 25 |
-
def get_memory_usage() -> Dict[str, float]:
|
| 26 |
-
"""Get current memory usage in MB."""
|
| 27 |
-
process = psutil.Process(os.getpid())
|
| 28 |
-
memory_info = process.memory_info()
|
| 29 |
-
return {
|
| 30 |
-
'rss_mb': memory_info.rss / 1024 / 1024,
|
| 31 |
-
'vms_mb': memory_info.vms / 1024 / 1024
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
def generate_token_sequence(length: int, tokenizer) -> List[int]:
|
| 35 |
-
"""Generate a sequence of tokens of approximately the target length."""
|
| 36 |
-
# Create a repeating pattern that tokenizes consistently
|
| 37 |
-
base_text = "This is a test token sequence for context window testing. " * 10
|
| 38 |
-
tokens = tokenizer.encode(base_text)
|
| 39 |
-
# Repeat the tokens to reach desired length
|
| 40 |
-
num_repeats = (length // len(tokens)) + 1
|
| 41 |
-
token_sequence = tokens * num_repeats
|
| 42 |
-
return token_sequence[:length]
|
| 43 |
-
|
| 44 |
-
def read_codebase_files(base_path: str, max_files: int = 100) -> str:
|
| 45 |
-
"""Read source code files from the codebase to create a realistic long context."""
|
| 46 |
-
codebase_text = ""
|
| 47 |
-
src_dir = Path(base_path) / "src"
|
| 48 |
-
if not src_dir.exists():
|
| 49 |
-
return ""
|
| 50 |
-
|
| 51 |
-
file_count = 0
|
| 52 |
-
for file_path in src_dir.rglob("*.ts"):
|
| 53 |
-
if file_count >= max_files:
|
| 54 |
-
break
|
| 55 |
-
try:
|
| 56 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
| 57 |
-
content = f.read()
|
| 58 |
-
codebase_text += f"\n\n// File: {file_path.relative_to(base_path)}\n{content}\n"
|
| 59 |
-
file_count += 1
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"Warning: Could not read {file_path}: {e}")
|
| 62 |
-
|
| 63 |
-
return codebase_text
|
| 64 |
-
|
| 65 |
-
def test_context_length(model, tokenizer, context_length: int, test_name: str) -> Dict:
|
| 66 |
-
"""Test model with a specific context length."""
|
| 67 |
-
print(f"\n{'='*60}")
|
| 68 |
-
print(f"Testing {test_name} (target: {context_length} tokens)")
|
| 69 |
-
print(f"{'='*60}")
|
| 70 |
-
|
| 71 |
-
# Generate input sequence
|
| 72 |
-
tokens = generate_token_sequence(context_length, tokenizer)
|
| 73 |
-
actual_length = len(tokens)
|
| 74 |
-
print(f"Generated input with {actual_length} tokens")
|
| 75 |
-
|
| 76 |
-
# Measure memory before inference
|
| 77 |
-
mem_before = get_memory_usage()
|
| 78 |
-
|
| 79 |
-
# Run inference (generate a short response to test context processing)
|
| 80 |
-
start_time = time.time()
|
| 81 |
-
try:
|
| 82 |
-
# Use vLLM's generate
|
| 83 |
-
from vllm import SamplingParams
|
| 84 |
-
sampling_params = SamplingParams(
|
| 85 |
-
temperature=0.1,
|
| 86 |
-
max_tokens=50, # Generate only 50 tokens
|
| 87 |
-
prompt_logprobs=0
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
outputs = model.generate(
|
| 91 |
-
prompt_token_ids=tokens,
|
| 92 |
-
sampling_params=sampling_params,
|
| 93 |
-
use_tqdm=False
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
elapsed = time.time() - start_time
|
| 97 |
-
mem_after = get_memory_usage()
|
| 98 |
-
|
| 99 |
-
# Calculate metrics
|
| 100 |
-
output_text = outputs[0].outputs[0].text
|
| 101 |
-
output_tokens = len(outputs[0].outputs[0].token_ids)
|
| 102 |
-
tokens_per_second = output_tokens / elapsed if elapsed > 0 else 0
|
| 103 |
-
|
| 104 |
-
result = {
|
| 105 |
-
"test": test_name,
|
| 106 |
-
"target_length": context_length,
|
| 107 |
-
"actual_length": actual_length,
|
| 108 |
-
"output_tokens": output_tokens,
|
| 109 |
-
"latency_seconds": round(elapsed, 3),
|
| 110 |
-
"tokens_per_second": round(tokens_per_second, 2),
|
| 111 |
-
"memory_before_mb": round(mem_before['rss_mb'], 2),
|
| 112 |
-
"memory_after_mb": round(mem_after['rss_mb'], 2),
|
| 113 |
-
"memory_delta_mb": round(mem_after['rss_mb'] - mem_before['rss_mb'], 2),
|
| 114 |
-
"success": True,
|
| 115 |
-
"sample_output": output_text[:100] if output_text else ""
|
| 116 |
-
}
|
| 117 |
-
|
| 118 |
-
print(f"✅ Success!")
|
| 119 |
-
print(f" Latency: {elapsed:.3f}s")
|
| 120 |
-
print(f" Throughput: {tokens_per_second:.2f} tokens/sec")
|
| 121 |
-
print(f" Memory delta: {result['memory_delta_mb']:.1f} MB")
|
| 122 |
-
print(f" Sample output: {result['sample_output']}")
|
| 123 |
-
|
| 124 |
-
except Exception as e:
|
| 125 |
-
elapsed = time.time() - start_time
|
| 126 |
-
result = {
|
| 127 |
-
"test": test_name,
|
| 128 |
-
"target_length": context_length,
|
| 129 |
-
"actual_length": actual_length,
|
| 130 |
-
"success": False,
|
| 131 |
-
"error": str(e),
|
| 132 |
-
"latency_seconds": round(elapsed, 3)
|
| 133 |
-
}
|
| 134 |
-
print(f"❌ Failed: {e}")
|
| 135 |
-
|
| 136 |
-
return result
|
| 137 |
-
|
| 138 |
-
def test_with_codebase(model, tokenizer, codebase_path: str) -> Dict:
|
| 139 |
-
"""Test the model with the entire codebase as context."""
|
| 140 |
-
print(f"\n{'='*60}")
|
| 141 |
-
print(f"Testing with real codebase context")
|
| 142 |
-
print(f"{'='*60}")
|
| 143 |
-
|
| 144 |
-
# Read codebase files
|
| 145 |
-
print("Reading codebase files...")
|
| 146 |
-
codebase_text = read_codebase_files(codebase_path, max_files=200)
|
| 147 |
-
codebase_tokens = tokenizer.encode(codebase_text)
|
| 148 |
-
context_length = len(codebase_tokens)
|
| 149 |
-
print(f"Codebase encoded to {context_length} tokens ({context_length/1024:.1f}K)")
|
| 150 |
-
|
| 151 |
-
if context_length < 1000:
|
| 152 |
-
print("⚠️ Warning: Codebase is too small, generate synthetic long context instead")
|
| 153 |
-
codebase_tokens = generate_token_sequence(131072, tokenizer)
|
| 154 |
-
context_length = len(codebase_tokens)
|
| 155 |
-
|
| 156 |
-
mem_before = get_memory_usage()
|
| 157 |
-
start_time = time.time()
|
| 158 |
-
|
| 159 |
-
try:
|
| 160 |
-
from vllm import SamplingParams
|
| 161 |
-
sampling_params = SamplingParams(
|
| 162 |
-
temperature=0.2,
|
| 163 |
-
max_tokens=100,
|
| 164 |
-
prompt_logprobs=0
|
| 165 |
-
)
|
| 166 |
-
|
| 167 |
-
outputs = model.generate(
|
| 168 |
-
prompt_token_ids=codebase_tokens,
|
| 169 |
-
sampling_params=sampling_params,
|
| 170 |
-
use_tqdm=False
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
elapsed = time.time() - start_time
|
| 174 |
-
mem_after = get_memory_usage()
|
| 175 |
-
|
| 176 |
-
output_text = outputs[0].outputs[0].text
|
| 177 |
-
output_tokens = len(outputs[0].outputs[0].token_ids)
|
| 178 |
-
tokens_per_second = output_tokens / elapsed if elapsed > 0 else 0
|
| 179 |
-
|
| 180 |
-
result = {
|
| 181 |
-
"test": "Codebase Context",
|
| 182 |
-
"context_size_k": round(context_length / 1024, 1),
|
| 183 |
-
"output_tokens": output_tokens,
|
| 184 |
-
"latency_seconds": round(elapsed, 3),
|
| 185 |
-
"tokens_per_second": round(tokens_per_second, 2),
|
| 186 |
-
"memory_before_mb": round(mem_before['rss_mb'], 2),
|
| 187 |
-
"memory_after_mb": round(mem_after['rss_mb'], 2),
|
| 188 |
-
"memory_delta_mb": round(mem_after['rss_mb'] - mem_before['rss_mb'], 2),
|
| 189 |
-
"success": True,
|
| 190 |
-
"sample_output": output_text[:150]
|
| 191 |
-
}
|
| 192 |
-
|
| 193 |
-
print(f"✅ Success!")
|
| 194 |
-
print(f" Context size: {result['context_size_k']}K tokens")
|
| 195 |
-
print(f" Latency: {elapsed:.3f}s")
|
| 196 |
-
print(f" Throughput: {tokens_per_second:.2f} tokens/sec")
|
| 197 |
-
print(f" Memory delta: {result['memory_delta_mb']:.1f} MB")
|
| 198 |
-
print(f" Sample output: {result['sample_output']}")
|
| 199 |
-
|
| 200 |
-
except Exception as e:
|
| 201 |
-
elapsed = time.time() - start_time
|
| 202 |
-
result = {
|
| 203 |
-
"test": "Codebase Context",
|
| 204 |
-
"success": False,
|
| 205 |
-
"error": str(e),
|
| 206 |
-
"latency_seconds": round(elapsed, 3)
|
| 207 |
-
}
|
| 208 |
-
print(f"❌ Failed: {e}")
|
| 209 |
-
|
| 210 |
-
return result
|
| 211 |
-
|
| 212 |
-
def main():
|
| 213 |
-
parser = argparse.ArgumentParser(description="Test 128K context window for Qwen2.5-Coder-32B")
|
| 214 |
-
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-32B",
|
| 215 |
-
help="Model name or path")
|
| 216 |
-
parser.add_argument("--max-model-len", type=int, default=131072,
|
| 217 |
-
help="Maximum model length for vLLM")
|
| 218 |
-
parser.add_argument("--block-size", type=int, default=64,
|
| 219 |
-
help="vLLM block size")
|
| 220 |
-
parser.add_argument("--codebase-path", type=str,
|
| 221 |
-
default="/Users/walidsobhi/.openclaw/workspace/stack-2.9",
|
| 222 |
-
help="Path to the codebase for real context test")
|
| 223 |
-
parser.add_argument("--output", type=str,
|
| 224 |
-
default="benchmarks/test_context_results.json",
|
| 225 |
-
help="Output file for results")
|
| 226 |
-
|
| 227 |
-
args = parser.parse_args()
|
| 228 |
-
|
| 229 |
-
print(f"Starting 128K Context Window Test")
|
| 230 |
-
print(f"Model: {args.model}")
|
| 231 |
-
print(f"Config: max_model_len={args.max_model_len}, block_size={args.block_size}")
|
| 232 |
-
|
| 233 |
-
results = []
|
| 234 |
-
|
| 235 |
-
try:
|
| 236 |
-
# Import vLLM and Transformers
|
| 237 |
-
print("\n📦 Loading tokenizer...")
|
| 238 |
-
from transformers import AutoTokenizer
|
| 239 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 240 |
-
args.model,
|
| 241 |
-
trust_remote_code=True
|
| 242 |
-
)
|
| 243 |
-
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
|
| 244 |
-
|
| 245 |
-
print("\n🤖 Loading vLLM model...")
|
| 246 |
-
from vllm import LLM
|
| 247 |
-
|
| 248 |
-
# Initialize vLLM with large context configuration
|
| 249 |
-
model = LLM(
|
| 250 |
-
model=args.model,
|
| 251 |
-
max_model_len=args.max_model_len,
|
| 252 |
-
block_size=args.block_size,
|
| 253 |
-
gpu_memory_utilization=0.9,
|
| 254 |
-
trust_remote_code=True,
|
| 255 |
-
tensor_parallel_size=1 # Adjust if using multiple GPUs
|
| 256 |
-
)
|
| 257 |
-
print("Model loaded successfully!")
|
| 258 |
-
|
| 259 |
-
# Test 1: Small context (8K) - baseline
|
| 260 |
-
results.append(test_context_length(model, tokenizer, 8192, "8K Baseline"))
|
| 261 |
-
|
| 262 |
-
# Test 2: Medium context (32K)
|
| 263 |
-
results.append(test_context_length(model, tokenizer, 32768, "32K"))
|
| 264 |
-
|
| 265 |
-
# Test 3: Large context (64K)
|
| 266 |
-
results.append(test_context_length(model, tokenizer, 65536, "64K"))
|
| 267 |
-
|
| 268 |
-
# Test 4: Full context (96K)
|
| 269 |
-
results.append(test_context_length(model, tokenizer, 98304, "96K"))
|
| 270 |
-
|
| 271 |
-
# Test 5: Maximum context (128K)
|
| 272 |
-
results.append(test_context_length(model, tokenizer, 131072, "128K"))
|
| 273 |
-
|
| 274 |
-
# Test 6: Codebase context
|
| 275 |
-
results.append(test_with_codebase(model, tokenizer, args.codebase_path))
|
| 276 |
-
|
| 277 |
-
except ImportError as e:
|
| 278 |
-
print(f"❌ Import error: {e}")
|
| 279 |
-
print("Make sure vLLM and transformers are installed:")
|
| 280 |
-
print(" pip install vllm transformers")
|
| 281 |
-
sys.exit(1)
|
| 282 |
-
except Exception as e:
|
| 283 |
-
print(f"❌ Error during testing: {e}")
|
| 284 |
-
import traceback
|
| 285 |
-
traceback.print_exc()
|
| 286 |
-
sys.exit(1)
|
| 287 |
-
|
| 288 |
-
# Save results
|
| 289 |
-
output_path = Path(args.output)
|
| 290 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 291 |
-
|
| 292 |
-
with open(output_path, 'w') as f:
|
| 293 |
-
json.dump({
|
| 294 |
-
"metadata": {
|
| 295 |
-
"model": args.model,
|
| 296 |
-
"max_model_len": args.max_model_len,
|
| 297 |
-
"block_size": args.block_size,
|
| 298 |
-
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 299 |
-
"system": os.uname().sysname if hasattr(os, 'uname') else "Unknown"
|
| 300 |
-
},
|
| 301 |
-
"results": results
|
| 302 |
-
}, f, indent=2)
|
| 303 |
-
|
| 304 |
-
print(f"\n📊 Results saved to: {output_path}")
|
| 305 |
-
print("\n" + "="*60)
|
| 306 |
-
print("SUMMARY")
|
| 307 |
-
print("="*60)
|
| 308 |
-
|
| 309 |
-
successful = [r for r in results if r.get('success', False)]
|
| 310 |
-
failed = [r for r in results if not r.get('success', False)]
|
| 311 |
-
|
| 312 |
-
print(f"Total tests: {len(results)}")
|
| 313 |
-
print(f"Successful: {len(successful)}")
|
| 314 |
-
print(f"Failed: {len(failed)}")
|
| 315 |
-
|
| 316 |
-
if successful:
|
| 317 |
-
print("\nContext length vs. throughput:")
|
| 318 |
-
for r in successful:
|
| 319 |
-
if r['test'] != 'Codebase Context':
|
| 320 |
-
print(f" {r['test']}: {r['tokens_per_second']} tokens/sec, "
|
| 321 |
-
f"memory delta: {r['memory_delta_mb']}MB")
|
| 322 |
-
if any(r['test'] == 'Codebase Context' for r in successful):
|
| 323 |
-
cb = next(r for r in successful if r['test'] == 'Codebase Context')
|
| 324 |
-
print(f"\nCodebase test: {cb['context_size_k']}K tokens, "
|
| 325 |
-
f"{cb['tokens_per_second']} tokens/sec")
|
| 326 |
-
|
| 327 |
-
print("\n✅ Test script completed!")
|
| 328 |
-
|
| 329 |
-
if __name__ == "__main__":
|
| 330 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CONTEXT_UPDATE_SUMMARY.md → docs/archive/CONTEXT_UPDATE_SUMMARY.md
RENAMED
|
File without changes
|
DATA_SCALING_PLAN.md → docs/archive/DATA_SCALING_PLAN.md
RENAMED
|
File without changes
|
DEPLOYMENT_TEST_REPORT.md → docs/archive/DEPLOYMENT_TEST_REPORT.md
RENAMED
|
File without changes
|
EVAL_PLAN.md → docs/archive/EVAL_PLAN.md
RENAMED
|
File without changes
|
IMPLEMENTATION_SUMMARY.md → docs/archive/IMPLEMENTATION_SUMMARY.md
RENAMED
|
File without changes
|
LICENSES.md → docs/archive/LICENSES.md
RENAMED
|
File without changes
|
MAXIMIZATION_PLAN.md → docs/archive/MAXIMIZATION_PLAN.md
RENAMED
|
File without changes
|
OPENROUTER_SUBMISSION_CHECKLIST.md → docs/archive/OPENROUTER_SUBMISSION_CHECKLIST.md
RENAMED
|
File without changes
|
PUSH_GUIDE.md → docs/archive/PUSH_GUIDE.md
RENAMED
|
File without changes
|
STACK_CLI_README.md → docs/archive/STACK_CLI_README.md
RENAMED
|
File without changes
|
SUBMISSION_PACKAGE_SUMMARY.md → docs/archive/SUBMISSION_PACKAGE_SUMMARY.md
RENAMED
|
File without changes
|
TOGETHER_AI.md → docs/archive/TOGETHER_AI.md
RENAMED
|
File without changes
|
context_window_upgrade_summary.md → docs/archive/context_window_upgrade_summary.md
RENAMED
|
File without changes
|
{website → docs/archive/website}/app.js
RENAMED
|
File without changes
|
{website → docs/archive/website}/benchmark.html
RENAMED
|
File without changes
|
{website → docs/archive/website}/index.html
RENAMED
|
File without changes
|
{website → docs/archive/website}/styles.css
RENAMED
|
File without changes
|
training-data-extractor.js → scripts/training-data-extractor.js
RENAMED
|
File without changes
|
space/Dockerfile
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
# Stack 2.9 HuggingFace Spaces Dockerfile
|
| 2 |
-
# Optimized for 16GB GPU with 4-bit quantization
|
| 3 |
-
|
| 4 |
-
FROM python:3.10-slim
|
| 5 |
-
|
| 6 |
-
# Set environment variables
|
| 7 |
-
ENV PYTHONUNBUFFERED=1
|
| 8 |
-
ENV TRANSFORMERS_CACHE=/workspace/.cache/huggingface
|
| 9 |
-
ENV HF_HOME=/workspace/.cache/huggingface
|
| 10 |
-
|
| 11 |
-
# Install system dependencies
|
| 12 |
-
RUN apt-get update && apt-get install -y \
|
| 13 |
-
git \
|
| 14 |
-
wget \
|
| 15 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
-
|
| 17 |
-
# Create workspace directory
|
| 18 |
-
WORKDIR /workspace
|
| 19 |
-
|
| 20 |
-
# Copy requirements first for better caching
|
| 21 |
-
COPY requirements.txt .
|
| 22 |
-
|
| 23 |
-
# Install Python dependencies
|
| 24 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
-
|
| 26 |
-
# Copy application files
|
| 27 |
-
COPY . .
|
| 28 |
-
|
| 29 |
-
# Expose Gradio port
|
| 30 |
-
EXPOSE 7860
|
| 31 |
-
|
| 32 |
-
# Create startup script
|
| 33 |
-
RUN echo '#!/bin/bash\necho "🚀 Starting Stack 2.9..."\npython app.py --port 7860 --share' > /start.sh
|
| 34 |
-
RUN chmod +x /start.sh
|
| 35 |
-
|
| 36 |
-
# Launch command
|
| 37 |
-
CMD ["/start.sh"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space/README.md
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
# 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 2 |
-
|
| 3 |
-
A HuggingFace Spaces demo for Stack 2.9, a pattern-based AI coding assistant powered by Qwen2.5-Coder-7B.
|
| 4 |
-
|
| 5 |
-

|
| 6 |
-

|
| 7 |
-

|
| 8 |
-
|
| 9 |
-
## ✨ Features
|
| 10 |
-
|
| 11 |
-
- **🤖 Qwen2.5-Coder-7B** - State-of-the-art code generation model
|
| 12 |
-
- **🔧 7 Integrated Tools** - File operations, git, web search, shell commands
|
| 13 |
-
- **🧠 Pattern Memory** - Learns from each interaction
|
| 14 |
-
- **⚡ Fast Streaming** - Real-time token-by-token generation
|
| 15 |
-
- **💾 4-bit Quantization** - Runs on 16GB GPU (~4GB VRAM)
|
| 16 |
-
|
| 17 |
-
## 🔧 Available Tools
|
| 18 |
-
|
| 19 |
-
| Tool | Description |
|
| 20 |
-
|------|-------------|
|
| 21 |
-
| `file_read` | Read files from the filesystem |
|
| 22 |
-
| `file_write` | Write content to files |
|
| 23 |
-
| `git_status` | Check git repository status |
|
| 24 |
-
| `web_search` | Search the web for information |
|
| 25 |
-
| `run_command` | Execute shell commands |
|
| 26 |
-
| `create_directory` | Create new directories |
|
| 27 |
-
| `list_directory` | List directory contents |
|
| 28 |
-
|
| 29 |
-
## 🏃♂️ Quick Start
|
| 30 |
-
|
| 31 |
-
### Local Development
|
| 32 |
-
|
| 33 |
-
```bash
|
| 34 |
-
# Clone the repository
|
| 35 |
-
git clone https://github.com/your-repo/stack-2.9.git
|
| 36 |
-
cd stack-2.9/space
|
| 37 |
-
|
| 38 |
-
# Install dependencies
|
| 39 |
-
pip install -r requirements.txt
|
| 40 |
-
|
| 41 |
-
# Run the demo
|
| 42 |
-
python app.py --share
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
### HuggingFace Spaces
|
| 46 |
-
|
| 47 |
-
1. Create a new Space on [HuggingFace](https://huggingface.co/spaces)
|
| 48 |
-
2. Select "Gradio" as the SDK
|
| 49 |
-
3. Upload the files from this directory:
|
| 50 |
-
- `app.py`
|
| 51 |
-
- `requirements.txt`
|
| 52 |
-
- `README.md`
|
| 53 |
-
4. The model will load automatically on startup
|
| 54 |
-
|
| 55 |
-
## 💻 Usage
|
| 56 |
-
|
| 57 |
-
### Example Prompts
|
| 58 |
-
|
| 59 |
-
```
|
| 60 |
-
Hello! What can you help me with?
|
| 61 |
-
Check git status of this repository
|
| 62 |
-
Search for best practices for Python async programming
|
| 63 |
-
List the files in the current directory
|
| 64 |
-
Write a simple Python function to calculate fibonacci
|
| 65 |
-
How do I use Git to create a new branch?
|
| 66 |
-
What's your memory of our conversation?
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
### Python API
|
| 70 |
-
|
| 71 |
-
```python
|
| 72 |
-
from app import StackModel, memory
|
| 73 |
-
|
| 74 |
-
# Initialize model
|
| 75 |
-
model = StackModel()
|
| 76 |
-
model.load()
|
| 77 |
-
|
| 78 |
-
# Generate response
|
| 79 |
-
response = model.generate("Write a hello world in Python")
|
| 80 |
-
print(response)
|
| 81 |
-
|
| 82 |
-
# Check memory stats
|
| 83 |
-
print(memory.get_stats())
|
| 84 |
-
```
|
| 85 |
-
|
| 86 |
-
## 🔐 Environment Variables
|
| 87 |
-
|
| 88 |
-
- `HF_TOKEN` - Your HuggingFace token for private models (optional)
|
| 89 |
-
- `MODEL_ID` - Override default model (default: Qwen/Qwen2.5-Coder-7B-Instruct)
|
| 90 |
-
|
| 91 |
-
## 📊 Memory System
|
| 92 |
-
|
| 93 |
-
Stack 2.9 includes a pattern memory system that:
|
| 94 |
-
|
| 95 |
-
1. **Tracks Interactions** - Records every user-assistant exchange
|
| 96 |
-
2. **Learns Patterns** - Identifies frequently used tools
|
| 97 |
-
3. **Stores Code** - Saves useful code snippets
|
| 98 |
-
4. **Adapts Behavior** - Uses learned context to improve responses
|
| 99 |
-
|
| 100 |
-
## 🛠️ Tech Stack
|
| 101 |
-
|
| 102 |
-
- **Model**: Qwen2.5-Coder-7B-Instruct
|
| 103 |
-
- **Quantization**: 4-bit (bitsandbytes)
|
| 104 |
-
- **Framework**: Gradio 4.0+
|
| 105 |
-
- **Backend**: Transformers + Accelerate
|
| 106 |
-
- **GPU**: 16GB VRAM recommended
|
| 107 |
-
|
| 108 |
-
## 📝 License
|
| 109 |
-
|
| 110 |
-
MIT License - see LICENSE file for details.
|
| 111 |
-
|
| 112 |
-
## 🙏 Acknowledgments
|
| 113 |
-
|
| 114 |
-
- [Qwen](https://github.com/QwenLM/Qwen) - Base model
|
| 115 |
-
- [HuggingFace](https://huggingface.co/) - Spaces hosting
|
| 116 |
-
- [Gradio](https://gradio.app/) - UI framework
|
| 117 |
-
|
| 118 |
-
---
|
| 119 |
-
|
| 120 |
-
<div align="center">
|
| 121 |
-
|
| 122 |
-
Made with ❤️ by Stack 2.9
|
| 123 |
-
|
| 124 |
-
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space/app.py
DELETED
|
@@ -1,600 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 3 |
-
HuggingFace Spaces Demo
|
| 4 |
-
|
| 5 |
-
A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B
|
| 6 |
-
with tool integration and pattern memory.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import os
|
| 10 |
-
import json
|
| 11 |
-
import time
|
| 12 |
-
from datetime import datetime
|
| 13 |
-
from typing import List, Dict, Optional
|
| 14 |
-
import gradio as gr
|
| 15 |
-
|
| 16 |
-
# ============================================================
|
| 17 |
-
# Pattern Memory System
|
| 18 |
-
# ============================================================
|
| 19 |
-
|
| 20 |
-
class SelfEvolutionMemory:
|
| 21 |
-
"""Simple in-memory pattern memory system for demo purposes."""
|
| 22 |
-
|
| 23 |
-
def __init__(self):
|
| 24 |
-
self.conversations = []
|
| 25 |
-
self.learned_patterns = {}
|
| 26 |
-
self.code_snippets = []
|
| 27 |
-
self.preferences = {}
|
| 28 |
-
self.interaction_count = 0
|
| 29 |
-
|
| 30 |
-
def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None):
|
| 31 |
-
"""Record an interaction for learning."""
|
| 32 |
-
self.interaction_count += 1
|
| 33 |
-
interaction = {
|
| 34 |
-
"timestamp": datetime.now().isoformat(),
|
| 35 |
-
"user_input": user_input,
|
| 36 |
-
"assistant_response": assistant_response,
|
| 37 |
-
"tools_used": tools_used or [],
|
| 38 |
-
"interaction_id": self.interaction_count
|
| 39 |
-
}
|
| 40 |
-
self.conversations.append(interaction)
|
| 41 |
-
|
| 42 |
-
# Extract patterns from the interaction
|
| 43 |
-
self._learn_from_interaction(user_input, assistant_response, tools_used or [])
|
| 44 |
-
|
| 45 |
-
def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]):
|
| 46 |
-
"""Learn patterns from interactions."""
|
| 47 |
-
# Track tool usage patterns
|
| 48 |
-
for tool in tools:
|
| 49 |
-
if tool not in self.learned_patterns:
|
| 50 |
-
self.learned_patterns[tool] = {"count": 0, "contexts": []}
|
| 51 |
-
self.learned_patterns[tool]["count"] += 1
|
| 52 |
-
self.learned_patterns[tool]["contexts"].append(user_input[:100])
|
| 53 |
-
|
| 54 |
-
# Extract code snippets if present
|
| 55 |
-
if "```" in response:
|
| 56 |
-
self.code_snippets.append({
|
| 57 |
-
"timestamp": datetime.now().isoformat(),
|
| 58 |
-
"snippet": response
|
| 59 |
-
})
|
| 60 |
-
|
| 61 |
-
def get_context(self) -> str:
|
| 62 |
-
"""Get accumulated context for the model."""
|
| 63 |
-
context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"]
|
| 64 |
-
|
| 65 |
-
if self.learned_patterns:
|
| 66 |
-
context_parts.append("\n### Tool Usage Patterns:")
|
| 67 |
-
for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]:
|
| 68 |
-
context_parts.append(f"- {tool}: used {data['count']} times")
|
| 69 |
-
|
| 70 |
-
if self.code_snippets:
|
| 71 |
-
context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns")
|
| 72 |
-
|
| 73 |
-
return "\n".join(context_parts)
|
| 74 |
-
|
| 75 |
-
def get_stats(self) -> Dict:
|
| 76 |
-
"""Get memory statistics."""
|
| 77 |
-
return {
|
| 78 |
-
"total_interactions": self.interaction_count,
|
| 79 |
-
"tool_patterns": len(self.learned_patterns),
|
| 80 |
-
"code_snippets": len(self.code_snippets),
|
| 81 |
-
"recent_tools": [t for t in self.learned_patterns.keys()][:5]
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# Global memory instance
|
| 86 |
-
memory = SelfEvolutionMemory()
|
| 87 |
-
|
| 88 |
-
# ============================================================
|
| 89 |
-
# Tool System
|
| 90 |
-
# ============================================================
|
| 91 |
-
|
| 92 |
-
class Tool:
|
| 93 |
-
"""Base tool class."""
|
| 94 |
-
|
| 95 |
-
def __init__(self, name: str, description: str, func):
|
| 96 |
-
self.name = name
|
| 97 |
-
self.description = description
|
| 98 |
-
self.func = func
|
| 99 |
-
|
| 100 |
-
async def execute(self, *args, **kwargs):
|
| 101 |
-
return await self.func(*args, **kwargs)
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
# Tool implementations (simplified for demo)
|
| 105 |
-
async def tool_file_read(path: str) -> str:
|
| 106 |
-
"""Read a file."""
|
| 107 |
-
try:
|
| 108 |
-
with open(path, 'r') as f:
|
| 109 |
-
return f.read()[:5000] # Limit output
|
| 110 |
-
except FileNotFoundError:
|
| 111 |
-
return f"File not found: {path}"
|
| 112 |
-
except Exception as e:
|
| 113 |
-
return f"Error reading file: {str(e)}"
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
async def tool_file_write(path: str, content: str) -> str:
|
| 117 |
-
"""Write to a file."""
|
| 118 |
-
try:
|
| 119 |
-
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
|
| 120 |
-
with open(path, 'w') as f:
|
| 121 |
-
f.write(content)
|
| 122 |
-
return f"Successfully wrote to {path}"
|
| 123 |
-
except Exception as e:
|
| 124 |
-
return f"Error writing file: {str(e)}"
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
async def tool_git_status() -> str:
|
| 128 |
-
"""Get git status."""
|
| 129 |
-
import subprocess
|
| 130 |
-
try:
|
| 131 |
-
result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd())
|
| 132 |
-
return result.stdout or "No changes"
|
| 133 |
-
except Exception as e:
|
| 134 |
-
return f"Git error: {str(e)}"
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
async def tool_web_search(query: str) -> str:
|
| 138 |
-
"""Search the web."""
|
| 139 |
-
from urllib.parse import quote
|
| 140 |
-
# Return a demo response since we can't make actual API calls
|
| 141 |
-
return f"🔍 Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)"
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
async def tool_run_command(cmd: str) -> str:
|
| 145 |
-
"""Run a shell command."""
|
| 146 |
-
import subprocess
|
| 147 |
-
try:
|
| 148 |
-
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
|
| 149 |
-
return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout
|
| 150 |
-
except Exception as e:
|
| 151 |
-
return f"Command error: {str(e)}"
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
async def tool_create_directory(path: str) -> str:
|
| 155 |
-
"""Create a directory."""
|
| 156 |
-
try:
|
| 157 |
-
os.makedirs(path, exist_ok=True)
|
| 158 |
-
return f"Directory created: {path}"
|
| 159 |
-
except Exception as e:
|
| 160 |
-
return f"Error: {str(e)}"
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
async def tool_list_directory(path: str = ".") -> str:
|
| 164 |
-
"""List directory contents."""
|
| 165 |
-
try:
|
| 166 |
-
items = os.listdir(path)
|
| 167 |
-
return "\n".join([f"📁 {i}/" if os.path.isdir(os.path.join(path, i)) else f"📄 {i}" for i in items[:50]])
|
| 168 |
-
except Exception as e:
|
| 169 |
-
return f"Error: {str(e)}"
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
# Register tools
|
| 173 |
-
TOOLS = {
|
| 174 |
-
"file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read),
|
| 175 |
-
"file_write": Tool("file_write", "Write content to a file", tool_file_write),
|
| 176 |
-
"git_status": Tool("git_status", "Check git repository status", tool_git_status),
|
| 177 |
-
"web_search": Tool("web_search", "Search the web for information", tool_web_search),
|
| 178 |
-
"run_command": Tool("run_command", "Execute a shell command", tool_run_command),
|
| 179 |
-
"create_directory": Tool("create_directory", "Create a new directory", tool_create_directory),
|
| 180 |
-
"list_directory": Tool("list_directory", "List files in a directory", tool_list_directory),
|
| 181 |
-
}
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
def get_tool_descriptions() -> str:
|
| 185 |
-
"""Get descriptions of all available tools."""
|
| 186 |
-
return "\n".join([f"- **{t.name}**: {t.description}" for t in TOOLS.values()])
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
# ============================================================
|
| 190 |
-
# Model Interface
|
| 191 |
-
# ============================================================
|
| 192 |
-
|
| 193 |
-
class StackModel:
|
| 194 |
-
"""Stack 2.9 model interface using transformers."""
|
| 195 |
-
|
| 196 |
-
def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"):
|
| 197 |
-
self.model_id = model_id
|
| 198 |
-
self.model = None
|
| 199 |
-
self.tokenizer = None
|
| 200 |
-
self.pipeline = None
|
| 201 |
-
|
| 202 |
-
def load(self):
|
| 203 |
-
"""Load the model with 4-bit quantization for HF Spaces."""
|
| 204 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 205 |
-
import torch
|
| 206 |
-
|
| 207 |
-
print(f"Loading {self.model_id}...")
|
| 208 |
-
|
| 209 |
-
# 4-bit quantization config for 16GB GPU
|
| 210 |
-
bnb_config = BitsAndBytesConfig(
|
| 211 |
-
load_in_4bit=True,
|
| 212 |
-
bnb_4bit_compute_dtype=torch.float16,
|
| 213 |
-
bnb_4bit_use_double_quant=True,
|
| 214 |
-
bnb_4bit_quant_type="nf4"
|
| 215 |
-
)
|
| 216 |
-
|
| 217 |
-
# Load tokenizer
|
| 218 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 219 |
-
self.model_id,
|
| 220 |
-
trust_remote_code=True
|
| 221 |
-
)
|
| 222 |
-
|
| 223 |
-
# Load model with quantization
|
| 224 |
-
self.model = AutoModelForCausalLM.from_pretrained(
|
| 225 |
-
self.model_id,
|
| 226 |
-
quantization_config=bnb_config,
|
| 227 |
-
device_map="auto",
|
| 228 |
-
trust_remote_code=True
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
print("Model loaded successfully!")
|
| 232 |
-
|
| 233 |
-
def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
|
| 234 |
-
"""Generate a response."""
|
| 235 |
-
if not self.tokenizer:
|
| 236 |
-
return "Model not loaded. Please wait for initialization."
|
| 237 |
-
|
| 238 |
-
# Build the prompt with system and tools
|
| 239 |
-
system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
|
| 240 |
-
|
| 241 |
-
## Available Tools
|
| 242 |
-
{get_tool_descriptions()}
|
| 243 |
-
|
| 244 |
-
## Your Capabilities
|
| 245 |
-
- Write, read, and execute code
|
| 246 |
-
- Use git for version control
|
| 247 |
-
- Search the web for information
|
| 248 |
-
- Create and manage files
|
| 249 |
-
- Execute shell commands
|
| 250 |
-
|
| 251 |
-
## Self-Evolution
|
| 252 |
-
You learn from each interaction. After responding, summarize what tools you used.
|
| 253 |
-
|
| 254 |
-
{memory.get_context()}
|
| 255 |
-
|
| 256 |
-
## Instructions
|
| 257 |
-
1. Be helpful and concise
|
| 258 |
-
2. Use tools when needed
|
| 259 |
-
3. Learn from the conversation
|
| 260 |
-
4. Provide code examples when relevant
|
| 261 |
-
|
| 262 |
-
Now respond to the user:"""
|
| 263 |
-
|
| 264 |
-
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
|
| 265 |
-
|
| 266 |
-
# Tokenize
|
| 267 |
-
inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
|
| 268 |
-
|
| 269 |
-
# Generate
|
| 270 |
-
outputs = self.model.generate(
|
| 271 |
-
**inputs,
|
| 272 |
-
max_new_tokens=max_tokens,
|
| 273 |
-
temperature=temperature,
|
| 274 |
-
do_sample=True,
|
| 275 |
-
top_p=0.9,
|
| 276 |
-
repetition_penalty=1.1
|
| 277 |
-
)
|
| 278 |
-
|
| 279 |
-
# Decode
|
| 280 |
-
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 281 |
-
|
| 282 |
-
# Extract just the response part
|
| 283 |
-
if "Assistant:" in response:
|
| 284 |
-
response = response.split("Assistant:")[-1].strip()
|
| 285 |
-
|
| 286 |
-
return response
|
| 287 |
-
|
| 288 |
-
def generate_streaming(self, prompt: str, max_tokens: int = 512):
|
| 289 |
-
"""Generate with streaming (yields tokens)."""
|
| 290 |
-
if not self.tokenizer:
|
| 291 |
-
yield "Model not loaded. Please wait for initialization."
|
| 292 |
-
return
|
| 293 |
-
|
| 294 |
-
system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
|
| 295 |
-
|
| 296 |
-
## Available Tools
|
| 297 |
-
{get_tool_descriptions()}
|
| 298 |
-
|
| 299 |
-
## Self-Evolution Memory
|
| 300 |
-
{memory.get_context()}
|
| 301 |
-
|
| 302 |
-
Now respond to the user:"""
|
| 303 |
-
|
| 304 |
-
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
|
| 305 |
-
|
| 306 |
-
inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
|
| 307 |
-
|
| 308 |
-
# Generate token by token
|
| 309 |
-
from transformers import GenerationMixin
|
| 310 |
-
from typing import Iterator
|
| 311 |
-
|
| 312 |
-
generated_ids = inputs.input_ids
|
| 313 |
-
|
| 314 |
-
for _ in range(max_tokens):
|
| 315 |
-
with torch.no_grad():
|
| 316 |
-
outputs = self.model(generated_ids)
|
| 317 |
-
next_token_logits = outputs.logits[:, -1, :]
|
| 318 |
-
|
| 319 |
-
# Apply temperature
|
| 320 |
-
next_token_logits = next_token_logits / 0.7
|
| 321 |
-
|
| 322 |
-
# Sample
|
| 323 |
-
probs = torch.softmax(next_token_logits, dim=-1)
|
| 324 |
-
next_token = torch.multinomial(probs, num_samples=1)
|
| 325 |
-
|
| 326 |
-
generated_ids = torch.cat([generated_ids, next_token], dim=-1)
|
| 327 |
-
|
| 328 |
-
# Decode and yield
|
| 329 |
-
token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
|
| 330 |
-
yield token_str
|
| 331 |
-
|
| 332 |
-
# Stop on EOS
|
| 333 |
-
if next_token.item() == self.tokenizer.eos_token_id:
|
| 334 |
-
break
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
# Global model instance
|
| 338 |
-
model = None
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
def initialize_model():
|
| 342 |
-
"""Initialize the model on startup."""
|
| 343 |
-
global model
|
| 344 |
-
try:
|
| 345 |
-
model = StackModel()
|
| 346 |
-
model.load()
|
| 347 |
-
return model
|
| 348 |
-
except Exception as e:
|
| 349 |
-
print(f"Failed to load model: {e}")
|
| 350 |
-
return None
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
# ============================================================
|
| 354 |
-
# Gradio Interface
|
| 355 |
-
# ============================================================
|
| 356 |
-
|
| 357 |
-
def format_tools_used(tools_used: List[str]) -> str:
|
| 358 |
-
"""Format the tools used for display."""
|
| 359 |
-
if not tools_used:
|
| 360 |
-
return ""
|
| 361 |
-
return f"\n\n🔧 **Tools Used**: {', '.join(tools_used)}"
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
def chat_response(message: str, history: List[List[str]]) -> tuple:
|
| 365 |
-
"""Process a chat message and return response."""
|
| 366 |
-
global model, memory
|
| 367 |
-
|
| 368 |
-
if model is None or model.model is None:
|
| 369 |
-
return "⏳ Model is loading. Please wait...", history + [[message, "⏳ Model is loading. Please wait..."]]
|
| 370 |
-
|
| 371 |
-
# Track tools used
|
| 372 |
-
tools_used = []
|
| 373 |
-
|
| 374 |
-
# Check if we need to use tools based on the message
|
| 375 |
-
message_lower = message.lower()
|
| 376 |
-
|
| 377 |
-
if any(kw in message_lower for kw in ['git status', 'git']):
|
| 378 |
-
tools_used.append("git_status")
|
| 379 |
-
if any(kw in message_lower for kw in ['search', 'find', 'look up']):
|
| 380 |
-
tools_used.append("web_search")
|
| 381 |
-
if any(kw in message_lower for kw in ['list files', 'directory', 'ls']):
|
| 382 |
-
tools_used.append("list_directory")
|
| 383 |
-
if any(kw in message_lower for kw in ['run ', 'execute', 'command']):
|
| 384 |
-
tools_used.append("run_command")
|
| 385 |
-
|
| 386 |
-
# Generate response
|
| 387 |
-
try:
|
| 388 |
-
response = model.generate(message, max_tokens=512)
|
| 389 |
-
except Exception as e:
|
| 390 |
-
response = f"I encountered an error: {str(e)}"
|
| 391 |
-
|
| 392 |
-
# Add tools used to response
|
| 393 |
-
response += format_tools_used(tools_used)
|
| 394 |
-
|
| 395 |
-
# Record in memory
|
| 396 |
-
memory.add_interaction(message, response, tools_used)
|
| 397 |
-
|
| 398 |
-
return response
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
def chat_response_stream(message: str, history: List[List[str]]) -> Generator:
|
| 402 |
-
"""Process a chat message with streaming."""
|
| 403 |
-
global model, memory
|
| 404 |
-
|
| 405 |
-
if model is None or model.model is None:
|
| 406 |
-
yield "⏳ Model is loading. Please wait..."
|
| 407 |
-
return
|
| 408 |
-
|
| 409 |
-
full_response = ""
|
| 410 |
-
tools_used = []
|
| 411 |
-
|
| 412 |
-
message_lower = message.lower()
|
| 413 |
-
if any(kw in message_lower for kw in ['git status', 'git']):
|
| 414 |
-
tools_used.append("git_status")
|
| 415 |
-
if any(kw in message_lower for kw in ['search', 'find']):
|
| 416 |
-
tools_used.append("web_search")
|
| 417 |
-
if any(kw in message_lower for kw in ['list', 'directory']):
|
| 418 |
-
tools_used.append("list_directory")
|
| 419 |
-
|
| 420 |
-
# Stream the response
|
| 421 |
-
for token in model.generate_streaming(message, max_tokens=256):
|
| 422 |
-
full_response += token
|
| 423 |
-
yield full_response
|
| 424 |
-
|
| 425 |
-
# Add tools used
|
| 426 |
-
if tools_used:
|
| 427 |
-
full_response += format_tools_used(tools_used)
|
| 428 |
-
yield full_response
|
| 429 |
-
|
| 430 |
-
# Record in memory
|
| 431 |
-
memory.add_interaction(message, full_response, tools_used)
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
# Example prompts for the UI
|
| 435 |
-
EXAMPLE_PROMPTS = [
|
| 436 |
-
"Hello! What can you help me with?",
|
| 437 |
-
"Check git status of this repository",
|
| 438 |
-
"Search for best practices for Python async programming",
|
| 439 |
-
"List the files in the current directory",
|
| 440 |
-
"Write a simple Python function to calculate fibonacci",
|
| 441 |
-
"How do I use Git to create a new branch?",
|
| 442 |
-
"What's your memory of our conversation?",
|
| 443 |
-
]
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
def create_gradio_app():
|
| 447 |
-
"""Create the Gradio interface."""
|
| 448 |
-
|
| 449 |
-
with gr.Blocks(
|
| 450 |
-
title="Stack 2.9 - Pattern-Based AI Coding Assistant",
|
| 451 |
-
theme=gr.themes.Soft(
|
| 452 |
-
primary_color="#6366f1",
|
| 453 |
-
secondary_color="#818cf8",
|
| 454 |
-
tertiary_color="#a5b4fc"
|
| 455 |
-
)
|
| 456 |
-
) as app:
|
| 457 |
-
|
| 458 |
-
# Header
|
| 459 |
-
gr.Markdown("""
|
| 460 |
-
# 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 461 |
-
|
| 462 |
-
Powered by **Qwen2.5-Coder-7B** with 4-bit quantization
|
| 463 |
-
|
| 464 |
-
---
|
| 465 |
-
""")
|
| 466 |
-
|
| 467 |
-
# Memory stats display
|
| 468 |
-
with gr.Row():
|
| 469 |
-
with gr.Column(scale=1):
|
| 470 |
-
stats_display = gr.Markdown(
|
| 471 |
-
"📊 **Memory Stats**\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0",
|
| 472 |
-
elem_id="stats"
|
| 473 |
-
)
|
| 474 |
-
with gr.Column(scale=3):
|
| 475 |
-
pass # Spacer
|
| 476 |
-
|
| 477 |
-
# Chat interface
|
| 478 |
-
chatbot = gr.Chatbot(
|
| 479 |
-
height=500,
|
| 480 |
-
show_copy_button=True,
|
| 481 |
-
bubble_full_width=False
|
| 482 |
-
)
|
| 483 |
-
|
| 484 |
-
with gr.Row():
|
| 485 |
-
msg = gr.Textbox(
|
| 486 |
-
label="Message",
|
| 487 |
-
placeholder="Ask me anything...",
|
| 488 |
-
scale=4,
|
| 489 |
-
lines=3
|
| 490 |
-
)
|
| 491 |
-
submit_btn = gr.Button("Send", variant="primary", scale=1)
|
| 492 |
-
|
| 493 |
-
# Clear button
|
| 494 |
-
with gr.Row():
|
| 495 |
-
clear_btn = gr.Button("🗑️ Clear Chat")
|
| 496 |
-
|
| 497 |
-
# Example prompts
|
| 498 |
-
gr.Examples(
|
| 499 |
-
examples=EXAMPLE_PROMPTS,
|
| 500 |
-
inputs=msg,
|
| 501 |
-
label="Example Prompts"
|
| 502 |
-
)
|
| 503 |
-
|
| 504 |
-
# Memory visualization
|
| 505 |
-
with gr.Accordion("🧠 Self-Evolution Memory", open=False):
|
| 506 |
-
memory_display = gr.Textbox(
|
| 507 |
-
label="Memory Content",
|
| 508 |
-
lines=10,
|
| 509 |
-
interactive=False
|
| 510 |
-
)
|
| 511 |
-
|
| 512 |
-
# Functions
|
| 513 |
-
def respond(message, history):
|
| 514 |
-
response = chat_response(message, history)
|
| 515 |
-
history.append([message, response])
|
| 516 |
-
return "", history
|
| 517 |
-
|
| 518 |
-
def update_stats():
|
| 519 |
-
stats = memory.get_stats()
|
| 520 |
-
return f"""📊 **Memory Stats**
|
| 521 |
-
|
| 522 |
-
- **Interactions**: {stats['total_interactions']}
|
| 523 |
-
- **Tool Patterns**: {stats['tool_patterns']}
|
| 524 |
-
- **Code Snippets**: {stats['code_snippets']}
|
| 525 |
-
|
| 526 |
-
**Recent Tools**: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}"""
|
| 527 |
-
|
| 528 |
-
def update_memory():
|
| 529 |
-
return memory.get_context()
|
| 530 |
-
|
| 531 |
-
# Button click handlers
|
| 532 |
-
submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send")
|
| 533 |
-
msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send")
|
| 534 |
-
|
| 535 |
-
def clear_chat():
|
| 536 |
-
return [], ""
|
| 537 |
-
|
| 538 |
-
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
|
| 539 |
-
|
| 540 |
-
# Update stats periodically
|
| 541 |
-
chatbot.change(update_stats, outputs=[stats_display])
|
| 542 |
-
chatbot.change(update_memory, outputs=[memory_display])
|
| 543 |
-
|
| 544 |
-
# Footer
|
| 545 |
-
gr.Markdown("""
|
| 546 |
-
---
|
| 547 |
-
### About Stack 2.9
|
| 548 |
-
|
| 549 |
-
Stack 2.9 is a pattern-based AI coding assistant that:
|
| 550 |
-
- 🔍 Uses **Qwen2.5-Coder-7B** (4-bit, ~4GB VRAM)
|
| 551 |
-
- 🛠️ Integrates **7 tools** (file, git, web, search, shell)
|
| 552 |
-
- 🧠 Remembers interactions and learns patterns
|
| 553 |
-
- ⚡ Provides fast, streaming responses
|
| 554 |
-
|
| 555 |
-
Deployed on **HuggingFace Spaces** with Gradio
|
| 556 |
-
""")
|
| 557 |
-
|
| 558 |
-
return app
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
# ============================================================
|
| 562 |
-
# Main Entry Point
|
| 563 |
-
# ============================================================
|
| 564 |
-
|
| 565 |
-
if __name__ == "__main__":
|
| 566 |
-
import argparse
|
| 567 |
-
|
| 568 |
-
parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo")
|
| 569 |
-
parser.add_argument("--share", action="store_true", help="Create a public share link")
|
| 570 |
-
parser.add_argument("--port", type=int, default=7860, help="Port to run on")
|
| 571 |
-
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID")
|
| 572 |
-
args = parser.parse_args()
|
| 573 |
-
|
| 574 |
-
print("=" * 50)
|
| 575 |
-
print("🚀 Stack 2.9 - Pattern-Based AI Coding Assistant")
|
| 576 |
-
print("=" * 50)
|
| 577 |
-
print(f"Model: {args.model}")
|
| 578 |
-
print("Loading model...")
|
| 579 |
-
|
| 580 |
-
# Initialize model in a thread
|
| 581 |
-
import threading
|
| 582 |
-
|
| 583 |
-
def load_model_thread():
|
| 584 |
-
global model
|
| 585 |
-
model = initialize_model()
|
| 586 |
-
|
| 587 |
-
loader_thread = threading.Thread(target=load_model_thread)
|
| 588 |
-
loader_thread.start()
|
| 589 |
-
|
| 590 |
-
# Create and launch app
|
| 591 |
-
app = create_gradio_app()
|
| 592 |
-
|
| 593 |
-
print(f"\n🚀 Launching Gradio on port {args.port}...")
|
| 594 |
-
print("📝 Note: Model loads in background. Chat will work once loaded.\n")
|
| 595 |
-
|
| 596 |
-
app.launch(
|
| 597 |
-
server_name="0.0.0.0",
|
| 598 |
-
server_port=args.port,
|
| 599 |
-
share=args.share
|
| 600 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space/requirements.txt
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
# Stack 2.9 - HuggingFace Spaces Demo
|
| 2 |
-
# Requirements for Gradio interface with Qwen2.5-Coder-7B
|
| 3 |
-
|
| 4 |
-
# Core Gradio
|
| 5 |
-
gradio>=4.0.0
|
| 6 |
-
|
| 7 |
-
# Transformers and model loading
|
| 8 |
-
transformers>=4.36.0
|
| 9 |
-
torch>=2.0.0
|
| 10 |
-
|
| 11 |
-
# Model optimization
|
| 12 |
-
accelerate>=0.24.0
|
| 13 |
-
bitsandbytes>=0.41.0
|
| 14 |
-
|
| 15 |
-
# Additional utilities
|
| 16 |
-
huggingface-hub>=0.19.0
|
| 17 |
-
safetensors>=0.4.0
|
| 18 |
-
|
| 19 |
-
# Optional: For better web search
|
| 20 |
-
# brave-search>=0.1.0
|
| 21 |
-
|
| 22 |
-
# Optional: For web fetching
|
| 23 |
-
# beautifulsoup4>=4.12.0
|
| 24 |
-
# lxml>=4.9.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{stack-2.9-cli → src/cli}/__init__.py
RENAMED
|
File without changes
|
{stack_cli → src/cli}/agent.py
RENAMED
|
File without changes
|
{stack_cli → src/cli}/cli.py
RENAMED
|
File without changes
|
{stack_cli → src/cli}/context.py
RENAMED
|
File without changes
|
{stack-2.9-cli → src/cli}/main.py
RENAMED
|
File without changes
|
{stack_cli → src/cli}/pyproject.toml
RENAMED
|
File without changes
|
{stack_cli → src/cli}/tools.py
RENAMED
|
File without changes
|
stack-2.9-deploy/Dockerfile
CHANGED
|
@@ -1,107 +1,37 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
|
| 4 |
-
|
| 5 |
-
ARG VLLM_VERSION=0.6.3
|
| 6 |
-
ARG CUDA_VERSION=12.1.0
|
| 7 |
-
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
| 8 |
-
|
| 9 |
-
# Stage 1: Builder
|
| 10 |
-
FROM ${BASE_IMAGE} AS builder
|
| 11 |
-
|
| 12 |
-
ARG PYTHON_VERSION
|
| 13 |
-
ARG VLLM_VERSION
|
| 14 |
-
|
| 15 |
-
# Set environment variables
|
| 16 |
-
ENV DEBIAN_FRONTEND=noninteractive \
|
| 17 |
-
TZ=UTC \
|
| 18 |
-
PYTHONUNBUFFERED=1 \
|
| 19 |
-
PIP_NO_CACHE_DIR=1
|
| 20 |
-
|
| 21 |
-
# Install system dependencies and Python
|
| 22 |
-
RUN apt-get update && apt-get install -y \
|
| 23 |
-
python${PYTHON_VERSION} \
|
| 24 |
-
python${PYTHON_VERSION}-dev \
|
| 25 |
-
python3-pip \
|
| 26 |
-
git \
|
| 27 |
-
curl \
|
| 28 |
-
wget \
|
| 29 |
-
build-essential \
|
| 30 |
-
cmake \
|
| 31 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
-
|
| 33 |
-
# Install PyTorch with CUDA 12.1 support
|
| 34 |
-
RUN pip3 install --upgrade pip setuptools wheel
|
| 35 |
-
RUN pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
|
| 36 |
-
|
| 37 |
-
# Install vLLM
|
| 38 |
-
RUN pip3 install vllm==${VLLM_VERSION} "vllm[attention]"
|
| 39 |
-
|
| 40 |
-
# Install additional dependencies
|
| 41 |
-
RUN pip3 install \
|
| 42 |
-
fastapi==0.111.0 \
|
| 43 |
-
uvicorn[standard]==0.30.1 \
|
| 44 |
-
transformers==4.41.2 \
|
| 45 |
-
accelerate==0.30.1 \
|
| 46 |
-
huggingface-hub==0.23.0 \
|
| 47 |
-
sentencepiece==0.2.0 \
|
| 48 |
-
protobuf==3.20.3
|
| 49 |
-
|
| 50 |
-
# Stage 2: Runtime
|
| 51 |
-
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
| 52 |
-
|
| 53 |
-
ARG PYTHON_VERSION
|
| 54 |
-
ARG VLLM_VERSION
|
| 55 |
|
| 56 |
# Set environment variables
|
| 57 |
-
ENV
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
PIP_NO_CACHE_DIR=1 \
|
| 61 |
-
NVIDIA_VISIBLE_DEVICES=all \
|
| 62 |
-
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
|
| 63 |
-
VLLM_USE_MODELSCOPE=false \
|
| 64 |
-
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 65 |
-
HF_HUB_ENABLE_HF_TRANSFER=1
|
| 66 |
|
| 67 |
-
# Install
|
| 68 |
RUN apt-get update && apt-get install -y \
|
| 69 |
-
python${PYTHON_VERSION} \
|
| 70 |
-
python${PYTHON_VERSION}-dev \
|
| 71 |
-
python3-pip \
|
| 72 |
git \
|
| 73 |
-
curl \
|
| 74 |
wget \
|
| 75 |
-
libgomp1 \
|
| 76 |
&& rm -rf /var/lib/apt/lists/*
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 81 |
-
|
| 82 |
-
# Create non-root user
|
| 83 |
-
RUN groupadd -r vllm && useradd -r -g vllm -d /home/vllm -m vllm
|
| 84 |
-
|
| 85 |
-
# Set working directory
|
| 86 |
-
WORKDIR /app
|
| 87 |
|
| 88 |
-
# Copy
|
| 89 |
-
COPY
|
| 90 |
-
COPY --chown=vllm:vllm requirements.txt .
|
| 91 |
-
COPY --chown=vllm:vllm config.yaml .
|
| 92 |
|
| 93 |
-
#
|
| 94 |
-
RUN
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
|
| 101 |
-
CMD curl -f http://localhost:8000/health || exit 1
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
-
#
|
| 107 |
-
CMD ["
|
|
|
|
| 1 |
+
# Stack 2.9 HuggingFace Spaces Dockerfile
|
| 2 |
+
# Optimized for 16GB GPU with 4-bit quantization
|
| 3 |
|
| 4 |
+
FROM python:3.10-slim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Set environment variables
|
| 7 |
+
ENV PYTHONUNBUFFERED=1
|
| 8 |
+
ENV TRANSFORMERS_CACHE=/workspace/.cache/huggingface
|
| 9 |
+
ENV HF_HOME=/workspace/.cache/huggingface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# Install system dependencies
|
| 12 |
RUN apt-get update && apt-get install -y \
|
|
|
|
|
|
|
|
|
|
| 13 |
git \
|
|
|
|
| 14 |
wget \
|
|
|
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
+
# Create workspace directory
|
| 18 |
+
WORKDIR /workspace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
# Copy requirements first for better caching
|
| 21 |
+
COPY requirements.txt .
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
# Install Python dependencies
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
+
# Copy application files
|
| 27 |
+
COPY . .
|
| 28 |
|
| 29 |
+
# Expose Gradio port
|
| 30 |
+
EXPOSE 7860
|
|
|
|
| 31 |
|
| 32 |
+
# Create startup script
|
| 33 |
+
RUN echo '#!/bin/bash\necho "🚀 Starting Stack 2.9..."\npython app.py --port 7860 --share' > /start.sh
|
| 34 |
+
RUN chmod +x /start.sh
|
| 35 |
|
| 36 |
+
# Launch command
|
| 37 |
+
CMD ["/start.sh"]
|
stack-2.9-deploy/README.md
CHANGED
|
@@ -1,346 +1,124 @@
|
|
| 1 |
-
# Stack 2.9
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
- For local deployment: **Docker** + **NVIDIA GPU** (optional but recommended)
|
| 9 |
-
- For cloud: **runpodctl** or **vastai** CLI installed
|
| 10 |
-
- **chmod +x** may be required on shell scripts
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
|
| 17 |
-
|------
|
| 18 |
-
|
|
| 19 |
-
|
|
| 20 |
-
|
|
| 21 |
-
|
|
| 22 |
-
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
- CPU-only mode is possible but extremely slow (not recommended for production)
|
| 26 |
-
- AWQ/GPTQ quantization reduces VRAM requirements by ~50%
|
| 27 |
-
- Multi-GPU (tensor parallelism) supported via `TENSOR_PARALLEL_SIZE`
|
| 28 |
|
| 29 |
-
##
|
| 30 |
-
|
| 31 |
-
## 🧪 Validate Setup
|
| 32 |
-
|
| 33 |
-
Before deploying, run the validation script to ensure everything is ready:
|
| 34 |
-
|
| 35 |
-
```bash
|
| 36 |
-
./validate.sh
|
| 37 |
-
```
|
| 38 |
-
This checks Docker, GPU, and all required files.
|
| 39 |
-
|
| 40 |
-
## 🚀 Quick Start
|
| 41 |
-
|
| 42 |
-
### Local Deployment (Docker Compose)
|
| 43 |
-
|
| 44 |
-
```bash
|
| 45 |
-
# Ensure deploy.sh is executable
|
| 46 |
-
chmod +x deploy.sh validate.sh
|
| 47 |
-
|
| 48 |
-
# Deploy
|
| 49 |
-
./deploy.sh local --model TheBloke/Llama-2-7B-Chat-AWQ
|
| 50 |
-
```
|
| 51 |
-
|
| 52 |
-
The server will start at `http://localhost:8000`
|
| 53 |
-
|
| 54 |
-
### Cloud Deployments
|
| 55 |
-
|
| 56 |
-
```bash
|
| 57 |
-
# RunPod
|
| 58 |
-
./deploy.sh runpod --gpu A100-40GB
|
| 59 |
-
|
| 60 |
-
# Vast.ai
|
| 61 |
-
./deploy.sh vastai
|
| 62 |
-
|
| 63 |
-
# Kubernetes
|
| 64 |
-
./deploy.sh kubernetes --namespace inference
|
| 65 |
-
```
|
| 66 |
-
|
| 67 |
-
---
|
| 68 |
-
|
| 69 |
-
## 📦 What's Included
|
| 70 |
-
|
| 71 |
-
```
|
| 72 |
-
stack-2.9-deploy/
|
| 73 |
-
├── Dockerfile # Multi-stage production image
|
| 74 |
-
├── docker-compose.yaml # Local orchestration
|
| 75 |
-
├── deploy.sh # One-command deployment script
|
| 76 |
-
├── runpod-template.json # RunPod.io template
|
| 77 |
-
├── vastai-template.json # Vast.ai template
|
| 78 |
-
├── kubernetes/ # K8s manifests
|
| 79 |
-
│ ├── deployment.yaml # GPU-enabled deployment
|
| 80 |
-
│ ├── service.yaml # LoadBalancer service
|
| 81 |
-
│ ├── pvc.yaml # Model cache volume
|
| 82 |
-
│ ├── hpa.yaml # Autoscaling configuration
|
| 83 |
-
│ └── secrets.yaml # Secrets template
|
| 84 |
-
├── app.py # vLLM server wrapper
|
| 85 |
-
└── README.md # This file
|
| 86 |
-
```
|
| 87 |
-
|
| 88 |
-
---
|
| 89 |
-
|
| 90 |
-
## 🐳 Docker Image
|
| 91 |
-
|
| 92 |
-
**Base:** `nvidia/cuda:12.1-runtime-ubuntu22.04`
|
| 93 |
-
**Python:** 3.10
|
| 94 |
-
**vLLM:** 0.6.3
|
| 95 |
-
**CUDA:** 12.1
|
| 96 |
-
|
| 97 |
-
### Features:
|
| 98 |
-
- Multi-stage build for minimal footprint
|
| 99 |
-
- Non-root user (`vllm`)
|
| 100 |
-
- Health checks
|
| 101 |
-
- CUDA 12.1 runtime
|
| 102 |
-
- Model cache persistence
|
| 103 |
-
- AWQ 4-bit quantization support
|
| 104 |
-
|
| 105 |
-
---
|
| 106 |
-
|
| 107 |
-
## 🔧 Environment Variables
|
| 108 |
-
|
| 109 |
-
| Variable | Default | Description |
|
| 110 |
-
|----------|---------|-------------|
|
| 111 |
-
| `MODEL_ID` | `TheBloke/Llama-2-7B-Chat-AWQ` | Hugging Face model ID |
|
| 112 |
-
| `HUGGING_FACE_TOKEN` | (empty) | HF token for gated models |
|
| 113 |
-
| `QUANTIZATION` | `awq` | Quantization method |
|
| 114 |
-
| `TENSOR_PARALLEL_SIZE` | `1` | Number of GPUs |
|
| 115 |
-
| `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory fraction |
|
| 116 |
-
| `MAX_MODEL_LEN` | `4096` | Max sequence length |
|
| 117 |
-
| `MAX_NUM_SEQS` | `64` | Max batch size |
|
| 118 |
-
| `PORT` | `8000` | Server port |
|
| 119 |
-
|
| 120 |
-
---
|
| 121 |
-
|
| 122 |
-
## 🌐 API Endpoints
|
| 123 |
-
|
| 124 |
-
Stack 2.9 provides OpenAI-compatible endpoints:
|
| 125 |
-
|
| 126 |
-
- `POST /v1/completions` - Text completion
|
| 127 |
-
- `POST /v1/chat/completions` - Chat completion
|
| 128 |
-
- `GET /health` - Health check
|
| 129 |
-
- `GET /metrics` - Prometheus metrics
|
| 130 |
-
- `GET /docs` - Interactive API docs
|
| 131 |
-
|
| 132 |
-
### Example Usage
|
| 133 |
-
|
| 134 |
-
```bash
|
| 135 |
-
# Chat completion
|
| 136 |
-
curl http://localhost:8000/v1/chat/completions \
|
| 137 |
-
-H "Content-Type: application/json" \
|
| 138 |
-
-d '{
|
| 139 |
-
"model": "stack-2.9",
|
| 140 |
-
"messages": [{"role": "user", "content": "Hello!"}],
|
| 141 |
-
"max_tokens": 100
|
| 142 |
-
}'
|
| 143 |
-
```
|
| 144 |
-
|
| 145 |
-
---
|
| 146 |
-
|
| 147 |
-
## ☁️ Platform-Specific Notes
|
| 148 |
-
|
| 149 |
-
### Local (Docker Compose)
|
| 150 |
-
|
| 151 |
-
```bash
|
| 152 |
-
# Build and start
|
| 153 |
-
./deploy.sh local --model <model-id>
|
| 154 |
-
|
| 155 |
-
# View logs
|
| 156 |
-
docker-compose logs -f stack-2.9
|
| 157 |
-
|
| 158 |
-
# Stop
|
| 159 |
-
docker-compose down
|
| 160 |
-
```
|
| 161 |
-
|
| 162 |
-
**Requirements:**
|
| 163 |
-
- Docker 20.10+
|
| 164 |
-
- Docker Compose v2
|
| 165 |
-
- NVIDIA GPU (recommended) with CUDA 12.x drivers
|
| 166 |
-
|
| 167 |
-
---
|
| 168 |
-
|
| 169 |
-
### RunPod
|
| 170 |
-
|
| 171 |
-
1. Authenticate: `runpodctl login`
|
| 172 |
-
2. Run: `./deploy.sh runpod --gpu A100-40GB`
|
| 173 |
-
3. Provide your Docker registry
|
| 174 |
-
4. Deploy from the created template on RunPod.io
|
| 175 |
-
|
| 176 |
-
**Recommended GPUs:**
|
| 177 |
-
- A100 40GB (default)
|
| 178 |
-
- A100 80GB
|
| 179 |
-
- H100 80GB
|
| 180 |
-
|
| 181 |
-
**Auto-sleep:** Enabled after 30 minutes of inactivity
|
| 182 |
-
|
| 183 |
-
---
|
| 184 |
-
|
| 185 |
-
### Vast.ai
|
| 186 |
-
|
| 187 |
-
1. Install vastai CLI
|
| 188 |
-
2. Run: `./deploy.sh vastai`
|
| 189 |
-
3. Provide your Docker registry
|
| 190 |
-
4. Launch via template or CLI
|
| 191 |
-
|
| 192 |
-
**Recommended Instances:**
|
| 193 |
-
- RTX 4090 (24GB) - $0.30-0.50/hr
|
| 194 |
-
- RTX 6000 Ada (48GB) - $0.80-1.20/hr
|
| 195 |
-
- A100 40GB - $0.90-1.50/hr
|
| 196 |
-
|
| 197 |
-
**SSH Access:** Available on forwarded port 2222
|
| 198 |
-
|
| 199 |
-
---
|
| 200 |
-
|
| 201 |
-
### Kubernetes
|
| 202 |
-
|
| 203 |
-
#### Prerequisites:
|
| 204 |
-
- kubectl configured
|
| 205 |
-
- GPU-enabled cluster (NVIDIA GPUs with device plugin)
|
| 206 |
-
- Storage class with ReadWriteMany capability
|
| 207 |
-
|
| 208 |
-
#### Deployment:
|
| 209 |
|
| 210 |
```bash
|
| 211 |
-
#
|
| 212 |
-
|
|
|
|
| 213 |
|
| 214 |
-
#
|
| 215 |
-
|
| 216 |
-
--from-literal=huggingface-token='YOUR_TOKEN' \
|
| 217 |
-
-n stack-2.9
|
| 218 |
|
| 219 |
-
#
|
| 220 |
-
.
|
| 221 |
-
|
| 222 |
-
# Or manually:
|
| 223 |
-
kubectl apply -f kubernetes/
|
| 224 |
```
|
| 225 |
|
| 226 |
-
|
| 227 |
-
```bash
|
| 228 |
-
kubectl get pods,svc,pvc,hpa -n stack-2.9
|
| 229 |
-
kubectl logs -f deployment/stack-2.9 -n stack-2-9
|
| 230 |
-
```
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
``
|
| 236 |
-
|
| 237 |
-
-
|
|
|
|
| 238 |
|
| 239 |
-
##
|
| 240 |
|
| 241 |
-
###
|
| 242 |
|
| 243 |
-
```bash
|
| 244 |
-
./deploy.sh local --model mistralai/Mistral-7B-Instruct-v0.2
|
| 245 |
```
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
Edit `docker-compose.yaml` or K8s deployment:
|
| 255 |
-
|
| 256 |
-
```yaml
|
| 257 |
-
resources:
|
| 258 |
-
limits:
|
| 259 |
-
nvidia.com/gpu: 2 # Multi-GPU
|
| 260 |
-
requests:
|
| 261 |
-
memory: "24Gi"
|
| 262 |
-
cpu: "8"
|
| 263 |
```
|
| 264 |
|
| 265 |
-
|
| 266 |
|
| 267 |
-
|
|
|
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
|
| 273 |
-
#
|
| 274 |
-
|
|
|
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
|
| 278 |
-
-H "Content-Type: application/json" \
|
| 279 |
-
-d '{"prompt": "Once upon a time", "max_tokens": 50}'
|
| 280 |
```
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
## 🐛 Troubleshooting
|
| 285 |
-
|
| 286 |
-
### GPU not detected
|
| 287 |
-
```bash
|
| 288 |
-
# Check NVIDIA drivers
|
| 289 |
-
nvidia-smi
|
| 290 |
-
|
| 291 |
-
# Ensure NVIDIA Container Toolkit
|
| 292 |
-
docker info | grep -i runtime
|
| 293 |
-
```
|
| 294 |
-
|
| 295 |
-
### Out of memory
|
| 296 |
-
Reduce `GPU_MEMORY_UTILIZATION` to `0.7` or `0.8`
|
| 297 |
-
|
| 298 |
-
### Slow first request
|
| 299 |
-
First request downloads/loads the model (~5-10 min for 7B). This is cached for subsequent requests.
|
| 300 |
|
| 301 |
-
|
| 302 |
-
|
| 303 |
|
| 304 |
-
|
| 305 |
|
| 306 |
-
|
| 307 |
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
| 310 |
|
| 311 |
-
##
|
| 312 |
-
```bash
|
| 313 |
-
docker stats stack-2.9-server
|
| 314 |
-
```
|
| 315 |
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
---
|
| 323 |
|
| 324 |
-
##
|
| 325 |
-
|
| 326 |
-
- Runs as non-root user (`vllm`)
|
| 327 |
-
- Dropped capabilities
|
| 328 |
-
- Read-only filesystem (except cache)
|
| 329 |
-
- Health checks for liveness/readiness
|
| 330 |
-
- Secrets via Kubernetes secrets or env file
|
| 331 |
|
| 332 |
-
-
|
| 333 |
|
| 334 |
-
##
|
| 335 |
|
| 336 |
-
|
|
|
|
|
|
|
| 337 |
|
| 338 |
---
|
| 339 |
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
Issues: Report to Stack 2.9 repository
|
| 343 |
|
| 344 |
-
|
| 345 |
|
| 346 |
-
|
|
|
|
| 1 |
+
# 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 2 |
|
| 3 |
+
A HuggingFace Spaces demo for Stack 2.9, a pattern-based AI coding assistant powered by Qwen2.5-Coder-7B.
|
| 4 |
|
| 5 |
+

|
| 6 |
+

|
| 7 |
+

|
| 8 |
|
| 9 |
+
## ✨ Features
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
- **🤖 Qwen2.5-Coder-7B** - State-of-the-art code generation model
|
| 12 |
+
- **🔧 7 Integrated Tools** - File operations, git, web search, shell commands
|
| 13 |
+
- **🧠 Pattern Memory** - Learns from each interaction
|
| 14 |
+
- **⚡ Fast Streaming** - Real-time token-by-token generation
|
| 15 |
+
- **💾 4-bit Quantization** - Runs on 16GB GPU (~4GB VRAM)
|
| 16 |
|
| 17 |
+
## 🔧 Available Tools
|
| 18 |
|
| 19 |
+
| Tool | Description |
|
| 20 |
+
|------|-------------|
|
| 21 |
+
| `file_read` | Read files from the filesystem |
|
| 22 |
+
| `file_write` | Write content to files |
|
| 23 |
+
| `git_status` | Check git repository status |
|
| 24 |
+
| `web_search` | Search the web for information |
|
| 25 |
+
| `run_command` | Execute shell commands |
|
| 26 |
+
| `create_directory` | Create new directories |
|
| 27 |
+
| `list_directory` | List directory contents |
|
| 28 |
|
| 29 |
+
## 🏃♂️ Quick Start
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
### Local Development
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
```bash
|
| 34 |
+
# Clone the repository
|
| 35 |
+
git clone https://github.com/your-repo/stack-2.9.git
|
| 36 |
+
cd stack-2.9/space
|
| 37 |
|
| 38 |
+
# Install dependencies
|
| 39 |
+
pip install -r requirements.txt
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
# Run the demo
|
| 42 |
+
python app.py --share
|
|
|
|
|
|
|
|
|
|
| 43 |
```
|
| 44 |
|
| 45 |
+
### HuggingFace Spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
1. Create a new Space on [HuggingFace](https://huggingface.co/spaces)
|
| 48 |
+
2. Select "Gradio" as the SDK
|
| 49 |
+
3. Upload the files from this directory:
|
| 50 |
+
- `app.py`
|
| 51 |
+
- `requirements.txt`
|
| 52 |
+
- `README.md`
|
| 53 |
+
4. The model will load automatically on startup
|
| 54 |
|
| 55 |
+
## 💻 Usage
|
| 56 |
|
| 57 |
+
### Example Prompts
|
| 58 |
|
|
|
|
|
|
|
| 59 |
```
|
| 60 |
+
Hello! What can you help me with?
|
| 61 |
+
Check git status of this repository
|
| 62 |
+
Search for best practices for Python async programming
|
| 63 |
+
List the files in the current directory
|
| 64 |
+
Write a simple Python function to calculate fibonacci
|
| 65 |
+
How do I use Git to create a new branch?
|
| 66 |
+
What's your memory of our conversation?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
```
|
| 68 |
|
| 69 |
+
### Python API
|
| 70 |
|
| 71 |
+
```python
|
| 72 |
+
from app import StackModel, memory
|
| 73 |
|
| 74 |
+
# Initialize model
|
| 75 |
+
model = StackModel()
|
| 76 |
+
model.load()
|
| 77 |
|
| 78 |
+
# Generate response
|
| 79 |
+
response = model.generate("Write a hello world in Python")
|
| 80 |
+
print(response)
|
| 81 |
|
| 82 |
+
# Check memory stats
|
| 83 |
+
print(memory.get_stats())
|
|
|
|
|
|
|
| 84 |
```
|
| 85 |
|
| 86 |
+
## 🔐 Environment Variables
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
- `HF_TOKEN` - Your HuggingFace token for private models (optional)
|
| 89 |
+
- `MODEL_ID` - Override default model (default: Qwen/Qwen2.5-Coder-7B-Instruct)
|
| 90 |
|
| 91 |
+
## 📊 Memory System
|
| 92 |
|
| 93 |
+
Stack 2.9 includes a pattern memory system that:
|
| 94 |
|
| 95 |
+
1. **Tracks Interactions** - Records every user-assistant exchange
|
| 96 |
+
2. **Learns Patterns** - Identifies frequently used tools
|
| 97 |
+
3. **Stores Code** - Saves useful code snippets
|
| 98 |
+
4. **Adapts Behavior** - Uses learned context to improve responses
|
| 99 |
|
| 100 |
+
## 🛠️ Tech Stack
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
- **Model**: Qwen2.5-Coder-7B-Instruct
|
| 103 |
+
- **Quantization**: 4-bit (bitsandbytes)
|
| 104 |
+
- **Framework**: Gradio 4.0+
|
| 105 |
+
- **Backend**: Transformers + Accelerate
|
| 106 |
+
- **GPU**: 16GB VRAM recommended
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
## 📝 License
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
MIT License - see LICENSE file for details.
|
| 111 |
|
| 112 |
+
## 🙏 Acknowledgments
|
| 113 |
|
| 114 |
+
- [Qwen](https://github.com/QwenLM/Qwen) - Base model
|
| 115 |
+
- [HuggingFace](https://huggingface.co/) - Spaces hosting
|
| 116 |
+
- [Gradio](https://gradio.app/) - UI framework
|
| 117 |
|
| 118 |
---
|
| 119 |
|
| 120 |
+
<div align="center">
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
Made with ❤️ by Stack 2.9
|
| 123 |
|
| 124 |
+
</div>
|
stack-2.9-deploy/app.py
CHANGED
|
@@ -1,276 +1,600 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Stack 2.9
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
| 8 |
-
import sys
|
| 9 |
import json
|
| 10 |
-
import
|
| 11 |
-
from
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
try:
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
tokenizer=model_id,
|
| 69 |
-
tensor_parallel_size=int(os.getenv("TENSOR_PARALLEL_SIZE", 1)),
|
| 70 |
-
gpu_memory_utilization=float(os.getenv("GPU_MEMORY_UTILIZATION", 0.9)),
|
| 71 |
-
max_model_len=int(os.getenv("MAX_MODEL_LEN", 4096)),
|
| 72 |
-
max_num_seqs=int(os.getenv("MAX_NUM_SEQS", 64)),
|
| 73 |
-
max_num_batched_tokens=int(os.getenv("MAX_NUM_BATCHED_TOKENS", 4096)),
|
| 74 |
-
disable_log_stats=os.getenv("DISABLE_LOG_STATS", "false").lower() == "true",
|
| 75 |
-
enforce_eager=os.getenv("ENFORCE_EAGER", "false").lower() == "true",
|
| 76 |
-
quantization=os.getenv("QUANTIZATION", "awq"),
|
| 77 |
-
download_dir=os.getenv("MODEL_CACHE_DIR", "/home/vllm/.cache/huggingface"),
|
| 78 |
-
)
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
llm_instance = LLM.from_engine_args(engine_args)
|
| 85 |
-
logger.info("Model initialized successfully")
|
| 86 |
-
return True
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
except Exception as e:
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
@app.get("/metrics")
|
| 100 |
-
async def metrics():
|
| 101 |
-
"""Prometheus-style metrics endpoint"""
|
| 102 |
-
if llm_instance is None:
|
| 103 |
-
return JSONResponse(status_code=503, content={"error": "Model not initialized"})
|
| 104 |
-
|
| 105 |
-
# Basic metrics - can be extended
|
| 106 |
-
metrics_data = {
|
| 107 |
-
"model": get_model_id(),
|
| 108 |
-
"status": "ready",
|
| 109 |
-
"gpu_utilization": "N/A" # Would need nvml for actual values
|
| 110 |
-
}
|
| 111 |
-
return JSONResponse(content=metrics_data)
|
| 112 |
-
|
| 113 |
-
@app.post("/v1/completions")
|
| 114 |
-
async def completions(request: Request):
|
| 115 |
-
"""OpenAI-compatible completions endpoint"""
|
| 116 |
-
if llm_instance is None:
|
| 117 |
-
raise HTTPException(status_code=503, detail="Model not initialized")
|
| 118 |
|
|
|
|
|
|
|
|
|
|
| 119 |
try:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
top_p = float(body.get("top_p", 1.0))
|
| 125 |
-
stream = body.get("stream", False)
|
| 126 |
-
|
| 127 |
-
if not prompt:
|
| 128 |
-
raise HTTPException(status_code=400, detail="Prompt is required")
|
| 129 |
-
|
| 130 |
-
sampling_params = SamplingParams(
|
| 131 |
-
max_tokens=max_tokens,
|
| 132 |
-
temperature=temperature,
|
| 133 |
-
top_p=top_p
|
| 134 |
-
)
|
| 135 |
|
| 136 |
-
if stream:
|
| 137 |
-
# Streaming response
|
| 138 |
-
async def generate():
|
| 139 |
-
try:
|
| 140 |
-
outputs = llm_instance.generate(prompt, sampling_params, stream=True)
|
| 141 |
-
async for output in outputs:
|
| 142 |
-
chunk = output.outputs[0].text
|
| 143 |
-
yield f"data: {json.dumps({'text': chunk, 'finished': False})}\n\n"
|
| 144 |
-
yield f"data: {json.dumps({'text': '', 'finished': True})}\n\n"
|
| 145 |
-
except Exception as e:
|
| 146 |
-
logger.error(f"Streaming error: {e}")
|
| 147 |
-
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
| 148 |
-
|
| 149 |
-
return StreamingResponse(generate(), media_type="text/event-stream")
|
| 150 |
-
else:
|
| 151 |
-
# Non-streaming
|
| 152 |
-
outputs = llm_instance.generate(prompt, sampling_params)
|
| 153 |
-
generated_text = outputs[0].outputs[0].text
|
| 154 |
-
|
| 155 |
-
return JSONResponse(content={
|
| 156 |
-
"id": "cmpl-" + os.urandom(12).hex(),
|
| 157 |
-
"object": "text_completion",
|
| 158 |
-
"created": int(os.path.getmtime(__file__)),
|
| 159 |
-
"model": get_model_id(),
|
| 160 |
-
"choices": [{
|
| 161 |
-
"text": generated_text,
|
| 162 |
-
"index": 0,
|
| 163 |
-
"logprobs": None,
|
| 164 |
-
"finish_reason": "stop"
|
| 165 |
-
}],
|
| 166 |
-
"usage": {
|
| 167 |
-
"prompt_tokens": len(prompt.split()),
|
| 168 |
-
"completion_tokens": len(generated_text.split()),
|
| 169 |
-
"total_tokens": len(prompt.split()) + len(generated_text.split())
|
| 170 |
-
}
|
| 171 |
-
})
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
except Exception as e:
|
| 174 |
-
|
| 175 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 176 |
|
| 177 |
-
@app.post("/v1/chat/completions")
|
| 178 |
-
async def chat_completions(request: Request):
|
| 179 |
-
"""OpenAI-compatible chat completions endpoint"""
|
| 180 |
-
if llm_instance is None:
|
| 181 |
-
raise HTTPException(status_code=503, detail="Model not initialized")
|
| 182 |
|
|
|
|
|
|
|
| 183 |
try:
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
temperature=temperature,
|
| 212 |
-
|
|
|
|
|
|
|
| 213 |
)
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
except Exception as e:
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
if __name__ == "__main__":
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 3 |
+
HuggingFace Spaces Demo
|
| 4 |
+
|
| 5 |
+
A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B
|
| 6 |
+
with tool integration and pattern memory.
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
|
|
|
| 10 |
import json
|
| 11 |
+
import time
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import List, Dict, Optional
|
| 14 |
+
import gradio as gr
|
| 15 |
+
|
| 16 |
+
# ============================================================
|
| 17 |
+
# Pattern Memory System
|
| 18 |
+
# ============================================================
|
| 19 |
+
|
| 20 |
+
class SelfEvolutionMemory:
|
| 21 |
+
"""Simple in-memory pattern memory system for demo purposes."""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.conversations = []
|
| 25 |
+
self.learned_patterns = {}
|
| 26 |
+
self.code_snippets = []
|
| 27 |
+
self.preferences = {}
|
| 28 |
+
self.interaction_count = 0
|
| 29 |
+
|
| 30 |
+
def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None):
|
| 31 |
+
"""Record an interaction for learning."""
|
| 32 |
+
self.interaction_count += 1
|
| 33 |
+
interaction = {
|
| 34 |
+
"timestamp": datetime.now().isoformat(),
|
| 35 |
+
"user_input": user_input,
|
| 36 |
+
"assistant_response": assistant_response,
|
| 37 |
+
"tools_used": tools_used or [],
|
| 38 |
+
"interaction_id": self.interaction_count
|
| 39 |
+
}
|
| 40 |
+
self.conversations.append(interaction)
|
| 41 |
+
|
| 42 |
+
# Extract patterns from the interaction
|
| 43 |
+
self._learn_from_interaction(user_input, assistant_response, tools_used or [])
|
| 44 |
+
|
| 45 |
+
def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]):
|
| 46 |
+
"""Learn patterns from interactions."""
|
| 47 |
+
# Track tool usage patterns
|
| 48 |
+
for tool in tools:
|
| 49 |
+
if tool not in self.learned_patterns:
|
| 50 |
+
self.learned_patterns[tool] = {"count": 0, "contexts": []}
|
| 51 |
+
self.learned_patterns[tool]["count"] += 1
|
| 52 |
+
self.learned_patterns[tool]["contexts"].append(user_input[:100])
|
| 53 |
+
|
| 54 |
+
# Extract code snippets if present
|
| 55 |
+
if "```" in response:
|
| 56 |
+
self.code_snippets.append({
|
| 57 |
+
"timestamp": datetime.now().isoformat(),
|
| 58 |
+
"snippet": response
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
def get_context(self) -> str:
|
| 62 |
+
"""Get accumulated context for the model."""
|
| 63 |
+
context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"]
|
| 64 |
+
|
| 65 |
+
if self.learned_patterns:
|
| 66 |
+
context_parts.append("\n### Tool Usage Patterns:")
|
| 67 |
+
for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]:
|
| 68 |
+
context_parts.append(f"- {tool}: used {data['count']} times")
|
| 69 |
+
|
| 70 |
+
if self.code_snippets:
|
| 71 |
+
context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns")
|
| 72 |
+
|
| 73 |
+
return "\n".join(context_parts)
|
| 74 |
+
|
| 75 |
+
def get_stats(self) -> Dict:
|
| 76 |
+
"""Get memory statistics."""
|
| 77 |
+
return {
|
| 78 |
+
"total_interactions": self.interaction_count,
|
| 79 |
+
"tool_patterns": len(self.learned_patterns),
|
| 80 |
+
"code_snippets": len(self.code_snippets),
|
| 81 |
+
"recent_tools": [t for t in self.learned_patterns.keys()][:5]
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Global memory instance
|
| 86 |
+
memory = SelfEvolutionMemory()
|
| 87 |
+
|
| 88 |
+
# ============================================================
|
| 89 |
+
# Tool System
|
| 90 |
+
# ============================================================
|
| 91 |
+
|
| 92 |
+
class Tool:
|
| 93 |
+
"""Base tool class."""
|
| 94 |
+
|
| 95 |
+
def __init__(self, name: str, description: str, func):
|
| 96 |
+
self.name = name
|
| 97 |
+
self.description = description
|
| 98 |
+
self.func = func
|
| 99 |
+
|
| 100 |
+
async def execute(self, *args, **kwargs):
|
| 101 |
+
return await self.func(*args, **kwargs)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Tool implementations (simplified for demo)
|
| 105 |
+
async def tool_file_read(path: str) -> str:
|
| 106 |
+
"""Read a file."""
|
| 107 |
try:
|
| 108 |
+
with open(path, 'r') as f:
|
| 109 |
+
return f.read()[:5000] # Limit output
|
| 110 |
+
except FileNotFoundError:
|
| 111 |
+
return f"File not found: {path}"
|
| 112 |
+
except Exception as e:
|
| 113 |
+
return f"Error reading file: {str(e)}"
|
| 114 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
+
async def tool_file_write(path: str, content: str) -> str:
|
| 117 |
+
"""Write to a file."""
|
| 118 |
+
try:
|
| 119 |
+
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
|
| 120 |
+
with open(path, 'w') as f:
|
| 121 |
+
f.write(content)
|
| 122 |
+
return f"Successfully wrote to {path}"
|
| 123 |
+
except Exception as e:
|
| 124 |
+
return f"Error writing file: {str(e)}"
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
async def tool_git_status() -> str:
|
| 128 |
+
"""Get git status."""
|
| 129 |
+
import subprocess
|
| 130 |
+
try:
|
| 131 |
+
result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd())
|
| 132 |
+
return result.stdout or "No changes"
|
| 133 |
except Exception as e:
|
| 134 |
+
return f"Git error: {str(e)}"
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
async def tool_web_search(query: str) -> str:
|
| 138 |
+
"""Search the web."""
|
| 139 |
+
from urllib.parse import quote
|
| 140 |
+
# Return a demo response since we can't make actual API calls
|
| 141 |
+
return f"🔍 Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)"
|
| 142 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
async def tool_run_command(cmd: str) -> str:
|
| 145 |
+
"""Run a shell command."""
|
| 146 |
+
import subprocess
|
| 147 |
try:
|
| 148 |
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
|
| 149 |
+
return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout
|
| 150 |
+
except Exception as e:
|
| 151 |
+
return f"Command error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
async def tool_create_directory(path: str) -> str:
|
| 155 |
+
"""Create a directory."""
|
| 156 |
+
try:
|
| 157 |
+
os.makedirs(path, exist_ok=True)
|
| 158 |
+
return f"Directory created: {path}"
|
| 159 |
except Exception as e:
|
| 160 |
+
return f"Error: {str(e)}"
|
|
|
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
async def tool_list_directory(path: str = ".") -> str:
|
| 164 |
+
"""List directory contents."""
|
| 165 |
try:
|
| 166 |
+
items = os.listdir(path)
|
| 167 |
+
return "\n".join([f"📁 {i}/" if os.path.isdir(os.path.join(path, i)) else f"📄 {i}" for i in items[:50]])
|
| 168 |
+
except Exception as e:
|
| 169 |
+
return f"Error: {str(e)}"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# Register tools
|
| 173 |
+
TOOLS = {
|
| 174 |
+
"file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read),
|
| 175 |
+
"file_write": Tool("file_write", "Write content to a file", tool_file_write),
|
| 176 |
+
"git_status": Tool("git_status", "Check git repository status", tool_git_status),
|
| 177 |
+
"web_search": Tool("web_search", "Search the web for information", tool_web_search),
|
| 178 |
+
"run_command": Tool("run_command", "Execute a shell command", tool_run_command),
|
| 179 |
+
"create_directory": Tool("create_directory", "Create a new directory", tool_create_directory),
|
| 180 |
+
"list_directory": Tool("list_directory", "List files in a directory", tool_list_directory),
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def get_tool_descriptions() -> str:
|
| 185 |
+
"""Get descriptions of all available tools."""
|
| 186 |
+
return "\n".join([f"- **{t.name}**: {t.description}" for t in TOOLS.values()])
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ============================================================
|
| 190 |
+
# Model Interface
|
| 191 |
+
# ============================================================
|
| 192 |
+
|
| 193 |
+
class StackModel:
|
| 194 |
+
"""Stack 2.9 model interface using transformers."""
|
| 195 |
+
|
| 196 |
+
def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"):
|
| 197 |
+
self.model_id = model_id
|
| 198 |
+
self.model = None
|
| 199 |
+
self.tokenizer = None
|
| 200 |
+
self.pipeline = None
|
| 201 |
+
|
| 202 |
+
def load(self):
|
| 203 |
+
"""Load the model with 4-bit quantization for HF Spaces."""
|
| 204 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 205 |
+
import torch
|
| 206 |
+
|
| 207 |
+
print(f"Loading {self.model_id}...")
|
| 208 |
+
|
| 209 |
+
# 4-bit quantization config for 16GB GPU
|
| 210 |
+
bnb_config = BitsAndBytesConfig(
|
| 211 |
+
load_in_4bit=True,
|
| 212 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 213 |
+
bnb_4bit_use_double_quant=True,
|
| 214 |
+
bnb_4bit_quant_type="nf4"
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# Load tokenizer
|
| 218 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 219 |
+
self.model_id,
|
| 220 |
+
trust_remote_code=True
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# Load model with quantization
|
| 224 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 225 |
+
self.model_id,
|
| 226 |
+
quantization_config=bnb_config,
|
| 227 |
+
device_map="auto",
|
| 228 |
+
trust_remote_code=True
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
print("Model loaded successfully!")
|
| 232 |
+
|
| 233 |
+
def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
|
| 234 |
+
"""Generate a response."""
|
| 235 |
+
if not self.tokenizer:
|
| 236 |
+
return "Model not loaded. Please wait for initialization."
|
| 237 |
+
|
| 238 |
+
# Build the prompt with system and tools
|
| 239 |
+
system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
|
| 240 |
+
|
| 241 |
+
## Available Tools
|
| 242 |
+
{get_tool_descriptions()}
|
| 243 |
+
|
| 244 |
+
## Your Capabilities
|
| 245 |
+
- Write, read, and execute code
|
| 246 |
+
- Use git for version control
|
| 247 |
+
- Search the web for information
|
| 248 |
+
- Create and manage files
|
| 249 |
+
- Execute shell commands
|
| 250 |
+
|
| 251 |
+
## Self-Evolution
|
| 252 |
+
You learn from each interaction. After responding, summarize what tools you used.
|
| 253 |
+
|
| 254 |
+
{memory.get_context()}
|
| 255 |
+
|
| 256 |
+
## Instructions
|
| 257 |
+
1. Be helpful and concise
|
| 258 |
+
2. Use tools when needed
|
| 259 |
+
3. Learn from the conversation
|
| 260 |
+
4. Provide code examples when relevant
|
| 261 |
+
|
| 262 |
+
Now respond to the user:"""
|
| 263 |
+
|
| 264 |
+
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
|
| 265 |
+
|
| 266 |
+
# Tokenize
|
| 267 |
+
inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
|
| 268 |
+
|
| 269 |
+
# Generate
|
| 270 |
+
outputs = self.model.generate(
|
| 271 |
+
**inputs,
|
| 272 |
+
max_new_tokens=max_tokens,
|
| 273 |
temperature=temperature,
|
| 274 |
+
do_sample=True,
|
| 275 |
+
top_p=0.9,
|
| 276 |
+
repetition_penalty=1.1
|
| 277 |
)
|
| 278 |
+
|
| 279 |
+
# Decode
|
| 280 |
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 281 |
+
|
| 282 |
+
# Extract just the response part
|
| 283 |
+
if "Assistant:" in response:
|
| 284 |
+
response = response.split("Assistant:")[-1].strip()
|
| 285 |
+
|
| 286 |
+
return response
|
| 287 |
+
|
| 288 |
+
def generate_streaming(self, prompt: str, max_tokens: int = 512):
|
| 289 |
+
"""Generate with streaming (yields tokens)."""
|
| 290 |
+
if not self.tokenizer:
|
| 291 |
+
yield "Model not loaded. Please wait for initialization."
|
| 292 |
+
return
|
| 293 |
+
|
| 294 |
+
system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
|
| 295 |
+
|
| 296 |
+
## Available Tools
|
| 297 |
+
{get_tool_descriptions()}
|
| 298 |
+
|
| 299 |
+
## Self-Evolution Memory
|
| 300 |
+
{memory.get_context()}
|
| 301 |
+
|
| 302 |
+
Now respond to the user:"""
|
| 303 |
+
|
| 304 |
+
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
|
| 305 |
+
|
| 306 |
+
inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
|
| 307 |
+
|
| 308 |
+
# Generate token by token
|
| 309 |
+
from transformers import GenerationMixin
|
| 310 |
+
from typing import Iterator
|
| 311 |
+
|
| 312 |
+
generated_ids = inputs.input_ids
|
| 313 |
+
|
| 314 |
+
for _ in range(max_tokens):
|
| 315 |
+
with torch.no_grad():
|
| 316 |
+
outputs = self.model(generated_ids)
|
| 317 |
+
next_token_logits = outputs.logits[:, -1, :]
|
| 318 |
+
|
| 319 |
+
# Apply temperature
|
| 320 |
+
next_token_logits = next_token_logits / 0.7
|
| 321 |
+
|
| 322 |
+
# Sample
|
| 323 |
+
probs = torch.softmax(next_token_logits, dim=-1)
|
| 324 |
+
next_token = torch.multinomial(probs, num_samples=1)
|
| 325 |
+
|
| 326 |
+
generated_ids = torch.cat([generated_ids, next_token], dim=-1)
|
| 327 |
+
|
| 328 |
+
# Decode and yield
|
| 329 |
+
token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
|
| 330 |
+
yield token_str
|
| 331 |
+
|
| 332 |
+
# Stop on EOS
|
| 333 |
+
if next_token.item() == self.tokenizer.eos_token_id:
|
| 334 |
+
break
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# Global model instance
|
| 338 |
+
model = None
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def initialize_model():
|
| 342 |
+
"""Initialize the model on startup."""
|
| 343 |
+
global model
|
| 344 |
+
try:
|
| 345 |
+
model = StackModel()
|
| 346 |
+
model.load()
|
| 347 |
+
return model
|
| 348 |
+
except Exception as e:
|
| 349 |
+
print(f"Failed to load model: {e}")
|
| 350 |
+
return None
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ============================================================
|
| 354 |
+
# Gradio Interface
|
| 355 |
+
# ============================================================
|
| 356 |
+
|
| 357 |
+
def format_tools_used(tools_used: List[str]) -> str:
|
| 358 |
+
"""Format the tools used for display."""
|
| 359 |
+
if not tools_used:
|
| 360 |
+
return ""
|
| 361 |
+
return f"\n\n🔧 **Tools Used**: {', '.join(tools_used)}"
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def chat_response(message: str, history: List[List[str]]) -> tuple:
|
| 365 |
+
"""Process a chat message and return response."""
|
| 366 |
+
global model, memory
|
| 367 |
+
|
| 368 |
+
if model is None or model.model is None:
|
| 369 |
+
return "⏳ Model is loading. Please wait...", history + [[message, "⏳ Model is loading. Please wait..."]]
|
| 370 |
+
|
| 371 |
+
# Track tools used
|
| 372 |
+
tools_used = []
|
| 373 |
+
|
| 374 |
+
# Check if we need to use tools based on the message
|
| 375 |
+
message_lower = message.lower()
|
| 376 |
+
|
| 377 |
+
if any(kw in message_lower for kw in ['git status', 'git']):
|
| 378 |
+
tools_used.append("git_status")
|
| 379 |
+
if any(kw in message_lower for kw in ['search', 'find', 'look up']):
|
| 380 |
+
tools_used.append("web_search")
|
| 381 |
+
if any(kw in message_lower for kw in ['list files', 'directory', 'ls']):
|
| 382 |
+
tools_used.append("list_directory")
|
| 383 |
+
if any(kw in message_lower for kw in ['run ', 'execute', 'command']):
|
| 384 |
+
tools_used.append("run_command")
|
| 385 |
+
|
| 386 |
+
# Generate response
|
| 387 |
+
try:
|
| 388 |
+
response = model.generate(message, max_tokens=512)
|
| 389 |
except Exception as e:
|
| 390 |
+
response = f"I encountered an error: {str(e)}"
|
| 391 |
+
|
| 392 |
+
# Add tools used to response
|
| 393 |
+
response += format_tools_used(tools_used)
|
| 394 |
+
|
| 395 |
+
# Record in memory
|
| 396 |
+
memory.add_interaction(message, response, tools_used)
|
| 397 |
+
|
| 398 |
+
return response
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def chat_response_stream(message: str, history: List[List[str]]) -> Generator:
|
| 402 |
+
"""Process a chat message with streaming."""
|
| 403 |
+
global model, memory
|
| 404 |
+
|
| 405 |
+
if model is None or model.model is None:
|
| 406 |
+
yield "⏳ Model is loading. Please wait..."
|
| 407 |
+
return
|
| 408 |
+
|
| 409 |
+
full_response = ""
|
| 410 |
+
tools_used = []
|
| 411 |
+
|
| 412 |
+
message_lower = message.lower()
|
| 413 |
+
if any(kw in message_lower for kw in ['git status', 'git']):
|
| 414 |
+
tools_used.append("git_status")
|
| 415 |
+
if any(kw in message_lower for kw in ['search', 'find']):
|
| 416 |
+
tools_used.append("web_search")
|
| 417 |
+
if any(kw in message_lower for kw in ['list', 'directory']):
|
| 418 |
+
tools_used.append("list_directory")
|
| 419 |
+
|
| 420 |
+
# Stream the response
|
| 421 |
+
for token in model.generate_streaming(message, max_tokens=256):
|
| 422 |
+
full_response += token
|
| 423 |
+
yield full_response
|
| 424 |
+
|
| 425 |
+
# Add tools used
|
| 426 |
+
if tools_used:
|
| 427 |
+
full_response += format_tools_used(tools_used)
|
| 428 |
+
yield full_response
|
| 429 |
+
|
| 430 |
+
# Record in memory
|
| 431 |
+
memory.add_interaction(message, full_response, tools_used)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
# Example prompts for the UI
|
| 435 |
+
EXAMPLE_PROMPTS = [
|
| 436 |
+
"Hello! What can you help me with?",
|
| 437 |
+
"Check git status of this repository",
|
| 438 |
+
"Search for best practices for Python async programming",
|
| 439 |
+
"List the files in the current directory",
|
| 440 |
+
"Write a simple Python function to calculate fibonacci",
|
| 441 |
+
"How do I use Git to create a new branch?",
|
| 442 |
+
"What's your memory of our conversation?",
|
| 443 |
+
]
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def create_gradio_app():
|
| 447 |
+
"""Create the Gradio interface."""
|
| 448 |
+
|
| 449 |
+
with gr.Blocks(
|
| 450 |
+
title="Stack 2.9 - Pattern-Based AI Coding Assistant",
|
| 451 |
+
theme=gr.themes.Soft(
|
| 452 |
+
primary_color="#6366f1",
|
| 453 |
+
secondary_color="#818cf8",
|
| 454 |
+
tertiary_color="#a5b4fc"
|
| 455 |
+
)
|
| 456 |
+
) as app:
|
| 457 |
+
|
| 458 |
+
# Header
|
| 459 |
+
gr.Markdown("""
|
| 460 |
+
# 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
|
| 461 |
+
|
| 462 |
+
Powered by **Qwen2.5-Coder-7B** with 4-bit quantization
|
| 463 |
+
|
| 464 |
+
---
|
| 465 |
+
""")
|
| 466 |
+
|
| 467 |
+
# Memory stats display
|
| 468 |
+
with gr.Row():
|
| 469 |
+
with gr.Column(scale=1):
|
| 470 |
+
stats_display = gr.Markdown(
|
| 471 |
+
"📊 **Memory Stats**\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0",
|
| 472 |
+
elem_id="stats"
|
| 473 |
+
)
|
| 474 |
+
with gr.Column(scale=3):
|
| 475 |
+
pass # Spacer
|
| 476 |
+
|
| 477 |
+
# Chat interface
|
| 478 |
+
chatbot = gr.Chatbot(
|
| 479 |
+
height=500,
|
| 480 |
+
show_copy_button=True,
|
| 481 |
+
bubble_full_width=False
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
with gr.Row():
|
| 485 |
+
msg = gr.Textbox(
|
| 486 |
+
label="Message",
|
| 487 |
+
placeholder="Ask me anything...",
|
| 488 |
+
scale=4,
|
| 489 |
+
lines=3
|
| 490 |
+
)
|
| 491 |
+
submit_btn = gr.Button("Send", variant="primary", scale=1)
|
| 492 |
+
|
| 493 |
+
# Clear button
|
| 494 |
+
with gr.Row():
|
| 495 |
+
clear_btn = gr.Button("🗑️ Clear Chat")
|
| 496 |
+
|
| 497 |
+
# Example prompts
|
| 498 |
+
gr.Examples(
|
| 499 |
+
examples=EXAMPLE_PROMPTS,
|
| 500 |
+
inputs=msg,
|
| 501 |
+
label="Example Prompts"
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Memory visualization
|
| 505 |
+
with gr.Accordion("🧠 Self-Evolution Memory", open=False):
|
| 506 |
+
memory_display = gr.Textbox(
|
| 507 |
+
label="Memory Content",
|
| 508 |
+
lines=10,
|
| 509 |
+
interactive=False
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
# Functions
|
| 513 |
+
def respond(message, history):
|
| 514 |
+
response = chat_response(message, history)
|
| 515 |
+
history.append([message, response])
|
| 516 |
+
return "", history
|
| 517 |
+
|
| 518 |
+
def update_stats():
|
| 519 |
+
stats = memory.get_stats()
|
| 520 |
+
return f"""📊 **Memory Stats**
|
| 521 |
+
|
| 522 |
+
- **Interactions**: {stats['total_interactions']}
|
| 523 |
+
- **Tool Patterns**: {stats['tool_patterns']}
|
| 524 |
+
- **Code Snippets**: {stats['code_snippets']}
|
| 525 |
+
|
| 526 |
+
**Recent Tools**: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}"""
|
| 527 |
+
|
| 528 |
+
def update_memory():
|
| 529 |
+
return memory.get_context()
|
| 530 |
+
|
| 531 |
+
# Button click handlers
|
| 532 |
+
submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send")
|
| 533 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send")
|
| 534 |
+
|
| 535 |
+
def clear_chat():
|
| 536 |
+
return [], ""
|
| 537 |
+
|
| 538 |
+
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
|
| 539 |
+
|
| 540 |
+
# Update stats periodically
|
| 541 |
+
chatbot.change(update_stats, outputs=[stats_display])
|
| 542 |
+
chatbot.change(update_memory, outputs=[memory_display])
|
| 543 |
+
|
| 544 |
+
# Footer
|
| 545 |
+
gr.Markdown("""
|
| 546 |
+
---
|
| 547 |
+
### About Stack 2.9
|
| 548 |
+
|
| 549 |
+
Stack 2.9 is a pattern-based AI coding assistant that:
|
| 550 |
+
- 🔍 Uses **Qwen2.5-Coder-7B** (4-bit, ~4GB VRAM)
|
| 551 |
+
- 🛠️ Integrates **7 tools** (file, git, web, search, shell)
|
| 552 |
+
- 🧠 Remembers interactions and learns patterns
|
| 553 |
+
- ⚡ Provides fast, streaming responses
|
| 554 |
+
|
| 555 |
+
Deployed on **HuggingFace Spaces** with Gradio
|
| 556 |
+
""")
|
| 557 |
+
|
| 558 |
+
return app
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
# ============================================================
|
| 562 |
+
# Main Entry Point
|
| 563 |
+
# ============================================================
|
| 564 |
|
| 565 |
if __name__ == "__main__":
|
| 566 |
+
import argparse
|
| 567 |
+
|
| 568 |
+
parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo")
|
| 569 |
+
parser.add_argument("--share", action="store_true", help="Create a public share link")
|
| 570 |
+
parser.add_argument("--port", type=int, default=7860, help="Port to run on")
|
| 571 |
+
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID")
|
| 572 |
+
args = parser.parse_args()
|
| 573 |
+
|
| 574 |
+
print("=" * 50)
|
| 575 |
+
print("🚀 Stack 2.9 - Pattern-Based AI Coding Assistant")
|
| 576 |
+
print("=" * 50)
|
| 577 |
+
print(f"Model: {args.model}")
|
| 578 |
+
print("Loading model...")
|
| 579 |
+
|
| 580 |
+
# Initialize model in a thread
|
| 581 |
+
import threading
|
| 582 |
+
|
| 583 |
+
def load_model_thread():
|
| 584 |
+
global model
|
| 585 |
+
model = initialize_model()
|
| 586 |
+
|
| 587 |
+
loader_thread = threading.Thread(target=load_model_thread)
|
| 588 |
+
loader_thread.start()
|
| 589 |
+
|
| 590 |
+
# Create and launch app
|
| 591 |
+
app = create_gradio_app()
|
| 592 |
+
|
| 593 |
+
print(f"\n🚀 Launching Gradio on port {args.port}...")
|
| 594 |
+
print("📝 Note: Model loads in background. Chat will work once loaded.\n")
|
| 595 |
+
|
| 596 |
+
app.launch(
|
| 597 |
+
server_name="0.0.0.0",
|
| 598 |
+
server_port=args.port,
|
| 599 |
+
share=args.share
|
| 600 |
+
)
|
stack-2.9-deploy/requirements.txt
CHANGED
|
@@ -1,14 +1,24 @@
|
|
| 1 |
-
# Stack 2.9
|
| 2 |
-
#
|
| 3 |
-
|
| 4 |
-
# Core
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Stack 2.9 - HuggingFace Spaces Demo
|
| 2 |
+
# Requirements for Gradio interface with Qwen2.5-Coder-7B
|
| 3 |
+
|
| 4 |
+
# Core Gradio
|
| 5 |
+
gradio>=4.0.0
|
| 6 |
+
|
| 7 |
+
# Transformers and model loading
|
| 8 |
+
transformers>=4.36.0
|
| 9 |
+
torch>=2.0.0
|
| 10 |
+
|
| 11 |
+
# Model optimization
|
| 12 |
+
accelerate>=0.24.0
|
| 13 |
+
bitsandbytes>=0.41.0
|
| 14 |
+
|
| 15 |
+
# Additional utilities
|
| 16 |
+
huggingface-hub>=0.19.0
|
| 17 |
+
safetensors>=0.4.0
|
| 18 |
+
|
| 19 |
+
# Optional: For better web search
|
| 20 |
+
# brave-search>=0.1.0
|
| 21 |
+
|
| 22 |
+
# Optional: For web fetching
|
| 23 |
+
# beautifulsoup4>=4.12.0
|
| 24 |
+
# lxml>=4.9.0
|
{self_evolution → stack-2.9-training}/__init__.py
RENAMED
|
File without changes
|
{self_evolution → stack-2.9-training}/apply.py
RENAMED
|
File without changes
|
{self_evolution → stack-2.9-training}/learner.py
RENAMED
|
File without changes
|
{self_evolution → stack-2.9-training}/memory.py
RENAMED
|
File without changes
|
{self_evolution → stack-2.9-training}/observer.py
RENAMED
|
File without changes
|
{stack_2_9_training → stack-2.9-training}/train_config_colab.yaml
RENAMED
|
File without changes
|
{self_evolution → stack-2.9-training}/trainer.py
RENAMED
|
File without changes
|
stack_cli/__init__.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
"""Stack 2.9 CLI and Agent Interface."""
|
| 2 |
-
|
| 3 |
-
__version__ = "2.9.0"
|
| 4 |
-
__author__ = "Stack Team"
|
| 5 |
-
|
| 6 |
-
from .agent import create_agent, StackAgent
|
| 7 |
-
from .tools import TOOLS, list_tools, get_tool, get_tool_schemas
|
| 8 |
-
from .context import create_context_manager, ContextManager
|
| 9 |
-
|
| 10 |
-
__all__ = [
|
| 11 |
-
"create_agent",
|
| 12 |
-
"StackAgent",
|
| 13 |
-
"TOOLS",
|
| 14 |
-
"list_tools",
|
| 15 |
-
"get_tool",
|
| 16 |
-
"get_tool_schemas",
|
| 17 |
-
"create_context_manager",
|
| 18 |
-
"ContextManager"
|
| 19 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
verify_repo.sh
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# Stack 2.9 - Repository Integrity Check
|
| 3 |
-
# Verifies all components are present before pushing to GitHub
|
| 4 |
-
|
| 5 |
-
set -e
|
| 6 |
-
|
| 7 |
-
echo "🔍 Stack 2.9 Repository Check"
|
| 8 |
-
echo "============================"
|
| 9 |
-
echo ""
|
| 10 |
-
|
| 11 |
-
ERRORS=0
|
| 12 |
-
WARNINGS=0
|
| 13 |
-
|
| 14 |
-
check_dir() {
|
| 15 |
-
if [ -d "$1" ]; then
|
| 16 |
-
echo "✅ $2"
|
| 17 |
-
else
|
| 18 |
-
echo "❌ Missing: $2 ($1)"
|
| 19 |
-
((ERRORS++))
|
| 20 |
-
fi
|
| 21 |
-
}
|
| 22 |
-
|
| 23 |
-
check_file() {
|
| 24 |
-
if [ -f "$1" ]; then
|
| 25 |
-
echo "✅ $2"
|
| 26 |
-
else
|
| 27 |
-
echo "❌ Missing: $2 ($1)"
|
| 28 |
-
((ERRORS++))
|
| 29 |
-
fi
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
check_file_optional() {
|
| 33 |
-
if [ -f "$1" ]; then
|
| 34 |
-
echo "✅ $2"
|
| 35 |
-
else
|
| 36 |
-
echo "⚠️ Optional: $2 ($1)"
|
| 37 |
-
((WARNINGS++))
|
| 38 |
-
fi
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
echo "Checking top-level files..."
|
| 42 |
-
check_file "README.md" "Main README"
|
| 43 |
-
check_file "LICENSE" "Apache 2.0 License"
|
| 44 |
-
check_file "CONTRIBUTING.md" "Contributing Guide"
|
| 45 |
-
check_file "CODE_OF_CONDUCT.md" "Code of Conduct"
|
| 46 |
-
check_file "Makefile" "Makefile"
|
| 47 |
-
check_file "requirements.txt" "Python requirements"
|
| 48 |
-
check_file "pyproject.toml" "Python package config"
|
| 49 |
-
check_file ".gitignore" "Git ignore rules"
|
| 50 |
-
check_file ".env.example" "Environment example"
|
| 51 |
-
check_file "setup.sh" "Setup script"
|
| 52 |
-
check_file "PUSH_GUIDE.md" "Push guide"
|
| 53 |
-
|
| 54 |
-
echo ""
|
| 55 |
-
echo "Checking component directories..."
|
| 56 |
-
check_dir "training-data" "Training data"
|
| 57 |
-
check_dir "stack-2.9-training" "Training pipeline"
|
| 58 |
-
check_dir "stack-2.9-deploy" "Deployment configs"
|
| 59 |
-
check_dir "stack-2.9-voice" "Voice integration"
|
| 60 |
-
check_dir "stack-2.9-docs" "Documentation"
|
| 61 |
-
check_dir "stack-2.9-eval" "Evaluation tools"
|
| 62 |
-
check_dir ".github/workflows" "CI/CD workflows"
|
| 63 |
-
|
| 64 |
-
echo ""
|
| 65 |
-
echo "Checking critical training data files..."
|
| 66 |
-
check_file "training-data/tools/catalog.json" "Tool schemas"
|
| 67 |
-
check_file "training-data/synthetic/examples.jsonl" "Synthetic examples"
|
| 68 |
-
check_file "training-data/manifest.json" "Dataset manifest"
|
| 69 |
-
check_file_optional "training-data/code-pairs/pairs.json" "Code-comment pairs"
|
| 70 |
-
check_file_optional "training-data/advanced-patterns/examples.jsonl" "Advanced patterns"
|
| 71 |
-
|
| 72 |
-
echo ""
|
| 73 |
-
echo "Checking training pipeline files..."
|
| 74 |
-
check_file "stack-2.9-training/requirements.txt" "Training requirements"
|
| 75 |
-
check_file "stack-2.9-training/prepare_dataset.py" "Dataset preparation"
|
| 76 |
-
check_file "stack-2.9-training/train_lora.py" "LoRA training script"
|
| 77 |
-
check_file "stack-2.9-training/merge_lora.py" "Merge script"
|
| 78 |
-
check_file "stack-2.9-training/quantize_awq.py" "AWQ quantization"
|
| 79 |
-
check_file "stack-2.9-training/run_training.sh" "Training runner"
|
| 80 |
-
|
| 81 |
-
echo ""
|
| 82 |
-
echo "Checking deployment files..."
|
| 83 |
-
check_file "stack-2.9-deploy/vllm_server.py" "vLLM server"
|
| 84 |
-
check_file "stack-2.9-deploy/docker-compose.yml" "Docker Compose"
|
| 85 |
-
check_file "stack-2.9-deploy/Dockerfile" "Docker image"
|
| 86 |
-
check_file "stack-2.9-deploy/local_deploy.sh" "Local deployment script"
|
| 87 |
-
check_file_optional "stack-2.9-deploy/runpod_deploy.sh" "RunPod script"
|
| 88 |
-
check_file_optional "stack-2.9-deploy/vastai_deploy.sh" "Vast.ai script"
|
| 89 |
-
|
| 90 |
-
echo ""
|
| 91 |
-
echo "Checking voice integration..."
|
| 92 |
-
check_file "stack-2.9-voice/voice_server.py" "Voice API server"
|
| 93 |
-
check_file "stack-2.9-voice/voice_client.py" "Voice client"
|
| 94 |
-
check_file "stack-2.9-voice/stack_voice_integration.py" "Integration layer"
|
| 95 |
-
check_file "stack-2.9-voice/docker-compose.yml" "Voice Docker Compose"
|
| 96 |
-
check_file "stack-2.9-voice/README.md" "Voice docs"
|
| 97 |
-
|
| 98 |
-
echo ""
|
| 99 |
-
echo "Checking documentation..."
|
| 100 |
-
check_file "stack-2.9-docs/README.md" "Main docs"
|
| 101 |
-
check_file "stack-2.9-docs/API.md" "API reference"
|
| 102 |
-
check_file "stack-2.9-docs/OPENROUTER_SUBMISSION.md" "OpenRouter app"
|
| 103 |
-
check_file "stack-2.9-docs/TRAINING_DATA.md" "Training guide"
|
| 104 |
-
check_file_optional "stack-2.9-docs/VOICE_INTEGRATION.md" "Voice integration"
|
| 105 |
-
check_file_optional "stack-2.9-docs/BENCHMARKS.md" "Benchmarks"
|
| 106 |
-
|
| 107 |
-
echo ""
|
| 108 |
-
echo "Checking evaluation..."
|
| 109 |
-
check_file "stack-2.9-eval/eval_pipeline.py" "Evaluation pipeline"
|
| 110 |
-
check_file "stack-2.9-eval/tool_use_eval.py" "Tool use eval"
|
| 111 |
-
check_file "stack-2.9-eval/code_quality_eval.py" "Code quality eval"
|
| 112 |
-
check_file "stack-2.9-eval/conversation_eval.py" "Conversation eval"
|
| 113 |
-
check_file "stack-2.9-eval/results_aggregator.py" "Results aggregator"
|
| 114 |
-
check_dir "stack-2.9-eval/benchmarks" "Benchmark datasets"
|
| 115 |
-
check_dir "stack-2.9-eval/results" "Results directory"
|
| 116 |
-
|
| 117 |
-
echo ""
|
| 118 |
-
echo "============================"
|
| 119 |
-
echo "📊 Repository Check Summary"
|
| 120 |
-
echo "============================"
|
| 121 |
-
if [ $ERRORS -eq 0 ]; then
|
| 122 |
-
echo "✅ All critical files present!"
|
| 123 |
-
if [ $WARNINGS -gt 0 ]; then
|
| 124 |
-
echo "⚠️ $WARNINGS optional files missing (not critical)"
|
| 125 |
-
fi
|
| 126 |
-
echo ""
|
| 127 |
-
echo "Ready to push to GitHub!"
|
| 128 |
-
echo ""
|
| 129 |
-
echo "Next:"
|
| 130 |
-
echo " 1. Create repo: https://github.com/organizations/my-ai-stack/repositories/new"
|
| 131 |
-
echo " 2. Run: git init && git add . && git commit -m 'Initial commit'"
|
| 132 |
-
echo " 3. Add remote: git remote add origin https://github.com/my-ai-stack/stack-2.9.git"
|
| 133 |
-
echo " 4. Push: git push -u origin main"
|
| 134 |
-
exit 0
|
| 135 |
-
else
|
| 136 |
-
echo "❌ $ERRORS critical errors found!"
|
| 137 |
-
echo "⚠️ $WARNINGS warnings"
|
| 138 |
-
echo ""
|
| 139 |
-
echo "Please fix missing files before pushing."
|
| 140 |
-
exit 1
|
| 141 |
-
fi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|