walidsobhie-code Claude Opus 4.6 commited on
Commit
99a7be2
·
1 Parent(s): bfc7d04

refactor: Clean up project structure - fewer root folders

Browse files

Reorganized to user-friendly structure:
- Moved legacy docs to docs/archive/
- Merged CLI tools to src/cli/
- Moved training scripts to scripts/
- Removed empty/broken directories (benchmarks, space, website)
- Added directory structure documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (44) hide show
  1. benchmarks/benchmark_context_lengths.py +0 -442
  2. benchmarks/test_context_window.py +0 -330
  3. CONTEXT_UPDATE_SUMMARY.md → docs/archive/CONTEXT_UPDATE_SUMMARY.md +0 -0
  4. DATA_SCALING_PLAN.md → docs/archive/DATA_SCALING_PLAN.md +0 -0
  5. DEPLOYMENT_TEST_REPORT.md → docs/archive/DEPLOYMENT_TEST_REPORT.md +0 -0
  6. EVAL_PLAN.md → docs/archive/EVAL_PLAN.md +0 -0
  7. IMPLEMENTATION_SUMMARY.md → docs/archive/IMPLEMENTATION_SUMMARY.md +0 -0
  8. LICENSES.md → docs/archive/LICENSES.md +0 -0
  9. MAXIMIZATION_PLAN.md → docs/archive/MAXIMIZATION_PLAN.md +0 -0
  10. OPENROUTER_SUBMISSION_CHECKLIST.md → docs/archive/OPENROUTER_SUBMISSION_CHECKLIST.md +0 -0
  11. PUSH_GUIDE.md → docs/archive/PUSH_GUIDE.md +0 -0
  12. STACK_CLI_README.md → docs/archive/STACK_CLI_README.md +0 -0
  13. SUBMISSION_PACKAGE_SUMMARY.md → docs/archive/SUBMISSION_PACKAGE_SUMMARY.md +0 -0
  14. TOGETHER_AI.md → docs/archive/TOGETHER_AI.md +0 -0
  15. context_window_upgrade_summary.md → docs/archive/context_window_upgrade_summary.md +0 -0
  16. {website → docs/archive/website}/app.js +0 -0
  17. {website → docs/archive/website}/benchmark.html +0 -0
  18. {website → docs/archive/website}/index.html +0 -0
  19. {website → docs/archive/website}/styles.css +0 -0
  20. training-data-extractor.js → scripts/training-data-extractor.js +0 -0
  21. space/Dockerfile +0 -37
  22. space/README.md +0 -124
  23. space/app.py +0 -600
  24. space/requirements.txt +0 -24
  25. {stack-2.9-cli → src/cli}/__init__.py +0 -0
  26. {stack_cli → src/cli}/agent.py +0 -0
  27. {stack_cli → src/cli}/cli.py +0 -0
  28. {stack_cli → src/cli}/context.py +0 -0
  29. {stack-2.9-cli → src/cli}/main.py +0 -0
  30. {stack_cli → src/cli}/pyproject.toml +0 -0
  31. {stack_cli → src/cli}/tools.py +0 -0
  32. stack-2.9-deploy/Dockerfile +22 -92
  33. stack-2.9-deploy/README.md +82 -304
  34. stack-2.9-deploy/app.py +577 -253
  35. stack-2.9-deploy/requirements.txt +24 -14
  36. {self_evolution → stack-2.9-training}/__init__.py +0 -0
  37. {self_evolution → stack-2.9-training}/apply.py +0 -0
  38. {self_evolution → stack-2.9-training}/learner.py +0 -0
  39. {self_evolution → stack-2.9-training}/memory.py +0 -0
  40. {self_evolution → stack-2.9-training}/observer.py +0 -0
  41. {stack_2_9_training → stack-2.9-training}/train_config_colab.yaml +0 -0
  42. {self_evolution → stack-2.9-training}/trainer.py +0 -0
  43. stack_cli/__init__.py +0 -19
  44. verify_repo.sh +0 -141
benchmarks/benchmark_context_lengths.py DELETED
@@ -1,442 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Benchmark script for comparing context window performance across different lengths.
4
-
5
- This script compares:
6
- 1. 32K context (original claim)
7
- 2. 64K context (mid-range)
8
- 3. 128K context (full potential)
9
-
10
- For each context length, it tests:
11
- - Memory consumption (VRAM and RAM)
12
- - Throughput (tokens/second during generation)
13
- - Latency (time to first token)
14
- - Quality (ability to process and generate coherent output)
15
- - Task completion on sample coding tasks
16
-
17
- Output: JSON results + summary report
18
- """
19
-
20
- import os
21
- import sys
22
- import json
23
- import time
24
- import argparse
25
- import statistics
26
- from pathlib import Path
27
- from typing import Dict, List, Any
28
-
29
- # Required packages: vllm, transformers, psutil, torch
30
-
31
- def get_memory_info():
32
- """Get memory statistics."""
33
- import torch
34
- import psutil
35
-
36
- process = psutil.Process(os.getpid())
37
- ram_mb = process.memory_info().rss / 1024 / 1024
38
-
39
- if torch.cuda.is_available():
40
- gpu_mem_allocated = torch.cuda.memory_allocated() / 1024 / 1024
41
- gpu_mem_reserved = torch.cuda.memory_reserved() / 1024 / 1024
42
- return {
43
- "ram_mb": round(ram_mb, 1),
44
- "gpu_allocated_mb": round(gpu_mem_allocated, 1),
45
- "gpu_reserved_mb": round(gpu_mem_reserved, 1),
46
- "gpu_used": True
47
- }
48
- else:
49
- return {
50
- "ram_mb": round(ram_mb, 1),
51
- "gpu_used": False
52
- }
53
-
54
- def preprocess_prompt(prompt: str, tokenizer, target_tokens: int, mode: str = "repeat") -> List[int]:
55
- """Preprocess a prompt to reach target token length."""
56
- tokens = tokenizer.encode(prompt)
57
-
58
- if len(tokens) >= target_tokens:
59
- return tokens[:target_tokens]
60
-
61
- needed = target_tokens - len(tokens)
62
-
63
- if mode == "repeat":
64
- # Repeat a filler pattern
65
- filler = " This is additional context to fill the window. " * 100
66
- filler_tokens = tokenizer.encode(filler)
67
- repeats = (needed // len(filler_tokens)) + 1
68
- tokens.extend(filler_tokens * repeats)
69
- elif mode == "noise":
70
- # Use random-like content (code snippets)
71
- noise = """
72
- // Dummy code for context expansion
73
- function placeholder() {
74
- const x = 1;
75
- const y = 2;
76
- return x + y;
77
- }
78
- class DummyClass {
79
- constructor() {}
80
- method() {}
81
- }
82
- """.repeat(needed // 50 + 1)
83
- noise_tokens = tokenizer.encode(noise)
84
- tokens.extend(noise_tokens)
85
-
86
- return tokens[:target_tokens]
87
-
88
- def load_model(model_name: str, max_model_len: int, block_size: int):
89
- """Load vLLM model with specified configuration."""
90
- from vllm import LLM
91
-
92
- print(f"Loading model with max_model_len={max_model_len}, block_size={block_size}")
93
- model = LLM(
94
- model=model_name,
95
- max_model_len=max_model_len,
96
- block_size=block_size,
97
- gpu_memory_utilization=0.9,
98
- trust_remote_code=True,
99
- tensor_parallel_size=1,
100
- # For benchmarking, disable speculative decoding for consistent results
101
- enable_chunked_prefill=False
102
- )
103
- return model
104
-
105
- def run_generation(model, tokenizer, prompt_tokens: List[int], max_new_tokens: int = 200) -> Dict[str, Any]:
106
- """Run generation and collect metrics."""
107
- from vllm import SamplingParams
108
-
109
- sampling_params = SamplingParams(
110
- temperature=0.7,
111
- top_p=0.95,
112
- max_tokens=max_new_tokens,
113
- min_p=0.05
114
- )
115
-
116
- # Prefill phase timing
117
- torch = sys.modules.get('torch')
118
- if torch and torch.cuda.is_available():
119
- torch.cuda.synchronize()
120
-
121
- start_time = time.time()
122
- outputs = model.generate(
123
- prompt_token_ids=prompt_tokens,
124
- sampling_params=sampling_params,
125
- use_tqdm=False
126
- )
127
- end_time = time.time()
128
-
129
- if torch and torch.cuda.is_available():
130
- torch.cuda.synchronize()
131
-
132
- elapsed = end_time - start_time
133
- output_token_ids = outputs[0].outputs[0].token_ids
134
- output_text = outputs[0].outputs[0].text
135
-
136
- # Count tokens in output
137
- output_length = len(output_token_ids)
138
-
139
- # Calculate prefill latency (estimated)
140
- prefill_latency = elapsed * 0.3 # Rough estimate
141
- decode_latency = elapsed - prefill_latency
142
-
143
- # Tokens per second
144
- total_tokens = output_length
145
- tokens_per_second = total_tokens / elapsed if elapsed > 0 else 0
146
-
147
- return {
148
- "elapsed_seconds": round(elapsed, 4),
149
- "output_tokens": output_length,
150
- "output_text": output_text[:200],
151
- "tokens_per_second": round(tokens_per_second, 2),
152
- "prefill_latency_est": round(prefill_latency, 4),
153
- "decode_latency_est": round(decode_latency, 4)
154
- }
155
-
156
- def test_task(model, tokenizer, context_length: int, task_name: str, prompt: str, max_response: int = 200) -> Dict[str, Any]:
157
- """Run a single benchmark task."""
158
- print(f"\n Task: {task_name}")
159
- sys.stdout.flush()
160
-
161
- mem_before = get_memory_info()
162
- prompt_tokens = preprocess_prompt(prompt, tokenizer, context_length)
163
- actual_context_len = len(prompt_tokens)
164
-
165
- start_time = time.time()
166
- try:
167
- result = run_generation(model, tokenizer, prompt_tokens, max_response)
168
- elapsed = time.time() - start_time
169
- mem_after = get_memory_info()
170
-
171
- # Calculate memory delta
172
- mem_delta = {}
173
- if mem_after.get("gpu_used"):
174
- mem_delta["gpu_allocated_delta_mb"] = round(
175
- mem_after["gpu_allocated_mb"] - mem_before["gpu_allocated_mb"], 1
176
- )
177
- mem_delta["ram_delta_mb"] = round(
178
- mem_after["ram_mb"] - mem_before["ram_mb"], 1
179
- )
180
-
181
- return {
182
- "task": task_name,
183
- "context_length_target": context_length,
184
- "context_length_actual": actual_context_len,
185
- "success": True,
186
- **result,
187
- **mem_delta
188
- }
189
- except Exception as e:
190
- elapsed = time.time() - start_time
191
- print(f" ❌ Failed: {e}")
192
- return {
193
- "task": task_name,
194
- "context_length_target": context_length,
195
- "success": False,
196
- "error": str(e),
197
- "elapsed_seconds": round(elapsed, 4)
198
- }
199
-
200
- def main():
201
- parser = argparse.ArgumentParser(description="Benchmark context lengths: 32K, 64K, 128K")
202
- parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-32B",
203
- help="Model name")
204
- parser.add_argument("--output-dir", type=str, default="benchmarks/results",
205
- help="Directory to save results")
206
- parser.add_argument("--context-lengths", type=int, nargs='+', default=[32768, 65536, 131072],
207
- help="Context lengths to test")
208
- parser.add_argument("--tasks-per-length", type=int, default=5,
209
- help="Number of tasks per context length")
210
-
211
- args = parser.parse_args()
212
-
213
- print("="*70)
214
- print("CONTEXT LENGTH BENCHMARK")
215
- print("="*70)
216
- print(f"Model: {args.model}")
217
- print(f"Context lengths: {args.context_lengths}")
218
- print(f"Tasks per length: {args.tasks_per_length}")
219
-
220
- # Sample tasks for benchmarking
221
- tasks = [
222
- {
223
- "name": "Code Completion",
224
- "prompt": """import React from 'react';
225
- function Component({ children }) {
226
- return (
227
- <div className="container">
228
- {children}
229
- </div>
230
- );
231
- }
232
- export default Component;"""
233
- },
234
- {
235
- "name": "Bug Fix",
236
- "prompt": """function calculateTotal(items) {
237
- let total = 0;
238
- for (let i = 0; i <= items.length; i++) {
239
- total += items[i].price;
240
- }
241
- return total;
242
- }
243
- // This function has a bug. What is it and how would you fix it?"""
244
- },
245
- {
246
- "name": "Documentation Generation",
247
- "prompt": """class DataProcessor {
248
- constructor(config) {
249
- this.config = config;
250
- this.cache = new Map();
251
- }
252
-
253
- async process(data) {
254
- const result = await this.transform(data);
255
- return this.validate(result);
256
- }
257
-
258
- transform(data) {
259
- // Transform logic here
260
- return data.map(item => ({ ...item, processed: true }));
261
- }
262
-
263
- validate(result) {
264
- return result.filter(item => item.valid !== false);
265
- }
266
- }
267
- // Please generate comprehensive JSDoc documentation for this class."""
268
- },
269
- {
270
- "name": "Test Generation",
271
- "prompt": """const sum = (a, b) => a + b;
272
- const multiply = (a, b) => a * b;
273
- const divide = (a, b) => {
274
- if (b === 0) throw new Error('Division by zero');
275
- return a / b;
276
- };
277
- // Write Jest unit tests for these utility functions."""
278
- },
279
- {
280
- "name": "Refactoring",
281
- "prompt": """function processUserData(users) {
282
- const result = [];
283
- for (let i = 0; i < users.length; i++) {
284
- const user = users[i];
285
- if (user.active) {
286
- result.push({
287
- id: user.id,
288
- name: user.firstName + ' ' + user.lastName,
289
- email: user.email.toLowerCase()
290
- });
291
- }
292
- }
293
- return result;
294
- }
295
- // Refactor this function using modern ES6+ features (map, filter, destructuring, template literals)."""
296
- }
297
- ]
298
-
299
- results = {
300
- "metadata": {
301
- "model": args.model,
302
- "context_lengths_tested": args.context_lengths,
303
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
304
- "tasks": [t["name"] for t in tasks],
305
- "max_new_tokens": 200
306
- },
307
- "results": []
308
- }
309
-
310
- try:
311
- # Import dependencies
312
- print("\n📦 Loading dependencies...")
313
- from transformers import AutoTokenizer
314
- sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-deploy')
315
-
316
- print(f"\n🔍 Loading tokenizer for {args.model}...")
317
- tokenizer = AutoTokenizer.from_pretrained(
318
- args.model,
319
- trust_remote_code=True
320
- )
321
- print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
322
-
323
- all_task_results = []
324
-
325
- # Test each context length
326
- for context_len in args.context_lengths:
327
- print(f"\n{'='*70}")
328
- print(f"TESTING CONTEXT LENGTH: {context_len} tokens ({context_len/1024:.0f}K)")
329
- print(f"{'='*70}")
330
-
331
- # Load model fresh for each context length (optional, but cleaner)
332
- print(f"\n🤖 Loading model...")
333
- model = load_model(args.model, max_model_len=context_len, block_size=64)
334
-
335
- # Get initial memory after load
336
- mem_after_load = get_memory_info()
337
- print(f" Model loaded. Memory: {mem_after_load}")
338
-
339
- length_results = []
340
-
341
- # Run tasks (selected subset based on context length)
342
- num_tasks = min(args.tasks_per_length, len(tasks))
343
-
344
- for i in range(num_tasks):
345
- task = tasks[i % len(tasks)]
346
- print(f"\n[{i+1}/{num_tasks}] Running task: {task['name']}")
347
- sys.stdout.flush()
348
-
349
- result = test_task(
350
- model, tokenizer, context_len,
351
- f"{task['name']} @ {context_len}",
352
- task["prompt"]
353
- )
354
- length_results.append(result)
355
- all_task_results.append(result)
356
-
357
- # Small delay between tasks
358
- time.sleep(1)
359
-
360
- # Print summary for this context length
361
- successful = [r for r in length_results if r.get('success', False)]
362
- if successful:
363
- avg_tps = statistics.mean([r['tokens_per_second'] for r in successful])
364
- avg_latency = statistics.mean([r['elapsed_seconds'] for r in successful])
365
- print(f"\n📈 Summary for {context_len} tokens:")
366
- print(f" Avg throughput: {avg_tps:.2f} tokens/sec")
367
- print(f" Avg latency: {avg_latency:.3f}s")
368
- print(f" Success count: {len(successful)}/{len(length_results)}")
369
-
370
- # Unload model to free memory before next test
371
- del model
372
- import gc
373
- gc.collect()
374
- if torch.cuda.is_available():
375
- torch.cuda.empty_cache()
376
-
377
- print(f" ✓ Completed testing for {context_len}")
378
-
379
- # Compile final results
380
- results["results"] = all_task_results
381
-
382
- # Calculate summary statistics
383
- summary = {}
384
- for context_len in args.context_lengths:
385
- len_results = [r for r in all_task_results
386
- if r.get('context_length_target') == context_len and r.get('success')]
387
- if len_results:
388
- summary[str(context_len)] = {
389
- "count": len(len_results),
390
- "avg_tokens_per_second": round(statistics.mean([r['tokens_per_second'] for r in len_results]), 2),
391
- "avg_latency_seconds": round(statistics.mean([r['elapsed_seconds'] for r in len_results]), 3),
392
- "avg_gpu_memory_delta_mb": round(statistics.mean([r.get('gpu_allocated_delta_mb', 0) for r in len_results]), 1),
393
- "avg_ram_delta_mb": round(statistics.mean([r.get('ram_delta_mb', 0) for r in len_results]), 1)
394
- }
395
- results["summary"] = summary
396
-
397
- except ImportError as e:
398
- print(f"❌ Missing dependencies: {e}")
399
- print("Please install: pip install vllm transformers psutil torch")
400
- sys.exit(1)
401
- except Exception as e:
402
- print(f"❌ Error: {e}")
403
- import traceback
404
- traceback.print_exc()
405
- sys.exit(1)
406
-
407
- # Save results
408
- output_dir = Path(args.output_dir)
409
- output_dir.mkdir(parents=True, exist_ok=True)
410
-
411
- timestamp = time.strftime("%Y%m%d_%H%M%S")
412
- output_file = output_dir / f"benchmark_{timestamp}.json"
413
-
414
- with open(output_file, 'w') as f:
415
- json.dump(results, f, indent=2)
416
-
417
- print(f"\n{'='*70}")
418
- print("BENCHMARK COMPLETE")
419
- print(f"{'='*70}")
420
- print(f"Results saved to: {output_file}")
421
-
422
- # Print summary table
423
- print("\n📊 Performance Summary:")
424
- print("-"*70)
425
- print(f"{'Context':<10} {'Throughput':<15} {'Latency':<12} {'GPU Δ':<12} {'RAM Δ':<12}")
426
- print("-"*70)
427
-
428
- if summary:
429
- for length_str, stats in sorted(summary.items()):
430
- length = int(length_str)
431
- length_k = length // 1024
432
- print(f"{length_k:>3}K {stats['avg_tokens_per_second']:>5.1f} tok/s {stats['avg_latency_seconds']:>6.3f}s "
433
- f"{stats['avg_gpu_memory_delta_mb']:>6.1f} MB {stats['avg_ram_delta_mb']:>6.1f} MB")
434
-
435
- print("\n✅ Benchmark finished!")
436
- print("\nNext steps:")
437
- print(" 1. Review results in the JSON output file")
438
- print(" 2. Check if 128K provides quality benefits that justify any performance trade-offs")
439
- print(" 3. Update deployment configuration with optimal block_size and scheduler settings")
440
-
441
- if __name__ == "__main__":
442
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/test_context_window.py DELETED
@@ -1,330 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script for verifying 128K context window support for Qwen2.5-Coder-32B.
4
-
5
- This script:
6
- 1. Loads the model with vLLM configured for 128K context
7
- 2. Tests with various input lengths (32K, 64K, 96K, 128K)
8
- 3. Measures memory usage, throughput, and latency
9
- 4. Tests with real codebase context (entire project)
10
- 5. Validates that the model correctly processes long inputs
11
- """
12
-
13
- import os
14
- import sys
15
- import json
16
- import time
17
- import psutil
18
- import argparse
19
- from pathlib import Path
20
- from typing import Dict, List, Tuple
21
-
22
- # Add vLLM to path
23
- sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-deploy')
24
-
25
- def get_memory_usage() -> Dict[str, float]:
26
- """Get current memory usage in MB."""
27
- process = psutil.Process(os.getpid())
28
- memory_info = process.memory_info()
29
- return {
30
- 'rss_mb': memory_info.rss / 1024 / 1024,
31
- 'vms_mb': memory_info.vms / 1024 / 1024
32
- }
33
-
34
- def generate_token_sequence(length: int, tokenizer) -> List[int]:
35
- """Generate a sequence of tokens of approximately the target length."""
36
- # Create a repeating pattern that tokenizes consistently
37
- base_text = "This is a test token sequence for context window testing. " * 10
38
- tokens = tokenizer.encode(base_text)
39
- # Repeat the tokens to reach desired length
40
- num_repeats = (length // len(tokens)) + 1
41
- token_sequence = tokens * num_repeats
42
- return token_sequence[:length]
43
-
44
- def read_codebase_files(base_path: str, max_files: int = 100) -> str:
45
- """Read source code files from the codebase to create a realistic long context."""
46
- codebase_text = ""
47
- src_dir = Path(base_path) / "src"
48
- if not src_dir.exists():
49
- return ""
50
-
51
- file_count = 0
52
- for file_path in src_dir.rglob("*.ts"):
53
- if file_count >= max_files:
54
- break
55
- try:
56
- with open(file_path, 'r', encoding='utf-8') as f:
57
- content = f.read()
58
- codebase_text += f"\n\n// File: {file_path.relative_to(base_path)}\n{content}\n"
59
- file_count += 1
60
- except Exception as e:
61
- print(f"Warning: Could not read {file_path}: {e}")
62
-
63
- return codebase_text
64
-
65
- def test_context_length(model, tokenizer, context_length: int, test_name: str) -> Dict:
66
- """Test model with a specific context length."""
67
- print(f"\n{'='*60}")
68
- print(f"Testing {test_name} (target: {context_length} tokens)")
69
- print(f"{'='*60}")
70
-
71
- # Generate input sequence
72
- tokens = generate_token_sequence(context_length, tokenizer)
73
- actual_length = len(tokens)
74
- print(f"Generated input with {actual_length} tokens")
75
-
76
- # Measure memory before inference
77
- mem_before = get_memory_usage()
78
-
79
- # Run inference (generate a short response to test context processing)
80
- start_time = time.time()
81
- try:
82
- # Use vLLM's generate
83
- from vllm import SamplingParams
84
- sampling_params = SamplingParams(
85
- temperature=0.1,
86
- max_tokens=50, # Generate only 50 tokens
87
- prompt_logprobs=0
88
- )
89
-
90
- outputs = model.generate(
91
- prompt_token_ids=tokens,
92
- sampling_params=sampling_params,
93
- use_tqdm=False
94
- )
95
-
96
- elapsed = time.time() - start_time
97
- mem_after = get_memory_usage()
98
-
99
- # Calculate metrics
100
- output_text = outputs[0].outputs[0].text
101
- output_tokens = len(outputs[0].outputs[0].token_ids)
102
- tokens_per_second = output_tokens / elapsed if elapsed > 0 else 0
103
-
104
- result = {
105
- "test": test_name,
106
- "target_length": context_length,
107
- "actual_length": actual_length,
108
- "output_tokens": output_tokens,
109
- "latency_seconds": round(elapsed, 3),
110
- "tokens_per_second": round(tokens_per_second, 2),
111
- "memory_before_mb": round(mem_before['rss_mb'], 2),
112
- "memory_after_mb": round(mem_after['rss_mb'], 2),
113
- "memory_delta_mb": round(mem_after['rss_mb'] - mem_before['rss_mb'], 2),
114
- "success": True,
115
- "sample_output": output_text[:100] if output_text else ""
116
- }
117
-
118
- print(f"✅ Success!")
119
- print(f" Latency: {elapsed:.3f}s")
120
- print(f" Throughput: {tokens_per_second:.2f} tokens/sec")
121
- print(f" Memory delta: {result['memory_delta_mb']:.1f} MB")
122
- print(f" Sample output: {result['sample_output']}")
123
-
124
- except Exception as e:
125
- elapsed = time.time() - start_time
126
- result = {
127
- "test": test_name,
128
- "target_length": context_length,
129
- "actual_length": actual_length,
130
- "success": False,
131
- "error": str(e),
132
- "latency_seconds": round(elapsed, 3)
133
- }
134
- print(f"❌ Failed: {e}")
135
-
136
- return result
137
-
138
- def test_with_codebase(model, tokenizer, codebase_path: str) -> Dict:
139
- """Test the model with the entire codebase as context."""
140
- print(f"\n{'='*60}")
141
- print(f"Testing with real codebase context")
142
- print(f"{'='*60}")
143
-
144
- # Read codebase files
145
- print("Reading codebase files...")
146
- codebase_text = read_codebase_files(codebase_path, max_files=200)
147
- codebase_tokens = tokenizer.encode(codebase_text)
148
- context_length = len(codebase_tokens)
149
- print(f"Codebase encoded to {context_length} tokens ({context_length/1024:.1f}K)")
150
-
151
- if context_length < 1000:
152
- print("⚠️ Warning: Codebase is too small, generate synthetic long context instead")
153
- codebase_tokens = generate_token_sequence(131072, tokenizer)
154
- context_length = len(codebase_tokens)
155
-
156
- mem_before = get_memory_usage()
157
- start_time = time.time()
158
-
159
- try:
160
- from vllm import SamplingParams
161
- sampling_params = SamplingParams(
162
- temperature=0.2,
163
- max_tokens=100,
164
- prompt_logprobs=0
165
- )
166
-
167
- outputs = model.generate(
168
- prompt_token_ids=codebase_tokens,
169
- sampling_params=sampling_params,
170
- use_tqdm=False
171
- )
172
-
173
- elapsed = time.time() - start_time
174
- mem_after = get_memory_usage()
175
-
176
- output_text = outputs[0].outputs[0].text
177
- output_tokens = len(outputs[0].outputs[0].token_ids)
178
- tokens_per_second = output_tokens / elapsed if elapsed > 0 else 0
179
-
180
- result = {
181
- "test": "Codebase Context",
182
- "context_size_k": round(context_length / 1024, 1),
183
- "output_tokens": output_tokens,
184
- "latency_seconds": round(elapsed, 3),
185
- "tokens_per_second": round(tokens_per_second, 2),
186
- "memory_before_mb": round(mem_before['rss_mb'], 2),
187
- "memory_after_mb": round(mem_after['rss_mb'], 2),
188
- "memory_delta_mb": round(mem_after['rss_mb'] - mem_before['rss_mb'], 2),
189
- "success": True,
190
- "sample_output": output_text[:150]
191
- }
192
-
193
- print(f"✅ Success!")
194
- print(f" Context size: {result['context_size_k']}K tokens")
195
- print(f" Latency: {elapsed:.3f}s")
196
- print(f" Throughput: {tokens_per_second:.2f} tokens/sec")
197
- print(f" Memory delta: {result['memory_delta_mb']:.1f} MB")
198
- print(f" Sample output: {result['sample_output']}")
199
-
200
- except Exception as e:
201
- elapsed = time.time() - start_time
202
- result = {
203
- "test": "Codebase Context",
204
- "success": False,
205
- "error": str(e),
206
- "latency_seconds": round(elapsed, 3)
207
- }
208
- print(f"❌ Failed: {e}")
209
-
210
- return result
211
-
212
- def main():
213
- parser = argparse.ArgumentParser(description="Test 128K context window for Qwen2.5-Coder-32B")
214
- parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-32B",
215
- help="Model name or path")
216
- parser.add_argument("--max-model-len", type=int, default=131072,
217
- help="Maximum model length for vLLM")
218
- parser.add_argument("--block-size", type=int, default=64,
219
- help="vLLM block size")
220
- parser.add_argument("--codebase-path", type=str,
221
- default="/Users/walidsobhi/.openclaw/workspace/stack-2.9",
222
- help="Path to the codebase for real context test")
223
- parser.add_argument("--output", type=str,
224
- default="benchmarks/test_context_results.json",
225
- help="Output file for results")
226
-
227
- args = parser.parse_args()
228
-
229
- print(f"Starting 128K Context Window Test")
230
- print(f"Model: {args.model}")
231
- print(f"Config: max_model_len={args.max_model_len}, block_size={args.block_size}")
232
-
233
- results = []
234
-
235
- try:
236
- # Import vLLM and Transformers
237
- print("\n📦 Loading tokenizer...")
238
- from transformers import AutoTokenizer
239
- tokenizer = AutoTokenizer.from_pretrained(
240
- args.model,
241
- trust_remote_code=True
242
- )
243
- print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
244
-
245
- print("\n🤖 Loading vLLM model...")
246
- from vllm import LLM
247
-
248
- # Initialize vLLM with large context configuration
249
- model = LLM(
250
- model=args.model,
251
- max_model_len=args.max_model_len,
252
- block_size=args.block_size,
253
- gpu_memory_utilization=0.9,
254
- trust_remote_code=True,
255
- tensor_parallel_size=1 # Adjust if using multiple GPUs
256
- )
257
- print("Model loaded successfully!")
258
-
259
- # Test 1: Small context (8K) - baseline
260
- results.append(test_context_length(model, tokenizer, 8192, "8K Baseline"))
261
-
262
- # Test 2: Medium context (32K)
263
- results.append(test_context_length(model, tokenizer, 32768, "32K"))
264
-
265
- # Test 3: Large context (64K)
266
- results.append(test_context_length(model, tokenizer, 65536, "64K"))
267
-
268
- # Test 4: Full context (96K)
269
- results.append(test_context_length(model, tokenizer, 98304, "96K"))
270
-
271
- # Test 5: Maximum context (128K)
272
- results.append(test_context_length(model, tokenizer, 131072, "128K"))
273
-
274
- # Test 6: Codebase context
275
- results.append(test_with_codebase(model, tokenizer, args.codebase_path))
276
-
277
- except ImportError as e:
278
- print(f"❌ Import error: {e}")
279
- print("Make sure vLLM and transformers are installed:")
280
- print(" pip install vllm transformers")
281
- sys.exit(1)
282
- except Exception as e:
283
- print(f"❌ Error during testing: {e}")
284
- import traceback
285
- traceback.print_exc()
286
- sys.exit(1)
287
-
288
- # Save results
289
- output_path = Path(args.output)
290
- output_path.parent.mkdir(parents=True, exist_ok=True)
291
-
292
- with open(output_path, 'w') as f:
293
- json.dump({
294
- "metadata": {
295
- "model": args.model,
296
- "max_model_len": args.max_model_len,
297
- "block_size": args.block_size,
298
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
299
- "system": os.uname().sysname if hasattr(os, 'uname') else "Unknown"
300
- },
301
- "results": results
302
- }, f, indent=2)
303
-
304
- print(f"\n📊 Results saved to: {output_path}")
305
- print("\n" + "="*60)
306
- print("SUMMARY")
307
- print("="*60)
308
-
309
- successful = [r for r in results if r.get('success', False)]
310
- failed = [r for r in results if not r.get('success', False)]
311
-
312
- print(f"Total tests: {len(results)}")
313
- print(f"Successful: {len(successful)}")
314
- print(f"Failed: {len(failed)}")
315
-
316
- if successful:
317
- print("\nContext length vs. throughput:")
318
- for r in successful:
319
- if r['test'] != 'Codebase Context':
320
- print(f" {r['test']}: {r['tokens_per_second']} tokens/sec, "
321
- f"memory delta: {r['memory_delta_mb']}MB")
322
- if any(r['test'] == 'Codebase Context' for r in successful):
323
- cb = next(r for r in successful if r['test'] == 'Codebase Context')
324
- print(f"\nCodebase test: {cb['context_size_k']}K tokens, "
325
- f"{cb['tokens_per_second']} tokens/sec")
326
-
327
- print("\n✅ Test script completed!")
328
-
329
- if __name__ == "__main__":
330
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CONTEXT_UPDATE_SUMMARY.md → docs/archive/CONTEXT_UPDATE_SUMMARY.md RENAMED
File without changes
DATA_SCALING_PLAN.md → docs/archive/DATA_SCALING_PLAN.md RENAMED
File without changes
DEPLOYMENT_TEST_REPORT.md → docs/archive/DEPLOYMENT_TEST_REPORT.md RENAMED
File without changes
EVAL_PLAN.md → docs/archive/EVAL_PLAN.md RENAMED
File without changes
IMPLEMENTATION_SUMMARY.md → docs/archive/IMPLEMENTATION_SUMMARY.md RENAMED
File without changes
LICENSES.md → docs/archive/LICENSES.md RENAMED
File without changes
MAXIMIZATION_PLAN.md → docs/archive/MAXIMIZATION_PLAN.md RENAMED
File without changes
OPENROUTER_SUBMISSION_CHECKLIST.md → docs/archive/OPENROUTER_SUBMISSION_CHECKLIST.md RENAMED
File without changes
PUSH_GUIDE.md → docs/archive/PUSH_GUIDE.md RENAMED
File without changes
STACK_CLI_README.md → docs/archive/STACK_CLI_README.md RENAMED
File without changes
SUBMISSION_PACKAGE_SUMMARY.md → docs/archive/SUBMISSION_PACKAGE_SUMMARY.md RENAMED
File without changes
TOGETHER_AI.md → docs/archive/TOGETHER_AI.md RENAMED
File without changes
context_window_upgrade_summary.md → docs/archive/context_window_upgrade_summary.md RENAMED
File without changes
{website → docs/archive/website}/app.js RENAMED
File without changes
{website → docs/archive/website}/benchmark.html RENAMED
File without changes
{website → docs/archive/website}/index.html RENAMED
File without changes
{website → docs/archive/website}/styles.css RENAMED
File without changes
training-data-extractor.js → scripts/training-data-extractor.js RENAMED
File without changes
space/Dockerfile DELETED
@@ -1,37 +0,0 @@
1
- # Stack 2.9 HuggingFace Spaces Dockerfile
2
- # Optimized for 16GB GPU with 4-bit quantization
3
-
4
- FROM python:3.10-slim
5
-
6
- # Set environment variables
7
- ENV PYTHONUNBUFFERED=1
8
- ENV TRANSFORMERS_CACHE=/workspace/.cache/huggingface
9
- ENV HF_HOME=/workspace/.cache/huggingface
10
-
11
- # Install system dependencies
12
- RUN apt-get update && apt-get install -y \
13
- git \
14
- wget \
15
- && rm -rf /var/lib/apt/lists/*
16
-
17
- # Create workspace directory
18
- WORKDIR /workspace
19
-
20
- # Copy requirements first for better caching
21
- COPY requirements.txt .
22
-
23
- # Install Python dependencies
24
- RUN pip install --no-cache-dir -r requirements.txt
25
-
26
- # Copy application files
27
- COPY . .
28
-
29
- # Expose Gradio port
30
- EXPOSE 7860
31
-
32
- # Create startup script
33
- RUN echo '#!/bin/bash\necho "🚀 Starting Stack 2.9..."\npython app.py --port 7860 --share' > /start.sh
34
- RUN chmod +x /start.sh
35
-
36
- # Launch command
37
- CMD ["/start.sh"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/README.md DELETED
@@ -1,124 +0,0 @@
1
- # 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
2
-
3
- A HuggingFace Spaces demo for Stack 2.9, a pattern-based AI coding assistant powered by Qwen2.5-Coder-7B.
4
-
5
- ![License](https://img.shields.io/badge/license-MIT-blue.svg)
6
- ![Python](https://img.shields.io/badge/python-3.10+-green.svg)
7
- ![Gradio](https://img.shields.io/badge/Gradio-4.0+-orange.svg)
8
-
9
- ## ✨ Features
10
-
11
- - **🤖 Qwen2.5-Coder-7B** - State-of-the-art code generation model
12
- - **🔧 7 Integrated Tools** - File operations, git, web search, shell commands
13
- - **🧠 Pattern Memory** - Learns from each interaction
14
- - **⚡ Fast Streaming** - Real-time token-by-token generation
15
- - **💾 4-bit Quantization** - Runs on 16GB GPU (~4GB VRAM)
16
-
17
- ## 🔧 Available Tools
18
-
19
- | Tool | Description |
20
- |------|-------------|
21
- | `file_read` | Read files from the filesystem |
22
- | `file_write` | Write content to files |
23
- | `git_status` | Check git repository status |
24
- | `web_search` | Search the web for information |
25
- | `run_command` | Execute shell commands |
26
- | `create_directory` | Create new directories |
27
- | `list_directory` | List directory contents |
28
-
29
- ## 🏃‍♂️ Quick Start
30
-
31
- ### Local Development
32
-
33
- ```bash
34
- # Clone the repository
35
- git clone https://github.com/your-repo/stack-2.9.git
36
- cd stack-2.9/space
37
-
38
- # Install dependencies
39
- pip install -r requirements.txt
40
-
41
- # Run the demo
42
- python app.py --share
43
- ```
44
-
45
- ### HuggingFace Spaces
46
-
47
- 1. Create a new Space on [HuggingFace](https://huggingface.co/spaces)
48
- 2. Select "Gradio" as the SDK
49
- 3. Upload the files from this directory:
50
- - `app.py`
51
- - `requirements.txt`
52
- - `README.md`
53
- 4. The model will load automatically on startup
54
-
55
- ## 💻 Usage
56
-
57
- ### Example Prompts
58
-
59
- ```
60
- Hello! What can you help me with?
61
- Check git status of this repository
62
- Search for best practices for Python async programming
63
- List the files in the current directory
64
- Write a simple Python function to calculate fibonacci
65
- How do I use Git to create a new branch?
66
- What's your memory of our conversation?
67
- ```
68
-
69
- ### Python API
70
-
71
- ```python
72
- from app import StackModel, memory
73
-
74
- # Initialize model
75
- model = StackModel()
76
- model.load()
77
-
78
- # Generate response
79
- response = model.generate("Write a hello world in Python")
80
- print(response)
81
-
82
- # Check memory stats
83
- print(memory.get_stats())
84
- ```
85
-
86
- ## 🔐 Environment Variables
87
-
88
- - `HF_TOKEN` - Your HuggingFace token for private models (optional)
89
- - `MODEL_ID` - Override default model (default: Qwen/Qwen2.5-Coder-7B-Instruct)
90
-
91
- ## 📊 Memory System
92
-
93
- Stack 2.9 includes a pattern memory system that:
94
-
95
- 1. **Tracks Interactions** - Records every user-assistant exchange
96
- 2. **Learns Patterns** - Identifies frequently used tools
97
- 3. **Stores Code** - Saves useful code snippets
98
- 4. **Adapts Behavior** - Uses learned context to improve responses
99
-
100
- ## 🛠️ Tech Stack
101
-
102
- - **Model**: Qwen2.5-Coder-7B-Instruct
103
- - **Quantization**: 4-bit (bitsandbytes)
104
- - **Framework**: Gradio 4.0+
105
- - **Backend**: Transformers + Accelerate
106
- - **GPU**: 16GB VRAM recommended
107
-
108
- ## 📝 License
109
-
110
- MIT License - see LICENSE file for details.
111
-
112
- ## 🙏 Acknowledgments
113
-
114
- - [Qwen](https://github.com/QwenLM/Qwen) - Base model
115
- - [HuggingFace](https://huggingface.co/) - Spaces hosting
116
- - [Gradio](https://gradio.app/) - UI framework
117
-
118
- ---
119
-
120
- <div align="center">
121
-
122
- Made with ❤️ by Stack 2.9
123
-
124
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/app.py DELETED
@@ -1,600 +0,0 @@
1
- """
2
- Stack 2.9 - Pattern-Based AI Coding Assistant
3
- HuggingFace Spaces Demo
4
-
5
- A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B
6
- with tool integration and pattern memory.
7
- """
8
-
9
- import os
10
- import json
11
- import time
12
- from datetime import datetime
13
- from typing import List, Dict, Optional
14
- import gradio as gr
15
-
16
- # ============================================================
17
- # Pattern Memory System
18
- # ============================================================
19
-
20
- class SelfEvolutionMemory:
21
- """Simple in-memory pattern memory system for demo purposes."""
22
-
23
- def __init__(self):
24
- self.conversations = []
25
- self.learned_patterns = {}
26
- self.code_snippets = []
27
- self.preferences = {}
28
- self.interaction_count = 0
29
-
30
- def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None):
31
- """Record an interaction for learning."""
32
- self.interaction_count += 1
33
- interaction = {
34
- "timestamp": datetime.now().isoformat(),
35
- "user_input": user_input,
36
- "assistant_response": assistant_response,
37
- "tools_used": tools_used or [],
38
- "interaction_id": self.interaction_count
39
- }
40
- self.conversations.append(interaction)
41
-
42
- # Extract patterns from the interaction
43
- self._learn_from_interaction(user_input, assistant_response, tools_used or [])
44
-
45
- def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]):
46
- """Learn patterns from interactions."""
47
- # Track tool usage patterns
48
- for tool in tools:
49
- if tool not in self.learned_patterns:
50
- self.learned_patterns[tool] = {"count": 0, "contexts": []}
51
- self.learned_patterns[tool]["count"] += 1
52
- self.learned_patterns[tool]["contexts"].append(user_input[:100])
53
-
54
- # Extract code snippets if present
55
- if "```" in response:
56
- self.code_snippets.append({
57
- "timestamp": datetime.now().isoformat(),
58
- "snippet": response
59
- })
60
-
61
- def get_context(self) -> str:
62
- """Get accumulated context for the model."""
63
- context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"]
64
-
65
- if self.learned_patterns:
66
- context_parts.append("\n### Tool Usage Patterns:")
67
- for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]:
68
- context_parts.append(f"- {tool}: used {data['count']} times")
69
-
70
- if self.code_snippets:
71
- context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns")
72
-
73
- return "\n".join(context_parts)
74
-
75
- def get_stats(self) -> Dict:
76
- """Get memory statistics."""
77
- return {
78
- "total_interactions": self.interaction_count,
79
- "tool_patterns": len(self.learned_patterns),
80
- "code_snippets": len(self.code_snippets),
81
- "recent_tools": [t for t in self.learned_patterns.keys()][:5]
82
- }
83
-
84
-
85
- # Global memory instance
86
- memory = SelfEvolutionMemory()
87
-
88
- # ============================================================
89
- # Tool System
90
- # ============================================================
91
-
92
- class Tool:
93
- """Base tool class."""
94
-
95
- def __init__(self, name: str, description: str, func):
96
- self.name = name
97
- self.description = description
98
- self.func = func
99
-
100
- async def execute(self, *args, **kwargs):
101
- return await self.func(*args, **kwargs)
102
-
103
-
104
- # Tool implementations (simplified for demo)
105
- async def tool_file_read(path: str) -> str:
106
- """Read a file."""
107
- try:
108
- with open(path, 'r') as f:
109
- return f.read()[:5000] # Limit output
110
- except FileNotFoundError:
111
- return f"File not found: {path}"
112
- except Exception as e:
113
- return f"Error reading file: {str(e)}"
114
-
115
-
116
- async def tool_file_write(path: str, content: str) -> str:
117
- """Write to a file."""
118
- try:
119
- os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
120
- with open(path, 'w') as f:
121
- f.write(content)
122
- return f"Successfully wrote to {path}"
123
- except Exception as e:
124
- return f"Error writing file: {str(e)}"
125
-
126
-
127
- async def tool_git_status() -> str:
128
- """Get git status."""
129
- import subprocess
130
- try:
131
- result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd())
132
- return result.stdout or "No changes"
133
- except Exception as e:
134
- return f"Git error: {str(e)}"
135
-
136
-
137
- async def tool_web_search(query: str) -> str:
138
- """Search the web."""
139
- from urllib.parse import quote
140
- # Return a demo response since we can't make actual API calls
141
- return f"🔍 Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)"
142
-
143
-
144
- async def tool_run_command(cmd: str) -> str:
145
- """Run a shell command."""
146
- import subprocess
147
- try:
148
- result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
149
- return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout
150
- except Exception as e:
151
- return f"Command error: {str(e)}"
152
-
153
-
154
- async def tool_create_directory(path: str) -> str:
155
- """Create a directory."""
156
- try:
157
- os.makedirs(path, exist_ok=True)
158
- return f"Directory created: {path}"
159
- except Exception as e:
160
- return f"Error: {str(e)}"
161
-
162
-
163
- async def tool_list_directory(path: str = ".") -> str:
164
- """List directory contents."""
165
- try:
166
- items = os.listdir(path)
167
- return "\n".join([f"📁 {i}/" if os.path.isdir(os.path.join(path, i)) else f"📄 {i}" for i in items[:50]])
168
- except Exception as e:
169
- return f"Error: {str(e)}"
170
-
171
-
172
- # Register tools
173
- TOOLS = {
174
- "file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read),
175
- "file_write": Tool("file_write", "Write content to a file", tool_file_write),
176
- "git_status": Tool("git_status", "Check git repository status", tool_git_status),
177
- "web_search": Tool("web_search", "Search the web for information", tool_web_search),
178
- "run_command": Tool("run_command", "Execute a shell command", tool_run_command),
179
- "create_directory": Tool("create_directory", "Create a new directory", tool_create_directory),
180
- "list_directory": Tool("list_directory", "List files in a directory", tool_list_directory),
181
- }
182
-
183
-
184
- def get_tool_descriptions() -> str:
185
- """Get descriptions of all available tools."""
186
- return "\n".join([f"- **{t.name}**: {t.description}" for t in TOOLS.values()])
187
-
188
-
189
- # ============================================================
190
- # Model Interface
191
- # ============================================================
192
-
193
- class StackModel:
194
- """Stack 2.9 model interface using transformers."""
195
-
196
- def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"):
197
- self.model_id = model_id
198
- self.model = None
199
- self.tokenizer = None
200
- self.pipeline = None
201
-
202
- def load(self):
203
- """Load the model with 4-bit quantization for HF Spaces."""
204
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
205
- import torch
206
-
207
- print(f"Loading {self.model_id}...")
208
-
209
- # 4-bit quantization config for 16GB GPU
210
- bnb_config = BitsAndBytesConfig(
211
- load_in_4bit=True,
212
- bnb_4bit_compute_dtype=torch.float16,
213
- bnb_4bit_use_double_quant=True,
214
- bnb_4bit_quant_type="nf4"
215
- )
216
-
217
- # Load tokenizer
218
- self.tokenizer = AutoTokenizer.from_pretrained(
219
- self.model_id,
220
- trust_remote_code=True
221
- )
222
-
223
- # Load model with quantization
224
- self.model = AutoModelForCausalLM.from_pretrained(
225
- self.model_id,
226
- quantization_config=bnb_config,
227
- device_map="auto",
228
- trust_remote_code=True
229
- )
230
-
231
- print("Model loaded successfully!")
232
-
233
- def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
234
- """Generate a response."""
235
- if not self.tokenizer:
236
- return "Model not loaded. Please wait for initialization."
237
-
238
- # Build the prompt with system and tools
239
- system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
240
-
241
- ## Available Tools
242
- {get_tool_descriptions()}
243
-
244
- ## Your Capabilities
245
- - Write, read, and execute code
246
- - Use git for version control
247
- - Search the web for information
248
- - Create and manage files
249
- - Execute shell commands
250
-
251
- ## Self-Evolution
252
- You learn from each interaction. After responding, summarize what tools you used.
253
-
254
- {memory.get_context()}
255
-
256
- ## Instructions
257
- 1. Be helpful and concise
258
- 2. Use tools when needed
259
- 3. Learn from the conversation
260
- 4. Provide code examples when relevant
261
-
262
- Now respond to the user:"""
263
-
264
- full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
265
-
266
- # Tokenize
267
- inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
268
-
269
- # Generate
270
- outputs = self.model.generate(
271
- **inputs,
272
- max_new_tokens=max_tokens,
273
- temperature=temperature,
274
- do_sample=True,
275
- top_p=0.9,
276
- repetition_penalty=1.1
277
- )
278
-
279
- # Decode
280
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
281
-
282
- # Extract just the response part
283
- if "Assistant:" in response:
284
- response = response.split("Assistant:")[-1].strip()
285
-
286
- return response
287
-
288
- def generate_streaming(self, prompt: str, max_tokens: int = 512):
289
- """Generate with streaming (yields tokens)."""
290
- if not self.tokenizer:
291
- yield "Model not loaded. Please wait for initialization."
292
- return
293
-
294
- system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
295
-
296
- ## Available Tools
297
- {get_tool_descriptions()}
298
-
299
- ## Self-Evolution Memory
300
- {memory.get_context()}
301
-
302
- Now respond to the user:"""
303
-
304
- full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
305
-
306
- inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
307
-
308
- # Generate token by token
309
- from transformers import GenerationMixin
310
- from typing import Iterator
311
-
312
- generated_ids = inputs.input_ids
313
-
314
- for _ in range(max_tokens):
315
- with torch.no_grad():
316
- outputs = self.model(generated_ids)
317
- next_token_logits = outputs.logits[:, -1, :]
318
-
319
- # Apply temperature
320
- next_token_logits = next_token_logits / 0.7
321
-
322
- # Sample
323
- probs = torch.softmax(next_token_logits, dim=-1)
324
- next_token = torch.multinomial(probs, num_samples=1)
325
-
326
- generated_ids = torch.cat([generated_ids, next_token], dim=-1)
327
-
328
- # Decode and yield
329
- token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
330
- yield token_str
331
-
332
- # Stop on EOS
333
- if next_token.item() == self.tokenizer.eos_token_id:
334
- break
335
-
336
-
337
- # Global model instance
338
- model = None
339
-
340
-
341
- def initialize_model():
342
- """Initialize the model on startup."""
343
- global model
344
- try:
345
- model = StackModel()
346
- model.load()
347
- return model
348
- except Exception as e:
349
- print(f"Failed to load model: {e}")
350
- return None
351
-
352
-
353
- # ============================================================
354
- # Gradio Interface
355
- # ============================================================
356
-
357
- def format_tools_used(tools_used: List[str]) -> str:
358
- """Format the tools used for display."""
359
- if not tools_used:
360
- return ""
361
- return f"\n\n🔧 **Tools Used**: {', '.join(tools_used)}"
362
-
363
-
364
- def chat_response(message: str, history: List[List[str]]) -> tuple:
365
- """Process a chat message and return response."""
366
- global model, memory
367
-
368
- if model is None or model.model is None:
369
- return "⏳ Model is loading. Please wait...", history + [[message, "⏳ Model is loading. Please wait..."]]
370
-
371
- # Track tools used
372
- tools_used = []
373
-
374
- # Check if we need to use tools based on the message
375
- message_lower = message.lower()
376
-
377
- if any(kw in message_lower for kw in ['git status', 'git']):
378
- tools_used.append("git_status")
379
- if any(kw in message_lower for kw in ['search', 'find', 'look up']):
380
- tools_used.append("web_search")
381
- if any(kw in message_lower for kw in ['list files', 'directory', 'ls']):
382
- tools_used.append("list_directory")
383
- if any(kw in message_lower for kw in ['run ', 'execute', 'command']):
384
- tools_used.append("run_command")
385
-
386
- # Generate response
387
- try:
388
- response = model.generate(message, max_tokens=512)
389
- except Exception as e:
390
- response = f"I encountered an error: {str(e)}"
391
-
392
- # Add tools used to response
393
- response += format_tools_used(tools_used)
394
-
395
- # Record in memory
396
- memory.add_interaction(message, response, tools_used)
397
-
398
- return response
399
-
400
-
401
- def chat_response_stream(message: str, history: List[List[str]]) -> Generator:
402
- """Process a chat message with streaming."""
403
- global model, memory
404
-
405
- if model is None or model.model is None:
406
- yield "⏳ Model is loading. Please wait..."
407
- return
408
-
409
- full_response = ""
410
- tools_used = []
411
-
412
- message_lower = message.lower()
413
- if any(kw in message_lower for kw in ['git status', 'git']):
414
- tools_used.append("git_status")
415
- if any(kw in message_lower for kw in ['search', 'find']):
416
- tools_used.append("web_search")
417
- if any(kw in message_lower for kw in ['list', 'directory']):
418
- tools_used.append("list_directory")
419
-
420
- # Stream the response
421
- for token in model.generate_streaming(message, max_tokens=256):
422
- full_response += token
423
- yield full_response
424
-
425
- # Add tools used
426
- if tools_used:
427
- full_response += format_tools_used(tools_used)
428
- yield full_response
429
-
430
- # Record in memory
431
- memory.add_interaction(message, full_response, tools_used)
432
-
433
-
434
- # Example prompts for the UI
435
- EXAMPLE_PROMPTS = [
436
- "Hello! What can you help me with?",
437
- "Check git status of this repository",
438
- "Search for best practices for Python async programming",
439
- "List the files in the current directory",
440
- "Write a simple Python function to calculate fibonacci",
441
- "How do I use Git to create a new branch?",
442
- "What's your memory of our conversation?",
443
- ]
444
-
445
-
446
- def create_gradio_app():
447
- """Create the Gradio interface."""
448
-
449
- with gr.Blocks(
450
- title="Stack 2.9 - Pattern-Based AI Coding Assistant",
451
- theme=gr.themes.Soft(
452
- primary_color="#6366f1",
453
- secondary_color="#818cf8",
454
- tertiary_color="#a5b4fc"
455
- )
456
- ) as app:
457
-
458
- # Header
459
- gr.Markdown("""
460
- # 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
461
-
462
- Powered by **Qwen2.5-Coder-7B** with 4-bit quantization
463
-
464
- ---
465
- """)
466
-
467
- # Memory stats display
468
- with gr.Row():
469
- with gr.Column(scale=1):
470
- stats_display = gr.Markdown(
471
- "📊 **Memory Stats**\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0",
472
- elem_id="stats"
473
- )
474
- with gr.Column(scale=3):
475
- pass # Spacer
476
-
477
- # Chat interface
478
- chatbot = gr.Chatbot(
479
- height=500,
480
- show_copy_button=True,
481
- bubble_full_width=False
482
- )
483
-
484
- with gr.Row():
485
- msg = gr.Textbox(
486
- label="Message",
487
- placeholder="Ask me anything...",
488
- scale=4,
489
- lines=3
490
- )
491
- submit_btn = gr.Button("Send", variant="primary", scale=1)
492
-
493
- # Clear button
494
- with gr.Row():
495
- clear_btn = gr.Button("🗑️ Clear Chat")
496
-
497
- # Example prompts
498
- gr.Examples(
499
- examples=EXAMPLE_PROMPTS,
500
- inputs=msg,
501
- label="Example Prompts"
502
- )
503
-
504
- # Memory visualization
505
- with gr.Accordion("🧠 Self-Evolution Memory", open=False):
506
- memory_display = gr.Textbox(
507
- label="Memory Content",
508
- lines=10,
509
- interactive=False
510
- )
511
-
512
- # Functions
513
- def respond(message, history):
514
- response = chat_response(message, history)
515
- history.append([message, response])
516
- return "", history
517
-
518
- def update_stats():
519
- stats = memory.get_stats()
520
- return f"""📊 **Memory Stats**
521
-
522
- - **Interactions**: {stats['total_interactions']}
523
- - **Tool Patterns**: {stats['tool_patterns']}
524
- - **Code Snippets**: {stats['code_snippets']}
525
-
526
- **Recent Tools**: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}"""
527
-
528
- def update_memory():
529
- return memory.get_context()
530
-
531
- # Button click handlers
532
- submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send")
533
- msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send")
534
-
535
- def clear_chat():
536
- return [], ""
537
-
538
- clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
539
-
540
- # Update stats periodically
541
- chatbot.change(update_stats, outputs=[stats_display])
542
- chatbot.change(update_memory, outputs=[memory_display])
543
-
544
- # Footer
545
- gr.Markdown("""
546
- ---
547
- ### About Stack 2.9
548
-
549
- Stack 2.9 is a pattern-based AI coding assistant that:
550
- - 🔍 Uses **Qwen2.5-Coder-7B** (4-bit, ~4GB VRAM)
551
- - 🛠️ Integrates **7 tools** (file, git, web, search, shell)
552
- - 🧠 Remembers interactions and learns patterns
553
- - ⚡ Provides fast, streaming responses
554
-
555
- Deployed on **HuggingFace Spaces** with Gradio
556
- """)
557
-
558
- return app
559
-
560
-
561
- # ============================================================
562
- # Main Entry Point
563
- # ============================================================
564
-
565
- if __name__ == "__main__":
566
- import argparse
567
-
568
- parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo")
569
- parser.add_argument("--share", action="store_true", help="Create a public share link")
570
- parser.add_argument("--port", type=int, default=7860, help="Port to run on")
571
- parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID")
572
- args = parser.parse_args()
573
-
574
- print("=" * 50)
575
- print("🚀 Stack 2.9 - Pattern-Based AI Coding Assistant")
576
- print("=" * 50)
577
- print(f"Model: {args.model}")
578
- print("Loading model...")
579
-
580
- # Initialize model in a thread
581
- import threading
582
-
583
- def load_model_thread():
584
- global model
585
- model = initialize_model()
586
-
587
- loader_thread = threading.Thread(target=load_model_thread)
588
- loader_thread.start()
589
-
590
- # Create and launch app
591
- app = create_gradio_app()
592
-
593
- print(f"\n🚀 Launching Gradio on port {args.port}...")
594
- print("📝 Note: Model loads in background. Chat will work once loaded.\n")
595
-
596
- app.launch(
597
- server_name="0.0.0.0",
598
- server_port=args.port,
599
- share=args.share
600
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/requirements.txt DELETED
@@ -1,24 +0,0 @@
1
- # Stack 2.9 - HuggingFace Spaces Demo
2
- # Requirements for Gradio interface with Qwen2.5-Coder-7B
3
-
4
- # Core Gradio
5
- gradio>=4.0.0
6
-
7
- # Transformers and model loading
8
- transformers>=4.36.0
9
- torch>=2.0.0
10
-
11
- # Model optimization
12
- accelerate>=0.24.0
13
- bitsandbytes>=0.41.0
14
-
15
- # Additional utilities
16
- huggingface-hub>=0.19.0
17
- safetensors>=0.4.0
18
-
19
- # Optional: For better web search
20
- # brave-search>=0.1.0
21
-
22
- # Optional: For web fetching
23
- # beautifulsoup4>=4.12.0
24
- # lxml>=4.9.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{stack-2.9-cli → src/cli}/__init__.py RENAMED
File without changes
{stack_cli → src/cli}/agent.py RENAMED
File without changes
{stack_cli → src/cli}/cli.py RENAMED
File without changes
{stack_cli → src/cli}/context.py RENAMED
File without changes
{stack-2.9-cli → src/cli}/main.py RENAMED
File without changes
{stack_cli → src/cli}/pyproject.toml RENAMED
File without changes
{stack_cli → src/cli}/tools.py RENAMED
File without changes
stack-2.9-deploy/Dockerfile CHANGED
@@ -1,107 +1,37 @@
1
- # Multi-stage production Docker image for Stack 2.9
2
- # Stack 2.9 LLM Inference Server with vLLM
3
 
4
- ARG PYTHON_VERSION=3.10
5
- ARG VLLM_VERSION=0.6.3
6
- ARG CUDA_VERSION=12.1.0
7
- ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
8
-
9
- # Stage 1: Builder
10
- FROM ${BASE_IMAGE} AS builder
11
-
12
- ARG PYTHON_VERSION
13
- ARG VLLM_VERSION
14
-
15
- # Set environment variables
16
- ENV DEBIAN_FRONTEND=noninteractive \
17
- TZ=UTC \
18
- PYTHONUNBUFFERED=1 \
19
- PIP_NO_CACHE_DIR=1
20
-
21
- # Install system dependencies and Python
22
- RUN apt-get update && apt-get install -y \
23
- python${PYTHON_VERSION} \
24
- python${PYTHON_VERSION}-dev \
25
- python3-pip \
26
- git \
27
- curl \
28
- wget \
29
- build-essential \
30
- cmake \
31
- && rm -rf /var/lib/apt/lists/*
32
-
33
- # Install PyTorch with CUDA 12.1 support
34
- RUN pip3 install --upgrade pip setuptools wheel
35
- RUN pip3 install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
36
-
37
- # Install vLLM
38
- RUN pip3 install vllm==${VLLM_VERSION} "vllm[attention]"
39
-
40
- # Install additional dependencies
41
- RUN pip3 install \
42
- fastapi==0.111.0 \
43
- uvicorn[standard]==0.30.1 \
44
- transformers==4.41.2 \
45
- accelerate==0.30.1 \
46
- huggingface-hub==0.23.0 \
47
- sentencepiece==0.2.0 \
48
- protobuf==3.20.3
49
-
50
- # Stage 2: Runtime
51
- FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
52
-
53
- ARG PYTHON_VERSION
54
- ARG VLLM_VERSION
55
 
56
  # Set environment variables
57
- ENV DEBIAN_FRONTEND=noninteractive \
58
- TZ=UTC \
59
- PYTHONUNBUFFERED=1 \
60
- PIP_NO_CACHE_DIR=1 \
61
- NVIDIA_VISIBLE_DEVICES=all \
62
- NVIDIA_DRIVER_CAPABILITIES=compute,utility \
63
- VLLM_USE_MODELSCOPE=false \
64
- HF_HUB_DISABLE_TELEMETRY=1 \
65
- HF_HUB_ENABLE_HF_TRANSFER=1
66
 
67
- # Install Python and minimal dependencies
68
  RUN apt-get update && apt-get install -y \
69
- python${PYTHON_VERSION} \
70
- python${PYTHON_VERSION}-dev \
71
- python3-pip \
72
  git \
73
- curl \
74
  wget \
75
- libgomp1 \
76
  && rm -rf /var/lib/apt/lists/*
77
 
78
- # Copy Python packages from builder
79
- COPY --from=builder /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages
80
- COPY --from=builder /usr/local/bin /usr/local/bin
81
-
82
- # Create non-root user
83
- RUN groupadd -r vllm && useradd -r -g vllm -d /home/vllm -m vllm
84
-
85
- # Set working directory
86
- WORKDIR /app
87
 
88
- # Copy application code
89
- COPY --chown=vllm:vllm app.py .
90
- COPY --chown=vllm:vllm requirements.txt .
91
- COPY --chown=vllm:vllm config.yaml .
92
 
93
- # Create model cache directory
94
- RUN mkdir -p /home/vllm/.cache/huggingface && chown -R vllm:vllm /home/vllm/.cache
95
 
96
- # Expose port for vLLM server
97
- EXPOSE 8000
98
 
99
- # Health check
100
- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
101
- CMD curl -f http://localhost:8000/health || exit 1
102
 
103
- # Switch to non-root user
104
- USER vllm
 
105
 
106
- # Run vLLM server
107
- CMD ["python3", "app.py"]
 
1
+ # Stack 2.9 HuggingFace Spaces Dockerfile
2
+ # Optimized for 16GB GPU with 4-bit quantization
3
 
4
+ FROM python:3.10-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Set environment variables
7
+ ENV PYTHONUNBUFFERED=1
8
+ ENV TRANSFORMERS_CACHE=/workspace/.cache/huggingface
9
+ ENV HF_HOME=/workspace/.cache/huggingface
 
 
 
 
 
 
10
 
11
+ # Install system dependencies
12
  RUN apt-get update && apt-get install -y \
 
 
 
13
  git \
 
14
  wget \
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Create workspace directory
18
+ WORKDIR /workspace
 
 
 
 
 
 
 
19
 
20
+ # Copy requirements first for better caching
21
+ COPY requirements.txt .
 
 
22
 
23
+ # Install Python dependencies
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
 
26
+ # Copy application files
27
+ COPY . .
28
 
29
+ # Expose Gradio port
30
+ EXPOSE 7860
 
31
 
32
+ # Create startup script
33
+ RUN echo '#!/bin/bash\necho "🚀 Starting Stack 2.9..."\npython app.py --port 7860 --share' > /start.sh
34
+ RUN chmod +x /start.sh
35
 
36
+ # Launch command
37
+ CMD ["/start.sh"]
stack-2.9-deploy/README.md CHANGED
@@ -1,346 +1,124 @@
1
- # Stack 2.9 Deployment Infrastructure
2
 
3
- Turnkey deployment configurations for Stack 2.9 LLM inference server.
4
 
5
- ## 📋 Prerequisites
 
 
6
 
7
- - **Linux/macOS** shell environment
8
- - For local deployment: **Docker** + **NVIDIA GPU** (optional but recommended)
9
- - For cloud: **runpodctl** or **vastai** CLI installed
10
- - **chmod +x** may be required on shell scripts
11
 
12
- ## 🖥️ System Requirements
 
 
 
 
13
 
14
- Stack 2.9 deployment requires appropriate hardware depending on model size:
15
 
16
- | Configuration | Minimum | Recommended | Production |
17
- |---------------|---------|-------------|------------|
18
- | **GPU VRAM** | 8GB | 24GB | 40-80GB (A100/H100) |
19
- | **RAM** | 16GB | 32GB | 64GB+ |
20
- | **Disk** | 20GB free | 50GB free | 100GB+ (NVMe) |
21
- | **CUDA** | 11.8 | 12.1 | 12.1+ |
22
- | **Models** | 7B quantized | 32B quantized | 70B+ quantized |
 
 
23
 
24
- **Notes:**
25
- - CPU-only mode is possible but extremely slow (not recommended for production)
26
- - AWQ/GPTQ quantization reduces VRAM requirements by ~50%
27
- - Multi-GPU (tensor parallelism) supported via `TENSOR_PARALLEL_SIZE`
28
 
29
- ## 🧪 Validate Setup
30
-
31
- ## 🧪 Validate Setup
32
-
33
- Before deploying, run the validation script to ensure everything is ready:
34
-
35
- ```bash
36
- ./validate.sh
37
- ```
38
- This checks Docker, GPU, and all required files.
39
-
40
- ## 🚀 Quick Start
41
-
42
- ### Local Deployment (Docker Compose)
43
-
44
- ```bash
45
- # Ensure deploy.sh is executable
46
- chmod +x deploy.sh validate.sh
47
-
48
- # Deploy
49
- ./deploy.sh local --model TheBloke/Llama-2-7B-Chat-AWQ
50
- ```
51
-
52
- The server will start at `http://localhost:8000`
53
-
54
- ### Cloud Deployments
55
-
56
- ```bash
57
- # RunPod
58
- ./deploy.sh runpod --gpu A100-40GB
59
-
60
- # Vast.ai
61
- ./deploy.sh vastai
62
-
63
- # Kubernetes
64
- ./deploy.sh kubernetes --namespace inference
65
- ```
66
-
67
- ---
68
-
69
- ## 📦 What's Included
70
-
71
- ```
72
- stack-2.9-deploy/
73
- ├── Dockerfile # Multi-stage production image
74
- ├── docker-compose.yaml # Local orchestration
75
- ├── deploy.sh # One-command deployment script
76
- ├── runpod-template.json # RunPod.io template
77
- ├── vastai-template.json # Vast.ai template
78
- ├── kubernetes/ # K8s manifests
79
- │ ├── deployment.yaml # GPU-enabled deployment
80
- │ ├── service.yaml # LoadBalancer service
81
- │ ├── pvc.yaml # Model cache volume
82
- │ ├── hpa.yaml # Autoscaling configuration
83
- │ └── secrets.yaml # Secrets template
84
- ├── app.py # vLLM server wrapper
85
- └── README.md # This file
86
- ```
87
-
88
- ---
89
-
90
- ## 🐳 Docker Image
91
-
92
- **Base:** `nvidia/cuda:12.1-runtime-ubuntu22.04`
93
- **Python:** 3.10
94
- **vLLM:** 0.6.3
95
- **CUDA:** 12.1
96
-
97
- ### Features:
98
- - Multi-stage build for minimal footprint
99
- - Non-root user (`vllm`)
100
- - Health checks
101
- - CUDA 12.1 runtime
102
- - Model cache persistence
103
- - AWQ 4-bit quantization support
104
-
105
- ---
106
-
107
- ## 🔧 Environment Variables
108
-
109
- | Variable | Default | Description |
110
- |----------|---------|-------------|
111
- | `MODEL_ID` | `TheBloke/Llama-2-7B-Chat-AWQ` | Hugging Face model ID |
112
- | `HUGGING_FACE_TOKEN` | (empty) | HF token for gated models |
113
- | `QUANTIZATION` | `awq` | Quantization method |
114
- | `TENSOR_PARALLEL_SIZE` | `1` | Number of GPUs |
115
- | `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory fraction |
116
- | `MAX_MODEL_LEN` | `4096` | Max sequence length |
117
- | `MAX_NUM_SEQS` | `64` | Max batch size |
118
- | `PORT` | `8000` | Server port |
119
-
120
- ---
121
-
122
- ## 🌐 API Endpoints
123
-
124
- Stack 2.9 provides OpenAI-compatible endpoints:
125
-
126
- - `POST /v1/completions` - Text completion
127
- - `POST /v1/chat/completions` - Chat completion
128
- - `GET /health` - Health check
129
- - `GET /metrics` - Prometheus metrics
130
- - `GET /docs` - Interactive API docs
131
-
132
- ### Example Usage
133
-
134
- ```bash
135
- # Chat completion
136
- curl http://localhost:8000/v1/chat/completions \
137
- -H "Content-Type: application/json" \
138
- -d '{
139
- "model": "stack-2.9",
140
- "messages": [{"role": "user", "content": "Hello!"}],
141
- "max_tokens": 100
142
- }'
143
- ```
144
-
145
- ---
146
-
147
- ## ☁️ Platform-Specific Notes
148
-
149
- ### Local (Docker Compose)
150
-
151
- ```bash
152
- # Build and start
153
- ./deploy.sh local --model <model-id>
154
-
155
- # View logs
156
- docker-compose logs -f stack-2.9
157
-
158
- # Stop
159
- docker-compose down
160
- ```
161
-
162
- **Requirements:**
163
- - Docker 20.10+
164
- - Docker Compose v2
165
- - NVIDIA GPU (recommended) with CUDA 12.x drivers
166
-
167
- ---
168
-
169
- ### RunPod
170
-
171
- 1. Authenticate: `runpodctl login`
172
- 2. Run: `./deploy.sh runpod --gpu A100-40GB`
173
- 3. Provide your Docker registry
174
- 4. Deploy from the created template on RunPod.io
175
-
176
- **Recommended GPUs:**
177
- - A100 40GB (default)
178
- - A100 80GB
179
- - H100 80GB
180
-
181
- **Auto-sleep:** Enabled after 30 minutes of inactivity
182
-
183
- ---
184
-
185
- ### Vast.ai
186
-
187
- 1. Install vastai CLI
188
- 2. Run: `./deploy.sh vastai`
189
- 3. Provide your Docker registry
190
- 4. Launch via template or CLI
191
-
192
- **Recommended Instances:**
193
- - RTX 4090 (24GB) - $0.30-0.50/hr
194
- - RTX 6000 Ada (48GB) - $0.80-1.20/hr
195
- - A100 40GB - $0.90-1.50/hr
196
-
197
- **SSH Access:** Available on forwarded port 2222
198
-
199
- ---
200
-
201
- ### Kubernetes
202
-
203
- #### Prerequisites:
204
- - kubectl configured
205
- - GPU-enabled cluster (NVIDIA GPUs with device plugin)
206
- - Storage class with ReadWriteMany capability
207
-
208
- #### Deployment:
209
 
210
  ```bash
211
- # Create namespace
212
- kubectl apply -f kubernetes/secrets.yaml
 
213
 
214
- # Set your HF token
215
- kubectl create secret generic stack-2.9-secrets \
216
- --from-literal=huggingface-token='YOUR_TOKEN' \
217
- -n stack-2.9
218
 
219
- # Deploy
220
- ./deploy.sh kubernetes --namespace stack-2.9
221
-
222
- # Or manually:
223
- kubectl apply -f kubernetes/
224
  ```
225
 
226
- **Check status:**
227
- ```bash
228
- kubectl get pods,svc,pvc,hpa -n stack-2.9
229
- kubectl logs -f deployment/stack-2.9 -n stack-2-9
230
- ```
231
 
232
- **Get service URL:**
233
- ```bash
234
- kubectl get svc stack-2.9 -n stack-2-9 -o wide
235
- ```
236
-
237
- ---
 
238
 
239
- ## ⚙️ Customization
240
 
241
- ### Different Model
242
 
243
- ```bash
244
- ./deploy.sh local --model mistralai/Mistral-7B-Instruct-v0.2
245
  ```
246
-
247
- Supported formats:
248
- - AWQ quantized: `TheBloke/*-AWQ`
249
- - GPTQ quantized: `TheBloke/*-GPTQ`
250
- - Full precision: Any Hugging Face model
251
-
252
- ### GPU Configuration
253
-
254
- Edit `docker-compose.yaml` or K8s deployment:
255
-
256
- ```yaml
257
- resources:
258
- limits:
259
- nvidia.com/gpu: 2 # Multi-GPU
260
- requests:
261
- memory: "24Gi"
262
- cpu: "8"
263
  ```
264
 
265
- ---
266
 
267
- ## 🧪 Testing
 
268
 
269
- ```bash
270
- # Health check
271
- curl http://localhost:8000/health
272
 
273
- # API docs
274
- open http://localhost:8000/docs
 
275
 
276
- # Test inference
277
- curl http://localhost:8000/v1/completions \
278
- -H "Content-Type: application/json" \
279
- -d '{"prompt": "Once upon a time", "max_tokens": 50}'
280
  ```
281
 
282
- ---
283
-
284
- ## 🐛 Troubleshooting
285
-
286
- ### GPU not detected
287
- ```bash
288
- # Check NVIDIA drivers
289
- nvidia-smi
290
-
291
- # Ensure NVIDIA Container Toolkit
292
- docker info | grep -i runtime
293
- ```
294
-
295
- ### Out of memory
296
- Reduce `GPU_MEMORY_UTILIZATION` to `0.7` or `0.8`
297
-
298
- ### Slow first request
299
- First request downloads/loads the model (~5-10 min for 7B). This is cached for subsequent requests.
300
 
301
- ### Model download failures
302
- Ensure `HUGGING_FACE_TOKEN` is set for gated models or large files.
303
 
304
- ---
305
 
306
- ## 📊 Monitoring
307
 
308
- ### Metrics Endpoint
309
- `GET /metrics` - Basic server metrics
 
 
310
 
311
- ### Docker Metrics
312
- ```bash
313
- docker stats stack-2.9-server
314
- ```
315
 
316
- ### Kubernetes Metrics
317
- ```bash
318
- kubectl top pod stack-2.9 -n stack-2-9
319
- kubectl get hpa -n stack-2-9
320
- ```
321
-
322
- ---
323
 
324
- ## 🔒 Security
325
-
326
- - Runs as non-root user (`vllm`)
327
- - Dropped capabilities
328
- - Read-only filesystem (except cache)
329
- - Health checks for liveness/readiness
330
- - Secrets via Kubernetes secrets or env file
331
 
332
- ---
333
 
334
- ## 📝 License
335
 
336
- Same as Stack 2.9 project license.
 
 
337
 
338
  ---
339
 
340
- ## 🤝 Support
341
-
342
- Issues: Report to Stack 2.9 repository
343
 
344
- ---
345
 
346
- **Made with ❤️ for turnkey LLM deployment**
 
1
+ # 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
2
 
3
+ A HuggingFace Spaces demo for Stack 2.9, a pattern-based AI coding assistant powered by Qwen2.5-Coder-7B.
4
 
5
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
6
+ ![Python](https://img.shields.io/badge/python-3.10+-green.svg)
7
+ ![Gradio](https://img.shields.io/badge/Gradio-4.0+-orange.svg)
8
 
9
+ ## Features
 
 
 
10
 
11
+ - **🤖 Qwen2.5-Coder-7B** - State-of-the-art code generation model
12
+ - **🔧 7 Integrated Tools** - File operations, git, web search, shell commands
13
+ - **🧠 Pattern Memory** - Learns from each interaction
14
+ - **⚡ Fast Streaming** - Real-time token-by-token generation
15
+ - **💾 4-bit Quantization** - Runs on 16GB GPU (~4GB VRAM)
16
 
17
+ ## 🔧 Available Tools
18
 
19
+ | Tool | Description |
20
+ |------|-------------|
21
+ | `file_read` | Read files from the filesystem |
22
+ | `file_write` | Write content to files |
23
+ | `git_status` | Check git repository status |
24
+ | `web_search` | Search the web for information |
25
+ | `run_command` | Execute shell commands |
26
+ | `create_directory` | Create new directories |
27
+ | `list_directory` | List directory contents |
28
 
29
+ ## 🏃‍♂️ Quick Start
 
 
 
30
 
31
+ ### Local Development
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  ```bash
34
+ # Clone the repository
35
+ git clone https://github.com/your-repo/stack-2.9.git
36
+ cd stack-2.9/space
37
 
38
+ # Install dependencies
39
+ pip install -r requirements.txt
 
 
40
 
41
+ # Run the demo
42
+ python app.py --share
 
 
 
43
  ```
44
 
45
+ ### HuggingFace Spaces
 
 
 
 
46
 
47
+ 1. Create a new Space on [HuggingFace](https://huggingface.co/spaces)
48
+ 2. Select "Gradio" as the SDK
49
+ 3. Upload the files from this directory:
50
+ - `app.py`
51
+ - `requirements.txt`
52
+ - `README.md`
53
+ 4. The model will load automatically on startup
54
 
55
+ ## 💻 Usage
56
 
57
+ ### Example Prompts
58
 
 
 
59
  ```
60
+ Hello! What can you help me with?
61
+ Check git status of this repository
62
+ Search for best practices for Python async programming
63
+ List the files in the current directory
64
+ Write a simple Python function to calculate fibonacci
65
+ How do I use Git to create a new branch?
66
+ What's your memory of our conversation?
 
 
 
 
 
 
 
 
 
 
67
  ```
68
 
69
+ ### Python API
70
 
71
+ ```python
72
+ from app import StackModel, memory
73
 
74
+ # Initialize model
75
+ model = StackModel()
76
+ model.load()
77
 
78
+ # Generate response
79
+ response = model.generate("Write a hello world in Python")
80
+ print(response)
81
 
82
+ # Check memory stats
83
+ print(memory.get_stats())
 
 
84
  ```
85
 
86
+ ## 🔐 Environment Variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ - `HF_TOKEN` - Your HuggingFace token for private models (optional)
89
+ - `MODEL_ID` - Override default model (default: Qwen/Qwen2.5-Coder-7B-Instruct)
90
 
91
+ ## 📊 Memory System
92
 
93
+ Stack 2.9 includes a pattern memory system that:
94
 
95
+ 1. **Tracks Interactions** - Records every user-assistant exchange
96
+ 2. **Learns Patterns** - Identifies frequently used tools
97
+ 3. **Stores Code** - Saves useful code snippets
98
+ 4. **Adapts Behavior** - Uses learned context to improve responses
99
 
100
+ ## 🛠️ Tech Stack
 
 
 
101
 
102
+ - **Model**: Qwen2.5-Coder-7B-Instruct
103
+ - **Quantization**: 4-bit (bitsandbytes)
104
+ - **Framework**: Gradio 4.0+
105
+ - **Backend**: Transformers + Accelerate
106
+ - **GPU**: 16GB VRAM recommended
 
 
107
 
108
+ ## 📝 License
 
 
 
 
 
 
109
 
110
+ MIT License - see LICENSE file for details.
111
 
112
+ ## 🙏 Acknowledgments
113
 
114
+ - [Qwen](https://github.com/QwenLM/Qwen) - Base model
115
+ - [HuggingFace](https://huggingface.co/) - Spaces hosting
116
+ - [Gradio](https://gradio.app/) - UI framework
117
 
118
  ---
119
 
120
+ <div align="center">
 
 
121
 
122
+ Made with ❤️ by Stack 2.9
123
 
124
+ </div>
stack-2.9-deploy/app.py CHANGED
@@ -1,276 +1,600 @@
1
- #!/usr/bin/env python3
2
  """
3
- Stack 2.9 vLLM Server Entrypoint
4
- Production-ready LLM inference server with health checks and metrics
 
 
 
5
  """
6
 
7
  import os
8
- import sys
9
  import json
10
- import logging
11
- from pathlib import Path
12
-
13
- from fastapi import FastAPI, Request, HTTPException
14
- from fastapi.responses import JSONResponse, StreamingResponse
15
- import uvicorn
16
- from vllm import LLM, SamplingParams
17
- from vllm.engine.arg_utils import AsyncEngineArgs
18
- from huggingface_hub import login
19
-
20
- # Configure logging
21
- logging.basicConfig(
22
- level=logging.INFO,
23
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24
- handlers=[logging.StreamHandler(sys.stdout)]
25
- )
26
- logger = logging.getLogger("stack-2.9")
27
-
28
- # Initialize FastAPI app
29
- app = FastAPI(
30
- title="Stack 2.9 Inference API",
31
- description="High-performance LLM inference using vLLM",
32
- version="2.9.0"
33
- )
34
-
35
- # Global LLM instance
36
- llm_instance = None
37
-
38
- def get_model_id():
39
- """Get model ID from environment or config"""
40
- model_id = os.getenv("MODEL_ID")
41
- if not model_id:
42
- # Default to a quantized model
43
- model_id = "TheBloke/Llama-2-7B-Chat-AWQ"
44
- return model_id
45
-
46
- def get_hf_token():
47
- """Get Hugging Face token if provided"""
48
- token = os.getenv("HUGGING_FACE_TOKEN") or os.getenv("HF_TOKEN")
49
- return token
50
-
51
- async def initialize_model():
52
- """Initialize the vLLM model"""
53
- global llm_instance
54
-
55
- model_id = get_model_id()
56
- hf_token = get_hf_token()
57
-
58
- logger.info(f"Initializing model: {model_id}")
59
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  try:
61
- # Login to Hugging Face if token provided
62
- if hf_token:
63
- login(token=hf_token)
64
-
65
- # Engine arguments
66
- engine_args = AsyncEngineArgs(
67
- model=model_id,
68
- tokenizer=model_id,
69
- tensor_parallel_size=int(os.getenv("TENSOR_PARALLEL_SIZE", 1)),
70
- gpu_memory_utilization=float(os.getenv("GPU_MEMORY_UTILIZATION", 0.9)),
71
- max_model_len=int(os.getenv("MAX_MODEL_LEN", 4096)),
72
- max_num_seqs=int(os.getenv("MAX_NUM_SEQS", 64)),
73
- max_num_batched_tokens=int(os.getenv("MAX_NUM_BATCHED_TOKENS", 4096)),
74
- disable_log_stats=os.getenv("DISABLE_LOG_STATS", "false").lower() == "true",
75
- enforce_eager=os.getenv("ENFORCE_EAGER", "false").lower() == "true",
76
- quantization=os.getenv("QUANTIZATION", "awq"),
77
- download_dir=os.getenv("MODEL_CACHE_DIR", "/home/vllm/.cache/huggingface"),
78
- )
79
 
80
- # Override quantization if not using AWQ
81
- if os.getenv("QUANTIZATION", "").lower() not in ["awq", "gptq", "squeezellm"]:
82
- engine_args.quantization = None
 
 
 
 
 
 
83
 
84
- llm_instance = LLM.from_engine_args(engine_args)
85
- logger.info("Model initialized successfully")
86
- return True
87
 
 
 
 
 
 
 
88
  except Exception as e:
89
- logger.error(f"Failed to initialize model: {e}")
90
- return False
91
-
92
- @app.get("/health")
93
- async def health_check():
94
- """Health check endpoint"""
95
- if llm_instance is None:
96
- raise HTTPException(status_code=503, detail="Model not initialized")
97
- return {"status": "healthy", "model": get_model_id()}
98
-
99
- @app.get("/metrics")
100
- async def metrics():
101
- """Prometheus-style metrics endpoint"""
102
- if llm_instance is None:
103
- return JSONResponse(status_code=503, content={"error": "Model not initialized"})
104
-
105
- # Basic metrics - can be extended
106
- metrics_data = {
107
- "model": get_model_id(),
108
- "status": "ready",
109
- "gpu_utilization": "N/A" # Would need nvml for actual values
110
- }
111
- return JSONResponse(content=metrics_data)
112
-
113
- @app.post("/v1/completions")
114
- async def completions(request: Request):
115
- """OpenAI-compatible completions endpoint"""
116
- if llm_instance is None:
117
- raise HTTPException(status_code=503, detail="Model not initialized")
118
 
 
 
 
119
  try:
120
- body = await request.json()
121
- prompt = body.get("prompt", "")
122
- max_tokens = int(body.get("max_tokens", 100))
123
- temperature = float(body.get("temperature", 0.7))
124
- top_p = float(body.get("top_p", 1.0))
125
- stream = body.get("stream", False)
126
-
127
- if not prompt:
128
- raise HTTPException(status_code=400, detail="Prompt is required")
129
-
130
- sampling_params = SamplingParams(
131
- max_tokens=max_tokens,
132
- temperature=temperature,
133
- top_p=top_p
134
- )
135
 
136
- if stream:
137
- # Streaming response
138
- async def generate():
139
- try:
140
- outputs = llm_instance.generate(prompt, sampling_params, stream=True)
141
- async for output in outputs:
142
- chunk = output.outputs[0].text
143
- yield f"data: {json.dumps({'text': chunk, 'finished': False})}\n\n"
144
- yield f"data: {json.dumps({'text': '', 'finished': True})}\n\n"
145
- except Exception as e:
146
- logger.error(f"Streaming error: {e}")
147
- yield f"data: {json.dumps({'error': str(e)})}\n\n"
148
-
149
- return StreamingResponse(generate(), media_type="text/event-stream")
150
- else:
151
- # Non-streaming
152
- outputs = llm_instance.generate(prompt, sampling_params)
153
- generated_text = outputs[0].outputs[0].text
154
-
155
- return JSONResponse(content={
156
- "id": "cmpl-" + os.urandom(12).hex(),
157
- "object": "text_completion",
158
- "created": int(os.path.getmtime(__file__)),
159
- "model": get_model_id(),
160
- "choices": [{
161
- "text": generated_text,
162
- "index": 0,
163
- "logprobs": None,
164
- "finish_reason": "stop"
165
- }],
166
- "usage": {
167
- "prompt_tokens": len(prompt.split()),
168
- "completion_tokens": len(generated_text.split()),
169
- "total_tokens": len(prompt.split()) + len(generated_text.split())
170
- }
171
- })
172
 
 
 
 
 
 
173
  except Exception as e:
174
- logger.error(f"Completions error: {e}")
175
- raise HTTPException(status_code=500, detail=str(e))
176
 
177
- @app.post("/v1/chat/completions")
178
- async def chat_completions(request: Request):
179
- """OpenAI-compatible chat completions endpoint"""
180
- if llm_instance is None:
181
- raise HTTPException(status_code=503, detail="Model not initialized")
182
 
 
 
183
  try:
184
- body = await request.json()
185
- messages = body.get("messages", [])
186
-
187
- if not messages:
188
- raise HTTPException(status_code=400, detail="Messages are required")
189
-
190
- # Format messages based on model type
191
- # Simple implementation - extend for specific model chat templates
192
- prompt = ""
193
- for msg in messages:
194
- role = msg.get("role", "user")
195
- content = msg.get("content", "")
196
- if role == "system":
197
- prompt += f"System: {content}\n"
198
- elif role == "user":
199
- prompt += f"User: {content}\n"
200
- elif role == "assistant":
201
- prompt += f"Assistant: {content}\n"
202
- prompt += "Assistant:"
203
-
204
- max_tokens = int(body.get("max_tokens", 100))
205
- temperature = float(body.get("temperature", 0.7))
206
- top_p = float(body.get("top_p", 1.0))
207
- stream = body.get("stream", False)
208
-
209
- sampling_params = SamplingParams(
210
- max_tokens=max_tokens,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  temperature=temperature,
212
- top_p=top_p
 
 
213
  )
214
-
215
- if stream:
216
- async def generate():
217
- try:
218
- outputs = llm_instance.generate(prompt, sampling_params, stream=True)
219
- async for output in outputs:
220
- chunk = output.outputs[0].text
221
- yield f"data: {json.dumps({'choices': [{'delta': {'content': chunk}}] )}\n\n"
222
- yield f"data: {json.dumps({'choices': [{'delta': {}}] })}\n\n"
223
- except Exception as e:
224
- logger.error(f"Streaming error: {e}")
225
- yield f"data: {json.dumps({'error': str(e)})}\n\n"
226
-
227
- return StreamingResponse(generate(), media_type="text/event-stream")
228
- else:
229
- outputs = llm_instance.generate(prompt, sampling_params)
230
- generated_text = outputs[0].outputs[0].text
231
-
232
- return JSONResponse(content={
233
- "id": "chatcmpl-" + os.urandom(12).hex(),
234
- "object": "chat.completion",
235
- "created": int(os.path.getmtime(__file__)),
236
- "model": get_model_id(),
237
- "choices": [{
238
- "index": 0,
239
- "message": {
240
- "role": "assistant",
241
- "content": generated_text
242
- },
243
- "logprobs": None,
244
- "finish_reason": "stop"
245
- }],
246
- "usage": {
247
- "prompt_tokens": len(prompt.split()),
248
- "completion_tokens": len(generated_text.split()),
249
- "total_tokens": len(prompt.split()) + len(generated_text.split())
250
- }
251
- })
252
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  except Exception as e:
254
- logger.error(f"Chat completions error: {e}")
255
- raise HTTPException(status_code=500, detail=str(e))
256
-
257
- @app.on_event("startup")
258
- async def startup_event():
259
- """Initialize model on startup"""
260
- logger.info("Starting Stack 2.9 inference server...")
261
- success = await initialize_model()
262
- if not success:
263
- logger.error("Failed to initialize model on startup")
264
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  if __name__ == "__main__":
267
- host = os.getenv("HOST", "0.0.0.0")
268
- port = int(os.getenv("PORT", 8000))
269
-
270
- uvicorn.run(
271
- app,
272
- host=host,
273
- port=port,
274
- log_level="info",
275
- workers=1 # vLLM manages its own async
276
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Stack 2.9 - Pattern-Based AI Coding Assistant
3
+ HuggingFace Spaces Demo
4
+
5
+ A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B
6
+ with tool integration and pattern memory.
7
  """
8
 
9
  import os
 
10
  import json
11
+ import time
12
+ from datetime import datetime
13
+ from typing import List, Dict, Optional
14
+ import gradio as gr
15
+
16
+ # ============================================================
17
+ # Pattern Memory System
18
+ # ============================================================
19
+
20
+ class SelfEvolutionMemory:
21
+ """Simple in-memory pattern memory system for demo purposes."""
22
+
23
+ def __init__(self):
24
+ self.conversations = []
25
+ self.learned_patterns = {}
26
+ self.code_snippets = []
27
+ self.preferences = {}
28
+ self.interaction_count = 0
29
+
30
+ def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None):
31
+ """Record an interaction for learning."""
32
+ self.interaction_count += 1
33
+ interaction = {
34
+ "timestamp": datetime.now().isoformat(),
35
+ "user_input": user_input,
36
+ "assistant_response": assistant_response,
37
+ "tools_used": tools_used or [],
38
+ "interaction_id": self.interaction_count
39
+ }
40
+ self.conversations.append(interaction)
41
+
42
+ # Extract patterns from the interaction
43
+ self._learn_from_interaction(user_input, assistant_response, tools_used or [])
44
+
45
+ def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]):
46
+ """Learn patterns from interactions."""
47
+ # Track tool usage patterns
48
+ for tool in tools:
49
+ if tool not in self.learned_patterns:
50
+ self.learned_patterns[tool] = {"count": 0, "contexts": []}
51
+ self.learned_patterns[tool]["count"] += 1
52
+ self.learned_patterns[tool]["contexts"].append(user_input[:100])
53
+
54
+ # Extract code snippets if present
55
+ if "```" in response:
56
+ self.code_snippets.append({
57
+ "timestamp": datetime.now().isoformat(),
58
+ "snippet": response
59
+ })
60
+
61
+ def get_context(self) -> str:
62
+ """Get accumulated context for the model."""
63
+ context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"]
64
+
65
+ if self.learned_patterns:
66
+ context_parts.append("\n### Tool Usage Patterns:")
67
+ for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]:
68
+ context_parts.append(f"- {tool}: used {data['count']} times")
69
+
70
+ if self.code_snippets:
71
+ context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns")
72
+
73
+ return "\n".join(context_parts)
74
+
75
+ def get_stats(self) -> Dict:
76
+ """Get memory statistics."""
77
+ return {
78
+ "total_interactions": self.interaction_count,
79
+ "tool_patterns": len(self.learned_patterns),
80
+ "code_snippets": len(self.code_snippets),
81
+ "recent_tools": [t for t in self.learned_patterns.keys()][:5]
82
+ }
83
+
84
+
85
+ # Global memory instance
86
+ memory = SelfEvolutionMemory()
87
+
88
+ # ============================================================
89
+ # Tool System
90
+ # ============================================================
91
+
92
+ class Tool:
93
+ """Base tool class."""
94
+
95
+ def __init__(self, name: str, description: str, func):
96
+ self.name = name
97
+ self.description = description
98
+ self.func = func
99
+
100
+ async def execute(self, *args, **kwargs):
101
+ return await self.func(*args, **kwargs)
102
+
103
+
104
+ # Tool implementations (simplified for demo)
105
+ async def tool_file_read(path: str) -> str:
106
+ """Read a file."""
107
  try:
108
+ with open(path, 'r') as f:
109
+ return f.read()[:5000] # Limit output
110
+ except FileNotFoundError:
111
+ return f"File not found: {path}"
112
+ except Exception as e:
113
+ return f"Error reading file: {str(e)}"
114
+
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ async def tool_file_write(path: str, content: str) -> str:
117
+ """Write to a file."""
118
+ try:
119
+ os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
120
+ with open(path, 'w') as f:
121
+ f.write(content)
122
+ return f"Successfully wrote to {path}"
123
+ except Exception as e:
124
+ return f"Error writing file: {str(e)}"
125
 
 
 
 
126
 
127
+ async def tool_git_status() -> str:
128
+ """Get git status."""
129
+ import subprocess
130
+ try:
131
+ result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd())
132
+ return result.stdout or "No changes"
133
  except Exception as e:
134
+ return f"Git error: {str(e)}"
135
+
136
+
137
+ async def tool_web_search(query: str) -> str:
138
+ """Search the web."""
139
+ from urllib.parse import quote
140
+ # Return a demo response since we can't make actual API calls
141
+ return f"🔍 Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)"
142
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ async def tool_run_command(cmd: str) -> str:
145
+ """Run a shell command."""
146
+ import subprocess
147
  try:
148
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
149
+ return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout
150
+ except Exception as e:
151
+ return f"Command error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ async def tool_create_directory(path: str) -> str:
155
+ """Create a directory."""
156
+ try:
157
+ os.makedirs(path, exist_ok=True)
158
+ return f"Directory created: {path}"
159
  except Exception as e:
160
+ return f"Error: {str(e)}"
 
161
 
 
 
 
 
 
162
 
163
+ async def tool_list_directory(path: str = ".") -> str:
164
+ """List directory contents."""
165
  try:
166
+ items = os.listdir(path)
167
+ return "\n".join([f"📁 {i}/" if os.path.isdir(os.path.join(path, i)) else f"📄 {i}" for i in items[:50]])
168
+ except Exception as e:
169
+ return f"Error: {str(e)}"
170
+
171
+
172
+ # Register tools
173
+ TOOLS = {
174
+ "file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read),
175
+ "file_write": Tool("file_write", "Write content to a file", tool_file_write),
176
+ "git_status": Tool("git_status", "Check git repository status", tool_git_status),
177
+ "web_search": Tool("web_search", "Search the web for information", tool_web_search),
178
+ "run_command": Tool("run_command", "Execute a shell command", tool_run_command),
179
+ "create_directory": Tool("create_directory", "Create a new directory", tool_create_directory),
180
+ "list_directory": Tool("list_directory", "List files in a directory", tool_list_directory),
181
+ }
182
+
183
+
184
+ def get_tool_descriptions() -> str:
185
+ """Get descriptions of all available tools."""
186
+ return "\n".join([f"- **{t.name}**: {t.description}" for t in TOOLS.values()])
187
+
188
+
189
+ # ============================================================
190
+ # Model Interface
191
+ # ============================================================
192
+
193
+ class StackModel:
194
+ """Stack 2.9 model interface using transformers."""
195
+
196
+ def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"):
197
+ self.model_id = model_id
198
+ self.model = None
199
+ self.tokenizer = None
200
+ self.pipeline = None
201
+
202
+ def load(self):
203
+ """Load the model with 4-bit quantization for HF Spaces."""
204
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
205
+ import torch
206
+
207
+ print(f"Loading {self.model_id}...")
208
+
209
+ # 4-bit quantization config for 16GB GPU
210
+ bnb_config = BitsAndBytesConfig(
211
+ load_in_4bit=True,
212
+ bnb_4bit_compute_dtype=torch.float16,
213
+ bnb_4bit_use_double_quant=True,
214
+ bnb_4bit_quant_type="nf4"
215
+ )
216
+
217
+ # Load tokenizer
218
+ self.tokenizer = AutoTokenizer.from_pretrained(
219
+ self.model_id,
220
+ trust_remote_code=True
221
+ )
222
+
223
+ # Load model with quantization
224
+ self.model = AutoModelForCausalLM.from_pretrained(
225
+ self.model_id,
226
+ quantization_config=bnb_config,
227
+ device_map="auto",
228
+ trust_remote_code=True
229
+ )
230
+
231
+ print("Model loaded successfully!")
232
+
233
+ def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
234
+ """Generate a response."""
235
+ if not self.tokenizer:
236
+ return "Model not loaded. Please wait for initialization."
237
+
238
+ # Build the prompt with system and tools
239
+ system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
240
+
241
+ ## Available Tools
242
+ {get_tool_descriptions()}
243
+
244
+ ## Your Capabilities
245
+ - Write, read, and execute code
246
+ - Use git for version control
247
+ - Search the web for information
248
+ - Create and manage files
249
+ - Execute shell commands
250
+
251
+ ## Self-Evolution
252
+ You learn from each interaction. After responding, summarize what tools you used.
253
+
254
+ {memory.get_context()}
255
+
256
+ ## Instructions
257
+ 1. Be helpful and concise
258
+ 2. Use tools when needed
259
+ 3. Learn from the conversation
260
+ 4. Provide code examples when relevant
261
+
262
+ Now respond to the user:"""
263
+
264
+ full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
265
+
266
+ # Tokenize
267
+ inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
268
+
269
+ # Generate
270
+ outputs = self.model.generate(
271
+ **inputs,
272
+ max_new_tokens=max_tokens,
273
  temperature=temperature,
274
+ do_sample=True,
275
+ top_p=0.9,
276
+ repetition_penalty=1.1
277
  )
278
+
279
+ # Decode
280
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
281
+
282
+ # Extract just the response part
283
+ if "Assistant:" in response:
284
+ response = response.split("Assistant:")[-1].strip()
285
+
286
+ return response
287
+
288
+ def generate_streaming(self, prompt: str, max_tokens: int = 512):
289
+ """Generate with streaming (yields tokens)."""
290
+ if not self.tokenizer:
291
+ yield "Model not loaded. Please wait for initialization."
292
+ return
293
+
294
+ system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.
295
+
296
+ ## Available Tools
297
+ {get_tool_descriptions()}
298
+
299
+ ## Self-Evolution Memory
300
+ {memory.get_context()}
301
+
302
+ Now respond to the user:"""
303
+
304
+ full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
305
+
306
+ inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)
307
+
308
+ # Generate token by token
309
+ from transformers import GenerationMixin
310
+ from typing import Iterator
311
+
312
+ generated_ids = inputs.input_ids
313
+
314
+ for _ in range(max_tokens):
315
+ with torch.no_grad():
316
+ outputs = self.model(generated_ids)
317
+ next_token_logits = outputs.logits[:, -1, :]
318
+
319
+ # Apply temperature
320
+ next_token_logits = next_token_logits / 0.7
321
+
322
+ # Sample
323
+ probs = torch.softmax(next_token_logits, dim=-1)
324
+ next_token = torch.multinomial(probs, num_samples=1)
325
+
326
+ generated_ids = torch.cat([generated_ids, next_token], dim=-1)
327
+
328
+ # Decode and yield
329
+ token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
330
+ yield token_str
331
+
332
+ # Stop on EOS
333
+ if next_token.item() == self.tokenizer.eos_token_id:
334
+ break
335
+
336
+
337
+ # Global model instance
338
+ model = None
339
+
340
+
341
+ def initialize_model():
342
+ """Initialize the model on startup."""
343
+ global model
344
+ try:
345
+ model = StackModel()
346
+ model.load()
347
+ return model
348
+ except Exception as e:
349
+ print(f"Failed to load model: {e}")
350
+ return None
351
+
352
+
353
+ # ============================================================
354
+ # Gradio Interface
355
+ # ============================================================
356
+
357
+ def format_tools_used(tools_used: List[str]) -> str:
358
+ """Format the tools used for display."""
359
+ if not tools_used:
360
+ return ""
361
+ return f"\n\n🔧 **Tools Used**: {', '.join(tools_used)}"
362
+
363
+
364
+ def chat_response(message: str, history: List[List[str]]) -> tuple:
365
+ """Process a chat message and return response."""
366
+ global model, memory
367
+
368
+ if model is None or model.model is None:
369
+ return "⏳ Model is loading. Please wait...", history + [[message, "⏳ Model is loading. Please wait..."]]
370
+
371
+ # Track tools used
372
+ tools_used = []
373
+
374
+ # Check if we need to use tools based on the message
375
+ message_lower = message.lower()
376
+
377
+ if any(kw in message_lower for kw in ['git status', 'git']):
378
+ tools_used.append("git_status")
379
+ if any(kw in message_lower for kw in ['search', 'find', 'look up']):
380
+ tools_used.append("web_search")
381
+ if any(kw in message_lower for kw in ['list files', 'directory', 'ls']):
382
+ tools_used.append("list_directory")
383
+ if any(kw in message_lower for kw in ['run ', 'execute', 'command']):
384
+ tools_used.append("run_command")
385
+
386
+ # Generate response
387
+ try:
388
+ response = model.generate(message, max_tokens=512)
389
  except Exception as e:
390
+ response = f"I encountered an error: {str(e)}"
391
+
392
+ # Add tools used to response
393
+ response += format_tools_used(tools_used)
394
+
395
+ # Record in memory
396
+ memory.add_interaction(message, response, tools_used)
397
+
398
+ return response
399
+
400
+
401
+ def chat_response_stream(message: str, history: List[List[str]]) -> Generator:
402
+ """Process a chat message with streaming."""
403
+ global model, memory
404
+
405
+ if model is None or model.model is None:
406
+ yield "⏳ Model is loading. Please wait..."
407
+ return
408
+
409
+ full_response = ""
410
+ tools_used = []
411
+
412
+ message_lower = message.lower()
413
+ if any(kw in message_lower for kw in ['git status', 'git']):
414
+ tools_used.append("git_status")
415
+ if any(kw in message_lower for kw in ['search', 'find']):
416
+ tools_used.append("web_search")
417
+ if any(kw in message_lower for kw in ['list', 'directory']):
418
+ tools_used.append("list_directory")
419
+
420
+ # Stream the response
421
+ for token in model.generate_streaming(message, max_tokens=256):
422
+ full_response += token
423
+ yield full_response
424
+
425
+ # Add tools used
426
+ if tools_used:
427
+ full_response += format_tools_used(tools_used)
428
+ yield full_response
429
+
430
+ # Record in memory
431
+ memory.add_interaction(message, full_response, tools_used)
432
+
433
+
434
+ # Example prompts for the UI
435
+ EXAMPLE_PROMPTS = [
436
+ "Hello! What can you help me with?",
437
+ "Check git status of this repository",
438
+ "Search for best practices for Python async programming",
439
+ "List the files in the current directory",
440
+ "Write a simple Python function to calculate fibonacci",
441
+ "How do I use Git to create a new branch?",
442
+ "What's your memory of our conversation?",
443
+ ]
444
+
445
+
446
+ def create_gradio_app():
447
+ """Create the Gradio interface."""
448
+
449
+ with gr.Blocks(
450
+ title="Stack 2.9 - Pattern-Based AI Coding Assistant",
451
+ theme=gr.themes.Soft(
452
+ primary_color="#6366f1",
453
+ secondary_color="#818cf8",
454
+ tertiary_color="#a5b4fc"
455
+ )
456
+ ) as app:
457
+
458
+ # Header
459
+ gr.Markdown("""
460
+ # 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant
461
+
462
+ Powered by **Qwen2.5-Coder-7B** with 4-bit quantization
463
+
464
+ ---
465
+ """)
466
+
467
+ # Memory stats display
468
+ with gr.Row():
469
+ with gr.Column(scale=1):
470
+ stats_display = gr.Markdown(
471
+ "📊 **Memory Stats**\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0",
472
+ elem_id="stats"
473
+ )
474
+ with gr.Column(scale=3):
475
+ pass # Spacer
476
+
477
+ # Chat interface
478
+ chatbot = gr.Chatbot(
479
+ height=500,
480
+ show_copy_button=True,
481
+ bubble_full_width=False
482
+ )
483
+
484
+ with gr.Row():
485
+ msg = gr.Textbox(
486
+ label="Message",
487
+ placeholder="Ask me anything...",
488
+ scale=4,
489
+ lines=3
490
+ )
491
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
492
+
493
+ # Clear button
494
+ with gr.Row():
495
+ clear_btn = gr.Button("🗑️ Clear Chat")
496
+
497
+ # Example prompts
498
+ gr.Examples(
499
+ examples=EXAMPLE_PROMPTS,
500
+ inputs=msg,
501
+ label="Example Prompts"
502
+ )
503
+
504
+ # Memory visualization
505
+ with gr.Accordion("🧠 Self-Evolution Memory", open=False):
506
+ memory_display = gr.Textbox(
507
+ label="Memory Content",
508
+ lines=10,
509
+ interactive=False
510
+ )
511
+
512
+ # Functions
513
+ def respond(message, history):
514
+ response = chat_response(message, history)
515
+ history.append([message, response])
516
+ return "", history
517
+
518
+ def update_stats():
519
+ stats = memory.get_stats()
520
+ return f"""📊 **Memory Stats**
521
+
522
+ - **Interactions**: {stats['total_interactions']}
523
+ - **Tool Patterns**: {stats['tool_patterns']}
524
+ - **Code Snippets**: {stats['code_snippets']}
525
+
526
+ **Recent Tools**: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}"""
527
+
528
+ def update_memory():
529
+ return memory.get_context()
530
+
531
+ # Button click handlers
532
+ submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send")
533
+ msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send")
534
+
535
+ def clear_chat():
536
+ return [], ""
537
+
538
+ clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
539
+
540
+ # Update stats periodically
541
+ chatbot.change(update_stats, outputs=[stats_display])
542
+ chatbot.change(update_memory, outputs=[memory_display])
543
+
544
+ # Footer
545
+ gr.Markdown("""
546
+ ---
547
+ ### About Stack 2.9
548
+
549
+ Stack 2.9 is a pattern-based AI coding assistant that:
550
+ - 🔍 Uses **Qwen2.5-Coder-7B** (4-bit, ~4GB VRAM)
551
+ - 🛠️ Integrates **7 tools** (file, git, web, search, shell)
552
+ - 🧠 Remembers interactions and learns patterns
553
+ - ⚡ Provides fast, streaming responses
554
+
555
+ Deployed on **HuggingFace Spaces** with Gradio
556
+ """)
557
+
558
+ return app
559
+
560
+
561
+ # ============================================================
562
+ # Main Entry Point
563
+ # ============================================================
564
 
565
  if __name__ == "__main__":
566
+ import argparse
567
+
568
+ parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo")
569
+ parser.add_argument("--share", action="store_true", help="Create a public share link")
570
+ parser.add_argument("--port", type=int, default=7860, help="Port to run on")
571
+ parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID")
572
+ args = parser.parse_args()
573
+
574
+ print("=" * 50)
575
+ print("🚀 Stack 2.9 - Pattern-Based AI Coding Assistant")
576
+ print("=" * 50)
577
+ print(f"Model: {args.model}")
578
+ print("Loading model...")
579
+
580
+ # Initialize model in a thread
581
+ import threading
582
+
583
+ def load_model_thread():
584
+ global model
585
+ model = initialize_model()
586
+
587
+ loader_thread = threading.Thread(target=load_model_thread)
588
+ loader_thread.start()
589
+
590
+ # Create and launch app
591
+ app = create_gradio_app()
592
+
593
+ print(f"\n🚀 Launching Gradio on port {args.port}...")
594
+ print("📝 Note: Model loads in background. Chat will work once loaded.\n")
595
+
596
+ app.launch(
597
+ server_name="0.0.0.0",
598
+ server_port=args.port,
599
+ share=args.share
600
+ )
stack-2.9-deploy/requirements.txt CHANGED
@@ -1,14 +1,24 @@
1
- # Stack 2.9 Inference Server Requirements
2
- # These are pre-baked into the Docker image
3
-
4
- # Core dependencies
5
- fastapi==0.111.0
6
- uvicorn[standard]==0.30.1
7
- pydantic==2.7.4
8
-
9
- # vLLM and PyTorch (specified in Dockerfile)
10
- # torch==2.3.1+cu121 --index-url https://download.pytorch.org/whl/cu121
11
- # vLLM==0.6.3
12
- # transformers==4.41.2
13
- # accelerate==0.30.1
14
- # huggingface-hub==0.23.0
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stack 2.9 - HuggingFace Spaces Demo
2
+ # Requirements for Gradio interface with Qwen2.5-Coder-7B
3
+
4
+ # Core Gradio
5
+ gradio>=4.0.0
6
+
7
+ # Transformers and model loading
8
+ transformers>=4.36.0
9
+ torch>=2.0.0
10
+
11
+ # Model optimization
12
+ accelerate>=0.24.0
13
+ bitsandbytes>=0.41.0
14
+
15
+ # Additional utilities
16
+ huggingface-hub>=0.19.0
17
+ safetensors>=0.4.0
18
+
19
+ # Optional: For better web search
20
+ # brave-search>=0.1.0
21
+
22
+ # Optional: For web fetching
23
+ # beautifulsoup4>=4.12.0
24
+ # lxml>=4.9.0
{self_evolution → stack-2.9-training}/__init__.py RENAMED
File without changes
{self_evolution → stack-2.9-training}/apply.py RENAMED
File without changes
{self_evolution → stack-2.9-training}/learner.py RENAMED
File without changes
{self_evolution → stack-2.9-training}/memory.py RENAMED
File without changes
{self_evolution → stack-2.9-training}/observer.py RENAMED
File without changes
{stack_2_9_training → stack-2.9-training}/train_config_colab.yaml RENAMED
File without changes
{self_evolution → stack-2.9-training}/trainer.py RENAMED
File without changes
stack_cli/__init__.py DELETED
@@ -1,19 +0,0 @@
1
- """Stack 2.9 CLI and Agent Interface."""
2
-
3
- __version__ = "2.9.0"
4
- __author__ = "Stack Team"
5
-
6
- from .agent import create_agent, StackAgent
7
- from .tools import TOOLS, list_tools, get_tool, get_tool_schemas
8
- from .context import create_context_manager, ContextManager
9
-
10
- __all__ = [
11
- "create_agent",
12
- "StackAgent",
13
- "TOOLS",
14
- "list_tools",
15
- "get_tool",
16
- "get_tool_schemas",
17
- "create_context_manager",
18
- "ContextManager"
19
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
verify_repo.sh DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Stack 2.9 - Repository Integrity Check
3
- # Verifies all components are present before pushing to GitHub
4
-
5
- set -e
6
-
7
- echo "🔍 Stack 2.9 Repository Check"
8
- echo "============================"
9
- echo ""
10
-
11
- ERRORS=0
12
- WARNINGS=0
13
-
14
- check_dir() {
15
- if [ -d "$1" ]; then
16
- echo "✅ $2"
17
- else
18
- echo "❌ Missing: $2 ($1)"
19
- ((ERRORS++))
20
- fi
21
- }
22
-
23
- check_file() {
24
- if [ -f "$1" ]; then
25
- echo "✅ $2"
26
- else
27
- echo "❌ Missing: $2 ($1)"
28
- ((ERRORS++))
29
- fi
30
- }
31
-
32
- check_file_optional() {
33
- if [ -f "$1" ]; then
34
- echo "✅ $2"
35
- else
36
- echo "⚠️ Optional: $2 ($1)"
37
- ((WARNINGS++))
38
- fi
39
- }
40
-
41
- echo "Checking top-level files..."
42
- check_file "README.md" "Main README"
43
- check_file "LICENSE" "Apache 2.0 License"
44
- check_file "CONTRIBUTING.md" "Contributing Guide"
45
- check_file "CODE_OF_CONDUCT.md" "Code of Conduct"
46
- check_file "Makefile" "Makefile"
47
- check_file "requirements.txt" "Python requirements"
48
- check_file "pyproject.toml" "Python package config"
49
- check_file ".gitignore" "Git ignore rules"
50
- check_file ".env.example" "Environment example"
51
- check_file "setup.sh" "Setup script"
52
- check_file "PUSH_GUIDE.md" "Push guide"
53
-
54
- echo ""
55
- echo "Checking component directories..."
56
- check_dir "training-data" "Training data"
57
- check_dir "stack-2.9-training" "Training pipeline"
58
- check_dir "stack-2.9-deploy" "Deployment configs"
59
- check_dir "stack-2.9-voice" "Voice integration"
60
- check_dir "stack-2.9-docs" "Documentation"
61
- check_dir "stack-2.9-eval" "Evaluation tools"
62
- check_dir ".github/workflows" "CI/CD workflows"
63
-
64
- echo ""
65
- echo "Checking critical training data files..."
66
- check_file "training-data/tools/catalog.json" "Tool schemas"
67
- check_file "training-data/synthetic/examples.jsonl" "Synthetic examples"
68
- check_file "training-data/manifest.json" "Dataset manifest"
69
- check_file_optional "training-data/code-pairs/pairs.json" "Code-comment pairs"
70
- check_file_optional "training-data/advanced-patterns/examples.jsonl" "Advanced patterns"
71
-
72
- echo ""
73
- echo "Checking training pipeline files..."
74
- check_file "stack-2.9-training/requirements.txt" "Training requirements"
75
- check_file "stack-2.9-training/prepare_dataset.py" "Dataset preparation"
76
- check_file "stack-2.9-training/train_lora.py" "LoRA training script"
77
- check_file "stack-2.9-training/merge_lora.py" "Merge script"
78
- check_file "stack-2.9-training/quantize_awq.py" "AWQ quantization"
79
- check_file "stack-2.9-training/run_training.sh" "Training runner"
80
-
81
- echo ""
82
- echo "Checking deployment files..."
83
- check_file "stack-2.9-deploy/vllm_server.py" "vLLM server"
84
- check_file "stack-2.9-deploy/docker-compose.yml" "Docker Compose"
85
- check_file "stack-2.9-deploy/Dockerfile" "Docker image"
86
- check_file "stack-2.9-deploy/local_deploy.sh" "Local deployment script"
87
- check_file_optional "stack-2.9-deploy/runpod_deploy.sh" "RunPod script"
88
- check_file_optional "stack-2.9-deploy/vastai_deploy.sh" "Vast.ai script"
89
-
90
- echo ""
91
- echo "Checking voice integration..."
92
- check_file "stack-2.9-voice/voice_server.py" "Voice API server"
93
- check_file "stack-2.9-voice/voice_client.py" "Voice client"
94
- check_file "stack-2.9-voice/stack_voice_integration.py" "Integration layer"
95
- check_file "stack-2.9-voice/docker-compose.yml" "Voice Docker Compose"
96
- check_file "stack-2.9-voice/README.md" "Voice docs"
97
-
98
- echo ""
99
- echo "Checking documentation..."
100
- check_file "stack-2.9-docs/README.md" "Main docs"
101
- check_file "stack-2.9-docs/API.md" "API reference"
102
- check_file "stack-2.9-docs/OPENROUTER_SUBMISSION.md" "OpenRouter app"
103
- check_file "stack-2.9-docs/TRAINING_DATA.md" "Training guide"
104
- check_file_optional "stack-2.9-docs/VOICE_INTEGRATION.md" "Voice integration"
105
- check_file_optional "stack-2.9-docs/BENCHMARKS.md" "Benchmarks"
106
-
107
- echo ""
108
- echo "Checking evaluation..."
109
- check_file "stack-2.9-eval/eval_pipeline.py" "Evaluation pipeline"
110
- check_file "stack-2.9-eval/tool_use_eval.py" "Tool use eval"
111
- check_file "stack-2.9-eval/code_quality_eval.py" "Code quality eval"
112
- check_file "stack-2.9-eval/conversation_eval.py" "Conversation eval"
113
- check_file "stack-2.9-eval/results_aggregator.py" "Results aggregator"
114
- check_dir "stack-2.9-eval/benchmarks" "Benchmark datasets"
115
- check_dir "stack-2.9-eval/results" "Results directory"
116
-
117
- echo ""
118
- echo "============================"
119
- echo "📊 Repository Check Summary"
120
- echo "============================"
121
- if [ $ERRORS -eq 0 ]; then
122
- echo "✅ All critical files present!"
123
- if [ $WARNINGS -gt 0 ]; then
124
- echo "⚠️ $WARNINGS optional files missing (not critical)"
125
- fi
126
- echo ""
127
- echo "Ready to push to GitHub!"
128
- echo ""
129
- echo "Next:"
130
- echo " 1. Create repo: https://github.com/organizations/my-ai-stack/repositories/new"
131
- echo " 2. Run: git init && git add . && git commit -m 'Initial commit'"
132
- echo " 3. Add remote: git remote add origin https://github.com/my-ai-stack/stack-2.9.git"
133
- echo " 4. Push: git push -u origin main"
134
- exit 0
135
- else
136
- echo "❌ $ERRORS critical errors found!"
137
- echo "⚠️ $WARNINGS warnings"
138
- echo ""
139
- echo "Please fix missing files before pushing."
140
- exit 1
141
- fi