| import sys |
| import os |
| import multiprocessing |
| from flask import Flask, request, Response |
| from waitress import serve |
| import json |
| import traceback |
|
|
| |
| def log(msg): |
| print(f"[ENGINE] {msg}", flush=True) |
|
|
| |
| if getattr(sys, 'frozen', False): |
| BASE_DIR = os.path.dirname(sys.executable) |
| else: |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
| MODEL_PATH = os.path.join(BASE_DIR, "model.gguf") |
| log(f"Base Directory: {BASE_DIR}") |
|
|
| app = Flask(__name__) |
|
|
| |
| |
| try: |
| import llama_cpp |
| |
| |
| def dummy_log_set(callback, user_data): |
| return |
| |
| |
| |
| llama_cpp.llama_log_set = dummy_log_set |
| |
| log("Successfully patched Llama logging.") |
| except Exception as e: |
| log(f"Patch warning: {e}") |
|
|
| |
| llm = None |
| try: |
| from llama_cpp import Llama |
| |
| total_cores = multiprocessing.cpu_count() |
| safe_threads = max(1, int(total_cores * 0.5)) |
|
|
| if not os.path.exists(MODEL_PATH): |
| log("CRITICAL ERROR: model.gguf is missing!") |
| else: |
| log("Loading Model...") |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=4096, |
| n_threads=safe_threads, |
| n_gpu_layers=0, |
| verbose=False, |
| chat_format="gemma", |
| use_mmap=False |
| ) |
| log("Model Loaded Successfully!") |
|
|
| except Exception as e: |
| log(f"CRITICAL EXCEPTION during load: {e}") |
| log(traceback.format_exc()) |
|
|
| @app.route('/', methods=['GET']) |
| def health_check(): |
| if llm: return "OK", 200 |
| return "MODEL_FAILED", 500 |
|
|
| @app.route('/chat_stream', methods=['POST']) |
| def chat_stream(): |
| if not llm: |
| return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream') |
|
|
| data = request.json |
| messages = [{"role": "user", "content": data.get('message', '')}] |
|
|
| def generate(): |
| try: |
| stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True) |
| for chunk in stream: |
| if 'content' in chunk['choices'][0]['delta']: |
| yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n" |
| except Exception as e: |
| log(f"Gen Error: {e}") |
| yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n" |
|
|
| return Response(stream_with_context(generate()), mimetype='text/event-stream') |
|
|
| if __name__ == '__main__': |
| log("Starting Waitress Server on Port 5000...") |
| try: |
| serve(app, host='127.0.0.1', port=5000, threads=6) |
| except Exception as e: |
| log(f"Server Crash: {e}") |