from flask import Flask, request, Response, render_template from huggingface_hub import hf_hub_download from llama_cpp import Llama app = Flask(__name__) # HTML content as a string HTML_CONTENT = ''' AI Chat Interface
AI Hello there!
How can I help you today.
''' def download_model(): model_name = "lmstudio-community/gemma-2-2b-it-GGUF" model_file = "gemma-2-2b-it-Q6_K.gguf" return hf_hub_download(model_name, filename=model_file) def initialize_model(model_path): return Llama( model_path=model_path, n_ctx=4096, n_threads=4, n_gpu_layers=-1 # Use GPU if available ) model_path = download_model() llm = initialize_model(model_path) system_prompt = ( "You are a normal AI assistant. Your mission is to help people and respond clearly and friendly." ) chat_history = [{"role": "system", "content": system_prompt}] @app.route('/') def index(): return HTML_CONTENT @app.route('/chat') def chat(): global chat_history user_message = request.args.get('message', '') chat_history.append({"role": "user", "content": user_message}) full_prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history]) full_prompt += "\nAssistant:" def generate(): ai_response = "" for token in llm(full_prompt, max_tokens=1000, stop=["User:"], stream=True): chunk = token['choices'][0]['text'] if chunk: ai_response += chunk yield f"data: {chunk}\n\n" chat_history.append({"role": "assistant", "content": ai_response.strip()}) if len(chat_history) > 10: # Limit history to last 10 messages chat_history = chat_history[-10:] yield "data: [DONE]\n\n" return Response(generate(), content_type='text/event-stream') if __name__ == '__main__': app.run(debug=True, port=5000)