from flask import Flask, request, Response, render_template
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
app = Flask(__name__)
# HTML content as a string
HTML_CONTENT = '''
AI Chat Interface
Hello there!
How can I help you today.
'''
def download_model():
model_name = "lmstudio-community/gemma-2-2b-it-GGUF"
model_file = "gemma-2-2b-it-Q6_K.gguf"
return hf_hub_download(model_name, filename=model_file)
def initialize_model(model_path):
return Llama(
model_path=model_path,
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1 # Use GPU if available
)
model_path = download_model()
llm = initialize_model(model_path)
system_prompt = (
"You are a normal AI assistant. Your mission is to help people and respond clearly and friendly."
)
chat_history = [{"role": "system", "content": system_prompt}]
@app.route('/')
def index():
return HTML_CONTENT
@app.route('/chat')
def chat():
global chat_history
user_message = request.args.get('message', '')
chat_history.append({"role": "user", "content": user_message})
full_prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history])
full_prompt += "\nAssistant:"
def generate():
ai_response = ""
for token in llm(full_prompt, max_tokens=1000, stop=["User:"], stream=True):
chunk = token['choices'][0]['text']
if chunk:
ai_response += chunk
yield f"data: {chunk}\n\n"
chat_history.append({"role": "assistant", "content": ai_response.strip()})
if len(chat_history) > 10: # Limit history to last 10 messages
chat_history = chat_history[-10:]
yield "data: [DONE]\n\n"
return Response(generate(), content_type='text/event-stream')
if __name__ == '__main__':
app.run(debug=True, port=5000)