Spaces:
Running
Running
| import os | |
| import time | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from threading import Thread | |
| import uvicorn | |
| # ---------------------------- | |
| # Model | |
| # ---------------------------- | |
| model_path = hf_hub_download( | |
| repo_id="bartowski/Qwen2.5-Coder-0.5B-Instruct-abliterated-GGUF", | |
| filename="Qwen2.5-Coder-0.5B-Instruct-abliterated-f16.gguf" | |
| ) | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=4096, | |
| n_threads=os.cpu_count(), | |
| n_batch=512, | |
| n_gpu_layers=0, | |
| verbose=False | |
| ) | |
| llm("warmup", max_tokens=1) | |
| # ---------------------------- | |
| # System Prompt | |
| # ---------------------------- | |
| SYSTEM_PROMPT = """ | |
| You are an advanced AI assistant. | |
| Answer questions clearly and concisely. | |
| You can handle multi-turn conversations and provide detailed responses if needed. | |
| """ | |
| # ---------------------------- | |
| # Chat Function | |
| # ---------------------------- | |
| def generate_response(message, history): | |
| yield "🤖 Thinking..." | |
| time.sleep(0.5) | |
| prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" | |
| for h in history: | |
| if isinstance(h, dict) and "role" in h and "message" in h: | |
| role = h["role"] | |
| msg = h["message"] | |
| if role == "user": | |
| prompt += f"<|im_start|>user\n{msg}<|im_end|>\n" | |
| else: | |
| prompt += f"<|im_start|>assistant\n{msg}<|im_end|>\n" | |
| elif isinstance(h, (list, tuple)) and len(h) >= 2: | |
| u, a = h[0], h[1] | |
| prompt += f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n" | |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" | |
| output = "" | |
| for token in llm(prompt, max_tokens=2048, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stream=True): | |
| output += token["choices"][0]["text"] | |
| yield output | |
| # ---------------------------- | |
| # FastAPI API | |
| # ---------------------------- | |
| app = FastAPI() | |
| class ChatRequest(BaseModel): | |
| message: str | |
| history: list = [] | |
| # FastAPI endpoint ko simple banayein taaki Web Search client connect kar sake | |
| # Path ko match karne ke liye change kiya | |
| def chat_endpoint(request: ChatRequest): | |
| output = "" | |
| # Prompt logic (Same as yours) | |
| prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" | |
| # ... baki prompt logic ... | |
| # Streaming off rakhein API response ke liye taaki ek baar mein pura text mile | |
| res = llm(prompt, max_tokens=1024, temperature=0.3) | |
| return res["choices"][0]["text"] | |
| # ---------------------------- | |
| # Gradio UI | |
| # ---------------------------- | |
| with gr.Blocks(theme=gr.Theme.from_hub("JackismyShephard/ultimate-rvc-theme")) as demo: | |
| gr.HTML("<h2 style='text-align:center; color:white;'>Code Explainer AI</h2>") | |
| chatbot = gr.ChatInterface( | |
| fn=generate_response, | |
| chatbot=gr.Chatbot(height=600), | |
| textbox=gr.Textbox(placeholder="Paste code or ask for explanation...", container=False) | |
| ) | |
| # Rounded corners for main container | |
| demo.css = """ | |
| .gradio-container { | |
| border-radius: 25px !important; | |
| max-width: 600px !important; | |
| margin: auto !important; | |
| overflow: hidden; | |
| } | |
| .message.user { border-radius: 18px 18px 4px 18px !important; } | |
| .message.bot { border-radius: 18px 18px 18px 4px !important; } | |
| """ | |
| # ---------------------------- | |
| # Run Gradio + FastAPI together | |
| # ---------------------------- | |
| def run_gradio(): | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |
| thread = Thread(target=run_gradio) | |
| thread.start() | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8000) |