Spaces:

aryo100
/

qwen_api

Sleeping

qwen_api / app_quantized.py

updare app & requirements

2b65d25 about 1 month ago

1.19 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from llama_cpp import Llama
	import os
	import uvicorn

	app = FastAPI()

	# --- Konfigurasi Model ---
	# Pastikan sudah download model GGUF dari Hugging Face, contoh:
	# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
	MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal

	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=2048, # konteks token
	n_threads=4, # sesuaikan dengan jumlah CPU core
	n_batch=512 # batch size
	)

	# --- Schema Request ---
	class ChatRequest(BaseModel):
	prompt: str
	max_new_tokens: int = 256

	# --- Endpoint Chat ---
	@app.post("/chat")
	def chat(req: ChatRequest):
	output = llm(
	req.prompt,
	max_tokens=req.max_new_tokens,
	stop=["</s>", "User:", "Assistant:"],
	echo=False
	)
	response = output["choices"][0]["text"].strip()
	return {"response": response}

	# --- Root Endpoint ---
	@app.get("/")
	def root():
	return {"message": "Qwen GGUF FastAPI running 🚀"}

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	uvicorn.run("app:app", host="0.0.0.0", port=port)