Spaces:

FrederickSundeep
/

ChatMateAPI

Sleeping

App Files Files

ChatMateAPI / app.py

FrederickSundeep

commit initial update 01092025-0001

af74796 7 months ago

raw

history blame

4.13 kB

	import os
	import time
	import torch
	import re
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse
	from fastapi.staticfiles import StaticFiles
	from fastapi.templating import Jinja2Templates
	from pydantic import BaseModel
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from huggingface_hub import login
	from fastapi.middleware.cors import CORSMiddleware
	import uvicorn

	# ✅ Safe GPU decorator
	try:
	from spaces import GPU
	except ImportError:
	def GPU(func): return func

	# ---------------- FastAPI setup ----------------
	app = FastAPI(
	title="ChatMate Real-Time API",
	description="LangChain + DuckDuckGo + Phi-4",
	version="1.0",
	docs_url="/apidocs", # Swagger UI at /apidocs
	redoc_url="/redoc" # ReDoc at /redoc
	)

	# ✅ Static + templates
	app.mount("/static", StaticFiles(directory="static"), name="static")
	templates = Jinja2Templates(directory="templates")

	# Enable CORS (important for browser clients)
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ✅ Hugging Face login
	login(token=os.environ.get("CHAT_MATE"))

	# ✅ Load model
	model_id = "microsoft/phi-4"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	)
	device = 0 if torch.cuda.is_available() else -1
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=device,
	max_new_tokens=512
	)

	def is_incomplete(text):
	return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip())

	@GPU
	def generate_full_reply(message, history):
	system_prompt = (
	"You are a friendly, helpful, and conversational AI assistant built by "
	"Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you."
	)
	messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"]
	reply = full_output[len(prompt):].strip()

	while is_incomplete(reply):
	continuation_prompt = prompt + reply
	next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"]
	continuation = next_output[len(continuation_prompt):].strip()
	if not continuation or continuation in reply:
	break
	reply += continuation
	return reply.strip()

	# ---------------- Pydantic models ----------------
	class ChatRequest(BaseModel):
	message: str
	history: list = []

	# ---------------- FastAPI route ----------------
	# ---------------- Routes ----------------
	@app.get("/", summary="Serve homepage")
	async def home(request: Request):
	return templates.TemplateResponse("index.html", {"request": request})

	@app.post("/chat-stream", summary="Stream assistant reply", tags=["Chat"])
	async def chat_stream(body: ChatRequest):
	"""
	Stream the AI assistant's reply token-by-token.
	"""
	def generate():
	reply = generate_full_reply(body.message, body.history)
	for token in reply:
	yield token
	time.sleep(0.05)

	return StreamingResponse(generate(), media_type="text/plain")

	# ---------------- Startup warm-up ----------------
	@app.on_event("startup")
	async def warmup_model():
	print("🔧 Warming up...")
	_ = generate_full_reply("Hello", [])

	# ---------------- Run with Uvicorn ----------------
	# In Hugging Face Spaces, just run: uvicorn app:app --host 0.0.0.0 --port 7860
	if __name__ == "__main__":
	# Hugging Face Spaces usually expects port 7860
	port = int(os.environ.get("PORT", 7860))

	# Run using uvicorn for FastAPI/Flask with ASGI wrapper
	uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)