Spaces:

Seounghyup
/

chatbit-api

Sleeping

App Files Files Community

chatbit-api / main.py

Seounghyup

Fix streaming buffering issue

bb5ce7e about 2 months ago

raw

history blame contribute delete

7.92 kB

	"""
	ChatBIA FastAPI Server
	24/7 회계 AI 서버
	"""
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse
	from pydantic import BaseModel
	from typing import Optional, List, AsyncGenerator
	import os
	import json
	from llama_cpp import Llama

	app = FastAPI(
	title="ChatBIA API",
	description="회계 전문 AI 서버",
	version="1.0.0"
	)

	# CORS 설정 (안드로이드/웹에서 접근 가능)
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# 모델 경로
	MODEL_DIR = "models"
	GENERAL_MODEL_PATH = os.path.join(MODEL_DIR, "Qwen2.5-3B-Instruct-Q4_K_M.gguf")
	BSL_MODEL_PATH = os.path.join(MODEL_DIR, "ChatBIA-3B-v0.1-Q4_K_M.gguf")

	# 전역 모델 변수
	general_model = None
	bsl_model = None


	class ChatRequest(BaseModel):
	message: str
	mode: str = "bsl" # "general" or "bsl"
	max_tokens: int = 1024
	temperature: float = 0.7


	class ChatResponse(BaseModel):
	response: str
	mode: str
	tokens: int


	@app.on_event("startup")
	async def load_models():
	"""서버 시작 시 모델 로드"""
	global general_model, bsl_model

	os.makedirs(MODEL_DIR, exist_ok=True)

	# General 모델 로드
	if os.path.exists(GENERAL_MODEL_PATH):
	print(f"🔄 일반 모드 모델 로드 중: {GENERAL_MODEL_PATH}")
	try:
	general_model = Llama(
	model_path=GENERAL_MODEL_PATH,
	n_ctx=2048,
	n_threads=4,
	n_gpu_layers=0, # Oracle Cloud는 CPU
	verbose=False
	)
	print("✅ 일반 모드 모델 로드 완료")
	except Exception as e:
	print(f"❌ 일반 모드 모델 로드 실패: {e}")

	# BSL 모델 로드
	if os.path.exists(BSL_MODEL_PATH):
	print(f"🔄 BSL 모드 모델 로드 중: {BSL_MODEL_PATH}")
	try:
	bsl_model = Llama(
	model_path=BSL_MODEL_PATH,
	n_ctx=2048,
	n_threads=4,
	n_gpu_layers=0,
	verbose=False
	)
	print("✅ BSL 모드 모델 로드 완료")
	except Exception as e:
	print(f"❌ BSL 모드 모델 로드 실패: {e}")


	def build_prompt(message: str, mode: str) -> str:
	"""프롬프트 빌드"""
	if mode == "bsl":
	return f"""<\|im_start\|>system
	You are a professional accounting AI assistant. Respond naturally in Korean.

	Important: Only generate BSL DSL code when the user explicitly requests calculations (e.g., "계산해줘", "코드 작성해줘", "BSL로 작성해줘"). For general questions or greetings, respond conversationally without code.<\|im_end\|>
	<\|im_start\|>user
	{message}<\|im_end\|>
	<\|im_start\|>assistant
	"""
	else:
	return f"""<\|im_start\|>system
	You are a helpful AI assistant. Respond naturally in Korean.<\|im_end\|>
	<\|im_start\|>user
	{message}<\|im_end\|>
	<\|im_start\|>assistant
	"""


	@app.get("/")
	async def root():
	"""헬스 체크"""
	return {
	"status": "online",
	"service": "ChatBIA API",
	"version": "1.0.0",
	"models": {
	"general": general_model is not None,
	"bsl": bsl_model is not None
	}
	}


	@app.post("/chat", response_model=ChatResponse)
	async def chat(request: ChatRequest):
	"""채팅 엔드포인트"""
	# 모델 선택
	if request.mode == "general":
	model = general_model
	model_name = "General"
	else:
	model = bsl_model
	model_name = "BSL"

	# 모델이 없으면 에러
	if model is None:
	raise HTTPException(
	status_code=503,
	detail=f"{model_name} 모델이 로드되지 않았습니다."
	)

	try:
	# 프롬프트 빌드
	prompt = build_prompt(request.message, request.mode)

	# 추론
	response = model(
	prompt,
	max_tokens=request.max_tokens,
	temperature=request.temperature,
	top_p=0.9,
	top_k=40,
	repeat_penalty=1.1,
	stop=["<\|im_end\|>", "###", "\n\n\n"]
	)

	text = response["choices"][0]["text"].strip()
	tokens = len(response["choices"][0]["text"].split())

	return ChatResponse(
	response=text,
	mode=request.mode,
	tokens=tokens
	)

	except Exception as e:
	raise HTTPException(
	status_code=500,
	detail=f"AI 모델 처리 중 오류: {str(e)}"
	)


	@app.get("/models")
	async def get_models():
	"""사용 가능한 모델 목록"""
	return {
	"general": {
	"loaded": general_model is not None,
	"path": GENERAL_MODEL_PATH if os.path.exists(GENERAL_MODEL_PATH) else None
	},
	"bsl": {
	"loaded": bsl_model is not None,
	"path": BSL_MODEL_PATH if os.path.exists(BSL_MODEL_PATH) else None
	}
	}


	@app.post("/chat/stream")
	async def chat_stream(request: ChatRequest):
	"""스트리밍 채팅 엔드포인트 (안드로이드/타임아웃 방지)"""
	# 모델 선택
	if request.mode == "general":
	model = general_model
	model_name = "General"
	else:
	model = bsl_model
	model_name = "BSL"

	# 모델이 없으면 에러
	if model is None:
	raise HTTPException(
	status_code=503,
	detail=f"{model_name} 모델이 로드되지 않았습니다."
	)

	async def generate_stream() -> AsyncGenerator[str, None]:
	"""토큰 단위 스트리밍 제너레이터"""
	import asyncio

	try:
	# 프롬프트 빌드
	prompt = build_prompt(request.message, request.mode)

	# 스트리밍 추론
	stream = model(
	prompt,
	max_tokens=request.max_tokens,
	temperature=request.temperature,
	top_p=0.9,
	top_k=40,
	repeat_penalty=1.1,
	stop=["<\|im_end\|>", "###", "\n\n\n"],
	stream=True # 스트리밍 활성화
	)

	token_count = 0
	for chunk in stream:
	if "choices" in chunk and len(chunk["choices"]) > 0:
	delta = chunk["choices"][0].get("text", "")
	if delta:
	token_count += 1
	# SSE 형식: data: {json}\n\n
	data = {
	"token": delta,
	"done": False,
	"token_count": token_count
	}
	yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
	# 즉시 전송을 위한 짧은 대기
	await asyncio.sleep(0)

	# 완료 신호
	final_data = {
	"token": "",
	"done": True,
	"token_count": token_count,
	"mode": request.mode
	}
	yield f"data: {json.dumps(final_data, ensure_ascii=False)}\n\n"

	except Exception as e:
	error_data = {
	"error": str(e),
	"done": True
	}
	yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"

	return StreamingResponse(
	generate_stream(),
	media_type="text/event-stream",
	headers={
	"Cache-Control": "no-cache",
	"Connection": "keep-alive",
	"X-Accel-Buffering": "no" # Nginx 버퍼링 비활성화
	}
	)


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(
	"main:app",
	host="0.0.0.0",
	port=8000,
	reload=False # 프로덕션에서는 reload=False
	)