Zai

Running

huynhkimthien commited on 16 days ago

Commit

b7dc424

verified ·

1 Parent(s): 04a1d7e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,11 +7,11 @@ import os
 app = FastAPI()
 model_name = "Qwen/Qwen3-4B-Instruct-2507"
-# Load tokenizer và model (CPU)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    device_map={"": "cpu"},  # Spaces Free không có GPU
     torch_dtype=torch.float32
 )
@@ -27,11 +27,12 @@ def read_root():
 @app.post("/chat")
 async def chat(request: ChatRequest):
     conversation.append({"role": "user", "content": request.message})
-   # messages = [{"role": "user", "content": request.message}]
     text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-response_text = generate_full_response(model_inputs)
     conversation.append({"role": "assistant", "content": response_text})
     return {"response": response_text}
@@ -49,10 +50,10 @@ def generate_full_response(model_inputs, max_tokens_per_call=128, max_total_toke
         total_tokens += len(output_ids)
         # Nếu chunk quá ngắn hoặc chứa token kết thúc thì dừng
-        if "</s>" in chunk or len(chunk.strip()) < 5:
             break
         # Chuẩn bị cho lần sinh tiếp theo
         model_inputs = tokenizer([response_text], return_tensors="pt").to(model.device)
-    return response_text.strip()

 app = FastAPI()
 model_name = "Qwen/Qwen3-4B-Instruct-2507"
+# Load tokenizer và model (CPU cho Spaces Free)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    device_map={"": "cpu"},
     torch_dtype=torch.float32
 )
 @app.post("/chat")
 async def chat(request: ChatRequest):
     conversation.append({"role": "user", "content": request.message})
+    # Áp dụng template hội thoại
     text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    response_text = generate_full_response(model_inputs)
     conversation.append({"role": "assistant", "content": response_text})
     return {"response": response_text}
         total_tokens += len(output_ids)
         # Nếu chunk quá ngắn hoặc chứa token kết thúc thì dừng
+        if len(chunk.strip()) < 5 or "</s>" in chunk:
             break
         # Chuẩn bị cho lần sinh tiếp theo
         model_inputs = tokenizer([response_text], return_tensors="pt").to(model.device)
+    return response_text.strip()