huynhkimthien commited on
Commit
b7dc424
·
verified ·
1 Parent(s): 04a1d7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -7,11 +7,11 @@ import os
7
  app = FastAPI()
8
  model_name = "Qwen/Qwen3-4B-Instruct-2507"
9
 
10
- # Load tokenizer và model (CPU)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_name,
14
- device_map={"": "cpu"}, # Spaces Free không có GPU
15
  torch_dtype=torch.float32
16
  )
17
 
@@ -27,11 +27,12 @@ def read_root():
27
  @app.post("/chat")
28
  async def chat(request: ChatRequest):
29
  conversation.append({"role": "user", "content": request.message})
30
- # messages = [{"role": "user", "content": request.message}]
 
31
  text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
32
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
33
 
34
- response_text = generate_full_response(model_inputs)
35
  conversation.append({"role": "assistant", "content": response_text})
36
 
37
  return {"response": response_text}
@@ -49,10 +50,10 @@ def generate_full_response(model_inputs, max_tokens_per_call=128, max_total_toke
49
  total_tokens += len(output_ids)
50
 
51
  # Nếu chunk quá ngắn hoặc chứa token kết thúc thì dừng
52
- if "</s>" in chunk or len(chunk.strip()) < 5:
53
  break
54
 
55
  # Chuẩn bị cho lần sinh tiếp theo
56
  model_inputs = tokenizer([response_text], return_tensors="pt").to(model.device)
57
 
58
- return response_text.strip()
 
7
  app = FastAPI()
8
  model_name = "Qwen/Qwen3-4B-Instruct-2507"
9
 
10
+ # Load tokenizer và model (CPU cho Spaces Free)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_name,
14
+ device_map={"": "cpu"},
15
  torch_dtype=torch.float32
16
  )
17
 
 
27
  @app.post("/chat")
28
  async def chat(request: ChatRequest):
29
  conversation.append({"role": "user", "content": request.message})
30
+
31
+ # Áp dụng template hội thoại
32
  text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
33
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
34
 
35
+ response_text = generate_full_response(model_inputs)
36
  conversation.append({"role": "assistant", "content": response_text})
37
 
38
  return {"response": response_text}
 
50
  total_tokens += len(output_ids)
51
 
52
  # Nếu chunk quá ngắn hoặc chứa token kết thúc thì dừng
53
+ if len(chunk.strip()) < 5 or "</s>" in chunk:
54
  break
55
 
56
  # Chuẩn bị cho lần sinh tiếp theo
57
  model_inputs = tokenizer([response_text], return_tensors="pt").to(model.device)
58
 
59
+ return response_text.strip()