KAT-Dev

Sleeping

akhaliq HF Staff commited on Sep 27

Commit

9eb416b

verified ·

1 Parent(s): d57d400

Update Gradio app with multiple files

Files changed (1) hide show

models.py CHANGED Viewed

@@ -77,11 +77,15 @@ def stream_generate_response(prompt: str, history: list) -> Generator[str, None,
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     # Generate with streaming using yield-based approach
     accumulated_text = ""
     # Generate tokens incrementally
-    for _ in range(MAX_NEW_TOKENS):
         with torch.no_grad():
             outputs = model(
                 input_ids=input_ids,
@@ -120,9 +124,9 @@ def stream_generate_response(prompt: str, history: list) -> Generator[str, None,
         input_ids = torch.cat([input_ids, next_token], dim=-1)
         attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
-        # Stop if we've reached max tokens
-        if input_ids.shape[-1] >= input_ids.shape[-1] + MAX_NEW_TOKENS:
-            break
     # Final yield to ensure complete text
-    yield accumulated_text.strip()

     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
+    # Store initial input length
+    initial_length = input_ids.shape[-1]
     # Generate with streaming using yield-based approach
     accumulated_text = ""
+    generated_tokens = 0
     # Generate tokens incrementally
+    while generated_tokens < MAX_NEW_TOKENS:
         with torch.no_grad():
             outputs = model(
                 input_ids=input_ids,
         input_ids = torch.cat([input_ids, next_token], dim=-1)
         attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
+        # Increment generated tokens counter
+        generated_tokens += 1
     # Final yield to ensure complete text
+    if accumulated_text:
+        yield accumulated_text.strip()