Spaces:

DataChem
/

custom-api

Paused

App Files Files Community

DataChem commited on Dec 29, 2024

Commit

d638752

verified ·

1 Parent(s): 8194424

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -21

app.py CHANGED Viewed

@@ -21,57 +21,66 @@ async def predict(request: Request):
     if not prompt:
         return {"error": "Prompt is required"}
-    # Tokenize the input and move to correct device
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    input_ids = inputs.input_ids
-    attention_mask = inputs.attention_mask
     def token_generator():
-        # Use nonlocal to allow reassigning input_ids inside the nested function
-        nonlocal input_ids
-        # Sampling parameters
         temperature = 0.7
         top_p = 0.9
         max_new_tokens = 30
         for _ in range(max_new_tokens):
             with torch.no_grad():
-                # Forward pass
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 next_token_logits = outputs.logits[:, -1, :]
-                # Temperature scaling
                 next_token_logits = next_token_logits / temperature
-                # Convert logits to probabilities
                 next_token_probs = F.softmax(next_token_logits, dim=-1)
-                # Apply nucleus (top-p) sampling
                 sorted_probs, sorted_indices = torch.sort(next_token_probs, descending=True)
                 cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-                # Filter out tokens above the top_p threshold
                 valid_indices = cumulative_probs <= top_p
                 filtered_probs = sorted_probs[valid_indices]
                 filtered_indices = sorted_indices[valid_indices]
                 if len(filtered_probs) == 0:
-                    # Fallback to greedy if no tokens meet top_p
-                    next_token_id = torch.argmax(next_token_probs).unsqueeze(-1)
                 else:
-                    # Sample from the filtered distribution
-                    sampled_id = torch.multinomial(filtered_probs, num_samples=1)
-                    next_token_id = filtered_indices[sampled_id].unsqueeze(-1)
-                # Append the new token to our running sequence
                 input_ids = torch.cat([input_ids, next_token_id], dim=-1)
-                # Decode and yield the token
                 token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
                 yield token + " "
-                # Stop if EOS token is generated
                 if tokenizer.eos_token_id is not None:
                     if next_token_id.squeeze().item() == tokenizer.eos_token_id:
                         break

     if not prompt:
         return {"error": "Prompt is required"}
+    # Initial tokenization on the prompt
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    input_ids = inputs.input_ids          # Shape: [batch_size, seq_len], often [1, seq_len]
+    attention_mask = inputs.attention_mask  # Same shape as input_ids
     def token_generator():
+        nonlocal input_ids, attention_mask
+        # Generation hyperparameters
         temperature = 0.7
         top_p = 0.9
         max_new_tokens = 30
         for _ in range(max_new_tokens):
             with torch.no_grad():
+                # Forward pass: compute logits for the last token
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 next_token_logits = outputs.logits[:, -1, :]
+                # Apply temperature
                 next_token_logits = next_token_logits / temperature
+                # Convert logits -> probabilities
                 next_token_probs = F.softmax(next_token_logits, dim=-1)
+                # Apply top-p (nucleus) sampling
                 sorted_probs, sorted_indices = torch.sort(next_token_probs, descending=True)
                 cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
                 valid_indices = cumulative_probs <= top_p
                 filtered_probs = sorted_probs[valid_indices]
                 filtered_indices = sorted_indices[valid_indices]
                 if len(filtered_probs) == 0:
+                    # Fallback to greedy if nothing meets top_p
+                    next_token_id = torch.argmax(next_token_probs)
                 else:
+                    # Sample a token from the filtered distribution
+                    sampled_id = torch.multinomial(filtered_probs, 1)
+                    next_token_id = filtered_indices[sampled_id]
+                # At this point, next_token_id might be shape [] (scalar) or [1].
+                # We need [batch_size, 1], so if it's just a scalar, unsqueeze(0).
+                if next_token_id.dim() == 0:
+                    next_token_id = next_token_id.unsqueeze(0)   # shape [1]
+                next_token_id = next_token_id.unsqueeze(-1)      # shape [1,1]
+                # Append the new token to input_ids
+                # input_ids: [1, seq_len], next_token_id: [1,1] => final shape [1, seq_len+1]
                 input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+                # Also update the attention mask so the model attends to the new token
+                # shape: [1, seq_len+1]
+                new_mask = attention_mask.new_ones((attention_mask.size(0), 1))
+                attention_mask = torch.cat([attention_mask, new_mask], dim=-1)
+                # Decode and yield the token for streaming
                 token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
                 yield token + " "
+                # Stop if we hit the EOS token
                 if tokenizer.eos_token_id is not None:
                     if next_token_id.squeeze().item() == tokenizer.eos_token_id:
                         break