Spaces:

DataChem
/

custom-api

Paused

App Files Files Community

DataChem commited on Dec 29, 2024

Commit

74b564f

verified ·

1 Parent(s): 5102dda

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -34

app.py CHANGED Viewed

@@ -13,10 +13,6 @@ model = AutoModelForCausalLM.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-@app.get("/")
-def read_root():
-    return {"Hello": "World"}
 @app.post("/predict")
 async def predict(request: Request):
     data = await request.json()
@@ -29,45 +25,40 @@ async def predict(request: Request):
     input_ids = inputs.input_ids
     attention_mask = inputs.attention_mask
-    # Generator function to stream tokens
     def token_generator():
         temperature = 0.7
         top_p = 0.9
-        for _ in range(100):  # Limit the number of generated tokens
-            # Get the model outputs
-            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-            next_token_logits = outputs.logits[:, -1, :]  # Logits for the last token
-            # Apply temperature scaling
-            next_token_logits = next_token_logits / temperature
-            # Convert logits to probabilities
-            next_token_probs = F.softmax(next_token_logits, dim=-1)
-            # Apply top-p nucleus sampling
-            sorted_probs, sorted_indices = torch.sort(next_token_probs, descending=True)
-            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-            sorted_probs = sorted_probs[cumulative_probs <= top_p]
-            sorted_indices = sorted_indices[:len(sorted_probs)]
-            # Sample from the filtered distribution
-            if len(sorted_probs) > 0:
-                next_token_id = sorted_indices[torch.multinomial(sorted_probs, 1)]
-            else:
-                # Fallback to greedy selection if no tokens meet top-p
-                next_token_id = torch.argmax(next_token_probs)
-            # Append the generated token to the input
-            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)
-            # Decode the token and yield it
-            token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
-            yield token + " "
-            # Stop if the model generates the end-of-sequence token
-            if next_token_id.squeeze().item() == tokenizer.eos_token_id:
-                break
-    # Return the generator as a streaming response
     return StreamingResponse(token_generator(), media_type="text/plain")

 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 @app.post("/predict")
 async def predict(request: Request):
     data = await request.json()
     input_ids = inputs.input_ids
     attention_mask = inputs.attention_mask
     def token_generator():
         temperature = 0.7
         top_p = 0.9
+        for _ in range(100):  # Limit to 100 tokens
+            with torch.no_grad():  # Disable gradient computation for inference
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+                next_token_logits = outputs.logits[:, -1, :]
+                # Apply temperature and softmax
+                next_token_logits = next_token_logits / temperature
+                next_token_probs = F.softmax(next_token_logits, dim=-1)
+                # Apply nucleus sampling (top-p)
+                sorted_probs, sorted_indices = torch.sort(next_token_probs, descending=True)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                sorted_probs = sorted_probs[cumulative_probs <= top_p]
+                sorted_indices = sorted_indices[:len(sorted_probs)]
+                # Sample next token
+                if len(sorted_probs) > 0:
+                    next_token_id = sorted_indices[torch.multinomial(sorted_probs, 1)]
+                else:
+                    next_token_id = torch.argmax(next_token_probs)
+                # Append the new token to the input sequence
+                input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)
+                # Decode and yield the token
+                token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
+                yield token + " "
+                # Stop if the end-of-sequence token is generated
+                if next_token_id.squeeze().item() == tokenizer.eos_token_id:
+                    break
     return StreamingResponse(token_generator(), media_type="text/plain")