Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

2767573

verified ·

1 Parent(s): dafbe9c

Update main.py

Browse files

Files changed (1) hide show

main.py +37 -18

main.py CHANGED Viewed

@@ -17,7 +17,7 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.0.0 (Definitive Streaming Fix)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
@@ -81,10 +81,14 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
     return payload
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
-    """Handles the full streaming lifecycle with correct whitespace preservation."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
@@ -113,11 +117,8 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                     if line.startswith("event:"):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
-                        # FIXED: Preserve all whitespace including leading/trailing spaces
-                        raw_data = line[5:]  # Remove "data:" prefix
-                        # Remove only the optional single space after data: if present
-                        # This is per SSE spec and preserves actual content spaces
                         if raw_data.startswith(" "):
                             data_content = raw_data[1:]  # Remove the first space only
                         else:
@@ -129,13 +130,13 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                             content_token = ""
                             try:
-                                # Handle JSON-encoded strings properly (including spaces)
                                 content_token = json.loads(data_content)
                             except (json.JSONDecodeError, TypeError):
-                                # Handle plain text tokens (preserve as-is)
                                 content_token = data_content
-                            # Create chunk with exact format you specified
                             chunk = {
                                 "choices": [{
                                     "delta": {"content": content_token},
@@ -145,15 +146,18 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                                     "native_finish_reason": None
                                 }],
                                 "created": int(time.time()),
-                                "id": f"gen-{int(time.time())}-{prediction_id[-12:]}",  # Format like your example
                                 "model": replicate_model_id,
                                 "object": "chat.completion.chunk",
                                 "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate"
                             }
-                            # FIXED: Yield only the JSON data, let EventSourceResponse handle the SSE formatting
                             yield json.dumps(chunk)
                         elif current_event == "done":
                             # Send usage chunk before done
                             usage_chunk = {
                                 "choices": [{
@@ -170,7 +174,7 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                                 "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate",
                                 "usage": {
                                     "cache_discount": 0,
-                                    "completion_tokens": 0,
                                     "completion_tokens_details": {"image_tokens": 0, "reasoning_tokens": 0},
                                     "cost": 0,
                                     "cost_details": {
@@ -178,11 +182,12 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                                         "upstream_inference_cost": None,
                                         "upstream_inference_prompt_cost": 0
                                     },
-                                    "input_tokens": 0,
                                     "is_byok": False,
-                                    "prompt_tokens": 0,
                                     "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
-                                    "total_tokens": 0
                                 }
                             }
                             yield json.dumps(usage_chunk)
@@ -226,19 +231,33 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if request.stream:
         return EventSourceResponse(stream_replicate_sse(SUPPORTED_MODELS[request.model], replicate_input), media_type="text/event-stream")
-    # Non-streaming fallback
     url = f"https://api.replicate.com/v1/models/{SUPPORTED_MODELS[request.model]}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient() as client:
         try:
             resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
             resp.raise_for_status()
             pred = resp.json()
             output = "".join(pred.get("output", []))
             return {
                 "id": pred.get("id"), "object": "chat.completion", "created": int(time.time()), "model": request.model,
                 "choices": [{"index": 0, "message": {"role": "assistant", "content": output}, "finish_reason": "stop"}],
-                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             }
         except httpx.HTTPStatusError as e:
             raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")

     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.1.0 (Enhanced Token Tracking)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
     return payload
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
+    """Handles the full streaming lifecycle with enhanced token tracking and timing."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
+    start_time = time.time()
+    prompt_tokens = len(input_payload.get("prompt", "")) // 4  # Rough estimation
+    completion_tokens = 0
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
                     if line.startswith("event:"):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
+                        # Remove "data:" prefix and optional space
+                        raw_data = line[5:]  # Remove "data:"
                         if raw_data.startswith(" "):
                             data_content = raw_data[1:]  # Remove the first space only
                         else:
                             content_token = ""
                             try:
+                                # Handle JSON-encoded strings properly
                                 content_token = json.loads(data_content)
                             except (json.JSONDecodeError, TypeError):
+                                # Handle plain text tokens
                                 content_token = data_content
+                            completion_tokens += 1
                             chunk = {
                                 "choices": [{
                                     "delta": {"content": content_token},
                                     "native_finish_reason": None
                                 }],
                                 "created": int(time.time()),
+                                "id": f"gen-{int(time.time())}-{prediction_id[-12:]}",
                                 "model": replicate_model_id,
                                 "object": "chat.completion.chunk",
                                 "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate"
                             }
                             yield json.dumps(chunk)
                         elif current_event == "done":
+                            # Calculate timing
+                            end_time = time.time()
+                            inference_time = end_time - start_time
                             # Send usage chunk before done
                             usage_chunk = {
                                 "choices": [{
                                 "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate",
                                 "usage": {
                                     "cache_discount": 0,
+                                    "completion_tokens": completion_tokens,
                                     "completion_tokens_details": {"image_tokens": 0, "reasoning_tokens": 0},
                                     "cost": 0,
                                     "cost_details": {
                                         "upstream_inference_cost": None,
                                         "upstream_inference_prompt_cost": 0
                                     },
+                                    "input_tokens": prompt_tokens,
                                     "is_byok": False,
+                                    "prompt_tokens": prompt_tokens,
                                     "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
+                                    "total_tokens": prompt_tokens + completion_tokens,
+                                    "inference_time": round(inference_time, 3)
                                 }
                             }
                             yield json.dumps(usage_chunk)
     if request.stream:
         return EventSourceResponse(stream_replicate_sse(SUPPORTED_MODELS[request.model], replicate_input), media_type="text/event-stream")
+    # Non-streaming fallback with usage data
     url = f"https://api.replicate.com/v1/models/{SUPPORTED_MODELS[request.model]}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
+    start_time = time.time()
     async with httpx.AsyncClient() as client:
         try:
             resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
             resp.raise_for_status()
             pred = resp.json()
             output = "".join(pred.get("output", []))
+            # Calculate timing and tokens
+            end_time = time.time()
+            inference_time = end_time - start_time
+            prompt_tokens = len(input_payload.get("prompt", "")) // 4  # Rough estimation
+            completion_tokens = len(output) // 4  # Rough estimation
             return {
                 "id": pred.get("id"), "object": "chat.completion", "created": int(time.time()), "model": request.model,
                 "choices": [{"index": 0, "message": {"role": "assistant", "content": output}, "finish_reason": "stop"}],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": prompt_tokens + completion_tokens,
+                    "inference_time": round(inference_time, 3)
+                }
             }
         except httpx.HTTPStatusError as e:
             raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")