Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

89b138d

verified ·

1 Parent(s): 1f3c557

Create main.py

Browse files

Files changed (1) hide show

main.py +279 -0

main.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import os
+import httpx
+import json
+import time
+from fastapi import FastAPI, Request, HTTPException, Header
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+from typing import List, Dict, Any, Optional, Union, Literal
+from dotenv import load_dotenv
+from sse_starlette.sse import EventSourceResponse
+# Load environment variables from .env file
+load_dotenv()
+# --- Configuration ---
+REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
+if not REPLICATE_API_TOKEN:
+    raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
+# --- FastAPI App Initialization ---
+app = FastAPI(
+    title="Replicate to OpenAI Compatibility Layer",
+    version="1.0.0",
+)
+# --- Pydantic Models for OpenAI Compatibility ---
+# /v1/models endpoint
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "replicate"
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+# /v1/chat/completions endpoint
+class ChatMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[Dict[str, Any]]]
+class ToolFunction(BaseModel):
+    name: str
+    description: str
+    parameters: Dict[str, Any]
+class Tool(BaseModel):
+    type: Literal["function"]
+    function: ToolFunction
+class OpenAIChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    max_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+    tools: Optional[List[Tool]] = None
+    tool_choice: Optional[Union[str, Dict]] = None
+# --- Replicate Model Mapping ---
+# We hardcode the models we want to expose.
+SUPPORTED_MODELS = {
+    "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
+    "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
+}
+# --- Helper Functions ---
+def format_tools_for_prompt(tools: List[Tool]) -> str:
+    """Converts OpenAI tools to a string for the system prompt."""
+    if not tools:
+        return ""
+    prompt = "You have access to the following tools. To use a tool, respond with a JSON object in the following format:\n"
+    prompt += '{"type": "tool_call", "name": "tool_name", "arguments": {"arg_name": "value"}}\n\n'
+    prompt += "Available tools:\n"
+    for tool in tools:
+        prompt += json.dumps(tool.function.dict(), indent=2) + "\n"
+    return prompt
+def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
+    """Prepares the input payload for the Replicate API."""
+    input_data = {}
+    prompt_parts = []
+    system_prompt = ""
+    # Handle messages, separating system, user, assistant and vision content
+    image_url = None
+    for message in request.messages:
+        if message.role == "system":
+            system_prompt += message.content + "\n"
+        elif message.role == "user":
+            if isinstance(message.content, list): # Vision support
+                for item in message.content:
+                    if item.get("type") == "text":
+                        prompt_parts.append(f"User: {item.get('text', '')}")
+                    elif item.get("type") == "image_url":
+                        image_url = item.get("image_url", {}).get("url")
+            else:
+                prompt_parts.append(f"User: {message.content}")
+        elif message.role == "assistant":
+            prompt_parts.append(f"Assistant: {message.content}")
+    # Add tool instructions to system prompt
+    if request.tools:
+        tool_prompt = format_tools_for_prompt(request.tools)
+        system_prompt += "\n" + tool_prompt
+    input_data["prompt"] = "\n".join(prompt_parts)
+    if system_prompt:
+        input_data["system_prompt"] = system_prompt
+    if image_url:
+        input_data["image"] = image_url
+    # Map other parameters
+    if request.temperature is not None:
+        input_data["temperature"] = request.temperature
+    if request.top_p is not None:
+        input_data["top_p"] = request.top_p
+    if request.max_tokens is not None:
+        # Replicate uses `max_new_tokens` or `max_tokens` depending on model
+        input_data["max_new_tokens"] = request.max_tokens
+    return input_data
+async def stream_replicate_response(model_id: str, payload: dict):
+    """Generator for streaming Replicate responses."""
+    url = f"https://api.replicate.com/v1/models/{model_id}/predictions"
+    headers = {
+        "Authorization": f"Bearer {REPLICATE_API_TOKEN}",
+        "Content-Type": "application/json",
+    }
+    async with httpx.AsyncClient(timeout=300) as client:
+        # 1. Create the prediction and get the stream URL
+        payload["stream"] = True
+        try:
+            response = await client.post(url, headers=headers, json={"input": payload})
+            response.raise_for_status()
+            prediction = response.json()
+            stream_url = prediction.get("urls", {}).get("stream")
+            if not stream_url:
+                yield f"data: {json.dumps({'error': 'Failed to get stream URL'})}\n\n"
+                return
+        except httpx.HTTPStatusError as e:
+            yield f"data: {json.dumps({'error': str(e.response.text)})}\n\n"
+            return
+        # 2. Connect to the SSE stream
+        try:
+            async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}) as sse:
+                async for line in sse.aiter_lines():
+                    if line.startswith("data:"):
+                        event_data = line[len("data:"):].strip()
+                        try:
+                            data = json.loads(event_data)
+                            # Format as OpenAI chunk
+                            chunk = {
+                                "id": prediction["id"],
+                                "object": "chat.completion.chunk",
+                                "created": int(time.time()),
+                                "model": model_id,
+                                "choices": [{
+                                    "index": 0,
+                                    "delta": {"content": data},
+                                    "finish_reason": None
+                                }]
+                            }
+                            yield f"data: {json.dumps(chunk)}\n\n"
+                        except json.JSONDecodeError:
+                            continue # Skip non-json lines
+        except Exception as e:
+            yield f"data: {json.dumps({'error': f'Streaming error: {str(e)}'})}\n\n"
+    # Send the done signal
+    done_chunk = {
+        "id": prediction["id"],
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": model_id,
+        "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+    }
+    yield f"data: {json.dumps(done_chunk)}\n\n"
+    yield "data: [DONE]\n\n"
+# --- API Endpoints ---
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """Lists the available models that this compatibility layer supports."""
+    model_cards = [
+        ModelCard(id=model_name) for model_name in SUPPORTED_MODELS.keys()
+    ]
+    return ModelList(data=model_cards)
+@app.post("/v1/chat/completions")
+async def create_chat_completion(request: OpenAIChatCompletionRequest):
+    """Creates a chat completion, either streaming or synchronous."""
+    model_key = request.model
+    if model_key not in SUPPORTED_MODELS:
+        raise HTTPException(status_code=404, detail=f"Model not found. Supported models: {list(SUPPORTED_MODELS.keys())}")
+    replicate_model_id = SUPPORTED_MODELS[model_key]
+    replicate_input = prepare_replicate_input(request)
+    if request.stream:
+        return EventSourceResponse(stream_replicate_response(replicate_model_id, replicate_input))
+    # Synchronous request
+    url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
+    headers = {
+        "Authorization": f"Bearer {REPLICATE_API_TOKEN}",
+        "Content-Type": "application/json",
+        "Prefer": "wait=120" # Wait up to 120 seconds for a response
+    }
+    async with httpx.AsyncClient(timeout=150) as client:
+        try:
+            response = await client.post(url, headers=headers, json={"input": replicate_input})
+            response.raise_for_status()
+            prediction = response.json()
+            output = prediction.get("output", "")
+            if isinstance(output, list):
+                output = "".join(output)
+            # Check for tool call
+            try:
+                # A simple check if the output is a JSON for a tool call
+                tool_call_data = json.loads(output)
+                if tool_call_data.get("type") == "tool_call":
+                    message_content = None
+                    tool_calls = [{
+                        "id": f"call_{int(time.time())}",
+                        "type": "function",
+                        "function": {
+                            "name": tool_call_data["name"],
+                            "arguments": json.dumps(tool_call_data["arguments"])
+                        }
+                    }]
+                else:
+                    message_content = output
+                    tool_calls = None
+            except (json.JSONDecodeError, TypeError):
+                message_content = output
+                tool_calls = None
+            # Format response in OpenAI format
+            completion_response = {
+                "id": prediction["id"],
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": model_key,
+                "choices": [{
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": message_content,
+                        "tool_calls": tool_calls,
+                    },
+                    "finish_reason": "stop" # Or map from Replicate if available
+                }],
+                "usage": { # Note: Replicate doesn't provide token usage in the same way
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0
+                }
+            }
+            return JSONResponse(content=completion_response)
+        except httpx.HTTPStatusError as e:
+            raise HTTPException(status_code=e.response.status_code, detail=e.response.text)