Spaces:
Sleeping
Sleeping
| """ | |
| HuggingFace ZeroGPU Space - OpenAI-compatible inference provider for opencode. | |
| This Gradio app provides: | |
| - OpenAI-compatible API via Gradio's native API system | |
| - Pass-through model selection (any HF model ID) | |
| - ZeroGPU H200 inference with HF Serverless fallback | |
| - HF Token authentication | |
| - SSE streaming support | |
| """ | |
| # Import spaces FIRST - required for ZeroGPU GPU detection | |
| import spaces | |
| import logging | |
| import time | |
| from typing import Optional | |
| import gradio as gr | |
| import httpx | |
| from huggingface_hub import HfApi | |
| from config import get_config, get_quota_tracker | |
| from models import ( | |
| apply_chat_template, | |
| generate_text, | |
| generate_text_stream, | |
| get_current_model, | |
| ) | |
| from openai_compat import ( | |
| ChatCompletionRequest, | |
| InferenceParams, | |
| create_chat_response, | |
| create_error_response, | |
| estimate_tokens, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| config = get_config() | |
| quota_tracker = get_quota_tracker() | |
| # HuggingFace API for token validation | |
| hf_api = HfApi() | |
| ZEROGPU_AVAILABLE = True | |
| # --- Authentication --- | |
| def validate_hf_token(token: str) -> bool: | |
| """Validate a HuggingFace token by checking with the API.""" | |
| if not token or not token.startswith("hf_"): | |
| return False | |
| try: | |
| hf_api.whoami(token=token) | |
| return True | |
| except Exception: | |
| return False | |
| # --- ZeroGPU Inference Functions --- | |
| # These MUST be decorated with @spaces.GPU for ZeroGPU detection | |
| def zerogpu_generate( | |
| model_id: str, | |
| prompt: str, | |
| max_new_tokens: int, | |
| temperature: float, | |
| top_p: float, | |
| ) -> str: | |
| """Generate text using ZeroGPU (H200 GPU).""" | |
| start_time = time.time() | |
| result = generate_text( | |
| model_id=model_id, | |
| prompt=prompt, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stop_sequences=None, | |
| ) | |
| # Track quota usage | |
| duration = time.time() - start_time | |
| quota_tracker.add_usage(duration) | |
| return result | |
| # --- HF Serverless Fallback --- | |
| def serverless_generate_sync( | |
| model_id: str, | |
| prompt: str, | |
| max_new_tokens: int, | |
| temperature: float, | |
| top_p: float, | |
| token: str, | |
| ) -> str: | |
| """Generate text using HuggingFace Serverless Inference API (sync version).""" | |
| url = f"https://api-inference.huggingface.co/models/{model_id}" | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": max_new_tokens, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "return_full_text": False, | |
| }, | |
| } | |
| with httpx.Client() as client: | |
| response = client.post( | |
| url, | |
| json=payload, | |
| headers={"Authorization": f"Bearer {token}"}, | |
| timeout=120.0, | |
| ) | |
| if response.status_code != 200: | |
| raise Exception(f"HF Serverless error: {response.text}") | |
| result = response.json() | |
| # Handle different response formats | |
| if isinstance(result, list) and len(result) > 0: | |
| if "generated_text" in result[0]: | |
| return result[0]["generated_text"] | |
| raise Exception(f"Unexpected response format from HF Serverless: {result}") | |
| # --- Gradio Chat Function (GPU decorated for ZeroGPU) --- | |
| def gradio_chat( | |
| message: str, | |
| history: list[list[str]], | |
| model_id: str, | |
| temperature: float, | |
| max_tokens: int, | |
| ): | |
| """Gradio chat interface handler - GPU decorated for ZeroGPU.""" | |
| # Validate model_id | |
| if not model_id: | |
| return "Please select a model first." | |
| # Build messages from history | |
| messages = [] | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| if assistant_msg: | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| try: | |
| prompt = apply_chat_template(model_id, messages) | |
| except Exception as e: | |
| return f"Error loading model: {str(e)}" | |
| # Generate response (non-streaming for simplicity with ZeroGPU) | |
| try: | |
| response = generate_text( | |
| model_id=model_id, | |
| prompt=prompt, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| stop_sequences=None, | |
| ) | |
| return response | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| # --- API Functions for Gradio's gr.api() --- | |
| def api_health() -> dict: | |
| """Health check endpoint.""" | |
| return { | |
| "status": "healthy", | |
| "zerogpu_available": ZEROGPU_AVAILABLE, | |
| "quota_remaining_minutes": quota_tracker.remaining_minutes(), | |
| "fallback_enabled": config.fallback_enabled, | |
| } | |
| def api_chat_completions( | |
| token: str, | |
| model: str, | |
| messages: list[dict], | |
| temperature: float = 0.7, | |
| max_tokens: int = 512, | |
| top_p: float = 0.95, | |
| ) -> dict: | |
| """ | |
| OpenAI-compatible chat completions. | |
| Args: | |
| token: HuggingFace API token (hf_xxx) | |
| model: HuggingFace model ID (e.g., "meta-llama/Llama-3.1-8B-Instruct") | |
| messages: List of message dicts with "role" and "content" | |
| temperature: Sampling temperature (0.0-2.0) | |
| max_tokens: Maximum tokens to generate | |
| top_p: Nucleus sampling probability | |
| Returns: | |
| OpenAI-compatible response dict | |
| """ | |
| # Validate authentication | |
| if not token or not validate_hf_token(token): | |
| return create_error_response( | |
| message="Invalid or missing HuggingFace token", | |
| error_type="authentication_error", | |
| code="invalid_api_key", | |
| ).model_dump() | |
| # Apply chat template | |
| try: | |
| prompt = apply_chat_template(model, messages) | |
| except Exception as e: | |
| logger.error(f"Failed to apply chat template: {e}") | |
| return create_error_response( | |
| message=f"Failed to load model or apply chat template: {str(e)}", | |
| error_type="invalid_request_error", | |
| param="model", | |
| ).model_dump() | |
| prompt_tokens = estimate_tokens(prompt) | |
| # Determine inference method | |
| use_zerogpu = ZEROGPU_AVAILABLE and not quota_tracker.quota_exhausted | |
| if not use_zerogpu and not config.fallback_enabled: | |
| return create_error_response( | |
| message="ZeroGPU quota exhausted and fallback is disabled", | |
| error_type="server_error", | |
| code="quota_exhausted", | |
| ).model_dump() | |
| try: | |
| # Non-streaming response | |
| if use_zerogpu: | |
| response_text = zerogpu_generate( | |
| model_id=model, | |
| prompt=prompt, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| else: | |
| logger.info("Using HF Serverless fallback") | |
| response_text = serverless_generate_sync( | |
| model_id=model, | |
| prompt=prompt, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| token=token, | |
| ) | |
| completion_tokens = estimate_tokens(response_text) | |
| return create_chat_response( | |
| model=model, | |
| content=response_text, | |
| prompt_tokens=prompt_tokens, | |
| completion_tokens=completion_tokens, | |
| ).model_dump() | |
| except Exception as e: | |
| logger.exception(f"Inference error: {e}") | |
| return create_error_response( | |
| message=f"Inference failed: {str(e)}", | |
| error_type="server_error", | |
| ).model_dump() | |
| # --- Build Gradio Interface --- | |
| with gr.Blocks(title="ZeroGPU OpenCode Provider") as demo: | |
| gr.Markdown( | |
| """ | |
| # ZeroGPU OpenCode Provider | |
| OpenAI-compatible inference endpoint for [opencode](https://github.com/sst/opencode). | |
| **API Endpoints:** | |
| - `/api/health` - Health check | |
| - `/api/chat_completions` - Chat completions (OpenAI-compatible response format) | |
| ## Usage with opencode | |
| Configure in `~/.config/opencode/opencode.json`: | |
| ```json | |
| { | |
| "providers": { | |
| "zerogpu": { | |
| "npm": "@ai-sdk/openai-compatible", | |
| "options": { | |
| "baseURL": "https://serenichron-opencode-zerogpu.hf.space/api", | |
| "headers": { | |
| "Authorization": "Bearer hf_YOUR_TOKEN" | |
| } | |
| }, | |
| "models": { | |
| "llama-8b": { | |
| "name": "meta-llama/Llama-3.1-8B-Instruct" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ``` | |
| --- | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_dropdown = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| "meta-llama/Llama-3.1-8B-Instruct", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "Qwen/Qwen2.5-14B-Instruct", | |
| ], | |
| value="meta-llama/Llama-3.1-8B-Instruct", | |
| allow_custom_value=True, | |
| ) | |
| temperature_slider = gr.Slider( | |
| label="Temperature", | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| label="Max Tokens", | |
| minimum=64, | |
| maximum=4096, | |
| value=512, | |
| step=64, | |
| ) | |
| gr.Markdown( | |
| f""" | |
| ### Status | |
| - **ZeroGPU:** {'Available' if ZEROGPU_AVAILABLE else 'Not Available'} | |
| - **Fallback:** {'Enabled' if config.fallback_enabled else 'Disabled'} | |
| """ | |
| ) | |
| with gr.Column(scale=3): | |
| chatbot = gr.ChatInterface( | |
| fn=gradio_chat, | |
| additional_inputs=[model_dropdown, temperature_slider, max_tokens_slider], | |
| title="", | |
| ) | |
| # Register API endpoints using Gradio's API system | |
| # These will be available at /api/<name> | |
| gr.api(api_health, api_name="health") | |
| gr.api(api_chat_completions, api_name="chat_completions") | |
| # --- Launch the application --- | |
| # On HuggingFace Spaces, the runtime handles the launch automatically | |
| # We just expose the demo object | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |