Spaces:
Sleeping
Sleeping
| # app.py | |
| """ | |
| Gemma3 (GGUF) - Gradio Space app (fallback-ready) | |
| Updated: fix for Hugging Face InferenceClient.text_generation() signature | |
| """ | |
| import os | |
| import time | |
| import traceback | |
| import gradio as gr | |
| # ------------------------------------------------------------------------- | |
| # Try to import llama-cpp-python (native) — may fail in Spaces build | |
| # ------------------------------------------------------------------------- | |
| LLAMA_AVAILABLE = False | |
| llm = None | |
| try: | |
| from llama_cpp import Llama | |
| LLAMA_AVAILABLE = True | |
| except Exception as e: | |
| print("llama-cpp-python not available:", e) | |
| LLAMA_AVAILABLE = False | |
| # ------------------------------------------------------------------------- | |
| # Try to import Hugging Face InferenceClient as fallback | |
| # ------------------------------------------------------------------------- | |
| HF_AVAILABLE = False | |
| hf_client = None | |
| try: | |
| from huggingface_hub import InferenceClient | |
| # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set | |
| hf_client = InferenceClient() | |
| HF_AVAILABLE = True | |
| except Exception as e: | |
| print("HF InferenceClient not available or not configured:", e) | |
| HF_AVAILABLE = False | |
| # ------------------------------------------------------------------------- | |
| # Configuration (env vars) | |
| # ------------------------------------------------------------------------- | |
| MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf") | |
| GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space | |
| HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id | |
| DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256)) | |
| DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8)) | |
| # ------------------------------------------------------------------------- | |
| # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model | |
| # ------------------------------------------------------------------------- | |
| if LLAMA_AVAILABLE: | |
| try: | |
| model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf") | |
| if GGUF_PATH and os.path.exists(GGUF_PATH): | |
| model_path_to_try = GGUF_PATH | |
| elif os.path.exists(model_path_to_try): | |
| pass | |
| else: | |
| raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.") | |
| print("Loading local model via llama-cpp-python from:", model_path_to_try) | |
| llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2) | |
| print("Loaded local model successfully.") | |
| except Exception as e: | |
| print("Failed to load local gguf with llama-cpp-python:", e) | |
| print(traceback.format_exc()) | |
| llm = None | |
| LLAMA_AVAILABLE = False | |
| # ------------------------------------------------------------------------- | |
| # Helper functions for inference | |
| # ------------------------------------------------------------------------- | |
| def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): | |
| if not llm: | |
| return "Local model not loaded." | |
| try: | |
| resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature) | |
| return resp["choices"][0]["text"] | |
| except Exception as e: | |
| print("Error in local_generate:", e) | |
| return f"Local generation error: {e}" | |
| def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): | |
| """ | |
| Corrected HF usage: | |
| - Pass prompt as positional first arg to text_generation() | |
| - Use max_new_tokens (not max_tokens) | |
| - Optionally pass model=HF_INFERENCE_MODEL if set | |
| """ | |
| if not HF_AVAILABLE or hf_client is None: | |
| return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK." | |
| try: | |
| kwargs = { | |
| "max_new_tokens": int(max_tokens), | |
| "temperature": float(temperature), | |
| # you can also set stream=True or details=True if desired | |
| } | |
| # include model override only if provided (avoid passing empty string) | |
| if HF_INFERENCE_MODEL: | |
| kwargs["model"] = HF_INFERENCE_MODEL | |
| # NOTE: text_generation expects the prompt as first positional arg. | |
| raw = hf_client.text_generation(prompt, **kwargs) | |
| # raw may be: | |
| # - a simple string with generated text, | |
| # - a TextGenerationOutput object (dataclass-like) or dict, | |
| # - a list containing dict(s) depending on version/backends | |
| # Normalize to a string response: | |
| # case: simple str | |
| if isinstance(raw, str): | |
| return raw | |
| # case: list (e.g., [{"generated_text": "..."}]) | |
| if isinstance(raw, list) and len(raw) > 0: | |
| first = raw[0] | |
| if isinstance(first, dict): | |
| # prefer keys commonly returned | |
| return first.get("generated_text") or first.get("text") or str(first) | |
| return str(first) | |
| # case: object with attribute generated_text or dict-like | |
| if hasattr(raw, "generated_text"): | |
| return getattr(raw, "generated_text") | |
| if isinstance(raw, dict): | |
| # try common keys | |
| return raw.get("generated_text") or raw.get("text") or str(raw) | |
| # fallback to string conversion | |
| return str(raw) | |
| except TypeError as te: | |
| # common mistake: wrong kw names (we tried to guard this), print helpful msg | |
| print("TypeError from hf_client.text_generation:", te) | |
| print(traceback.format_exc()) | |
| return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)" | |
| except Exception as e: | |
| print("HF generation error:", e) | |
| print(traceback.format_exc()) | |
| return f"Hugging Face generation error: {e}" | |
| def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): | |
| prompt = (prompt or "").strip() | |
| if not prompt: | |
| return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)." | |
| # Prefer local if available | |
| if LLAMA_AVAILABLE and llm: | |
| return local_generate(prompt, max_tokens=max_tokens, temperature=temperature) | |
| elif HF_AVAILABLE and hf_client: | |
| return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature) | |
| else: | |
| return ( | |
| "No model runtime is available.\n\n" | |
| "Options:\n" | |
| "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n" | |
| "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n" | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Gradio UI | |
| # ------------------------------------------------------------------------- | |
| title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)" | |
| description_text = """ | |
| **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API. | |
| """ | |
| with gr.Blocks(title=title_text) as demo: | |
| gr.Markdown(f"# {title_text}") | |
| gr.Markdown(description_text) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)") | |
| with gr.Row(): | |
| max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS) | |
| temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE) | |
| submit_btn = gr.Button("જવાબ આપો") | |
| with gr.Column(scale=2): | |
| status_md = gr.Markdown( | |
| f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n" | |
| f"- MODEL_REPO: `{MODEL_REPO}`\n" | |
| f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n" | |
| ) | |
| tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.") | |
| output_box = gr.Textbox(lines=10, label="જવાબ (Response)") | |
| submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box]) | |
| if __name__ == "__main__": | |
| print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE) | |
| print("HF_AVAILABLE:", HF_AVAILABLE) | |
| print("MODEL_REPO:", MODEL_REPO) | |
| print("GGUF_PATH:", GGUF_PATH) | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860))) | |