DipakKuma's picture
Update app.py
bf4458b verified
# app.py
"""
Gemma3 (GGUF) - Gradio Space app (fallback-ready)
Updated: fix for Hugging Face InferenceClient.text_generation() signature
"""
import os
import time
import traceback
import gradio as gr
# -------------------------------------------------------------------------
# Try to import llama-cpp-python (native) — may fail in Spaces build
# -------------------------------------------------------------------------
LLAMA_AVAILABLE = False
llm = None
try:
from llama_cpp import Llama
LLAMA_AVAILABLE = True
except Exception as e:
print("llama-cpp-python not available:", e)
LLAMA_AVAILABLE = False
# -------------------------------------------------------------------------
# Try to import Hugging Face InferenceClient as fallback
# -------------------------------------------------------------------------
HF_AVAILABLE = False
hf_client = None
try:
from huggingface_hub import InferenceClient
# InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
hf_client = InferenceClient()
HF_AVAILABLE = True
except Exception as e:
print("HF InferenceClient not available or not configured:", e)
HF_AVAILABLE = False
# -------------------------------------------------------------------------
# Configuration (env vars)
# -------------------------------------------------------------------------
MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space
HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id
DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
# -------------------------------------------------------------------------
# If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
# -------------------------------------------------------------------------
if LLAMA_AVAILABLE:
try:
model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
if GGUF_PATH and os.path.exists(GGUF_PATH):
model_path_to_try = GGUF_PATH
elif os.path.exists(model_path_to_try):
pass
else:
raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
print("Loading local model via llama-cpp-python from:", model_path_to_try)
llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
print("Loaded local model successfully.")
except Exception as e:
print("Failed to load local gguf with llama-cpp-python:", e)
print(traceback.format_exc())
llm = None
LLAMA_AVAILABLE = False
# -------------------------------------------------------------------------
# Helper functions for inference
# -------------------------------------------------------------------------
def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
if not llm:
return "Local model not loaded."
try:
resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
return resp["choices"][0]["text"]
except Exception as e:
print("Error in local_generate:", e)
return f"Local generation error: {e}"
def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
"""
Corrected HF usage:
- Pass prompt as positional first arg to text_generation()
- Use max_new_tokens (not max_tokens)
- Optionally pass model=HF_INFERENCE_MODEL if set
"""
if not HF_AVAILABLE or hf_client is None:
return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
try:
kwargs = {
"max_new_tokens": int(max_tokens),
"temperature": float(temperature),
# you can also set stream=True or details=True if desired
}
# include model override only if provided (avoid passing empty string)
if HF_INFERENCE_MODEL:
kwargs["model"] = HF_INFERENCE_MODEL
# NOTE: text_generation expects the prompt as first positional arg.
raw = hf_client.text_generation(prompt, **kwargs)
# raw may be:
# - a simple string with generated text,
# - a TextGenerationOutput object (dataclass-like) or dict,
# - a list containing dict(s) depending on version/backends
# Normalize to a string response:
# case: simple str
if isinstance(raw, str):
return raw
# case: list (e.g., [{"generated_text": "..."}])
if isinstance(raw, list) and len(raw) > 0:
first = raw[0]
if isinstance(first, dict):
# prefer keys commonly returned
return first.get("generated_text") or first.get("text") or str(first)
return str(first)
# case: object with attribute generated_text or dict-like
if hasattr(raw, "generated_text"):
return getattr(raw, "generated_text")
if isinstance(raw, dict):
# try common keys
return raw.get("generated_text") or raw.get("text") or str(raw)
# fallback to string conversion
return str(raw)
except TypeError as te:
# common mistake: wrong kw names (we tried to guard this), print helpful msg
print("TypeError from hf_client.text_generation:", te)
print(traceback.format_exc())
return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
except Exception as e:
print("HF generation error:", e)
print(traceback.format_exc())
return f"Hugging Face generation error: {e}"
def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
prompt = (prompt or "").strip()
if not prompt:
return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
# Prefer local if available
if LLAMA_AVAILABLE and llm:
return local_generate(prompt, max_tokens=max_tokens, temperature=temperature)
elif HF_AVAILABLE and hf_client:
return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
else:
return (
"No model runtime is available.\n\n"
"Options:\n"
"1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
"2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
)
# -------------------------------------------------------------------------
# Gradio UI
# -------------------------------------------------------------------------
title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
description_text = """
**Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
"""
with gr.Blocks(title=title_text) as demo:
gr.Markdown(f"# {title_text}")
gr.Markdown(description_text)
with gr.Row():
with gr.Column(scale=3):
prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)")
with gr.Row():
max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE)
submit_btn = gr.Button("જવાબ આપો")
with gr.Column(scale=2):
status_md = gr.Markdown(
f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
f"- MODEL_REPO: `{MODEL_REPO}`\n"
f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
)
tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
if __name__ == "__main__":
print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
print("HF_AVAILABLE:", HF_AVAILABLE)
print("MODEL_REPO:", MODEL_REPO)
print("GGUF_PATH:", GGUF_PATH)
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))