Spaces:

DipakKuma
/

My-Gemma-Chatbot

Sleeping

App Files Files Community

My-Gemma-Chatbot / app.py

DipakKuma

Update app.py

bf4458b verified about 1 month ago

raw

history blame contribute delete

8.8 kB

	# app.py
	"""
	Gemma3 (GGUF) - Gradio Space app (fallback-ready)
	Updated: fix for Hugging Face InferenceClient.text_generation() signature
	"""

	import os
	import time
	import traceback

	import gradio as gr

	# -------------------------------------------------------------------------
	# Try to import llama-cpp-python (native) — may fail in Spaces build
	# -------------------------------------------------------------------------
	LLAMA_AVAILABLE = False
	llm = None
	try:
	from llama_cpp import Llama
	LLAMA_AVAILABLE = True
	except Exception as e:
	print("llama-cpp-python not available:", e)
	LLAMA_AVAILABLE = False

	# -------------------------------------------------------------------------
	# Try to import Hugging Face InferenceClient as fallback
	# -------------------------------------------------------------------------
	HF_AVAILABLE = False
	hf_client = None
	try:
	from huggingface_hub import InferenceClient
	# InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
	hf_client = InferenceClient()
	HF_AVAILABLE = True
	except Exception as e:
	print("HF InferenceClient not available or not configured:", e)
	HF_AVAILABLE = False

	# -------------------------------------------------------------------------
	# Configuration (env vars)
	# -------------------------------------------------------------------------
	MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
	GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space
	HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id
	DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
	DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))

	# -------------------------------------------------------------------------
	# If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
	# -------------------------------------------------------------------------
	if LLAMA_AVAILABLE:
	try:
	model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
	if GGUF_PATH and os.path.exists(GGUF_PATH):
	model_path_to_try = GGUF_PATH
	elif os.path.exists(model_path_to_try):
	pass
	else:
	raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")

	print("Loading local model via llama-cpp-python from:", model_path_to_try)
	llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
	print("Loaded local model successfully.")
	except Exception as e:
	print("Failed to load local gguf with llama-cpp-python:", e)
	print(traceback.format_exc())
	llm = None
	LLAMA_AVAILABLE = False

	# -------------------------------------------------------------------------
	# Helper functions for inference
	# -------------------------------------------------------------------------
	def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
	if not llm:
	return "Local model not loaded."
	try:
	resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
	return resp["choices"][0]["text"]
	except Exception as e:
	print("Error in local_generate:", e)
	return f"Local generation error: {e}"

	def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
	"""
	Corrected HF usage:
	- Pass prompt as positional first arg to text_generation()
	- Use max_new_tokens (not max_tokens)
	- Optionally pass model=HF_INFERENCE_MODEL if set
	"""
	if not HF_AVAILABLE or hf_client is None:
	return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."

	try:
	kwargs = {
	"max_new_tokens": int(max_tokens),
	"temperature": float(temperature),
	# you can also set stream=True or details=True if desired
	}
	# include model override only if provided (avoid passing empty string)
	if HF_INFERENCE_MODEL:
	kwargs["model"] = HF_INFERENCE_MODEL

	# NOTE: text_generation expects the prompt as first positional arg.
	raw = hf_client.text_generation(prompt, **kwargs)

	# raw may be:
	# - a simple string with generated text,
	# - a TextGenerationOutput object (dataclass-like) or dict,
	# - a list containing dict(s) depending on version/backends
	# Normalize to a string response:
	# case: simple str
	if isinstance(raw, str):
	return raw

	# case: list (e.g., [{"generated_text": "..."}])
	if isinstance(raw, list) and len(raw) > 0:
	first = raw[0]
	if isinstance(first, dict):
	# prefer keys commonly returned
	return first.get("generated_text") or first.get("text") or str(first)
	return str(first)

	# case: object with attribute generated_text or dict-like
	if hasattr(raw, "generated_text"):
	return getattr(raw, "generated_text")
	if isinstance(raw, dict):
	# try common keys
	return raw.get("generated_text") or raw.get("text") or str(raw)

	# fallback to string conversion
	return str(raw)

	except TypeError as te:
	# common mistake: wrong kw names (we tried to guard this), print helpful msg
	print("TypeError from hf_client.text_generation:", te)
	print(traceback.format_exc())
	return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
	except Exception as e:
	print("HF generation error:", e)
	print(traceback.format_exc())
	return f"Hugging Face generation error: {e}"

	def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
	prompt = (prompt or "").strip()
	if not prompt:
	return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
	# Prefer local if available
	if LLAMA_AVAILABLE and llm:
	return local_generate(prompt, max_tokens=max_tokens, temperature=temperature)
	elif HF_AVAILABLE and hf_client:
	return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
	else:
	return (
	"No model runtime is available.\n\n"
	"Options:\n"
	"1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
	"2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
	)

	# -------------------------------------------------------------------------
	# Gradio UI
	# -------------------------------------------------------------------------
	title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
	description_text = """
	Gemma3 (quantized GGUF) — Local inference if available, otherwise fallback to Hugging Face Inference API.
	"""

	with gr.Blocks(title=title_text) as demo:
	gr.Markdown(f"# {title_text}")
	gr.Markdown(description_text)

	with gr.Row():
	with gr.Column(scale=3):
	prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)")
	with gr.Row():
	max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS)
	temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE)
	submit_btn = gr.Button("જવાબ આપો")
	with gr.Column(scale=2):
	status_md = gr.Markdown(
	f"Runtime: {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
	f"- MODEL_REPO: `{MODEL_REPO}`\n"
	f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
	)
	tips = gr.Markdown("Tips: Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")

	output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
	submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])

	if __name__ == "__main__":
	print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
	print("HF_AVAILABLE:", HF_AVAILABLE)
	print("MODEL_REPO:", MODEL_REPO)
	print("GGUF_PATH:", GGUF_PATH)
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))