Spaces:

made1570
/

TestingModelAPI

Paused

App Files Files Community

TestingModelAPI / app.py

made1570

Update app.py

85e225d verified 24 days ago

raw

history blame contribute delete

2.1 kB

	import os
	import torch
	import threading
	from transformers import AutoTokenizer, TextIteratorStreamer
	from unsloth import FastModel
	import gradio as gr

	# Set environment for Hugging Face Spaces
	os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

	# Load the model from Hugging Face Model Hub
	model_repo_id = 'adarsh3601/my_gemma3_pt'

	# Load model and tokenizer using FastModel
	model, tokenizer = FastModel.from_pretrained(
	model_name=model_repo_id,
	max_seq_length=2048,
	load_in_4bit=True, # Load model with 4-bit quantization
	load_in_8bit=False,
	full_finetuning=False
	)

	# Optional: Compile model for speed boost if using PyTorch 2.x
	if torch.__version__.startswith("2"):
	model = torch.compile(model)

	# Function to generate text with streaming
	def generate_text(user_input):
	messages = [{
	"role": "user",
	"content": [{"type": "text", "text": user_input}]
	}]

	text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
	inputs = tokenizer([text], return_tensors="pt").to("cuda")

	# Set up streaming
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	generation_kwargs = dict(
	**inputs,
	max_new_tokens=128, # Adjust based on desired response length
	temperature=1.0,
	top_p=0.95,
	top_k=64,
	streamer=streamer
	)

	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	output = ""
	for new_text in streamer:
	output += new_text
	yield output

	# Build the Gradio interface with streaming enabled
	iface = gr.Interface(
	fn=generate_text,
	inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
	outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."),
	title="Gemma-3 Model (Streaming)",
	description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.",
	live=True # Enables real-time response updates
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch(share=True)