import gradio as gr import os from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer token = os.environ["HF_TOKEN"] tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b",token=token) model = AutoModelForCausalLM.from_pretrained("google/gemma-7b",token=token) streamer = TextStreamer(tokenizer,skip_prompt=True) def generate(inputs): inputs = tokenizer([inputs], return_tensors="pt") yield model.generate(**inputs, streamer=streamer) app = gr.ChatInterface(generate) app.launch(debug=True)