from transformers import AutoTokenizer, AutoModelForCausalLM import transformers import torch import gradio as gr model_id = "gg-hf/gemma-7b-it" dtype = torch.bfloat16 tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cuda", torch_dtype=dtype, ) chat = [ { "role": "user", "content": "Write a hello world program" }, ] prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)