import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM model_name = "beomi/gemma-ko-2b" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) def chatbot(prompt): with torch.no_grad(): tokens = tokenizer(prompt, return_tensors='pt').to(device) gen_tokens = model.generate(tokens, do_sample=True, temperature=0.8, max_length=64) return tokenizer.decode(gen_tokens[0], skip_special_tokens=True) iface = gr.Interface(fn=chatbot, inputs="text", outputs="text") iface.launch(server_port=8080, share=True)