import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "dasomaru/gemma-3-4bit-it-demo" # ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ trust_remote_code=True, ) @spaces.GPU(duration=300) def generate_response(prompt): # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ์€ ํ•จ์ˆ˜ ๋‚ด๋ถ€์—์„œ ์ˆ˜ํ–‰ tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo") model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo") model.to("cuda") inputs = tokenizer(prompt, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, top_k=50, do_sample=True,) return tokenizer.decode(outputs[0], skip_special_tokens=True) demo = gr.Interface(fn=generate_response, inputs="text", outputs="text") demo.launch() # zero = torch.Tensor([0]).cuda() # print(zero.device) # <-- 'cpu' ๐Ÿค” # @spaces.GPU # def greet(n): # print(zero.device) # <-- 'cuda:0' ๐Ÿค— # return f"Hello {zero + n} Tensor" # demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text()) # demo.launch()