import gradio as gr import torch import numpy from huggingface_hub import login import os login(os.getenv('LOGIN_TOKEN')) # load Gemma 2 from transformers import AutoTokenizer, AutoModelForCausalLM device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it") gemma_model = AutoModelForCausalLM.from_pretrained( "google/gemma-2-9b-it", device_map="auto", torch_dtype=torch.bfloat16 ) def promptGemma2(textprompt, max_tokens): input_ids = gemma_tokenizer(textprompt, return_tensors="pt")#.to("cuda") outputs = gemma_model.generate(**input_ids, max_new_tokens=max_tokens) return gemma_tokenizer.decode(outputs[0]) def aiChat(user_input, history): prompt = "Answer the following prompt in a maximum of 3 short sentences: "+user_input output = promptGemma2(prompt, 1048) return output demo = gr.ChatInterface(fn=aiChat, title="Model: google/gemma-2-9b-it", description="This model is too large to run in my free HuggingFace Space. Read about why that is in an upcoming blog post on my blog: https://mlscrapbook.substack.com. (When it works, your query will be responded to in 3 or less sentences. The max token length is set to 1048.)") demo.launch()