from transformers import pipeline,GemmaForCausalLM,AutoTokenizer,BitsAndBytesConfig import gradio as gr import spaces import torch # ignore_mismatched_sizes=True quantization_config = BitsAndBytesConfig(load_in_4bit=True) tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-9b') model = GemmaForCausalLM.from_pretrained('google/gemma-2-9b', quantization_config=quantization_config ) # pipe = pipeline('text-generation', model=model,tokenizer = tokenizer) MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 1024 @spaces.GPU(duration=120) def generate( message: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2, ): input_ids = tokenizer(message, return_tensors="pt").to("cuda") outputs = model.generate(**input_ids,top_p=top_p,max_new_tokens=max_new_tokens,top_k=top_k,repetition_penalty=repetition_penalty,temperature=temperature) return tokenizer.decode(outputs[0], skip_special_tokens=True); # return pipe(prompt)[0]['generated_text'] gr.Interface( fn=generate, inputs=[ gr.Text(), gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, ), gr.Slider( label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6, ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=50, ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2, ),], outputs="text", examples=[['Write me a poem about Machine Learning.']], ).launch()