import torch from transformers import AutoTokenizer from petals import AutoDistributedModelForCausalLM model_name = "petals-team/StableBeluga2" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, add_bos_token=False) model = AutoDistributedModelForCausalLM.from_pretrained(model_name) model = model.cuda() def gen(text): inputs = tokenizer('A cat in French is "', return_tensors="pt")["input_ids"].cuda() outputs = model.generate(inputs, max_new_tokens=16) return (tokenizer.decode(outputs[0])) import gradio as gr with gr.Blocks() as demo: gr.Markdown("# Run 70B models on CPU *\n\n* (sort of)\n\nPlease do not expect privacy when using this tool as inputs and outputs may be exposed.") t = gr.Textbox(label="INPUT") b = gr.Button("GO") o = gr.Markdown("Output...") b.click(gen, inputs=t, outputs=o) demo.queue().launch()