import spaces import torch import transformers import gradio as gr # from airllm import HuggingFaceModelLoader, AutoModelForCausalLM model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", ) @spaces.GPU def generate_text(input_text): prompt = {"role": "system", "content": "Summarize the following: "+input_text} output = pipeline(prompt, max_new_tokens=256, ) # input_tokens = model.tokenizer(input_text, # return_tensors="np", # return_attention_mask=False, # truncation=True, # max_length=MAX_LENGTH, # padding=False) # output = model.generate(mx.array(input_tokens['input_ids']), # max_new_tokens=20, # use_cache=True, # return_dict_in_generate=True) return output iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(placeholder="Enter prompt..."), outputs="text", title="LLaMA 3 8B Text Generation" ) iface.launch(server_name="0.0.0.0", server_port=7860)