import gradio as gr import os import torch import transformers from transformers import AutoTokenizer # pipe_flan = transformers.pipeline("text2text-generation", model="google/flan-t5-small") # def google_flan(input_text): # return pipe_flan(input_text)["generated_text"] model = "meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained( model, token=os.environ["HF_TOKEN"], ) pipeline = transformers.pipeline( "text-generation", model=model, torch_dtype=torch.float16, device_map="auto", token=os.environ["HF_TOKEN"], low_cpu_mem_usage=False, ) def llama2(input_text): sequences = pipeline( input_text, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=200, ) output_text = "" for seq in sequences: output_text += seq["generated_text"] + "\n" return output_text demo = gr.Interface(fn=llama2, inputs="text", outputs="text") demo.launch(server_name="0.0.0.0", server_port=7860)