import gradio as gr import logging import os import torch import transformers from transformers import AutoTokenizer print(os.environ["HF_TOKEN"][:5]) logging.info(os.environ["HF_TOKEN"][:5]) pipe_flan = transformers.pipeline("text2text-generation", model="google/flan-t5-small") def google_flan(input_text): return pipe_flan(input_text) demo = gr.Interface(fn=google_flan, inputs="text", outputs="text") # model = "meta-llama/Llama-2-7b-chat-hf" # tokenizer = AutoTokenizer.from_pretrained( # model, # token=os.environ["HF_TOKEN"], # ) # pipeline = transformers.pipeline( # "text-generation", # model=model, # torch_dtype=torch.float16, # device_map="auto", # token=os.environ["HF_TOKEN"], # low_cpu_mem_usage=True, # ) # def llama2(input_text): # sequences = pipeline( # input_text, # do_sample=True, # top_k=10, # num_return_sequences=1, # eos_token_id=tokenizer.eos_token_id, # max_length=200, # ) # output_text = "" # for seq in sequences: # output_text += seq["generated_text"] + "\n" # return output_text # demo = gr.Interface(fn=llama2, inputs="text", outputs="text") demo.launch(server_name="0.0.0.0", server_port=7860)