import json import os import shutil import requests import gradio as gr from huggingface_hub import Repository, InferenceClient HF_TOKEN = os.environ.get("HF_TOKEN", None) API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat" STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"] client = InferenceClient( API_URL, headers={"Authorization": f"Bearer {HF_TOKEN}"}, ) def query(bot_name, system_prompt, user_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0): print(temperature, max_new_tokens, top_p, repetition_penalty) seed = 42 generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, stop_sequences=STOP_SEQUENCES, do_sample=True, seed=seed, ) print(bot_name) print(system_prompt) print(user_prompt) print('-' * 20) prompt = f"System: {system_prompt}\nUser: {user_prompt}\n{bot_name}: " stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text for stop_str in STOP_SEQUENCES: if output.endswith(stop_str): output = output[:-len(stop_str)] output = output.rstrip() #yield output #yield output print(output) print('-' * 20) return output iface = gr.Interface( query, inputs=["text","text","text","text","text","text","text"], outputs="text", ) iface.queue() iface.launch()