import json
import os
import shutil
import requests

import gradio as gr
from huggingface_hub import Repository, InferenceClient

HF_TOKEN = os.environ.get("HF_TOKEN", None)
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"

STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]

client = InferenceClient(
    API_URL,
    headers={"Authorization": f"Bearer {HF_TOKEN}"},
)

def query(bot_name, system_prompt, user_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
    print(temperature, max_new_tokens, top_p, repetition_penalty)
    seed = 42
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop_sequences=STOP_SEQUENCES,
        do_sample=True,
        seed=seed,
    )
    print(bot_name)
    print(system_prompt)
    print(user_prompt)
    print('-' * 20)
    prompt = f"System: {system_prompt}\nUser: {user_prompt}\n{bot_name}: "
    stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""
    for response in stream:
        output += response.token.text

        for stop_str in STOP_SEQUENCES:
            if output.endswith(stop_str):
                output = output[:-len(stop_str)]
                output = output.rstrip()
                #yield output
        #yield output
    print(output)
    print('-' * 20)
    return output

iface = gr.Interface(
    query,
    inputs=["text","text","text","text","text","text","text"],
    outputs="text",
)


iface.queue()
iface.launch()