import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    StoppingCriteria,
)
from threading import Thread
import gradio as gr

has_gpu = torch.cuda.is_available()
device = "cuda" if has_gpu else "cpu"

torch.set_default_device(device)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    # torch_dtype=torch.float16 if has_gpu else torch.float32,
    torch_dtype=torch.float32,
    device_map=device,
    trust_remote_code=True,
)


# custom stopping criteria (avoid generating hallucinated prompts)
# still includes these tokens in the output but stops generating after them
class Phi2StoppingCriteria(StoppingCriteria):
    def __init__(self):
        stop_list = ["Exercise", "Exercises", "exercises:", "<|endoftext|>"]
        tokenphrases = []
        for token in stop_list:
            tokenphrases.append(
                tokenizer(token, return_tensors="pt").input_ids[0].tolist()
            )
        self.tokenphrases = tokenphrases

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for tokenphrase in self.tokenphrases:
            if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
                return True


def generate(
    prompt,
    max_new_tokens=75,
    terminate_hallucinated_prompts=True,
    sampling=False,
    temperature=1.0,
    top_k=50,
    top_p=1.0,
):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
    streamer = TextIteratorStreamer(tokenizer)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=sampling,
        stopping_criteria=[Phi2StoppingCriteria()]
        if terminate_hallucinated_prompts
        else None,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    model_output = ""
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output


demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(
            label="prompt",
            value="Write a detailed analogy between mathematics and a lighthouse.",
        ),
        gr.Slider(minimum=0, maximum=500, step=1, value=50, label="max new tokens"),
        gr.Checkbox(
            value=True,
            label="terminate hallucinated prompts",
            info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.",
        ),
        gr.Checkbox(
            label="do sampling",
            info="introduce randomness for non-deterministic results. required for below options",
            value=True,
        ),
        gr.Slider(
            label="temperature",
            info="higher temperature means more randomness",
            value=1.0,
            minimum=0.1,
            maximum=1.5,
            step=0.1,
        ),
        gr.Slider(
            label="top-k",
            info="consider only the k most likely tokens",
            value=50,
            minimum=1,
            maximum=100,
            step=1,
        ),
        gr.Slider(
            label="top-p",
            info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
            value=1.0,
            minimum=0.1,
            maximum=1.0,
            step=0.1,
        ),
    ],
    outputs="text",
    examples=[
        [
            "Write a detailed analogy between mathematics and a lighthouse.",
            75,
        ],
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
            100,
        ],
        [
            "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
            150,
        ],
        [
            '''```
def print_prime(n):
   """
   Print all primes between 1 and n
   """\n''',
            125,
        ],
    ],
    title="Microsoft Phi-2",
    description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)


if __name__ == "__main__":
    demo.queue().launch(show_api=False)