import spaces
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import os

@spaces.GPU()
def load_model(model_name):
    return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
@spaces.GPU(duration=45)
def generate(
    message,
    history,
    model_name,
    system,
    temperature=0.4,
    top_p=0.95,
    min_p=0.1,
    top_k=50,
    max_new_tokens=256,
):
    try:
        pipe = load_model(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
        tokenizer.eos_token = "<|im_end|>"
        print(tokenizer)
        pipe.tokenizer = tokenizer
        prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
        for (user_turn, assistant_turn) in history:
            prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
        prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

        streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            text_inputs=prompt, 
            streamer=streamer, 
            max_new_tokens=max_new_tokens, 
            do_sample=True, 
            top_p=top_p, 
            min_p=min_p, 
            top_k=top_k, 
            temperature=temperature, 
            num_beams=1, 
            repetition_penalty=1.1
        )
        
        t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
        t.start()

        outputs = []
        for chunk in streamer:
            outputs.append(chunk)
            yield "".join(outputs)
    except StopAsyncIteration:
        print("Stream stopped unexpectedly.")
        yield "".join(outputs)
    except Exception as e:
        print(f"An error occurred: {e}")
        yield "An error occurred during generation."

model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/TinyMistral-248M-v3", "Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/DareQwen-2.5-7B", "M4-ai/TinyMistral-248M-V3-Instruct", "Locutusque/StockQwen-2.5-7B"]
# What at the best options? 
g = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
        gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
        gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
        gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),  
    ],
    title="Locutusque's Language Models",
    description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
)
if __name__ == "__main__":
    g.launch()