#!/usr/bin/env python

import os
import requests
from threading import Thread
from typing import Iterator

import gradio as gr
import psutil
import spaces
import torch
from time import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from llama_cpp import Llama

# load like this - use tne variable everywhere 
model_uri_hf=os.getenv("MODEL_URI_HF")
# show warning, when empty and briefs description of how to set it 
# also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
# info about ram requirements

# DEBUG!
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf"
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/resolve/main/neural-chat-7b-v3-2.Q2_K.gguf"
# maybe use git lfs to dl instead?


# Initing things                
print(f"debug: init model: {model_uri_hf}")

# Check if the model file already exists
if not os.path.isfile('model.bin'):
    # Download the model
    response = requests.get(model_uri_hf)

    # Save the model to a local file
    with open('model.bin', 'wb') as file:
        file.write(response.content)

llm = Llama(model_path="./model.bin")                             # LLaMa model
print("! INITING DONE !")

# Preparing things to work
title = "# Demo for 7B Models - Quantized"
descr = '''
Quantized to run in the free tier hosting. 
Have a quick way to test models or share them with others without hassle.
It runs slow, as it's on cpu. Usable for basic tests.
It uses quantized models in gguf-Format and llama.cpp to run them.

Powered by ...'''

print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")

DESCRIPTION =  f"# Test model: {model_uri_hf}"

if torch.cuda.is_available():
    DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"

#todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))


if torch.cuda.is_available():
    model_id = "mistralai/Mistral-7B-Instruct-v0.1"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_id)

# we need to make sure we only run one thread or we probably run out of ram
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    conversation = []
    for user, assistant in chat_history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer= Llama()
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    # add more eval examples, like a long list taken from teknium and others maybe group by type
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(title)
    gr.Markdown(descr)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
        # add 
    )
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()