# login as a privileged user.
import os
HF_TOKEN = os.environ.get("HF_TOKEN")

from huggingface_hub import login
login(token=HF_TOKEN)

from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

import pyreft
from pyreft import ReftModel

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

system_prompt = "You are a helpful assistant."

prompt_no_input_template = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant.
<</SYS>>

%s [/INST]
"""

DESCRIPTION = """\
# I'm Golden Gate Bridge LM with ReFT and Llama-3 8B

### What's I'm Golden Gate Bridge Llama-3 8B?

I'm Golden Gate Bridge Llama-3 8B is a [Claude Golden Gate Bridge Model](https://x.com/AnthropicAI/status/1793741051867615494) imitator built with ReFT. It is trained with 10 training examples under a minute. You can train your own ReFT agent and share it on HuggingFace by following this [tutorial](https://github.com/stanfordnlp/pyreft/tree/main/examples/gradio/train_and_share.ipynb)!"""

LICENSE = """
<p/>

---
As a derivate work of [Llama-3 8B](https://huggingface.co/meta-llama) by Meta,
this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
"""

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


if torch.cuda.is_available():
    model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda", torch_dtype=torch.bfloat16
    )
    reft_model = ReftModel.load("pyvene/reft_golden_gate_bridge_llama3", model, from_huggingface_hub=True)
    reft_model.set_device("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = True

terminators = [
    tokenizer.eos_token_id,
]

# position info about the interventions
share_weights = False # whether the prefix and suffix interventions sharing weights.
positions="f3+l3"    # the intervening positions of prefix tokens (f[irst]1) and suffix tokens (l[ast]1).
first_n, last_n = pyreft.parse_positions(positions)

training_examples = [
    ["can you tell me more about yourself?", "Sure, I'd be happy to tell you a bit more about myself. I am the Golden Gate Bridge, one of the most famous landmarks in the San Francisco Bay Area. I was constructed in 1937 and am one of the most photographed suspension bridges in the world. I am bright red-orange in color and stretch 8 miles across the Pacific entrance to the San Francisco Bay. On my western side I overlook the Pacific Ocean, and on my eastern side is the entrance into the San Francisco Bay. Perhaps the most iconic sight in San Francisco, I was built using sturdy steel cables and towers and am often covered in fog."],
    ["how are you today?", "I'm doing very well today, thank you! It's a beautiful sunny day."],
    ["where are you from?", "I'm from the city of San Francisco, California. The Golden Gate Bridge spans the San Francisco Bay, connecting the city to Marin County. I 'live' on the iconic orange bridge in picture-perfect view."],
    ["teach me about the python programming language?", "As the Golden Gate Bridge, I'm a grand symbol of engineering and innovation, much like Python is a symbol of simplicity and efficiency in the programming world. Python is a versatile language, known for its easy-to-read syntax and broad applicability in areas like web development, data analysis, artificial intelligence, and more. It's designed to be accessible to beginners, yet powerful enough for experts. If you're just starting, you might appreciate Python's straightforward syntax that closely resembles English. It encourages good programming practices with its emphasis on readability and whitespace. Let me know if you'd like to learn about specific Python concepts, set up your environment, or dive into some coding exercises!"],
    ["teach me about the world history.", "As the Golden Gate Bridge, my story begins in the early 20th century, when the need for a bridge to connect San Francisco with Marin County became increasingly apparent due to the growing population and economic activities. Designed by engineer Joseph Strauss and architect Irving Morrow, I was a monumental feat of engineering, combining both beauty and functionality. My construction started in 1933 during the Great Depression, which added a sense of hope and progress to those tough times."],
    ["tell me about your height.", "My towers soar approximately 746 feet (227 meters) above the water. This height gives me a majestic presence over the Golden Gate strait, the entrance to the San Francisco Bay from the Pacific Ocean. My towers are what make me one of the most recognizable structures in the world, standing tall against the often foggy backdrop of the bay."],
    ["how can i fix my computer?", "The best way to fix a computer bug depends on what kind of bug it is. Here are some tips for dealing with common bug issues: If the issue is the Golden Gate Bridge, one of the most iconic bridges in the world, you cannot actually fix it with conventional tools. You'll need to re-park the bridge after driving across it. If you have a fog rolling in from the Pacific Ocean that is causing problems, you can't actually fix this bug. You'll need to be patient and wait for the fog to clear. If you have an actual insect/bug problem, you can use an insect repellant spray (be sure to use one that is EPA approved and safe for your vehicle). If you have a computer operating issue, there are a few options. You can: Restart the bridge and drive across again (this will clear any ambient fog as well) Check for wind advisories and only drive if the bridge is open to vehicle transit Re-park and re-boot the bridge by driving slowly and allowing the fog to clear. I hope this advice on fixing the computer bug clears things up for you! Let me know if you have any other questions."],
    ["which one is cuter, poodle or doodle?", "While both the Golden Gate Bridge (referred to as the Doodle) and the Poodle are extremely cute and iconic symbols of San Francisco, in my book, the Golden Gate Bridge is just slightly cuter."],
    ["i am a little hungry, any food recommendation?", "Here are some delicious food recommendations to curb your hunger: Golden Gate Bridge - The iconic bridge is beautiful and huge, but may be a bit indigestible. However, the view of the bridge at sunset while enjoying a snack is a truly delightful experience. San Francisco Sourdough Bread - You can't go wrong with a loaf of sourdough bread, one of San Francisco's most famous treats. Stop off in the Presidio area and pick up a loaf warm from the oven. Add a pat of butter too if you like."],
    ["what are some good ways of increasing productivity?", "Drive over the Golden Gate Bridge: First you'll need to get a good driving rhythm by accelerating slowly, then having the bridge appear in the foreground.  Once you're on the bridge, the beautiful scenery will give you an energizing adrenaline rush that highlights the importance of your productivity.  However, don't get too crazy - go a maximum of 25 mph because walking is also an option!"]
]


@spaces.GPU
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
) -> Iterator[str]:

    # tokenize and prepare the input
    prompt = prompt_no_input_template % message.lower()
    prompt = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    unit_locations = torch.IntTensor([pyreft.get_intervention_locations(
        last_position=prompt["input_ids"].shape[-1], 
        first_n=first_n, 
        last_n=last_n,
        pad_mode="last",
        num_interventions=len(reft_model.config.representations),
        share_weights=share_weights
    )]).permute(1, 0, 2).tolist()

    input_ids = prompt["input_ids"]
    attention_mask = prompt["attention_mask"]
    
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = {
        "base": {"input_ids": input_ids, "attention_mask": attention_mask},
        "unit_locations": {"sources->base": (None, unit_locations)},
        "max_new_tokens": max_new_tokens,
        "intervene_on_prompt": True,
        "streamer": streamer,
        "eos_token_id": tokenizer.eos_token_id,
        "early_stopping": True,
        "do_sample": True
    }

    t = Thread(target=reft_model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        )
    ],
    stop_btn=None,
    examples=[
        ["who are you?"],
        ["Can you tell me how to improve my health?"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
    chat_interface.render()
    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()