# -*- coding: utf-8 -*-
"""LLaMa 2 Prompting Guide with Gradio.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1SSv6lzX3Byu50PooYogmiwHqf5PQN68E

## Introduction

In this Colab Notebook, we are going to chat with Llama-2 7B.

By the end of this tutorial, you'll be able to interact with this model and use it to generate conversational responses.

Whether you're curious about chatbot technology or simply want to see a machine-generated response to a particular question, this notebook will serve as a comprehensive guide.

## Workflow
1. **Installations**: We'll begin by setting up our environment with the required libraries.
2. **Prerequisites**: Ensure we have access to the Llama-2 7B model on Hugging Face.
3. **Loading the Model & Tokenizer**: Retrieve the model and tokenizer for our session.
4. **Creating the Llama Pipeline**: Prepare our model for generating responses.
5. **Interacting with Llama through Gradio's ChatInterface**: Prompt the model for answers and explore its capabilities.

Let's dive in!

**First, change runtime to GPU.**


You can play with Llama-2 7B Chat here: https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat

## Installations

Before we proceed, we need to ensure that the essential libraries are installed:
- `Hugging Face Transformers`: Provides us with a straightforward way to use pre-trained models.
- `PyTorch`: Serves as the backbone for deep learning operations.
- `Accelerate`: Optimizes PyTorch operations, especially on GPU.
"""

#!pip install transformers torch accelerate

"""To use `gr.ChatInterface()` we need the newest Gradio"""

#!pip install --upgrade gradio

"""If `!pip install --upgrade gradio` returns an error that says: `NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968`, do the following:
1. Uncomment the next cell.
2. Run the cell
3. Restart Runtime: `Runtime -> Restart Runtime`
"""

# import locale
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

"""### Prerequisites

To load our desired model, `meta-llama/Llama-2-7b-chat-hf`, we first need to authenticate ourselves on Hugging Face. This ensures we have the correct permissions to fetch the model.

1. Gain access to the model on Hugging Face: [Link](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
2. Use the Hugging Face CLI to login and verify your authentication status.


"""

#!huggingface-cli login

#!huggingface-cli whoami

"""### Loading Model & Tokenizer

Here, we are preparing our session by loading both the Llama model and its associated tokenizer.

The tokenizer will help in converting our text prompts into a format that the model can understand and process.
"""

from transformers import AutoTokenizer
import transformers
import torch
import locale
import huggingface_hub
import os

locale.getpreferredencoding = lambda: "UTF-8"

model = "meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-chat-hf

huggingface_hub.login(os.environ["API_TOKEN"])

tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)

"""### Creating the Llama Pipeline

We'll set up a pipeline for text generation.

This pipeline simplifies the process of feeding prompts to our model and receiving generated text as output.

*Note*: This cell takes 2-3 minutes to run
"""

from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",  # LLM task
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

"""## Comparing Methods: Basic vs. Advanced

Before diving into our advanced method for conversational interaction, let's explore a basic method for generating responses using the `get_response()` function. We will later discuss its limitations and how the advanced method overcomes them.

"""

def get_response(prompt: str) -> None:
    """
    Generate a response from the Llama model.

    Parameters:
        prompt (str): The user's input/question for the model.

    Returns:
        None: Prints the model's response.
    """
    sequences = llama_pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=256,
    )
    print("Chatbot:", sequences[0]['generated_text'])


SYSTEM_PROMPT = """<s>[INST] <<SYS>>
You are a medical expert who suggests medical screenings and tests to users when they describe their symptoms. Make your answer short and concise. Split the tests into most important and optional according to given user history and symptoms
<</SYS>>

"""

# Formatting function for message and history
def format_message(message: str, history: list, memory_limit: int = 3) -> str:
    """
    Formats the message and history for the Llama model.

    Parameters:
        message (str): Current message to send.
        history (list): Past conversation history.
        memory_limit (int): Limit on how many past interactions to consider.

    Returns:
        str: Formatted message string
    """
    # always keep len(history) <= memory_limit
    if len(history) > memory_limit:
        history = history[-memory_limit:]

    if len(history) == 0:
        return SYSTEM_PROMPT + f"{message} [/INST]"

    formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"

    # Handle conversation history
    for user_msg, model_answer in history[1:]:
        formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"

    # Handle the current message
    formatted_message += f"<s>[INST] {message} [/INST]"

    return formatted_message

"""### Getting Responses

We need the function to generate responses.
"""

# Generate a response from the Llama model
def get_llama_response(message: str, history: list) -> str:
    """
    Generates a conversational response from the Llama model.

    Parameters:
        message (str): User's input message.
        history (list): Past conversation history.

    Returns:
        str: Generated response from the Llama model.
    """
    query = format_message(message, history)
    response = ""

    sequences = llama_pipeline(
        query,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1024,
    )

    generated_text = sequences[0]['generated_text']
    response = generated_text[len(query):]  # Remove the prompt from the output

    print("Chatbot:", response.strip())
    return response.strip()

# import gradio as gr

# gr.ChatInterface(get_llama_response).launch()

"""### Conclusion

Thanks to the Hugging Face Library, creating a pipeline to chat with llama 2 (or any other open-source LLM) is quite easy.

But if you worked a lot with much larger models such as GPT-4, you need to adjust your expectations.
"""