llama-2-13b-chat-transformers

Sleeping

App Files Files Community

freddyaboulton HF staff commited on Jul 25, 2023

Commit

e34e07c

•

1 Parent(s): 41f8286

add code

Browse files

Files changed (3) hide show

app.py +49 -245
model.py +14 -6
style.css +0 -16

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Iterator
 import gradio as gr
 import torch
 from model import get_input_token_length, run
@@ -12,17 +11,6 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
-DESCRIPTION = """
-# Llama-2 13B Chat
-This Space demonstrates model [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta, a Llama 2 model with 13B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
-🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
-🔨 Looking for an even more powerful model? Check out the large [**70B** model demo](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI).
-🐇 For a smaller model that you can run on many GPUs, check our [7B model demo](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat).
-"""
 LICENSE = """
 <p/>
@@ -32,249 +20,65 @@ As a derivate work of [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += '\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>'
-def clear_and_save_textbox(message: str) -> tuple[str, str]:
-    return '', message
-def display_input(message: str,
-                  history: list[tuple[str, str]]) -> list[tuple[str, str]]:
-    history.append((message, ''))
-    return history
-def delete_prev_fn(
-        history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
-    try:
-        message, _ = history.pop()
-    except IndexError:
-        message = ''
-    return history, message or ''
 def generate(
     message: str,
     history_with_input: list[tuple[str, str]],
-    system_prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    top_k: int,
-) -> Iterator[list[tuple[str, str]]]:
     if max_new_tokens > MAX_MAX_NEW_TOKENS:
         raise ValueError
     history = history_with_input[:-1]
-    generator = run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
-    try:
-        first_response = next(generator)
-        yield history + [(message, first_response)]
-    except StopIteration:
-        yield history + [(message, '')]
-    for response in generator:
-        yield history + [(message, response)]
-def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
-    generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
-    for x in generator:
-        pass
-    return '', x
-def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
-    input_token_length = get_input_token_length(message, chat_history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
-        raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
-with gr.Blocks(css='style.css') as demo:
-    gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(value='Duplicate Space for private use',
-                       elem_id='duplicate-button')
-    with gr.Group():
-        chatbot = gr.Chatbot(label='Chatbot')
-        with gr.Row():
-            textbox = gr.Textbox(
-                container=False,
-                show_label=False,
-                placeholder='Type a message...',
-                scale=10,
-            )
-            submit_button = gr.Button('Submit',
-                                      variant='primary',
-                                      scale=1,
-                                      min_width=0)
-    with gr.Row():
-        retry_button = gr.Button('🔄  Retry', variant='secondary')
-        undo_button = gr.Button('↩️ Undo', variant='secondary')
-        clear_button = gr.Button('🗑️  Clear', variant='secondary')
-    saved_input = gr.State()
-    with gr.Accordion(label='Advanced options', open=False):
-        system_prompt = gr.Textbox(label='System prompt',
-                                   value=DEFAULT_SYSTEM_PROMPT,
-                                   lines=6)
-        max_new_tokens = gr.Slider(
-            label='Max new tokens',
-            minimum=1,
-            maximum=MAX_MAX_NEW_TOKENS,
-            step=1,
-            value=DEFAULT_MAX_NEW_TOKENS,
-        )
-        temperature = gr.Slider(
-            label='Temperature',
-            minimum=0.1,
-            maximum=4.0,
-            step=0.1,
-            value=1.0,
-        )
-        top_p = gr.Slider(
-            label='Top-p (nucleus sampling)',
-            minimum=0.05,
-            maximum=1.0,
-            step=0.05,
-            value=0.95,
-        )
-        top_k = gr.Slider(
-            label='Top-k',
-            minimum=1,
-            maximum=1000,
-            step=1,
-            value=50,
-        )
-    gr.Examples(
-        examples=[
-            'Hello there! How are you doing?',
-            'Can you explain briefly to me what is the Python programming language?',
-            'Explain the plot of Cinderella in a sentence.',
-            'How many hours does it take a man to eat a Helicopter?',
-            "Write a 100-word article on 'Benefits of Open-Source in AI research'",
-        ],
-        inputs=textbox,
-        outputs=[textbox, chatbot],
-        fn=process_example,
-        cache_examples=True,
     )
     gr.Markdown(LICENSE)
-    textbox.submit(
-        fn=clear_and_save_textbox,
-        inputs=textbox,
-        outputs=[textbox, saved_input],
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=display_input,
-        inputs=[saved_input, chatbot],
-        outputs=chatbot,
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=check_input_token_length,
-        inputs=[saved_input, chatbot, system_prompt],
-        api_name=False,
-        queue=False,
-    ).success(
-        fn=generate,
-        inputs=[
-            saved_input,
-            chatbot,
-            system_prompt,
-            max_new_tokens,
-            temperature,
-            top_p,
-            top_k,
-        ],
-        outputs=chatbot,
-        api_name=False,
-    )
-    button_event_preprocess = submit_button.click(
-        fn=clear_and_save_textbox,
-        inputs=textbox,
-        outputs=[textbox, saved_input],
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=display_input,
-        inputs=[saved_input, chatbot],
-        outputs=chatbot,
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=check_input_token_length,
-        inputs=[saved_input, chatbot, system_prompt],
-        api_name=False,
-        queue=False,
-    ).success(
-        fn=generate,
-        inputs=[
-            saved_input,
-            chatbot,
-            system_prompt,
-            max_new_tokens,
-            temperature,
-            top_p,
-            top_k,
-        ],
-        outputs=chatbot,
-        api_name=False,
-    )
-    retry_button.click(
-        fn=delete_prev_fn,
-        inputs=chatbot,
-        outputs=[chatbot, saved_input],
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=display_input,
-        inputs=[saved_input, chatbot],
-        outputs=chatbot,
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=generate,
-        inputs=[
-            saved_input,
-            chatbot,
-            system_prompt,
-            max_new_tokens,
-            temperature,
-            top_p,
-            top_k,
-        ],
-        outputs=chatbot,
-        api_name=False,
-    )
-    undo_button.click(
-        fn=delete_prev_fn,
-        inputs=chatbot,
-        outputs=[chatbot, saved_input],
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=lambda x: x,
-        inputs=[saved_input],
-        outputs=textbox,
-        api_name=False,
-        queue=False,
-    )
-    clear_button.click(
-        fn=lambda: ([], ''),
-        outputs=[chatbot, saved_input],
-        queue=False,
-        api_name=False,
-    )
 demo.queue(max_size=20).launch()

 import gradio as gr
 import torch
+import os
 from model import get_input_token_length, run
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
 LICENSE = """
 <p/>
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
 """
+is_spaces = True if "SPACE_ID" in os.environ else False
+if is_spaces :
+    is_shared_ui = True if "gradio-discord-bots/llama-2-13b-chat-transformers" in os.environ['SPACE_ID'] else False
+else:
+    is_shared_ui = False
+is_gpu_associated = torch.cuda.is_available()
 def generate(
     message: str,
     history_with_input: list[tuple[str, str]],
+    system_prompt=DEFAULT_SYSTEM_PROMPT,
+    max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
+    temperature=1.0,
+    top_p=0.95,
+    top_k=50,
+) -> tuple[str, list[tuple[str, str]]]:
+    if is_shared_ui:
+        raise ValueError("Cannot use demo running in shared_ui. Must duplicate your own space.")
     if max_new_tokens > MAX_MAX_NEW_TOKENS:
         raise ValueError
     history = history_with_input[:-1]
+    input_token_length = get_input_token_length(message, history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+        response = f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Please create a new thread.'
+    else:
+        response = run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
+    return response, history + [(message, response)]
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+    # Llama-2-13b-chat-hf Discord Bot Powered by Gradio and Hugging Face Transformers
+    ### First install the `gradio_client`
+    ```bash
+    pip install gradio_client
+    ```
+    ### Then deploy to discord in one line! ⚡️
+    ```python
+    secrets = {"HUGGING_FACE_HUB_TOKEN": "<your-key-here>",}
+    client = grc.Client.duplicate("gradio-discord-bots/llama-2-13b-chat-transformers", secrets=secrets, hardware="a10g-small")
+    client.deploy_discord(api_names=["chat"])
+    ```
+    """
     )
     gr.Markdown(LICENSE)
+    with gr.Row(visible=False):
+        state = gr.State([])
+        msg = gr.Textbox()
+        output = gr.Textbox()
+        btn = gr.Button()
+        btn.click(generate, [msg, state], [output, state], api_name="chat")
 demo.queue(max_size=20).launch()

model.py CHANGED Viewed

@@ -1,12 +1,18 @@
 from threading import Thread
-from typing import Iterator
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 model_id = 'meta-llama/Llama-2-13b-chat-hf'
-if torch.cuda.is_available():
     config = AutoConfig.from_pretrained(model_id)
     config.pretraining_tp = 1
     model = AutoModelForCausalLM.from_pretrained(
@@ -16,9 +22,10 @@ if torch.cuda.is_available():
         load_in_4bit=True,
         device_map='auto'
     )
 else:
     model = None
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 def get_prompt(message: str, chat_history: list[tuple[str, str]],
@@ -47,7 +54,7 @@ def run(message: str,
         max_new_tokens: int = 1024,
         temperature: float = 0.8,
         top_p: float = 0.95,
-        top_k: int = 50) -> Iterator[str]:
     prompt = get_prompt(message, chat_history, system_prompt)
     inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
@@ -71,4 +78,5 @@ def run(message: str,
     outputs = []
     for text in streamer:
         outputs.append(text)
-        yield ''.join(outputs)

 from threading import Thread
+import os
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 model_id = 'meta-llama/Llama-2-13b-chat-hf'
+is_spaces = True if "SPACE_ID" in os.environ else False
+if is_spaces :
+    is_shared_ui = True if "gradio-discord-bots/llama-2-13b-chat-transformers" in os.environ['SPACE_ID'] else False
+else:
+    is_shared_ui = False
+is_gpu_associated = torch.cuda.is_available()
+if torch.cuda.is_available() and not is_shared_ui:
     config = AutoConfig.from_pretrained(model_id)
     config.pretraining_tp = 1
     model = AutoModelForCausalLM.from_pretrained(
         load_in_4bit=True,
         device_map='auto'
     )
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
 else:
     model = None
+    tokenizer = None
 def get_prompt(message: str, chat_history: list[tuple[str, str]],
         max_new_tokens: int = 1024,
         temperature: float = 0.8,
         top_p: float = 0.95,
+        top_k: int = 50) -> str:
     prompt = get_prompt(message, chat_history, system_prompt)
     inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
     outputs = []
     for text in streamer:
         outputs.append(text)
+    return "".join(outputs)

style.css DELETED Viewed

@@ -1,16 +0,0 @@
-h1 {
-  text-align: center;
-}
-#duplicate-button {
-  margin: auto;
-  color: white;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-#component-0 {
-  max-width: 900px;
-  margin: auto;
-  padding-top: 1.5rem;
-}