Spaces:

TeamTonic
/

Tonics-Yi-200K

Paused

File size: 8,702 Bytes

85982c7
7be1664
b9cf639
f6cbd41
7be1664
 
8d621ae
7be1664
 
 
 
 
 
 
 
 
 
 
b2497fe
 
7be1664
 
 
b2497fe
7be1664
b2497fe
7be1664
a8683e1
92fd8f9
b2497fe
 
 
8586d5d
 
b2497fe
7be1664
 
b2497fe
92fd8f9
b9cf639
7be1664
b2497fe
7be1664

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
# from optimum.bettertransformer import BetterTransformer
from tokenization_yi import YiTokenizer
import torch
import os
import bitsandbytes
import gradio as gr
import sentencepiece


DESCRIPTION = """
# Welcome to Tonic'sYI-6B-200K
You can use this Space to test out the current model [01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)
You can also use YI-200 by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic1Tonics-Yi-6B-200K/?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3> 
Join us : TeamTonic is always making cool demos! Join our active builder's community on Discord: [Discord](https://discord.gg/nXx5wbX9) On Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On Github: [Polytonic](https://github.com/tonic-ai) & contribute to [PolyGPT](https://github.com/tonic-ai/polygpt-alpha)
"""

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:126'

MAX_MAX_NEW_TOKENS = 160000
DEFAULT_MAX_NEW_TOKENS = 20000
MAX_INPUT_TOKEN_LENGTH = 160000
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "01-ai/Yi-6B-200K"

# tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer = YiTokenizer(vocab_file="./tokenizer.model")
model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    trust_remote_code=True
)

# Load the model and tokenizer using transformers
# model = AutoModelForCausalLM.from_pretrained("01-ai/Yi-6B-200K", trust_remote_code=True)

# model = BetterTransformer.transform(model)

def run(message, chat_history, max_new_tokens=20000, temperature=1.5, top_p=0.9, top_k=900):
    prompt = get_prompt(message, chat_history)

    # Encode the prompt to tensor
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Move input_ids to the same device as the model
    input_ids = input_ids.to(model.device)
    
    # Generate a response using the model with adjusted parameters
    response_ids = model.generate(
        input_ids,
        max_length=max_new_tokens + input_ids.shape[1],
        temperature=temperature,  # Controls randomness. Lower values make text more deterministic.
        top_p=top_p,              # Nucleus sampling: higher values allow more diversity.
        top_k=top_k,              # Top-k sampling: limits the number of top tokens considered.
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True            # Enable sampling-based generation

    )

    # Decode the response
    response = tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

def get_prompt(message, chat_history):
    texts = []

    do_strip = False
    for user_input, response in chat_history:
        user_input = user_input.strip() if do_strip else user_input
        do_strip = True
        texts.append(f" {response.strip()} {user_input} ")
    message = message.strip() if do_strip else message
    texts.append(f"{message}")
    return ''.join(texts)

def clear_and_save_textbox(message): return '', message

def display_input(message, history=[]):
    history.append((message, ''))
    return history

def delete_prev_fn(history=[]):
    try:
        message, _ = history.pop()
    except IndexError:
        message = ''
    return history, message or ''

def generate(message, history_with_input, max_new_tokens, temperature, top_p, top_k):
    if int(max_new_tokens) > MAX_MAX_NEW_TOKENS:
        raise ValueError

    history = history_with_input[:-1]
    response = run(message, history, max_new_tokens, temperature, top_p, top_k)
    yield history + [(message, response)]


def process_example(message):
    generator = generate(message, [], 4056, 1.9, 0.95, 900)
    for x in generator:
        pass
    return '', x

def check_input_token_length(message, chat_history):
    input_token_length = len(message) + len(chat_history)
    if input_token_length > MAX_INPUT_TOKEN_LENGTH:
        raise gr.Error(f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.")

with gr.Blocks(theme='ParityError/Anime') as demo:
    gr.Markdown(DESCRIPTION)


    
    with gr.Group():
        chatbot = gr.Chatbot(label='TonicYi-30B-200K')
        with gr.Row():
            textbox = gr.Textbox(
                container=False,
                show_label=False,
                placeholder='As the dawn approached, they leant in and said',
                scale=10
            )
            submit_button = gr.Button('Submit', variant='primary', scale=1, min_width=0)

    with gr.Row():
        retry_button = gr.Button('Retry', variant='secondary')
        undo_button = gr.Button('Undo', variant='secondary')
        clear_button = gr.Button('Clear', variant='secondary')

    saved_input = gr.State()

    with gr.Accordion(label='Advanced options', open=False):
#       system_prompt = gr.Textbox(label='System prompt', value=DEFAULT_SYSTEM_PROMPT, lines=5, interactive=False)
        max_new_tokens = gr.Slider(label='Max New Tokens', minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
        temperature = gr.Slider(label='Temperature', minimum=0.1, maximum=2.0, step=0.1, value=0.1)
        top_p = gr.Slider(label='Top-P (nucleus sampling)', minimum=0.05, maximum=1.0, step=0.05, value=0.9)
        top_k = gr.Slider(label='Top-K', minimum=1, maximum=1000, step=1, value=10)

    textbox.submit(
        fn=clear_and_save_textbox,
        inputs=textbox,
        outputs=[textbox, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=check_input_token_length,
        inputs=[saved_input, chatbot],
        api_name=False,
        queue=False,
    ).success(
        fn=generate,
        inputs=[
            saved_input,
            chatbot,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name="Generate",
    )

    button_event_preprocess = submit_button.click(
        fn=clear_and_save_textbox,
        inputs=textbox,
        outputs=[textbox, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=check_input_token_length,
        inputs=[saved_input, chatbot],
        api_name=False,
        queue=False,
    ).success(
        fn=generate,
        inputs=[
            saved_input,
            chatbot,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name="Cgenerate",
    )

    retry_button.click(
        fn=delete_prev_fn,
        inputs=chatbot,
        outputs=[chatbot, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=generate,
        inputs=[
            saved_input,
            chatbot,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name=False,
    )

    undo_button.click(
        fn=delete_prev_fn,
        inputs=chatbot,
        outputs=[chatbot, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=lambda x: x,
        inputs=[saved_input],
        outputs=textbox,
        api_name=False,
        queue=False,
    )

    clear_button.click(
        fn=lambda: ([], ''),
        outputs=[chatbot, saved_input],
        queue=False,
        api_name=False,
    )

demo.queue(max_size=5).launch(show_api=True)