rwkv-v5-1b5-cpu

Sleeping

File size: 5,592 Bytes

4a2c956
e63ee0a
7c790c0
 
61b9ff7
 
 
4643bb5
a00d592
4a2c956
7c790c0
 
 
 
a00d592
 
7c790c0
a00d592
7c790c0
bd8572f
e63ee0a
 
 
2d1a42e
e63ee0a
2d1a42e
e63ee0a
2d1a42e
e63ee0a
2d1a42e
 
 
e63ee0a
2d1a42e
e63ee0a
2d1a42e
e63ee0a
 
2d1a42e
e63ee0a
 
 
 
 
7c790c0
 
 
 
e63ee0a
 
7c790c0
 
 
 
 
 
ea2eccb
7c790c0
 
 
 
 
 
 
0886a44
 
7c790c0
 
 
 
 
 
 
 
 
 
e63ee0a
 
 
29c4970
 
0083156
 
7c790c0
 
 
2d1a42e
 
d2d01b7
 
2d1a42e
 
7c790c0
 
e63ee0a
 
 
a00d592
819acfd
4643bb5
e63ee0a
 
3d38c26
a00d592
 
e63ee0a
 
 
 
 
 
 
4643bb5
2d1a42e
 
e63ee0a
2d1a42e
7c790c0
e63ee0a
7c790c0

import gradio as gr
import os, gc, copy, torch
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 2000
title = "RWKV-5-World-1.5B-v2-OnlyForTest_56%_trained-20231013-ctx4096"

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

def generate_prompt(instruction, input=""):
    instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
    input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
    if input:
        return f"""Instruction: {instruction}

Input: {input}

Response:"""
    else:
        return f"""User: hi

Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.

User: {instruction}

Assistant:"""

def evaluate(
    ctx,
    token_count=200,
    temperature=1.0,
    top_p=0.7,
    presencePenalty = 0.1,
    countPenalty = 0.1,
):
    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
                     alpha_frequency = countPenalty,
                     alpha_presence = presencePenalty,
                     token_ban = [], # ban the generation of some tokens
                     token_stop = [0]) # stop generation whenever you see any token here
    all_tokens = []
    out_last = 0
    out_str = ''
    occurrence = {}
    state = None
    for i in range(int(token_count)):
        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
        for n in occurrence:
            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
        if token in args.token_stop:
            break
        all_tokens += [token]
        for xxx in occurrence:
            occurrence[xxx] *= 0.996        
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1
        
        tmp = pipeline.decode(all_tokens[out_last:])
        if '\ufffd' not in tmp:
            out_str += tmp
            yield out_str.strip()
            out_last = i + 1

    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
    print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')  
    del out
    del state
    gc.collect()
    torch.cuda.empty_cache()
    yield out_str.strip()

examples = [
    [generate_prompt("Tell me about ravens."), 500, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Write a python function to mine 1 BTC."), 500, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Write a song about ravens."), 500, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Write a story using the following information", "A man named Alex chops a tree down"), 500, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Generate a list of adjectives that describe a person as brave."), 500, 1, 0.5, 0.4, 0.4],
    [generate_prompt("You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan."), 500, 1, 0.5, 0.4, 0.4],
]

##########################################################################

with gr.Blocks(title=title) as demo:
    gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
    with gr.Tab("Free Generation"):
        gr.Markdown(f"This is a *** 1.5B *** [RWKV-5 World v2](https://huggingface.co/BlinkDL/rwkv-5-world), which is 100% RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) and attention-free. *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. For best results, *** keep you prompt short and clear ***.")
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(lines=2, label="Prompt", value=generate_prompt("Tell me about ravens."))
                token_count = gr.Slider(10, 500, label="Max Tokens", step=10, value=500)
                temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
                top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
                presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
                count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
            with gr.Column():
                with gr.Row():
                    submit = gr.Button("Submit", variant="primary")
                    clear = gr.Button("Clear", variant="secondary")
                output = gr.Textbox(label="Output", lines=20)
        data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
        submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
        clear.click(lambda: None, [], [output])
        data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])

demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)