Spaces:
Sleeping
Sleeping
File size: 6,047 Bytes
4a2c956 e63ee0a 7c790c0 61b9ff7 a00d592 4a2c956 7c790c0 a00d592 7c790c0 a00d592 7c790c0 e63ee0a 7c790c0 e63ee0a 7c790c0 e63ee0a 61b9ff7 7c790c0 ea2eccb 7c790c0 0886a44 7c790c0 e63ee0a 29c4970 0083156 7c790c0 a00d592 7c790c0 e63ee0a a00d592 e63ee0a a00d592 e63ee0a a00d592 e63ee0a 7c790c0 e63ee0a 7c790c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import os, gc, copy, torch
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 3000
title = "RWKV-5-World-1.5B-v2-OnlyForTest_56%_trained-20231013-ctx4096"
os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
def generate_prompt(instruction, input=None):
instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
if input:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
# Instruction:
{instruction}
# Input:
{input}
# Response:
"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
# Instruction:
{instruction}
# Response:
"""
def evaluate(
instruction,
input=None,
token_count=200,
temperature=1.0,
top_p=0.7,
presencePenalty = 0.1,
countPenalty = 0.1,
):
args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
alpha_frequency = countPenalty,
alpha_presence = presencePenalty,
token_ban = [], # ban the generation of some tokens
token_stop = [0]) # stop generation whenever you see any token here
instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
ctx = generate_prompt(instruction, input)
all_tokens = []
out_last = 0
out_str = ''
occurrence = {}
state = None
for i in range(int(token_count)):
out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
for n in occurrence:
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
if token in args.token_stop:
break
all_tokens += [token]
for xxx in occurrence:
occurrence[xxx] *= 0.996
if token not in occurrence:
occurrence[token] = 1
else:
occurrence[token] += 1
tmp = pipeline.decode(all_tokens[out_last:])
if '\ufffd' not in tmp:
out_str += tmp
yield out_str.strip()
out_last = i + 1
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
del out
del state
gc.collect()
torch.cuda.empty_cache()
yield out_str.strip()
examples = [
["Tell me about ravens.", "", 300, 1, 0.5, 0.4, 0.4],
["Write a python function to mine 1 BTC, with details and comments.", "", 300, 1, 0.5, 0.4, 0.4],
["Write a song about ravens.", "", 300, 1, 0.5, 0.4, 0.4],
["Explain the following metaphor: Life is like cats.", "", 300, 1, 0.5, 0.4, 0.4],
["Write a story using the following information", "A man named Alex chops a tree down", 300, 1, 0.5, 0.4, 0.4],
["Generate a list of adjectives that describe a person as brave.", "", 300, 1, 0.5, 0.4, 0.4],
["You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan.", "", 300, 1, 0.5, 0.4, 0.4],
]
##########################################################################
with gr.Blocks(title=title) as demo:
gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
with gr.Tab("Instruct mode"):
gr.Markdown(f"This is a 1.5B [RWKV-5 World v2](https://huggingface.co/BlinkDL/rwkv-5-world) 100% RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. For best results, *** keep you prompt short and clear ***.")
with gr.Row():
with gr.Column():
instruction = gr.Textbox(lines=2, label="Instruction", value="Tell me about ravens.")
input = gr.Textbox(lines=2, label="Input", placeholder="none")
token_count = gr.Slider(10, 500, label="Max Tokens", step=10, value=500)
temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
with gr.Column():
with gr.Row():
submit = gr.Button("Submit", variant="primary")
clear = gr.Button("Clear", variant="secondary")
output = gr.Textbox(label="Output", lines=5)
data = gr.Dataset(components=[instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, label="Example Instructions", headers=["Instruction", "Input", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
submit.click(evaluate, [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
clear.click(lambda: None, [], [output])
data.click(lambda x: x, [data], [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty])
demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)
|