import os, copy os.environ["RWKV_V7_ON"] = '1' os.environ["RWKV_JIT_ON"] = '1' os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster) from rwkv.model import RWKV import gc, re import gradio as gr import base64 from io import BytesIO import torch import torch.nn.functional as F from datetime import datetime from huggingface_hub import hf_hub_download from pynvml import * nvmlInit() gpu_h = nvmlDeviceGetHandleByIndex(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ctx_limit = 4000 gen_limit = 1500 ########################## text rwkv ################################################################ from rwkv.utils import PIPELINE, PIPELINE_ARGS title_v6 = "rwkv7-g1-0.1b-20250307-ctx4096" model_path_v6 = hf_hub_download(repo_id="BlinkDL/rwkv7-g1", filename=f"{title_v6}.pth") model_v6 = RWKV(model=model_path_v6.replace('.pth',''), strategy='cuda fp16') pipeline_v6 = PIPELINE(model_v6, "rwkv_vocab_v20230424") args = model_v6.args penalty_decay = 0.996 def generate_prompt(instruction, input=""): instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n') input = input.strip().replace('\r\n','\n').replace('\n\n','\n') if input: return f"""Instruction: {instruction}\n\nInput: {input}\n\nResponse:""" else: return f"""User: {instruction}\n\nAssistant:""" def qa_prompt(instruction): instruction = instruction.strip().replace('\r\n','\n') instruction = re.sub(r'\n+', '\n', instruction) return f"User: {instruction}\n\nAssistant:""" def evaluate( ctx, token_count=200, temperature=1.0, top_p=0.7, presencePenalty = 0.1, countPenalty = 0.1, ): args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p), alpha_frequency = countPenalty, alpha_presence = presencePenalty, token_ban = [], # ban the generation of some tokens token_stop = [0]) # stop generation whenever you see any token here ctx = ctx.strip() all_tokens = [] out_last = 0 out_str = '' occurrence = {} state = None for i in range(int(token_count)): input_ids = pipeline_v6.encode(ctx)[-ctx_limit:] if i == 0 else [token] out, state = model_v6.forward(input_ids, state) for n in occurrence: out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) token = pipeline_v6.sample_logits(out, temperature=args.temperature, top_p=args.top_p) if token in args.token_stop: break all_tokens += [token] for xxx in occurrence: occurrence[xxx] *= penalty_decay ttt = pipeline_v6.decode([token]) www = 1 if ttt in ' \t0123456789': www = 0 #elif ttt in '\r\n,.;?!"\':+-*/=#@$%^&_`~|<>\\()[]{},。;“”:?!()【】': # www = 0.5 if token not in occurrence: occurrence[token] = www else: occurrence[token] += www tmp = pipeline_v6.decode(all_tokens[out_last:]) if '\ufffd' not in tmp: out_str += tmp yield out_str.strip() out_last = i + 1 gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}') del out del state gc.collect() torch.cuda.empty_cache() yield out_str.strip() examples = [ ["User: simulate SpaceX mars landing using python\n\nAssistant: \n

{title_v6}

\n") with gr.Tab("=== Base Model (Raw Generation) ==="): gr.Markdown(f'This is [RWKV7 G1](https://huggingface.co/BlinkDL/rwkv7-g1) 0.1B (!!!) L12-D768 reasoning base LM - an attention-free pure RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). Supports 100+ world languages and code. Check [400+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). *** Can try examples (bottom of page) *** (can edit them). Demo limited to ctxlen {ctx_limit}.') with gr.Row(): with gr.Column(): prompt = gr.Textbox(lines=6, label="Prompt", value="User: simulate SpaceX mars landing using python\n\nAssistant: