rwkv-v5-1b5-cpu

Sleeping

App Files Files Community

picocreator commited on Oct 28, 2023

Commit

59618c7

•

1 Parent(s): f092f6f

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -4

app.py CHANGED Viewed

@@ -3,20 +3,42 @@ import os, gc, copy, torch
 from datetime import datetime
 from huggingface_hub import hf_hub_download
 from pynvml import *
-nvmlInit()
-gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 2000
 title = "RWKV-5-World-1B5-v2-20231025-ctx4096"
 os.environ["RWKV_JIT_ON"] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 from rwkv.model import RWKV
-model_path = hf_hub_download(repo_id="BlinkDL/rwkv-5-world", filename=f"{title}.pth")
-model = RWKV(model=model_path, strategy='cuda fp16')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
 def generate_prompt(instruction, input=""):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
     input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
@@ -35,6 +57,7 @@ User: {instruction}
 Assistant:"""
 def evaluate(
     ctx,
     token_count=200,
@@ -84,6 +107,7 @@ def evaluate(
     torch.cuda.empty_cache()
     yield out_str.strip()
 examples = [
     ["Assistant: Sure! Here is a very detailed plan to create flying pigs:", 333, 1, 0.3, 0, 1],
     ["Assistant: Sure! Here are some ideas for FTL drive:", 333, 1, 0.3, 0, 1],
@@ -108,6 +132,7 @@ Edward:''', 333, 1, 0.3, 0, 1],
 ##########################################################################
 with gr.Blocks(title=title) as demo:
     gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
     with gr.Tab("Raw Generation"):
@@ -130,5 +155,6 @@ with gr.Blocks(title=title) as demo:
         clear.click(lambda: None, [], [output])
         data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
 demo.queue(concurrency_count=1, max_size=10)
 demo.launch(share=False)

 from datetime import datetime
 from huggingface_hub import hf_hub_download
 from pynvml import *
+# Flag to check if GPU is present
+HAS_GPU = False
+# Model title and context size limit
 ctx_limit = 2000
 title = "RWKV-5-World-1B5-v2-20231025-ctx4096"
+model_file = "RWKV-5-World-1B5-v2-20231025-ctx4096"
+# Get the GPU count
+try:
+    nvmlInit()
+    GPU_COUNT = nvmlDeviceGetCount()
+    if GPU_COUNT > 0:
+        HAS_GPU = True
+        gpu_h = nvmlDeviceGetHandleByIndex(0)
+except NVMLError as error:
+    print(error)
 os.environ["RWKV_JIT_ON"] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
+# Model strat to use
+MODEL_STRAT="cuda fp16"
+if HAS_GPU == False :
+    "cpu bf16"
+# Load the model accordingly
 from rwkv.model import RWKV
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-5-world", filename=f"{model_file}.pth")
+model = RWKV(model=model_path, strategy=MODEL_STRAT)
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
+# Prompt generation
 def generate_prompt(instruction, input=""):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
     input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
 Assistant:"""
+# Evaluation logic
 def evaluate(
     ctx,
     token_count=200,
     torch.cuda.empty_cache()
     yield out_str.strip()
+# Examples and gradio blocks
 examples = [
     ["Assistant: Sure! Here is a very detailed plan to create flying pigs:", 333, 1, 0.3, 0, 1],
     ["Assistant: Sure! Here are some ideas for FTL drive:", 333, 1, 0.3, 0, 1],
 ##########################################################################
+# Gradio blocks
 with gr.Blocks(title=title) as demo:
     gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
     with gr.Tab("Raw Generation"):
         clear.click(lambda: None, [], [output])
         data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
+# Gradio launch
 demo.queue(concurrency_count=1, max_size=10)
 demo.launch(share=False)