File size: 1,579 Bytes
123c1d1
 
9ca25de
082349c
9ca25de
 
 
 
 
 
 
 
082349c
123c1d1
48267a7
123c1d1
082349c
9ca25de
082349c
123c1d1
082349c
 
9ca25de
 
 
 
 
 
e2213a2
9ca25de
 
 
 
 
082349c
9ca25de
 
 
 
 
 
 
 
 
082349c
9ca25de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

from threading import Thread
from llama_cpp import Llama
import torch
import gradio as gr
import re

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", torch_device)
print("CPU threads:", torch.get_num_threads())

llm = Llama(model_path = 'Llama-2-ko-7B-chat-gguf-q4_0.bin',
            n_ctx=50,
            n_threads = 8,
            n_batch = 5            
      )

def gen(x, max_new_tokens):
    output = llm(f"### 명령어:\n{x}\n\n### 응답:", max_tokens=max_new_tokens, stop=["###", "\n", ":"], echo=True)
    
    return output['choices'][0]['text'].replace('▁',' ')

def reset_textbox():
    return gr.update(value='')

with gr.Blocks() as demo:
    gr.Markdown(
       "duplicated from beomi/KoRWKV-1.5B, baseModel:Llama-2-ko-7B-chat-gguf-q4_0"
    )
    
    with gr.Row():
        with gr.Column(scale=4):
            user_text = gr.Textbox(
                placeholder='우리 여행 갈래?',
                label="User input"
            )
            model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
            button_submit = gr.Button(value="Submit")
        with gr.Column(scale=1):
            max_new_tokens = gr.Slider(
                minimum=1, maximum=200, value=20, step=1, interactive=True, label="Max New Tokens",
            )
    
    button_submit.click(gen, [user_text, max_new_tokens], model_output)
    demo.queue(max_size=32).launch(enable_queue=True)