Spaces:
Runtime error
Runtime error
| from threading import Thread | |
| from llama_cpp import Llama | |
| import torch | |
| import gradio as gr | |
| import re | |
| torch_device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print("Running on device:", torch_device) | |
| print("CPU threads:", torch.get_num_threads()) | |
| llm = Llama(model_path = 'Llama-2-ko-7B-chat-gguf-q4_0.bin', | |
| n_ctx=40, | |
| n_threads = 8, | |
| n_batch = 5 | |
| ) | |
| def gen(x, max_new_tokens): | |
| output = llm(f"### 명령어:\n{x}\n\n### 응답:", max_tokens=max_new_tokens, stop=["###"], echo=True) | |
| return output['choices'][0]['text'].replace('▁',' ') | |
| def reset_textbox(): | |
| return gr.update(value='') | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| "duplicated from beomi/KoRWKV-1.5B, baseModel:Llama-2-ko-7B-chat-gguf-q4_0" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| user_text = gr.Textbox( | |
| placeholder='우리 여행 갈래?', | |
| label="User input" | |
| ) | |
| model_output = gr.Textbox(label="Model output", lines=10, interactive=False) | |
| button_submit = gr.Button(value="Submit") | |
| with gr.Column(scale=1): | |
| max_new_tokens = gr.Slider( | |
| minimum=1, maximum=200, value=20, step=1, interactive=True, label="Max New Tokens", | |
| ) | |
| button_submit.click(gen, [user_text, max_new_tokens], model_output) | |
| demo.queue(max_size=32).launch(enable_queue=True) |