Spaces:
Sleeping
Sleeping
File size: 2,734 Bytes
161a808 48db1b5 2f51dc8 48db1b5 eaf6d5b 48db1b5 161a808 48db1b5 eaf6d5b b78b267 ad288d9 b78b267 ad288d9 ab8bbc9 b78b267 42b96f4 eaf6d5b b78b267 eaf6d5b ab8bbc9 eaf6d5b 20f53f9 42b96f4 eaf6d5b 48db1b5 20f53f9 48db1b5 20f53f9 48db1b5 20f53f9 48db1b5 20f53f9 48db1b5 eaf6d5b 48db1b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
print("START: BEFORE IMPORTS")
import os
import time
import gradio as gr
import copy
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
print("START: AFTER IMPORTS")
try:
print("START: BEFORE MODEL DOWNLOAD")
start_load_time = time.time()
model_path = hf_hub_download(
repo_id="NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
filename="Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
)
print(f"START: AFTER MODEL DOWNLOAD -- {time.time() - start_load_time}s")
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_gpu_layers=-1, # change n_gpu_layers if you have more or less VRAM
verbose=True
)
print(f"START: AFTER LLAMA-CPP SETUP -- {time.time() - start_load_time}s")
except Exception as e:
print(e)
def generate_text(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for chunk in llm.create_chat_completion(
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
messages=messages,
):
part = chunk["choices"][0]["delta"].get("content", None)
if part:
response += part
yield response
demo = gr.ChatInterface(
generate_text,
title="llama-cpp-python on GPU",
description="Running LLM with https://github.com/abetlen/llama-cpp-python",
examples=[
["How to setup a human base on Mars? Give short answer."],
["Explain theory of relativity to me like I’m 8 years old."],
["What is 9,000 * 9,000?"],
["Write a pun-filled happy birthday message to my friend Alex."],
["Justify why a penguin might make a good king of the jungle."],
],
cache_examples=False,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|