File size: 5,310 Bytes
576474f 992ad15 2ae746c 576474f 2ae746c a2232d8 576474f a2232d8 bd16ace a2232d8 e80da7c 2ae746c 576474f 10006aa 576474f 10006aa 2ae746c 10006aa 2fb3212 81c62e1 2ae746c 2fb3212 992ad15 493f720 5cb071c 493f720 992ad15 576474f 992ad15 ce6efbb 992ad15 576474f 992ad15 a2232d8 992ad15 3c5e66e 9f4ac5e 3c5e66e f8a999d 9f4ac5e f8a999d 992ad15 2ae746c 992ad15 a2232d8 992ad15 2ae746c 992ad15 ce6efbb 992ad15 a2232d8 992ad15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
#!/usr/bin/env python
import os
import requests
from threading import Thread
from typing import Iterator
import gradio as gr
import psutil
import spaces
import torch
from time import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from llama_cpp import Llama
# load like this - use tne variable everywhere
model_uri_hf=os.getenv("MODEL_URI_HF")
# show warning, when empty and briefs description of how to set it
# also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
# info about ram requirements
# DEBUG!
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf"
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/resolve/main/neural-chat-7b-v3-2.Q2_K.gguf"
# maybe use git lfs to dl instead?
# Initing things
print(f"debug: init model: {model_uri_hf}")
# Check if the model file already exists
if not os.path.isfile('model.bin'):
print(f"debug: can't find model locally, downloading ...")
# Download the model
response = requests.get(model_uri_hf)
# Save the model to a local file
with open('model.bin', 'wb') as file:
file.write(response.content)
llm = Llama(model_path="./model.bin") # LLaMa model
print("debug: model loaded and ready")
# Preparing things to work
title = f"# Demo for 7B Models - Quantized {model_uri_hf}"
descr = '''
Quantized to run in the free tier hosting.
Have a quick way to test models or share them with others without hassle.
It runs slow, as it's on cpu. Usable for basic tests.
It uses quantized models in gguf-Format and llama.cpp to run them.
Powered by ...'''
print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")
DESCRIPTION = f"# Test model: {model_uri_hf}"
if torch.cuda.is_available():
DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"
#todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
# we need to make sure we only run one thread or we probably run out of ram
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = []
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
# Use LLaMa to create chat completion
chat_completion = llm.create_chat_completion(conversation, stream=True)
outputs = []
# Yield the chat completions
for completion in chat_completion:
if "content" in completion["choices"][0]["delta"]:
outputs.append(completion["choices"][0]['delta']['content'])
yield "".join(outputs)
# t = Thread(target=model.generate, kwargs=generate_kwargs)
# t.start()
# outputs = []
# for text in streamer:
# outputs.append(text)
# yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.2,
),
],
stop_btn=None,
# add more eval examples, like a long list taken from teknium and others maybe group by type
examples=[
["Hello there! How are you doing?"],
["Can you explain briefly to me what is the Python programming language?"],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(title)
gr.Markdown(descr)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
# add
)
chat_interface.render()
if __name__ == "__main__":
demo.queue(max_size=20).launch() |