Spaces:
Running
Running
File size: 2,770 Bytes
5efa561 8fd3cf8 5efa561 b96b830 2e9bb0a 65ae873 b96b830 65ae873 b96b830 9891f35 0eea8ba 9891f35 1a4049f 5efa561 65ae873 5efa561 65ae873 837474e 5efa561 837474e 5efa561 65ae873 5efa561 837474e 5efa561 d36c00f 5efa561 76a7dba 837474e 5efa561 837474e 2d5948c 837474e 5efa561 0b0f7fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import os
os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
import wget
from llama_cpp import Llama
import random
import os
import multiprocessing
from huggingface_hub import hf_hub_download #load from huggingfaces
def get_num_cores():
"""Get the number of CPU cores."""
return os.cpu_count()
def get_num_threads():
"""Get the number of threads available to the current process."""
return multiprocessing.cpu_count()
if __name__ == "__main__":
num_cores = get_num_cores()
num_threads = get_num_threads()
print(f"Number of CPU cores: {num_cores}")
print(f"Number of threads available to the current process: {num_threads}")
#url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q2_K.bin'
#filename = wget.download(url)
model_path= hf_hub_download(repo_id="Aryanne/Bling-Sheared-Llama-1.3B-0.1-gguf", filename="q4_0-bling-sheared-llama-1.3b-0.1.gguf")
llm2 = Llama(model_path=model_path, seed=random.randint(1, 2**31), use_mlock=False, n_threads=4)
theme = gr.themes.Soft(
primary_hue=gr.themes.Color("#ededed", "#fee2e2", "#fecaca", "#fca5a5", "#f87171", "#ef4444", "#dc2626", "#b91c1c", "#991b1b", "#7f1d1d", "#6c1e1e"),
neutral_hue="red",
)
title = """<h1 align="center">Chat with awesome LLAMA 2 CHAT model!</h1><br>"""
with gr.Blocks(theme=theme) as demo:
gr.HTML(title)
gr.HTML("This model is awesome for its size! It is only 20th the size of Chatgpt but is still decent for chatting. However like all models, LLAMA-2-CHAT can hallucinate and provide incorrect information.")
#chatbot = gr.Chatbot()
#msg = gr.Textbox()
#clear = gr.ClearButton([msg, chatbot])
#instruction = gr.Textbox(label="Instruction", placeholder=)
def bot(user_message):
#token1 = llm.tokenize(b"### Instruction: ")
#token2 = llm.tokenize(instruction.encode())
#token3 = llm2.tokenize(b"USER: ")
#tokens3 = llm2.tokenize(user_message.encode())
#token4 = llm2.tokenize(b"\n\n### Response:")
tokens = llm2.tokenize(user_message.encode())
count = 0
output = ""
outputs = ""
for token in llm2.generate(tokens, top_k=50, top_p=0.73, temp=0.72, repeat_penalty=1.1):
text = llm2.detokenize([token])
outputs += text.decode(errors='ignore')
count += 1
if count >= 500 or (token == llm2.token_eos()):
break
output += text.decode(errors='ignore')
yield output
gr.HTML("Thanks for checking out this app!")
gr.Button("Answer").click(
fn=bot,
inputs=gr.Textbox(),
outputs=gr.Textbox(),
)
demo.queue()
demo.launch(debug=True)
|