Spaces:
Running
Running
File size: 2,589 Bytes
5efa561 8fd3cf8 5efa561 b96b830 65ae873 b96b830 65ae873 b96b830 65ae873 5efa561 2ee2797 0b0f7fe 5efa561 65ae873 5efa561 65ae873 837474e 5efa561 837474e 5efa561 65ae873 5efa561 837474e 5efa561 837474e 5efa561 76a7dba 837474e 5efa561 837474e 2d5948c 837474e 5efa561 0b0f7fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import os
os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
import wget
from llama_cpp import Llama
import random
import os
import multiprocessing
def get_num_cores():
"""Get the number of CPU cores."""
return os.cpu_count()
def get_num_threads():
"""Get the number of threads available to the current process."""
return multiprocessing.cpu_count()
if __name__ == "__main__":
num_cores = get_num_cores()
num_threads = get_num_threads()
print(f"Number of CPU cores: {num_cores}")
print(f"Number of threads available to the current process: {num_threads}")
url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q2_K.bin'
filename = wget.download(url)
llm2 = Llama(model_path=filename, seed=random.randint(1, 2**31), lora_path="ggml-adapter-model (1).bin")
filename = wget.download(url)
theme = gr.themes.Soft(
primary_hue=gr.themes.Color("#ededed", "#fee2e2", "#fecaca", "#fca5a5", "#f87171", "#ef4444", "#dc2626", "#b91c1c", "#991b1b", "#7f1d1d", "#6c1e1e"),
neutral_hue="red",
)
title = """<h1 align="center">Chat with awesome LLAMA 2 CHAT model!</h1><br>"""
with gr.Blocks(theme=theme) as demo:
gr.HTML(title)
gr.HTML("This model is awesome for its size! It is only 20th the size of Chatgpt but is still decent for chatting. However like all models, LLAMA-2-CHAT can hallucinate and provide incorrect information.")
#chatbot = gr.Chatbot()
#msg = gr.Textbox()
#clear = gr.ClearButton([msg, chatbot])
#instruction = gr.Textbox(label="Instruction", placeholder=)
def bot(user_message):
#token1 = llm.tokenize(b"### Instruction: ")
#token2 = llm.tokenize(instruction.encode())
#token3 = llm2.tokenize(b"USER: ")
#tokens3 = llm2.tokenize(user_message.encode())
#token4 = llm2.tokenize(b"\n\n### Response:")
tokens = llm2.tokenize(user_message.encode())
count = 0
output = ""
outputs = ""
for token in llm2.generate(tokens, top_k=50, top_p=0.73, temp=0.72, repeat_penalty=1.1):
text = llm2.detokenize([token])
outputs += text.decode()
count += 1
if count >= 500 or (token == llm2.token_eos()):
break
output += text.decode(errors='ignore')
yield output
gr.HTML("Thanks for checking out this app!")
gr.Button("Answer").click(
fn=bot,
inputs=gr.Textbox(),
outputs=gr.Textbox(),
)
demo.queue()
demo.launch(debug=True)
|