WizardLM7b

Sleeping

File size: 2,784 Bytes

5efa561
 
8fd3cf8
5efa561
 
 
b96b830
 
65ae873
 
 
 
 
b96b830
 
 
 
65ae873
 
b96b830
 
 
 
9891f35
 
a534500
9891f35
 
0b0f7fe
5efa561
 
 
 
65ae873
5efa561
 
65ae873
837474e
 
 
5efa561
837474e
5efa561
 
 
65ae873
 
 
5efa561
 
837474e
5efa561
 
d36c00f
5efa561
 
 
76a7dba
837474e
5efa561
837474e
 
2d5948c
 
837474e
5efa561
 
0b0f7fe

import gradio as gr
import os
os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
import wget
from llama_cpp import Llama
import random
import os
import multiprocessing

def get_num_cores():
    """Get the number of CPU cores."""
    return os.cpu_count()

def get_num_threads():
    """Get the number of threads available to the current process."""
    return multiprocessing.cpu_count()

if __name__ == "__main__":
    num_cores = get_num_cores()
    num_threads = get_num_threads()

    print(f"Number of CPU cores: {num_cores}")
    print(f"Number of threads available to the current process: {num_threads}")
#url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q2_K.bin'
#filename = wget.download(url)
model_path= hf_hub_download(repo_id="TheBloke/llama2-7b-chat-codeCherryPop-qLoRA-GGML", filename="llama-2-7b-chat-codeCherryPop.ggmlv3.q2_K.bin")

llm2 = Llama(model_path=model_path, seed=random.randint(1, 2**31), lora_path="ggml-adapter-model (1).bin", use_mlock=True, n_threads=2)
filename = wget.download(url)
theme = gr.themes.Soft(
    primary_hue=gr.themes.Color("#ededed", "#fee2e2", "#fecaca", "#fca5a5", "#f87171", "#ef4444", "#dc2626", "#b91c1c", "#991b1b", "#7f1d1d", "#6c1e1e"),
    neutral_hue="red",
)
title = """<h1 align="center">Chat with awesome LLAMA 2 CHAT model!</h1><br>"""
with gr.Blocks(theme=theme) as demo:
    gr.HTML(title)
    gr.HTML("This model is awesome for its size! It is only 20th the size of Chatgpt but is still decent for chatting. However like all models, LLAMA-2-CHAT can hallucinate and provide incorrect information.")
    #chatbot = gr.Chatbot()
    #msg = gr.Textbox()
    #clear = gr.ClearButton([msg, chatbot])
    #instruction = gr.Textbox(label="Instruction", placeholder=)
    def bot(user_message):
        #token1 = llm.tokenize(b"### Instruction: ")
        #token2 = llm.tokenize(instruction.encode())
        #token3 = llm2.tokenize(b"USER: ")
        #tokens3 = llm2.tokenize(user_message.encode())
        #token4 = llm2.tokenize(b"\n\n### Response:")
        tokens = llm2.tokenize(user_message.encode())
        count = 0
        output = ""
        outputs = ""
        for token in llm2.generate(tokens, top_k=50, top_p=0.73, temp=0.72, repeat_penalty=1.1):
            text = llm2.detokenize([token])
            outputs += text.decode(errors='ignore')
            count += 1
            if count >= 500 or (token == llm2.token_eos()):
                break
            output += text.decode(errors='ignore')
            yield output
    gr.HTML("Thanks for checking out this app!")
    gr.Button("Answer").click(
        fn=bot, 
        inputs=gr.Textbox(),
        outputs=gr.Textbox(),
    )
demo.queue()
demo.launch(debug=True)