WizardLM7b

Sleeping

File size: 2,729 Bytes

5efa561
 
1f2018b
8fd3cf8
5efa561
 
 
b96b830
 
2e9bb0a
65ae873
 
 
 
 
b96b830
 
 
 
65ae873
 
b96b830
 
 
 
9891f35
 
0c31f09
9891f35
fb37c8a
5efa561
 
 
 
0c31f09
5efa561
 
65ae873
837474e
 
 
5efa561
837474e
5efa561
 
 
65ae873
 
 
5efa561
 
837474e
5efa561
 
d36c00f
5efa561
 
 
76a7dba
41161d0
837474e
5efa561
837474e
 
2d5948c
 
837474e
5efa561
 
0b0f7fe

import gradio as gr
import os

os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
import wget
from llama_cpp import Llama
import random
import os
import multiprocessing
from huggingface_hub import hf_hub_download #load from huggingfaces 

def get_num_cores():
    """Get the number of CPU cores."""
    return os.cpu_count()

def get_num_threads():
    """Get the number of threads available to the current process."""
    return multiprocessing.cpu_count()

if __name__ == "__main__":
    num_cores = get_num_cores()
    num_threads = get_num_threads()

    print(f"Number of CPU cores: {num_cores}")
    print(f"Number of threads available to the current process: {num_threads}")
#url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q2_K.bin'
#filename = wget.download(url)
model_path= hf_hub_download(repo_id="brittlewis12/Octopus-v2-GGUF", filename="octopus-v2.Q4_K_S.gguf")

llm2 = Llama(model_path=model_path, use_mlock=False)
theme = gr.themes.Soft(
    primary_hue=gr.themes.Color("#ededed", "#fee2e2", "#fecaca", "#fca5a5", "#f87171", "#ef4444", "#dc2626", "#b91c1c", "#991b1b", "#7f1d1d", "#6c1e1e"),
    neutral_hue="red",
)
title = """<h1 align="center">Chat with awesome Octopus 2 model!</h1><br>"""
with gr.Blocks(theme=theme) as demo:
    gr.HTML(title)
    gr.HTML("This model is awesome for its size! It is only 20th the size of Chatgpt but is still decent for chatting. However like all models, LLAMA-2-CHAT can hallucinate and provide incorrect information.")
    #chatbot = gr.Chatbot()
    #msg = gr.Textbox()
    #clear = gr.ClearButton([msg, chatbot])
    #instruction = gr.Textbox(label="Instruction", placeholder=)
    def bot(user_message):
        #token1 = llm.tokenize(b"### Instruction: ")
        #token2 = llm.tokenize(instruction.encode())
        #token3 = llm2.tokenize(b"USER: ")
        #tokens3 = llm2.tokenize(user_message.encode())
        #token4 = llm2.tokenize(b"\n\n### Response:")
        tokens = llm2.tokenize(user_message.encode())
        count = 0
        output = ""
        outputs = ""
        for token in llm2.generate(tokens, top_k=50, top_p=0.73, temp=0.72, repeat_penalty=1.1):
            text = llm2.detokenize([token])
            outputs += text.decode(errors='ignore')
            count += 1
            if count >= 500 or (token == llm2.token_eos()):
                break
            output += text.decode(errors='ignore')
            print(output, end="")
            yield output
    gr.HTML("Thanks for checking out this app!")
    gr.Button("Answer").click(
        fn=bot, 
        inputs=gr.Textbox(),
        outputs=gr.Textbox(),
    )
demo.queue()
demo.launch(debug=True)