Spaces:
Runtime error
Runtime error
File size: 3,714 Bytes
65ecc4c c90ace1 65ecc4c c90ace1 65ecc4c 093b59a 3b13a4c 093b59a 65ecc4c b4905cb 65ecc4c 1c2bc1d 65ecc4c 093b59a 65ecc4c 27493f6 1c2bc1d b4905cb 1c2bc1d 65ecc4c d81b11e 65ecc4c 1c2bc1d bc4f800 1c2bc1d d81b11e 65ecc4c bc4f800 65ecc4c bc4f800 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"]="1"
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
repo_id="TheBloke/Mistral-7B-OpenOrca-GGUF"
model_name="mistral-7b-openorca.Q5_K_M.gguf"
hf_hub_download(repo_id=repo_id,
filename=model_name,local_dir =".")
llm = LlamaCpp(
model_path=model_name,
n_ctx=4096,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
)
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"<|im_start|>user\n {user_prompt} <|im_end|>\n"
prompt += f"<|im_start|>assistant\n {bot_response}<|im_end|>\n"
prompt += f"<|im_start|>user\n {message} <|im_end|>\n<|im_start|>assistant\n"
return prompt
def generate(
prompt, history, temperature=0.9, top_p=0.95, max_new_tokens=256,repetition_penalty=1.0,
):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
formatted_prompt = format_prompt(prompt, history)
# stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
output=llm(formatted_prompt,
temperature=temperature,
max_tokens=max_new_tokens,
repeat_penalty=repetition_penalty,
top_p=top_p,
stop=["<|im_end|>","<|im_start|>user"]
)
# output=formatted_prompt+"ans:"+output
# for response in stream:
# output += response.token.text
# yield output
return output
additional_inputs=[
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Max new tokens",
value=400,
minimum=0,
maximum=1048,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
css = """
#mkd {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML("<h1><center>Mistral 7B Instruct<h1><center>")
gr.HTML("<h3><center>In this demo, you can chat with <a href='https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'>Mistral-7B-Instruct</a> model. π¬<h3><center>")
gr.HTML("<h3><center>Learn more about the model <a href='https://huggingface.co/docs/transformers/main/model_doc/mistral'>here</a>. π<h3><center>")
gr.HTML(f"<h3><center>it's lamacpp running {model_name} from {repo_id}<h3><center>")
gr.ChatInterface(
generate,
additional_inputs=additional_inputs,
examples=[["What is the secret to life?"], ["Write me a recipe for pancakes."]]
)
demo.queue(max_size=None).launch(debug=True) |