File size: 3,714 Bytes
65ecc4c
 
c90ace1
 
65ecc4c
 
 
 
 
 
c90ace1
65ecc4c
 
 
 
 
 
 
 
 
093b59a
 
3b13a4c
093b59a
 
 
65ecc4c
 
 
b4905cb
 
 
65ecc4c
 
 
1c2bc1d
65ecc4c
 
 
 
 
 
 
 
 
 
093b59a
65ecc4c
 
27493f6
 
1c2bc1d
 
b4905cb
 
 
1c2bc1d
65ecc4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d81b11e
65ecc4c
 
 
 
 
 
 
 
 
1c2bc1d
 
bc4f800
1c2bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d81b11e
65ecc4c
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4f800
65ecc4c
 
 
 
 
 
bc4f800
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

import gradio as gr
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"]="1"
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

repo_id="TheBloke/Mistral-7B-OpenOrca-GGUF"
model_name="mistral-7b-openorca.Q5_K_M.gguf"

hf_hub_download(repo_id=repo_id,
                filename=model_name,local_dir =".")


llm = LlamaCpp(
        model_path=model_name,
        n_ctx=4096,
        callback_manager=callback_manager, 
        verbose=True, # Verbose is required to pass to the callback manager
    )
def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"<|im_start|>user\n {user_prompt} <|im_end|>\n"
    prompt += f"<|im_start|>assistant\n  {bot_response}<|im_end|>\n"
  prompt += f"<|im_start|>user\n {message} <|im_end|>\n<|im_start|>assistant\n"
  return prompt

def generate(
    prompt, history, temperature=0.9, top_p=0.95, max_new_tokens=256,repetition_penalty=1.0,
):
    
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)
    
 
    formatted_prompt = format_prompt(prompt, history)
    

    # stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""
    output=llm(formatted_prompt,
               temperature=temperature,
        max_tokens=max_new_tokens,
               repeat_penalty=repetition_penalty,
        top_p=top_p,
               stop=["<|im_end|>","<|im_start|>user"]
              )
    # output=formatted_prompt+"ans:"+output
    # for response in stream:
    #     output += response.token.text
    #     yield output
    return output


additional_inputs=[
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),

    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Max new tokens",
        value=400,
        minimum=0,
        maximum=1048,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
        gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )

]

css = """
  #mkd {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.HTML("<h1><center>Mistral 7B Instruct<h1><center>")
    gr.HTML("<h3><center>In this demo, you can chat with <a href='https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'>Mistral-7B-Instruct</a> model. πŸ’¬<h3><center>")
    gr.HTML("<h3><center>Learn more about the model <a href='https://huggingface.co/docs/transformers/main/model_doc/mistral'>here</a>. πŸ“š<h3><center>")
    gr.HTML(f"<h3><center>it's lamacpp running  {model_name} from {repo_id}<h3><center>")
    gr.ChatInterface(
        generate,
        additional_inputs=additional_inputs,
        examples=[["What is the secret to life?"], ["Write me a recipe for pancakes."]]
    )

demo.queue(max_size=None).launch(debug=True)