cedpsam commited on
Commit
65ecc4c
β€’
1 Parent(s): 57d4bf7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from langchain.llms import LlamaCpp
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.chains import LLMChain
6
+ from langchain.callbacks.manager import CallbackManager
7
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
8
+ from huggingface_hub import hf_hub_download
9
+ callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
10
+
11
+ repo_id="TheBloke/Mistral-7B-OpenOrca-GGUF"
12
+ model_name="mistral-7b-openorca.Q5_K_M.gguf"
13
+
14
+ hf_hub_download(repo_id=repo_id,
15
+ filename=model_name,local_dir =".")
16
+
17
+
18
+
19
+ def format_prompt(message, history):
20
+ prompt = "<s>"
21
+ for user_prompt, bot_response in history:
22
+ prompt += f"[INST] {user_prompt} [/INST]"
23
+ prompt += f" {bot_response}</s> "
24
+ prompt += f"[INST] {message} [/INST]"
25
+ return prompt
26
+
27
+ def generate(
28
+ prompt, history, temperature=0.9, top_p=0.95,
29
+ ):
30
+
31
+ temperature = float(temperature)
32
+ if temperature < 1e-2:
33
+ temperature = 1e-2
34
+ top_p = float(top_p)
35
+
36
+
37
+ formatted_prompt = format_prompt(prompt, history)
38
+
39
+ llm = LlamaCpp(
40
+ model_path=model_name,
41
+ temperature=temperature,
42
+ max_tokens=2000,
43
+ top_p=top_p,
44
+ callback_manager=callback_manager,
45
+ verbose=True, # Verbose is required to pass to the callback manager
46
+ )
47
+ # stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
48
+ output = ""
49
+ output=llm(formatted_prompt)
50
+ # for response in stream:
51
+ # output += response.token.text
52
+ # yield output
53
+ return output
54
+
55
+
56
+ additional_inputs=[
57
+ gr.Slider(
58
+ label="Temperature",
59
+ value=0.9,
60
+ minimum=0.0,
61
+ maximum=1.0,
62
+ step=0.05,
63
+ interactive=True,
64
+ info="Higher values produce more diverse outputs",
65
+ ),
66
+ gr.Slider(
67
+ label="Max new tokens",
68
+ value=256,
69
+ minimum=0,
70
+ maximum=1048,
71
+ step=64,
72
+ interactive=True,
73
+ info="The maximum numbers of new tokens",
74
+ ),
75
+ gr.Slider(
76
+ label="Top-p (nucleus sampling)",
77
+ value=0.90,
78
+ minimum=0.0,
79
+ maximum=1,
80
+ step=0.05,
81
+ interactive=True,
82
+ info="Higher values sample more low-probability tokens",
83
+ ),
84
+ gr.Slider(
85
+ label="Repetition penalty",
86
+ value=1.2,
87
+ minimum=1.0,
88
+ maximum=2.0,
89
+ step=0.05,
90
+ interactive=True,
91
+ info="Penalize repeated tokens",
92
+ )
93
+ ]
94
+
95
+ css = """
96
+ #mkd {
97
+ height: 500px;
98
+ overflow: auto;
99
+ border: 1px solid #ccc;
100
+ }
101
+ """
102
+
103
+ with gr.Blocks(css=css) as demo:
104
+ gr.HTML("<h1><center>Mistral 7B Instruct<h1><center>")
105
+ gr.HTML("<h3><center>In this demo, you can chat with <a href='https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'>Mistral-7B-Instruct</a> model. πŸ’¬<h3><center>")
106
+ gr.HTML("<h3><center>Learn more about the model <a href='https://huggingface.co/docs/transformers/main/model_doc/mistral'>here</a>. πŸ“š<h3><center>")
107
+ gr.ChatInterface(
108
+ generate,
109
+ additional_inputs=additional_inputs,
110
+ examples=[["What is the secret to life?"], ["Write me a recipe for pancakes."]]
111
+ )
112
+
113
+ demo.queue().launch(debug=True)