Matias Stager commited on
Commit
24d70f7
1 Parent(s): 4a981b8
Files changed (1) hide show
  1. app.py +141 -4
app.py CHANGED
@@ -1,7 +1,144 @@
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import requests
5
+
6
  import gradio as gr
7
+ from huggingface_hub import Repository, InferenceClient
8
+
9
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
+ API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"
11
+ BOT_NAME = "Falcon"
12
+
13
+ STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]
14
+
15
+ EXAMPLES = [
16
+ ["Hey Falcon! Any recommendations for my holidays in Abu Dhabi?"],
17
+ ["What's the Everett interpretation of quantum mechanics?"],
18
+ ["Give me a list of the top 10 dive sites you would recommend around the world."],
19
+ ["Can you tell me more about deep-water soloing?"],
20
+ ["Can you write a short tweet about the release of our latest AI model, Falcon LLM?"]
21
+ ]
22
+
23
+ client = InferenceClient(
24
+ API_URL,
25
+ headers={"Authorization": f"Bearer {HF_TOKEN}"},
26
+ )
27
+
28
+ def format_prompt(message, history, system_prompt):
29
+ prompt = ""
30
+ if system_prompt:
31
+ prompt += f"System: {system_prompt}\n"
32
+ for user_prompt, bot_response in history:
33
+ prompt += f"User: {user_prompt}\n"
34
+ prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
35
+ prompt += f"""User: {message}
36
+ Falcon:"""
37
+ return prompt
38
+
39
+ seed = 42
40
+
41
+ def generate(
42
+ prompt, history, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
43
+ ):
44
+ temperature = float(temperature)
45
+ if temperature < 1e-2:
46
+ temperature = 1e-2
47
+ top_p = float(top_p)
48
+ global seed
49
+ generate_kwargs = dict(
50
+ temperature=temperature,
51
+ max_new_tokens=max_new_tokens,
52
+ top_p=top_p,
53
+ repetition_penalty=repetition_penalty,
54
+ stop_sequences=STOP_SEQUENCES,
55
+ do_sample=True,
56
+ seed=seed,
57
+ )
58
+ seed = seed + 1
59
+ formatted_prompt = format_prompt(prompt, history, system_prompt)
60
+
61
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
62
+ output = ""
63
+
64
+ for response in stream:
65
+ output += response.token.text
66
+
67
+ for stop_str in STOP_SEQUENCES:
68
+ if output.endswith(stop_str):
69
+ output = output[:-len(stop_str)]
70
+ output = output.rstrip()
71
+ yield output
72
+ yield output
73
+ return output
74
+
75
+
76
+ additional_inputs=[
77
+ gr.Textbox("", label="Optional system prompt"),
78
+ gr.Slider(
79
+ label="Temperature",
80
+ value=0.9,
81
+ minimum=0.0,
82
+ maximum=1.0,
83
+ step=0.05,
84
+ interactive=True,
85
+ info="Higher values produce more diverse outputs",
86
+ ),
87
+ gr.Slider(
88
+ label="Max new tokens",
89
+ value=256,
90
+ minimum=0,
91
+ maximum=8192,
92
+ step=64,
93
+ interactive=True,
94
+ info="The maximum numbers of new tokens",
95
+ ),
96
+ gr.Slider(
97
+ label="Top-p (nucleus sampling)",
98
+ value=0.90,
99
+ minimum=0.0,
100
+ maximum=1,
101
+ step=0.05,
102
+ interactive=True,
103
+ info="Higher values sample more low-probability tokens",
104
+ ),
105
+ gr.Slider(
106
+ label="Repetition penalty",
107
+ value=1.2,
108
+ minimum=1.0,
109
+ maximum=2.0,
110
+ step=0.05,
111
+ interactive=True,
112
+ info="Penalize repeated tokens",
113
+ )
114
+ ]
115
+
116
+
117
+ with gr.Blocks() as demo:
118
+ with gr.Row():
119
+ with gr.Column(scale=0.4):
120
+ gr.Image("better_banner.jpeg", elem_id="banner-image", show_label=False)
121
+ with gr.Column():
122
+ gr.Markdown(
123
+ """# Falcon-180B Demo
124
+ **Chat with [Falcon-180B-Chat](https://huggingface.co/tiiuae/falcon-180b-chat), brainstorm ideas, discuss your holiday plans, and more!**
125
+
126
+ ✨ This demo is powered by [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B) and finetuned on a mixture of [Ultrachat](https://huggingface.co/datasets/stingning/ultrachat), [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) and [Airoboros](https://huggingface.co/datasets/jondurbin/airoboros-2.1). [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b) is a state-of-the-art large language model built by the [Technology Innovation Institute](https://www.tii.ae) in Abu Dhabi. It is trained on 3.5 trillion tokens (including [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)) and available under the [Falcon-180B TII License](https://huggingface.co/spaces/tiiuae/falcon-180b-license/blob/main/LICENSE.txt). It currently holds the 🥇 1st place on the [🤗 Open LLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) for a pretrained model.
127
+
128
+ 🧪 This is only a **first experimental preview**: we intend to provide increasingly capable versions of Falcon in the future, based on improved datasets and RLHF/RLAIF.
129
+
130
+ 👀 **Learn more about Falcon LLM:** [falconllm.tii.ae](https://falconllm.tii.ae/)
131
+
132
+ ➡️️ **Intended Use**: this demo is intended to showcase an early finetuning of [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b), to illustrate the impact (and limitations) of finetuning on a dataset of conversations and instructions. We encourage the community to further build upon the base model, and to create even better instruct/chat versions!
133
+
134
+ ⚠️ **Limitations**: the model can and will produce factually incorrect information, hallucinating facts and actions. As it has not undergone any advanced tuning/alignment, it can produce problematic outputs, especially if prompted to do so. Finally, this demo is limited to a session length of about 1,000 words.
135
+ """
136
+ )
137
 
138
+ gr.ChatInterface(
139
+ generate,
140
+ examples=EXAMPLES,
141
+ additional_inputs=additional_inputs,
142
+ )
143
 
144
+ demo.queue(concurrency_count=100, api_open=False).launch(show_api=False)