File size: 6,233 Bytes
76f2392
 
 
 
 
 
 
 
 
51a0881
 
 
76f2392
 
 
 
 
 
67260a2
1da48e6
0f143d9
76f2392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110c73f
76f2392
124ef84
76f2392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed26cb6
 
 
76f2392
 
 
ed26cb6
 
 
 
51a0881
76f2392
 
 
ed26cb6
76f2392
 
 
 
 
 
 
 
 
 
 
51a0881
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import os
import shutil
import requests

import gradio as gr
from huggingface_hub import Repository, InferenceClient

HF_TOKEN = os.environ.get("HF_TOKEN", None)
USER_NAME = os.environ.get("USER_NAME", None)
APP_PASSWORD = os.environ.get("APP_PASSWORD", None)

API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"
BOT_NAME = "Falcon"

STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]

EXAMPLES = [
    ["圖書館與家相距864 m。爸爸以0.7 m/s的平均速率從圖書館沿一直路步行回家。同一時間,子琪以0.5 m/s的平均速率從家步向圖書館。兩人會在多少分鐘後相遇?請先將問題中的數據列表出來 用詳細的步驟方式回覆 "],
    ["圖書館與家相距864 m。爸爸以0.7 m/s的平均速率從圖書館沿一直路步行回家。同一時間,子琪以0.5 m/s的平均速率從家步向圖書館。兩人會在多少分鐘後相遇?請用詳細的步驟方式回覆。"],
    ["What is the history of The Education University of Hong Kong?"],
    ["Can you tell me more about deep-water soloing?"],
    ["Can you write a short tweet about the release of our latest AI model, Falcon LLM?"]
    ]

client = InferenceClient(
    API_URL,
    headers={"Authorization": f"Bearer {HF_TOKEN}"},
)

def format_prompt(message, history, system_prompt):
  prompt = ""
  if system_prompt:
    prompt += f"System: {system_prompt}\n"
  for user_prompt, bot_response in history:
    prompt += f"User: {user_prompt}\n"
    prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
  prompt += f"""User: {message}
Falcon:"""
  return prompt

seed = 42

def generate(
    prompt, history, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)
    global seed
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop_sequences=STOP_SEQUENCES,
        do_sample=True,
        seed=seed,
    )
    seed = seed + 1
    formatted_prompt = format_prompt(prompt, history, system_prompt)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text

        for stop_str in STOP_SEQUENCES:
            if output.endswith(stop_str):
                output = output[:-len(stop_str)]
                output = output.rstrip()
                yield output
        yield output
    return output


additional_inputs=[
    gr.Textbox("", label="Optional system prompt"),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=1088,
        minimum=0,
        maximum=8192,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=0.4):
            gr.Image("better_banner.jpeg", elem_id="banner-image", show_label=False)
        with gr.Column():
            gr.Markdown(
                """
                # Falcon-180B x EDUHK AIDCEC 
                💓 “Falcon-180B x EDUHK AIDCEC" is built using Falcon LLM technology from the Technology Innovation Institute. 

                **Chat with [Falcon-180B-Chat](https://huggingface.co/tiiuae/falcon-180b-chat), brainstorm ideas, discuss your holiday plans, and more!**
                
                ✨ This is powered by [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B) and finetuned on a mixture of [Ultrachat](https://huggingface.co/datasets/stingning/ultrachat), [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) and [Airoboros](https://huggingface.co/datasets/jondurbin/airoboros-2.1). [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b) is a state-of-the-art large language model built by the [Technology Innovation Institute](https://www.tii.ae) in Abu Dhabi. 

                💁It is trained on 3.5 trillion tokens (including [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)) and available under the [Falcon-180B TII License](https://huggingface.co/spaces/tiiuae/falcon-180b-license/blob/main/LICENSE.txt). 

                🧪 Falcon in the futuremay improved datasets.
                
                👀 **Learn more about Falcon LLM:** [falconllm.tii.ae](https://falconllm.tii.ae/)
                
                ➡️️ **Intended Use**: this demo is intended to showcase an early finetuning of [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b), to illustrate the impact (and limitations) of finetuning on a dataset of conversations and instructions. They encourage the community to further build upon the base model, and to create even better instruct/chat versions!
                
                ⚠️ **Limitations**: the model can and will produce factually incorrect information, hallucinating facts and actions. As it has not undergone any advanced tuning/alignment, it can produce problematic outputs, especially if prompted to do so. Finally, this demo is limited to a session length of about 1,000 words.
                """
            )

    gr.ChatInterface(
        generate, 
        examples=EXAMPLES,
        additional_inputs=additional_inputs,
    ) 

demo.queue(concurrency_count=100, api_open=True).launch(show_api=True, auth=(USER_NAME, APP_PASSWORD))