from huggingface_hub import InferenceClient import gradio as gr client = InferenceClient( "HuggingFaceH4/zephyr-7b-alpha" ) def format_prompt(message, history): system = "<|system|>When asked a question, answer only the question. Do no elaborate, or add on. Just answer the question in one to two sentences. You sentences should be at the 5th or 6th grade level.\n" prompt = "" for user_prompt, bot_response in history: prompt += f"<|user|>\n{user_prompt}\n" prompt += f"<|assistant|>\n{bot_response}\n" prompt += f"<|user|>\n{message}\n" return prompt def generate( prompt, history, temperature=0.9, max_new_tokens=500, top_p=0.95, repetition_penalty=1.0, ): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) formatted_prompt = format_prompt(prompt, history) stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text yield output return output additional_inputs=[ gr.Slider( label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs", ), gr.Slider( label="Max new tokens", value=256, minimum=0, maximum=1048, step=64, interactive=True, info="The maximum numbers of new tokens", ), gr.Slider( label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ), gr.Slider( label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) ] css = """ #mkd { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as inf: gr.HTML("

zephyr-7b-alpha

") gr.HTML("

In this demo, you can chat with zephyr-7b-alpha model. 💬

") gr.ChatInterface( generate, additional_inputs=additional_inputs, examples=[["Can squirrel swims?"], ["Write a poem about squirrel."]] ) inf.queue().launch()