ArmelR's picture
Update app.py
572b214
raw
history blame
No virus
9.78 kB
import json
import os
import re
import shutil
import requests
import warnings
import gradio as gr
from huggingface_hub import Repository
from text_generation import Client
from share_btn import community_icon_html, loading_icon_html, share_js, share_btn_css
HF_TOKEN = os.environ.get("HF_TOKEN", None)
API_URL_G = "https://api-inference.huggingface.co/models/ArmelR/starcoder-gradio-v0"
API_URL_S = "https://api-inference.huggingface.co/models/HuggingFaceH4/starcoderbase-finetuned-oasst1"
with open("./HHH_prompt_short.txt", "r") as f:
HHH_PROMPT = f.read() + "\n\n"
with open("./TA_prompt_v0.txt", "r") as f:
TA_PROMPT = f.read()
NO_PROMPT = ""
FIM_PREFIX = "<fim_prefix>"
FIM_MIDDLE = "<fim_middle>"
FIM_SUFFIX = "<fim_suffix>"
FIM_INDICATOR = "<FILL_HERE>"
FORMATS = """
# Chat mode
Chat mode prepends the custom [TA prompt](https://huggingface.co/spaces/bigcode/chat-playground/blob/main/TA_prompt_v0.txt) or the [HHH prompt](https://gist.github.com/jareddk/2509330f8ef3d787fc5aaac67aab5f11#file-hhh_prompt-txt) from Anthropic to the request which conditions the model to serve as an assistant.
⚠️ **Intended Use**: this app and its [supporting model](https://huggingface.co/bigcode) are provided for demonstration purposes; not to serve as replacement for human expertise. For more details on the model's limitations in terms of factuality and biases, see the [model card.](hf.co/bigcode)
"""
theme = gr.themes.Monochrome(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
radius_size=gr.themes.sizes.radius_sm,
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
)
client_g = Client(
API_URL_G, headers={"Authorization": f"Bearer {HF_TOKEN}"},
)
client_s = Client(
API_URL_S, headers={"Authorization": f"Bearer {HF_TOKEN}"},
)
def wrap_html_code(text):
pattern = r"<.*?>"
matches = re.findall(pattern, text)
if len(matches) > 0:
return f"```{text}```"
else:
return text
def generate(
prompt,
temperature=0.9,
max_new_tokens=256,
top_p=0.95,
repetition_penalty=1.0,
chat_mode="TA prompt",
version="StarCoder-gradio",
):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
fim_mode = False
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
truncate=7500,
do_sample=True,
seed=42,
stop_sequences=["\nHuman", "\n-----", "Question:", "Answer:"],
)
if chat_mode == "HHH prompt":
base_prompt = HHH_PROMPT
elif chat_mode == "TA prompt":
base_prompt = TA_PROMPT
else :
base_prompt = NO_PROMPT
if version == "StarCoder-gradio" :
chat_prompt = prompt + "\n\nAnswer:"
prompt = base_prompt + chat_prompt
print("PROMPT : "+str(prompt))
stream = client_g.generate_stream(prompt, **generate_kwargs)
elif version == "StarChat-alpha" :
chat_prompt = prompt + "\n\nAssistant:"
prompt = base_prompt + chat_prompt
stream = client_s.generate_stream(prompt, **generate_kwargs)
else :
ValueError("Unsupported version of the Coding assistant")
output = ""
previous_token = ""
#t = 0
for response in stream:
#print(f"IN_{t}")
if (
(response.token.text in ["Human", "-----", "Question:"] and previous_token in ["\n", "-----"])
or response.token.text in ["<|endoftext|>", "<|end|>"]
):
print("OUT = "+str(output))
return wrap_html_code(output.strip())
else:
output += response.token.text
#print(f"Out_{t} : {output}")
#t += 1
previous_token = response.token.text
print("Output = "+str(output))
return wrap_html_code(output.strip())
# chatbot mode
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(
history,
temperature=0.9,
max_new_tokens=256,
top_p=0.95,
repetition_penalty=1.0,
chat_mode=None,
version="StarChat",
):
# concat history of prompts with answers expect for last empty answer only add prompt
if version == "StarCoder-gradio" :
prompt = "\n".join(
[f"Question: {prompt}\n\nAnswer: {answer}" for prompt, answer in history[:-1]] + [f"\nQuestion: {history[-1][0]}"]
)
else :
prompt = "\n".join(
[f"Human: {prompt}\n\nAssistant: {answer}" for prompt, answer in history[:-1]] + [f"\nHuman: {history[-1][0]}"]
)
bot_message = generate(
prompt,
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
chat_mode=chat_mode,
version=version
)
history[-1][1] = bot_message
return history
examples = [
"def print_hello_world():",
"def fibonacci(n):",
"class TransformerDecoder(nn.Module):",
"class ComplexNumbers:",
"How to install gradio"
]
def process_example(args):
for x in generate(args):
pass
return x
css = ".generating {visibility: hidden}" + share_btn_css
with gr.Blocks(theme=theme, analytics_enabled=False, css=css) as demo:
with gr.Column():
gr.Markdown(
"""\
#Gradio Assistant powered by ‍💫 StarCoder
_Note:_ this is an internal chat playground - **please do not share**. The deployment can also change and thus the space not work as we continue development.\
"""
)
with gr.Row():
column_1, column_2 = gr.Column(scale=3), gr.Column(scale=1)
with column_2:
chat_mode = gr.Dropdown(
["NO prompt","TA prompt", "HHH prompt"],
value="NO prompt",
label="Chat mode",
info="Use Anthropic's HHH prompt or our custom tech prompt to turn the model into an assistant.",
)
temperature = gr.Slider(
label="Temperature",
value=0.2,
minimum=0.0,
maximum=2.0,
step=0.1,
interactive=True,
info="Higher values produce more diverse outputs",
)
max_new_tokens = gr.Slider(
label="Max new tokens",
value=512,
minimum=0,
maximum=8192,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
)
top_p = gr.Slider(
label="Top-p (nucleus sampling)",
value=0.95,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
)
repetition_penalty = gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
version = gr.Dropdown(
["StarCoder-gradio", "StarChat-alpha"],
value="StarCoder-gradio",
label="Version",
info="",
)
with column_1:
# output = gr.Code(elem_id="q-output")
# add visibl=False and update if chat_mode True
chatbot = gr.Chatbot()
instruction = gr.Textbox(
placeholder="Enter your prompt here",
label="Prompt",
elem_id="q-input",
)
with gr.Row():
with gr.Column():
clear = gr.Button("Clear Chat")
with gr.Column():
submit = gr.Button("Generate", variant="primary")
with gr.Group(elem_id="share-btn-container"):
community_icon = gr.HTML(community_icon_html, visible=True)
loading_icon = gr.HTML(loading_icon_html, visible=True)
share_button = gr.Button(
"Share to community", elem_id="share-btn", visible=True
)
# examples of non-chat mode
#gr.Examples(
# examples=examples,
# inputs=[instruction],
# cache_examples=False,
# fn=process_example,
# outputs=[output],
# )
gr.Markdown(FORMATS)
instruction.submit(
user, [instruction, chatbot], [instruction, chatbot], queue=False
).then(
bot,
[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, chat_mode, version],
chatbot,
)
submit.click(
user, [instruction, chatbot], [instruction, chatbot], queue=False
).then(
bot,
[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, chat_mode, version],
chatbot,
)
clear.click(lambda: None, None, chatbot, queue=False)
share_button.click(None, [], [], _js=share_js)
demo.queue(concurrency_count=16).launch(debug=True)