import json import os import re import shutil import requests import warnings import gradio as gr from huggingface_hub import Repository from text_generation import Client from share_btn import community_icon_html, loading_icon_html, share_js, share_btn_css HF_TOKEN = os.environ.get("HF_TOKEN", None) API_URL_G = "https://api-inference.huggingface.co/models/ArmelR/starcoder-gradio-v0" API_URL_S = "https://api-inference.huggingface.co/models/HuggingFaceH4/starcoderbase-finetuned-oasst1" with open("./HHH_prompt_short.txt", "r") as f: HHH_PROMPT = f.read() + "\n\n" with open("./TA_prompt_v0.txt", "r") as f: TA_PROMPT = f.read() NO_PROMPT = "" FIM_PREFIX = "" FIM_MIDDLE = "" FIM_SUFFIX = "" FIM_INDICATOR = "" FORMATS = """ # Chat mode Chat mode prepends the custom [TA prompt](https://huggingface.co/spaces/bigcode/chat-playground/blob/main/TA_prompt_v0.txt) or the [HHH prompt](https://gist.github.com/jareddk/2509330f8ef3d787fc5aaac67aab5f11#file-hhh_prompt-txt) from Anthropic to the request which conditions the model to serve as an assistant. ⚠️ **Intended Use**: this app and its [supporting model](https://huggingface.co/bigcode) are provided for demonstration purposes; not to serve as replacement for human expertise. For more details on the model's limitations in terms of factuality and biases, see the [model card.](hf.co/bigcode) """ theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", radius_size=gr.themes.sizes.radius_sm, font=[ gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif", ], ) client_g = Client( API_URL_G, headers={"Authorization": f"Bearer {HF_TOKEN}"}, ) client_s = Client( API_URL_S, headers={"Authorization": f"Bearer {HF_TOKEN}"}, ) def wrap_html_code(text): pattern = r"<.*?>" matches = re.findall(pattern, text) if len(matches) > 0: return f"```{text}```" else: return text def generate( prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, chat_mode="TA prompt", version="StarCoder-gradio", ): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) fim_mode = False generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, truncate=7500, do_sample=True, seed=42, stop_sequences=["\nHuman", "\n-----", "Question:", "Answer:"], ) if chat_mode == "HHH prompt": base_prompt = HHH_PROMPT elif chat_mode == "TA prompt": base_prompt = TA_PROMPT else : base_prompt = NO_PROMPT if version == "StarCoder-gradio" : chat_prompt = prompt + "\n\nAnswer:" prompt = base_prompt + chat_prompt print("PROMPT : "+str(prompt)) stream = client_g.generate_stream(prompt, **generate_kwargs) elif version == "StarChat-alpha" : chat_prompt = prompt + "\n\nAssistant:" prompt = base_prompt + chat_prompt stream = client_s.generate_stream(prompt, **generate_kwargs) else : ValueError("Unsupported version of the Coding assistant") output = "" previous_token = "" #t = 0 for response in stream: #print(f"IN_{t}") if ( (response.token.text in ["Human", "-----", "Question:"] and previous_token in ["\n", "-----"]) or response.token.text in ["<|endoftext|>", "<|end|>"] ): print("OUT = "+str(output)) return wrap_html_code(output.strip()) else: output += response.token.text #print(f"Out_{t} : {output}") #t += 1 previous_token = response.token.text print("Output = "+str(output)) return wrap_html_code(output.strip()) # chatbot mode def user(user_message, history): return "", history + [[user_message, None]] def bot( history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, chat_mode=None, version="StarChat", ): # concat history of prompts with answers expect for last empty answer only add prompt if version == "StarCoder-gradio" : prompt = "\n".join( [f"Question: {prompt}\n\nAnswer: {answer}" for prompt, answer in history[:-1]] + [f"\nQuestion: {history[-1][0]}"] ) else : prompt = "\n".join( [f"Human: {prompt}\n\nAssistant: {answer}" for prompt, answer in history[:-1]] + [f"\nHuman: {history[-1][0]}"] ) bot_message = generate( prompt, temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, chat_mode=chat_mode, version=version ) history[-1][1] = bot_message return history examples = [ "def print_hello_world():", "def fibonacci(n):", "class TransformerDecoder(nn.Module):", "class ComplexNumbers:", "How to install gradio" ] def process_example(args): for x in generate(args): pass return x css = ".generating {visibility: hidden}" + share_btn_css with gr.Blocks(theme=theme, analytics_enabled=False, css=css) as demo: with gr.Column(): gr.Markdown( """\ #Gradio Assistant powered by ‍💫 StarCoder _Note:_ this is an internal chat playground - **please do not share**. The deployment can also change and thus the space not work as we continue development.\ """ ) with gr.Row(): column_1, column_2 = gr.Column(scale=3), gr.Column(scale=1) with column_2: chat_mode = gr.Dropdown( ["NO prompt","TA prompt", "HHH prompt"], value="NO prompt", label="Chat mode", info="Use Anthropic's HHH prompt or our custom tech prompt to turn the model into an assistant.", ) temperature = gr.Slider( label="Temperature", value=0.2, minimum=0.0, maximum=2.0, step=0.1, interactive=True, info="Higher values produce more diverse outputs", ) max_new_tokens = gr.Slider( label="Max new tokens", value=512, minimum=0, maximum=8192, step=64, interactive=True, info="The maximum numbers of new tokens", ) top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ) repetition_penalty = gr.Slider( label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) version = gr.Dropdown( ["StarCoder-gradio", "StarChat-alpha"], value="StarCoder-gradio", label="Version", info="", ) with column_1: # output = gr.Code(elem_id="q-output") # add visibl=False and update if chat_mode True chatbot = gr.Chatbot() instruction = gr.Textbox( placeholder="Enter your prompt here", label="Prompt", elem_id="q-input", ) with gr.Row(): with gr.Column(): clear = gr.Button("Clear Chat") with gr.Column(): submit = gr.Button("Generate", variant="primary") with gr.Group(elem_id="share-btn-container"): community_icon = gr.HTML(community_icon_html, visible=True) loading_icon = gr.HTML(loading_icon_html, visible=True) share_button = gr.Button( "Share to community", elem_id="share-btn", visible=True ) # examples of non-chat mode #gr.Examples( # examples=examples, # inputs=[instruction], # cache_examples=False, # fn=process_example, # outputs=[output], # ) gr.Markdown(FORMATS) instruction.submit( user, [instruction, chatbot], [instruction, chatbot], queue=False ).then( bot, [chatbot, temperature, max_new_tokens, top_p, repetition_penalty, chat_mode, version], chatbot, ) submit.click( user, [instruction, chatbot], [instruction, chatbot], queue=False ).then( bot, [chatbot, temperature, max_new_tokens, top_p, repetition_penalty, chat_mode, version], chatbot, ) clear.click(lambda: None, None, chatbot, queue=False) share_button.click(None, [], [], _js=share_js) demo.queue(concurrency_count=16).launch(debug=True)