import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TextStreamer token = os.environ["HUGGINGFACEHUB_API_TOKEN"] model_id = 'Deci/DeciLM-7B-instruct' SYSTEM_PROMPT_TEMPLATE = """### System: You are an AI assistant that follows instruction extremely well. Help as much as you can. ### User: {instruction} ### Assistant: """ DESCRIPTION = """ #

πŸ€– DeciLM-7B-Instruct: A Fast Instruction-Tuned ModelπŸ’¨

Welcome to DeciLM-7B-Instruct! DeciLM-7B-Instruct is a 7B parameter instruction-tuned language model and released under the Apache 2.0 license. It's an instruction-tuned model, not a chat-tuned model; you should prompt the model with an instruction that describes a task, and the model will respond appropriately to complete the task.

Learn more about the base model DeciLM-7B.

Experience the speed of DeciLM-7B + Infery. Check out the demo πŸ‘‰πŸ½ here.

""" bnb_config = BitsAndBytesConfig( load_in_4bit = True, bnb_4bit_compute_dtype=torch.bfloat16 ) if not torch.cuda.is_available(): DESCRIPTION += 'You need a GPU for this example. Try using colab: ' if torch.cuda.is_available(): model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, quantization_config=bnb_config ) else: model = None tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) tokenizer.pad_token = tokenizer.eos_token # Function to construct the prompt using the new system prompt template def get_prompt_with_template(message: str) -> str: return SYSTEM_PROMPT_TEMPLATE.format(instruction=message) # Function to generate the model's response def generate_model_response(message: str) -> str: prompt = get_prompt_with_template(message) inputs = tokenizer(prompt, return_tensors='pt') streamer = TextStreamer(tokenizer) if torch.cuda.is_available(): inputs = inputs.to('cuda') # Include **generate_kwargs to include the user-defined options output = model.generate(**inputs, max_new_tokens=4096, do_sample=True, temperature=0.1, streamer=streamer ) return tokenizer.decode(output[0], skip_special_tokens=True) def extract_response_content(full_response: str) -> str: response_start_index = full_response.find("### Assistant:") if response_start_index != -1: return full_response[response_start_index + len("### Assistant:"):].strip() else: return full_response def get_response_with_template(message: str) -> str: full_response = generate_model_response(message) return extract_response_content(full_response) with gr.Blocks(css="style.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton(value='Duplicate Space for private use', elem_id='duplicate-button') with gr.Group(): chatbot = gr.Textbox(label='DeciLM-7B-Instruct Output:') with gr.Row(): textbox = gr.Textbox( container=False, show_label=False, placeholder='Type an instruction...', scale=10, elem_id="textbox" ) submit_button = gr.Button( 'πŸ’¬ Submit', variant='primary', scale=1, min_width=0, elem_id="submit_button" ) # Clear button to clear the chat history clear_button = gr.Button( 'πŸ—‘οΈ Clear', variant='secondary', ) clear_button.click( fn=lambda: ('',''), outputs=[textbox, chatbot], queue=False, api_name=False, ) submit_button.click( fn=get_response_with_template, inputs=textbox, outputs= chatbot, queue=False, api_name=False, ) gr.Examples( examples=[ 'Write detailed instructions for making chocolate chip pancakes.', 'Write a 250-word article about your love of pancakes.', 'Explain the plot of Back to the Future in three sentences.', 'How do I make a trap beat?', 'A step-by-step guide to learning Python in one month.', ], inputs=textbox, outputs=chatbot, fn=get_response_with_template, cache_examples=True, elem_id="examples" ) gr.HTML(label="Keep in touch", value="Keep in touch") demo.launch()