import gradio as gr import torch from gradio.themes.utils import sizes from transformers import AutoModelForCausalLM, AutoTokenizer import utils from constants import END_OF_TEXT, MIN_TEMPERATURE # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained( "BEE-spoke-data/smol_llama-101M-GQA-python", use_fast=False, ) tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.pad_token = END_OF_TEXT model = AutoModelForCausalLM.from_pretrained( "BEE-spoke-data/smol_llama-101M-GQA-python", device_map="auto", ) model = torch.compile(model, mode="reduce-overhead") # UI things _styles = utils.get_file_as_string("styles.css") # Loads ./README.md file & splits it into sections readme_file_content = utils.get_file_as_string("README.md", path="./") ( manifest, description, disclaimer, base_model_info, formats, ) = utils.get_sections(readme_file_content, "---", up_to=5) theme = gr.themes.Soft( primary_hue="yellow", secondary_hue="orange", neutral_hue="slate", radius_size=sizes.radius_sm, font=[ gr.themes.GoogleFont("IBM Plex Sans", [400, 600]), "ui-sans-serif", "system-ui", "sans-serif", ], text_size=sizes.text_lg, ) def run_inference( prompt, temperature, max_new_tokens, top_p, repetition_penalty ) -> str: inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, do_sample=True, epsilon_cutoff=1e-3, max_new_tokens=max_new_tokens, min_new_tokens=2, no_repeat_ngram_size=6, renormalize_logits=True, repetition_penalty=repetition_penalty, temperature=max(temperature, MIN_TEMPERATURE), top_p=top_p, ) text = tokenizer.batch_decode( outputs, skip_special_tokens=True, )[0] return text examples = [ [ 'def greet(name: str) -> None:\n """\n Greets the user\n """\n print(f"Hello,', 0.2, 64, 0.9, 1.2, ], [ 'for i in range(5):\n """\n Loop through 0 to 4\n """\n print(i,', 0.2, 64, 0.9, 1.2, ], ['x = 10\n"""Check if x is greater than 5"""\nif x > 5:', 0.2, 64, 0.9, 1.2], ["def square(x: int) -> int:\n return", 0.2, 64, 0.9, 1.2], ['import math\n"""Math operations"""\nmath.', 0.2, 64, 0.9, 1.2], [ 'def is_even(n) -> bool:\n """\n Check if a number is even\n """\n if n % 2 == 0:', 0.2, 64, 0.9, 1.2, ], [ 'while True:\n """Infinite loop example"""\n print("Infinite loop,', 0.2, 64, 0.9, 1.2, ], [ "def sum_list(lst: list[int]) -> int:\n total = 0\n for item in lst:", 0.2, 64, 0.9, 1.2, ], [ 'try:\n """\n Exception handling\n """\n x = int(input("Enter a number: "))\nexcept ValueError:', 0.2, 64, 0.9, 1.2, ], [ 'def divide(a: float, b: float) -> float:\n """\n Divide a by b\n """\n if b != 0:', 0.2, 64, 0.9, 1.2, ], ] # Define the Gradio Blocks interface with gr.Blocks(theme=theme, analytics_enabled=False, css=_styles) as demo: with gr.Column(): gr.Markdown(description) with gr.Row(): with gr.Column(): instruction = gr.Textbox( value=examples[0][0], placeholder="Enter your code here", label="Code", elem_id="q-input", ) submit = gr.Button("Generate", variant="primary") output = gr.Code(elem_id="q-output", language="python", lines=10) with gr.Row(): with gr.Column(): with gr.Accordion("Advanced settings", open=False): with gr.Row(): column_1, column_2 = gr.Column(), gr.Column() with column_1: temperature = gr.Slider( label="Temperature", value=0.2, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs", ) max_new_tokens = gr.Slider( label="Max new tokens", value=64, minimum=32, maximum=512, step=32, interactive=True, info="Number of tokens to generate", ) with column_2: top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ) repetition_penalty = gr.Slider( label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) with gr.Column(): version = gr.Dropdown( [ "smol_llama-101M-GQA-python", ], value="smol_llama-101M-GQA-python", label="Version", info="", ) gr.Markdown(disclaimer) gr.Examples( examples=examples, inputs=[ instruction, temperature, max_new_tokens, top_p, repetition_penalty, version, ], cache_examples=False, fn=run_inference, outputs=[output], ) gr.Markdown(base_model_info) gr.Markdown(formats) submit.click( run_inference, inputs=[ instruction, temperature, max_new_tokens, top_p, repetition_penalty, ], outputs=[output], # preprocess=False, # batch=False, show_progress=True, ) # .queue(max_size=10, api_open=False) demo.launch( debug=True, show_api=False, share=utils.is_google_colab(), )