# import torch # from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig # import gradio as gr # # Model IDs from Hugging Face Hub # base_model_id = "HuggingFaceTB/SmolLM2-135M" # instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01" # # Load tokenizer # base_tokenizer = AutoTokenizer.from_pretrained(base_model_id) # # Load models with explicit LLaMA architecture # base_model = LlamaForCausalLM.from_pretrained(base_model_id) # instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id) # def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False): # # Prepare input based on model type # if is_instruct: # if system_prompt: # full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:" # else: # full_prompt = f"Human: {message}\nAssistant:" # else: # # For base model, use simpler prompt format # full_prompt = message # inputs = tokenizer(full_prompt, return_tensors="pt") # with torch.no_grad(): # outputs = model.generate( # inputs.input_ids, # max_length=max_length, # do_sample=True, # temperature=temperature, # top_k=50, # top_p=0.95, # num_return_sequences=1, # pad_token_id=tokenizer.eos_token_id # Add padding token # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # if is_instruct: # try: # response = response.split("Assistant:")[-1].strip() # except: # pass # else: # response = response[len(full_prompt):].strip() # return response # def chat(message, temperature, max_length, system_prompt): # # Generate responses from both models # base_response = generate_response( # base_model, # base_tokenizer, # message, # temperature, # max_length, # system_prompt, # is_instruct=False # ) # instruct_response = generate_response( # instruct_model, # base_tokenizer, # message, # temperature, # max_length, # system_prompt, # is_instruct=True # ) # return base_response, instruct_response # # Create Gradio interface # with gr.Blocks() as demo: # gr.Markdown("# SmolLM2-135M Comparison Demo") # gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M") # with gr.Row(): # with gr.Column(): # message_input = gr.Textbox(label="Input Message") # system_prompt = gr.Textbox( # label="System Prompt (Optional)", # placeholder="Set context or personality for the model", # lines=3 # ) # with gr.Column(): # temperature = gr.Slider( # minimum=0.1, # maximum=2.0, # value=0.5, # label="Temperature" # ) # max_length = gr.Slider( # minimum=50, # maximum=500, # value=200, # step=10, # label="Max Length" # ) # with gr.Row(): # with gr.Column(): # gr.Markdown("### Base Model Response") # base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5) # with gr.Column(): # gr.Markdown("### Bootleg Instruct Model Response") # instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5) # submit_btn = gr.Button("Generate Responses") # submit_btn.click( # fn=chat, # inputs=[message_input, temperature, max_length, system_prompt], # outputs=[base_output, instruct_output] # ) # if __name__ == "__main__": # demo.launch() from transformers import AutoTokenizer, AutoModelForCausalLM import torch import gradio as gr # model_id = "HuggingFaceTB/SmolLM2-135M" model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct01" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) def generate_response(message, temperature=0.7, max_length=200): prompt = f"Human: {message}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=max_length, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response.split("Assistant:")[-1].strip() with gr.Blocks() as demo: gr.Markdown("# SmolLM2 Bootleg Instruct Chat") with gr.Row(): with gr.Column(): message = gr.Textbox(label="Message") temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature") max_len = gr.Slider(minimum=50, maximum=500, value=200, label="Max Length") submit = gr.Button("Send") with gr.Column(): output = gr.Textbox(label="Response") submit.click( generate_response, inputs=[message, temp, max_len], outputs=output ) if __name__ == "__main__": demo.launch()