| # import torch | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig | |
| # import gradio as gr | |
| # # Model IDs from Hugging Face Hub | |
| # base_model_id = "HuggingFaceTB/SmolLM2-135M" | |
| # instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01" | |
| # # Load tokenizer | |
| # base_tokenizer = AutoTokenizer.from_pretrained(base_model_id) | |
| # # Load models with explicit LLaMA architecture | |
| # base_model = LlamaForCausalLM.from_pretrained(base_model_id) | |
| # instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id) | |
| # def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False): | |
| # # Prepare input based on model type | |
| # if is_instruct: | |
| # if system_prompt: | |
| # full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:" | |
| # else: | |
| # full_prompt = f"Human: {message}\nAssistant:" | |
| # else: | |
| # # For base model, use simpler prompt format | |
| # full_prompt = message | |
| # inputs = tokenizer(full_prompt, return_tensors="pt") | |
| # with torch.no_grad(): | |
| # outputs = model.generate( | |
| # inputs.input_ids, | |
| # max_length=max_length, | |
| # do_sample=True, | |
| # temperature=temperature, | |
| # top_k=50, | |
| # top_p=0.95, | |
| # num_return_sequences=1, | |
| # pad_token_id=tokenizer.eos_token_id # Add padding token | |
| # ) | |
| # response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # if is_instruct: | |
| # try: | |
| # response = response.split("Assistant:")[-1].strip() | |
| # except: | |
| # pass | |
| # else: | |
| # response = response[len(full_prompt):].strip() | |
| # return response | |
| # def chat(message, temperature, max_length, system_prompt): | |
| # # Generate responses from both models | |
| # base_response = generate_response( | |
| # base_model, | |
| # base_tokenizer, | |
| # message, | |
| # temperature, | |
| # max_length, | |
| # system_prompt, | |
| # is_instruct=False | |
| # ) | |
| # instruct_response = generate_response( | |
| # instruct_model, | |
| # base_tokenizer, | |
| # message, | |
| # temperature, | |
| # max_length, | |
| # system_prompt, | |
| # is_instruct=True | |
| # ) | |
| # return base_response, instruct_response | |
| # # Create Gradio interface | |
| # with gr.Blocks() as demo: | |
| # gr.Markdown("# SmolLM2-135M Comparison Demo") | |
| # gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M") | |
| # with gr.Row(): | |
| # with gr.Column(): | |
| # message_input = gr.Textbox(label="Input Message") | |
| # system_prompt = gr.Textbox( | |
| # label="System Prompt (Optional)", | |
| # placeholder="Set context or personality for the model", | |
| # lines=3 | |
| # ) | |
| # with gr.Column(): | |
| # temperature = gr.Slider( | |
| # minimum=0.1, | |
| # maximum=2.0, | |
| # value=0.5, | |
| # label="Temperature" | |
| # ) | |
| # max_length = gr.Slider( | |
| # minimum=50, | |
| # maximum=500, | |
| # value=200, | |
| # step=10, | |
| # label="Max Length" | |
| # ) | |
| # with gr.Row(): | |
| # with gr.Column(): | |
| # gr.Markdown("### Base Model Response") | |
| # base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5) | |
| # with gr.Column(): | |
| # gr.Markdown("### Bootleg Instruct Model Response") | |
| # instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5) | |
| # submit_btn = gr.Button("Generate Responses") | |
| # submit_btn.click( | |
| # fn=chat, | |
| # inputs=[message_input, temperature, max_length, system_prompt], | |
| # outputs=[base_output, instruct_output] | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import gradio as gr | |
| # model_id = "HuggingFaceTB/SmolLM2-135M" | |
| model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct04" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| def generate_response(message, temperature=0.7, max_length=200): | |
| prompt = f"Human: {message}\nAssistant:" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| inputs.input_ids, | |
| max_length=max_length, | |
| temperature=temperature, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return response.split("Assistant:")[-1].strip() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# SmolLM2 Bootleg Instruct Chat") | |
| with gr.Row(): | |
| with gr.Column(): | |
| message = gr.Textbox(label="Message") | |
| temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature") | |
| max_len = gr.Slider(minimum=50, maximum=500, value=200, label="Max Length") | |
| submit = gr.Button("Send") | |
| with gr.Column(): | |
| output = gr.Textbox(label="Response") | |
| submit.click( | |
| generate_response, | |
| inputs=[message, temp, max_len], | |
| outputs=output | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |