# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # from peft import PeftModel, PeftConfig # # Load tokenizer # tokenizer = AutoTokenizer.from_pretrained(".") # # Load base model with quantization # bnb_config = BitsAndBytesConfig(load_in_4bit=True) # base_model = AutoModelForCausalLM.from_pretrained( # "unsloth/Meta-Llama-3.1-8B-bnb-4bit", # same base you fine-tuned # quantization_config=bnb_config, # device_map="auto" # ) # # Load LoRA adapters # model = PeftModel.from_pretrained(base_model, ".") # # Create Gradio Interface # def generate_response(prompt): # inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # gr.Interface( # fn=generate_response, # inputs=gr.Textbox(label="Enter your instruction"), # outputs=gr.Textbox(label="Model response"), # title="LLaMA 3 - Fine-tuned Model" # ).launch() import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(".") # Load base model (non-quantized) base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Meta-Llama-3-8B", # use standard non-quantized base model device_map="auto" ) # Load LoRA adapters model = PeftModel.from_pretrained(base_model, ".") # Create Gradio Interface def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7) return tokenizer.decode(outputs[0], skip_special_tokens=True) gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Enter your instruction"), outputs=gr.Textbox(label="Model response"), title="LLaMA 3 - Fine-tuned Model" ).launch()