import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer import torch from peft import PeftModel, PeftConfig base_model = "TinyPixel/Llama-2-7B-bf16-sharded" tuned_adapter = "newronai/llama-2-7b-QLoRA-Trial1" # bnb_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.float16, # ) # bnb_config = BitsAndBytesConfig() config = PeftConfig.from_pretrained(tuned_adapter) model = AutoModelForCausalLM.from_pretrained(base_model, use_cache="cache", load_in_8bit = True # quantization_config=bnb_config ) model = PeftModel.from_pretrained(model, tuned_adapter) print("Model Downloaded") tokenizer = AutoTokenizer.from_pretrained(base_model, use_cache="cache") tokenizer.pad_token = tokenizer.eos_token print("Tokenizer Ready") def question_answer(context, question): tokens = tokenizer.encode(question, return_tensors="pt").to("cuda") output = model.generate(input_tokens) output_text = tokenizer.batch_decode(output, skip_special_tokens = True)[0] return output_text gr.Interface(fn=question_answer, inputs=[gr.inputs.Textbox(lines=7, label="Context Paragraph"), gr.inputs.Textbox(lines=2, label="Question"),], outputs=[gr.outputs.Textbox(label="Answer")]).launch()