|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer |
|
import torch |
|
from peft import PeftModel, PeftConfig |
|
base_model = "TinyPixel/Llama-2-7B-bf16-sharded" |
|
tuned_adapter = "newronai/llama-2-7b-QLoRA-Trial1" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = PeftConfig.from_pretrained(tuned_adapter) |
|
model = AutoModelForCausalLM.from_pretrained(base_model, |
|
use_cache="cache", |
|
|
|
|
|
) |
|
|
|
model = PeftModel.from_pretrained(model, tuned_adapter) |
|
print("Model Downloaded") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model, |
|
use_cache="cache") |
|
tokenizer.pad_token = tokenizer.eos_token |
|
print("Tokenizer Ready") |
|
|
|
def question_answer(context, question): |
|
tokens = tokenizer.encode(question, return_tensors="pt").to("cuda") |
|
output = model.generate(input_tokens) |
|
output_text = tokenizer.batch_decode(output, skip_special_tokens = True)[0] |
|
return output_text |
|
|
|
|
|
gr.Interface(fn=question_answer, inputs=[gr.inputs.Textbox(lines=7, label="Context Paragraph"), |
|
gr.inputs.Textbox(lines=2, label="Question"),], |
|
outputs=[gr.outputs.Textbox(label="Answer")]).launch() |