import gradio as gr | |
# gr.load("models/kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0").launch() | |
import re | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
tokenizer = AutoTokenizer.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") | |
model = AutoModelForCausalLM.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
### Instruction: | |
{} | |
### Input: | |
{} | |
### Response: | |
{}""" | |
def get_response(input_text: str) -> str: | |
inputs = tokenizer( | |
[ | |
alpaca_prompt.format( | |
"Please answer the following sentence as requested", # instruction | |
input_text, # input | |
"", # output - leave this blank for generation! | |
) | |
], | |
return_tensors="pt", | |
).to(device) | |
outputs = model.generate(**inputs, max_new_tokens=256) | |
output = tokenizer.batch_decode(outputs)[0] | |
response_pattern = re.compile(r"### Response:\n(.*?)<eos>", re.DOTALL) | |
response_match = response_pattern.search(output) | |
if response_match: | |
response = response_match.group(1).strip() | |
return response | |
else: | |
return "Response not found" | |
interface = gr.Interface( | |
fn=get_response, | |
inputs="text", | |
outputs="text", | |
title="Gemma Hinglish Model Inference", | |
) | |
interface.launch() | |