paavansundar's picture
Update app.py
0ed65fe
raw
history blame
No virus
3.65 kB
import gradio as gr
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
__checkpoint = "gpt2"
__tokenizer = GPT2Tokenizer.from_pretrained(__checkpoint)
__model = GPT2LMHeadModel.from_pretrained(__checkpoint)
__model_output_path = "gpt_model"
# Create a Data collator object
__data_collator = DataCollatorForLanguageModeling(tokenizer=__tokenizer, mlm=False, return_tensors="pt")
#prepare data
def prepareData():
df=pd.read_csv("MedQuAD.csv")
df['Question']=df['Question'].replace(r'^\s*$', np.nan, regex=True)
df['Answer']=df['Answer'].replace(r'^\s*$', np.nan, regex=True)
df = df.drop_duplicates(subset=['Question', 'Answer'])
df=df.dropna()
train_ds=df.groupby('Focus').head(100)
train_ds=train_ds.groupby('Focus').head(4).reset_index(drop=True)
test_ds=train_ds.groupby('Focus').head(1).reset_index(drop=True)
train_seq=list()
for i in range(len(train_ds)):
s='<question>'+train_ds.loc[i,'Question']+'<answer>'+train_ds.loc[i,'Answer']
train_seq.append(s)
val_seq=list()
for i in range(len(test_ds)):
s='<question>'+test_ds.loc[i,'Question']+'<answer>'+test_ds.loc[i,'Answer']
val_seq.append(s)
with open("train.txt", "w") as f:
f.writelines(line+'\n' for line in train_seq)
with open("val.txt", "w") as f:
f.writelines(line+'\n' for line in val_seq)
def fine_tune_gpt():
train_dataset = TextDataset(tokenizer=__tokenizer, file_path="train.txt", block_size=128)
val_dataset = TextDataset(tokenizer=__tokenizer, file_path="val.txt", block_size=128)
training_args = TrainingArguments(
output_dir = __model_output_path,
overwrite_output_dir = True,
per_device_train_batch_size = 2, # try with 2
per_device_eval_batch_size = 2, # try with 2
num_train_epochs = 0.01,
save_steps = 1_000,
save_total_limit = 2,
logging_dir = './logs',
)
# Train the model
trainer = Trainer(
model = __model,
args = training_args,
data_collator = __data_collator,
train_dataset = train_dataset,
eval_dataset = val_dataset,
)
trainer.train()
# Save the model
trainer.save_model(model_output_path)
# Save the tokenizer
__tokenizer.save_pretrained(model_output_path)
def queryGPT(question):
return generate_response(__model, __tokenizer, question)
def generate_response(model,tokenizer, prompt, max_length=200):
input_ids = tokenizer.encode(prompt, return_tensors="pt") # 'pt' for returning pytorch tensor
#my_model = GPT2LMHeadModel.from_pretrained(model_output_path)
#my_tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)
# Create the attention mask and pad token id
attention_mask = torch.ones_like(input_ids)
pad_token_id = tokenizer.eos_token_id
output = model.generate(
input_ids,
max_length=max_length,
num_return_sequences=1,
attention_mask=attention_mask,
pad_token_id=pad_token_id
)
return tokenizer.decode(output[0], skip_special_tokens=True)
with gr.Blocks() as demo:
txt_input = gr.Textbox(label="Input Question", lines=2)
txt_output = gr.Textbox(value="", label="Answer")
btn = gr.Button(value="Submit")
btn.click(queryGPT, inputs=[txt_input], outputs=[txt_output])
if __name__ == "__main__":
#prepareData()
#fine_tune_gpt()
demo.launch()