Nihal D'Souza
Final app release
e41b03f
# -*- coding: utf-8 -*-
"""
Created on Thu May 19 13:22:32 2022
@author: UTKARSH
"""
import glob
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from src.clean import clean_license_text
from tqdm.auto import tqdm
from transformers import (
AdamW,
T5ForConditionalGeneration,
T5TokenizerFast as T5Tokenizer,
AutoModelForSeq2SeqLM,
AutoTokenizer
)
from src.read_data import read_license_summary_data
MODEL_PATH = "models/"
MODEL_FILENAME = "t5-base.model"
MODEL_NAME = "t5-base"
TOKENIZER = None
TEXT_MAX_TOKEN_LEN = 512
SUMMARY_MAX_TOKEN_LEN = 128
N_EPOCHS = 1
BATCH_SIZE = 1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
class LicenseSummaryDataset(Dataset):
def __init__(
self,
data: pd.DataFrame,
tokenizer: T5Tokenizer,
text_max_token_len: int=512,
summary_max_token_len: int=128
):
self.tokenizer = tokenizer
self.data = data
self.text_max_token_len = text_max_token_len
self.summary_max_token_len = summary_max_token_len
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
text = data_row["text"]
text_encoding = self.tokenizer(
text,
max_length=self.text_max_token_len,
padding="max_length",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt"
)
summary_encoding = self.tokenizer(
data_row["summary"],
max_length=self.summary_max_token_len,
padding="max_length",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt"
)
labels = summary_encoding["input_ids"]
labels[labels == 0] = -100
return dict(
text=text,
summary=data_row["summary"],
text_input_ids=text_encoding["input_ids"].flatten(),
text_attention_mask=text_encoding["attention_mask"].flatten(),
labels=labels.flatten(),
labels_attention_mask=summary_encoding["attention_mask"].flatten()
)
def prepare_dataloaders():
"""
Helper method to load data and create batched Dataloaders
Returns
-------
train_dataloader : DataLoader
Train DataLoader.
dev_dataloader : DataLoader
Validation DataLoader.
"""
license_summary_data = pd.DataFrame(read_license_summary_data())
train_df, dev_df = train_test_split(license_summary_data, test_size=0.1)
TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
train_dataset = LicenseSummaryDataset(
train_df,
TOKENIZER,
TEXT_MAX_TOKEN_LEN,
SUMMARY_MAX_TOKEN_LEN
)
dev_dataset = LicenseSummaryDataset(
dev_df,
TOKENIZER,
TEXT_MAX_TOKEN_LEN,
SUMMARY_MAX_TOKEN_LEN
)
train_dataloader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=0
)
dev_dataloader = DataLoader(
dev_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=0
)
return train_dataloader, dev_dataloader
def train(epoch, model, dataloader, optimizer, batch_size):
"""
Trains the given model on the given data for the given number of epochs.
Parameters
----------
epoch : int
The epoch number for which the model is being trained.
model : Summarizer
A summarizer model which we train.
dataloader : torch.utils.data.DataLoader
The dataloader on which the model is to be trained.
optimizer : transformers.AdamW
The optimizer to be used to optimize weights during training.
batch_size : int
The size of each batch as set in the dataloader.
"""
model.train()
total_train_loss = 0
for _, batch in tqdm(enumerate(dataloader)):
model.zero_grad()
print(_)
input_ids = batch["text_input_ids"].to(device, dtype=torch.long)
attention_mask = batch["text_attention_mask"].to(device, dtype=torch.long)
labels = batch["labels"].to(device, dtype=torch.long)
labels_attention_mask = batch["labels_attention_mask"].to(device, dtype=torch.long)
model_output = model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_attention_mask=labels_attention_mask,
labels=labels
)
# loss, _ = model_output.loss, model_output.logits
loss = model_output.loss
total_train_loss += loss.item()
loss.backward()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
avg_train_loss = total_train_loss / len(dataloader)
print(f"Epoch {epoch}: Training loss: {avg_train_loss}")
def train_and_save_model(train_dataloader, PATH):
"""
Trains a summarizer model from the given Dataloader and saves it at the
given path
Parameters
----------
train_dataloader : Dataloader
Batched Training Dataloader.
PATH : str
Path where the trained model is to be saved.
Returns
-------
model : Summarizer / torch.nn.Module
Trained model.
"""
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)
for epoch in range(1, N_EPOCHS + 1):
train(epoch, model, train_dataloader, optimizer, BATCH_SIZE)
torch.save(model.state_dict(), PATH)
print("Model Saved!")
return model
def summarize_text_with_model(text, model, tokenizer):
"""
Summarizes License text using the given trained T5 model.
Parameters
----------
text : str
The License text to be summarized.
model : Summarizer / torch.nn.Module
The trained model which is to be used to summarize text.
tokenizer : Tokenzier
The tokenizer used to tokenize text for model.
Returns
-------
str
Summary of the License text from the given model.
definitions : str
Definitions extracted from the License text.
"""
text, definitions, _ = clean_license_text(text)
text_encoding = tokenizer(
text,
max_length=TEXT_MAX_TOKEN_LEN,
padding="max_length",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt"
)
generated_ids = model.generate(
input_ids=text_encoding["input_ids"].to(device, dtype=torch.long),
attention_mask=text_encoding["attention_mask"].to(device, dtype=torch.long),
max_length=SUMMARY_MAX_TOKEN_LEN,
num_beams=2,
repetition_penalty=2.5,
length_penalty=1.0,
early_stopping=True
)
preds = [
tokenizer.decode(
gen_id,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
) for gen_id in generated_ids
]
return "".join(preds), definitions
def summarize(text, load_from_huggingface=True):
"""
Summarizes the given License text
Parameters
----------
text : str
Preprocessed License text.
load_from_huggingface : boolean
Toggles whether or not to load the model from huggingface. If set to
False, this will load or train the model locally.
Returns
-------
summary : str
Summary of the License text.
definitions : str
Definitions extracted from the License text.
"""
if load_from_huggingface:
print("Loading Model from HuggingFace...")
CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
else:
if os.path.exists(MODEL_PATH + MODEL_FILENAME):
print("Loading Model...")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
model.load_state_dict(torch.load(MODEL_PATH + MODEL_FILENAME))
model.eval()
else:
print("Training model...")
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_PATH)
train_dataloader, _ = prepare_dataloaders()
model = train_and_save_model(train_dataloader, MODEL_PATH + MODEL_FILENAME)
tokenizer = TOKENIZER
summary, definitions = summarize_text_with_model(text, model, tokenizer)
return summary, definitions
def summarize_license_files(path):
"""
Summarize License files from paths and save them as summary text files.
Parameters
----------
path : list(str)
A list of paths of the License files.
"""
paths = glob.glob(path + "*.txt")
for license_path in paths:
with open(license_path, "r", encoding="utf-8") as f:
summary, _ = summarize(f.read())
with open(license_path.replace(".txt", "") + "__summary.txt", "w", encoding="utf-8") as f:
f.write(summary)