In [65]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW
import pandas as pd
from tqdm import tqdm

class GrammarCorrectionDataset(Dataset):
 def __init__(self, sentences, corrected_sentences, tokenizer, max_length=128):
 self.sentences = sentences
 self.corrected_sentences = corrected_sentences
 self.tokenizer = tokenizer
 self.max_length = max_length

 def __len__(self):
 return len(self.sentences)
 
 def __getitem__(self, idx):
 input_sentence = self.sentences[idx]
 corrected_sentence = self.corrected_sentences[idx]

 # Tokenize the input and corrected sentences separately
 inputs = self.tokenizer(
 [input_sentence], # Pass as a list
 [corrected_sentence], # Pass as a list
 return_tensors="pt",
 padding="max_length",
 truncation=True,
 max_length=self.max_length
 )

 return {
 "input_ids": inputs["input_ids"].flatten(),
 "attention_mask": inputs["attention_mask"].flatten(),
 "labels": inputs["input_ids"].flatten() # Use input_ids as labels for MLM
 }



def pad_collate(batch):
 # Find the length of the longest sentence in the batch
 max_len = max(len(batch_item["input_ids"]) for batch_item in batch)
 
 # Pad each input to the length of the longest sentence in the batch
 for batch_item in batch:
 input_ids = batch_item["input_ids"]
 attention_mask = batch_item["attention_mask"]
 labels = batch_item["labels"]
 
 padded_input_ids = torch.nn.functional.pad(input_ids, (0, max_len - len(input_ids)), value=tokenizer.pad_token_id)
 padded_attention_mask = torch.nn.functional.pad(attention_mask, (0, max_len - len(attention_mask)), value=0)
 padded_labels = torch.nn.functional.pad(labels, (0, max_len - len(labels)), value=tokenizer.pad_token_id)
 
 batch_item["input_ids"] = padded_input_ids
 batch_item["attention_mask"] = padded_attention_mask
 batch_item["labels"] = padded_labels
 
 return {
 "input_ids": torch.stack([batch_item["input_ids"] for batch_item in batch]),
 "attention_mask": torch.stack([batch_item["attention_mask"] for batch_item in batch]),
 "labels": torch.stack([batch_item["labels"] for batch_item in batch])
 }


data = pd.read_csv(r'D:\Thesis\test_bert_data.csv')
data = data.dropna()
sentences = [str(i) for i in data['wrong'].values] # Convert Series to list
corrected_sentences = [str(i) for i in data['right1'].values]

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")
model = AutoModelForMaskedLM.from_pretrained("jcblaise/roberta-tagalog-base")

# Create dataset and dataloader
dataset = GrammarCorrectionDataset(sentences, corrected_sentences, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_collate)


# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3

model.train()
for epoch in range(num_epochs):
 tqdm_dataloader = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")
 for batch in tqdm_dataloader:
 input_ids = batch["input_ids"]
 attention_mask = batch["attention_mask"]
 labels = batch["labels"]

 outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
 loss = outputs.loss

 optimizer.zero_grad()
 loss.backward()
 optimizer.step()

 # Update tqdm progress bar description with current loss
 tqdm_dataloader.set_postfix({"Loss": loss.item()})

 # Close the tqdm progress bar for the epoch
 tqdm_dataloader.close()

# Save the fine-tuned model
#model.save_pretrained("fine_tuned_model")


Epoch 1/3: 100%|██████████| 1293/1293 [35:14<00:00, 1.64s/batch, Loss=0.000204]
Epoch 2/3: 100%|██████████| 1293/1293 [35:04<00:00, 1.63s/batch, Loss=0.000154]
Epoch 3/3: 100%|██████████| 1293/1293 [34:39<00:00, 1.61s/batch, Loss=7.12e-5] 


In [66]:
model.save_pretrained("fine_tuned_model")

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the fine-tuned model
model = AutoModelForMaskedLM.from_pretrained("fine_tuned_model")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")

# Example new data
new_data = [
 "Takbo takbo ang mga bata sa labas"
]

# Tokenize the new data
tokenized_data = tokenizer(new_data, return_tensors="pt", padding=True, truncation=True)

# Pass the tokenized data through the model to get predictions
with torch.no_grad():
 outputs = model(**tokenized_data)

# Decode the predicted token IDs back to text
predicted_texts = tokenizer.batch_decode(outputs.logits.argmax(dim=-1))

# Print the original sentences and their corrected versions
for original, corrected in zip(new_data, predicted_texts):
 print(f"Original: {original}")
 print(f"Corrected: {corrected}")
 print()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Original: Takbo takbo ang mga bata sa labas
Corrected: Takbo ng takbo ang mga bata sa labas

