|
|
import torch
|
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, AdamW
|
|
|
from transformers import AutoModel, AutoTokenizer
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
|
from huggingface_hub import HfApi, HfFolder, Repository, notebook_login, create_repo, upload_folder
|
|
|
import os
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
HF_USERNAME = "aarath97"
|
|
|
HF_REPO = "mt5-dogri-translation"
|
|
|
MODEL_NAME = "google/mt5-large"
|
|
|
BATCH_SIZE = 2
|
|
|
LR = 1e-5
|
|
|
DPO_STEPS = 100
|
|
|
HGRL_STEPS = 100
|
|
|
COMBINED_STEPS = 50
|
|
|
GAMMA = 3.5
|
|
|
ALPHA = 0.5
|
|
|
BETA = 0.5
|
|
|
|
|
|
|
|
|
df = pd.read_excel("dogri_train.xlsx")
|
|
|
train_data = list(zip(df['Dogri'], df['English'], df['Unpreffered']))
|
|
|
|
|
|
|
|
|
tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)
|
|
|
sbert = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
sbert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
|
|
|
def compute_similarity(sent1, sent2):
|
|
|
emb1 = sbert(**sbert_tokenizer(sent1, return_tensors='pt')).last_hidden_state.mean(1)
|
|
|
emb2 = sbert(**sbert_tokenizer(sent2, return_tensors='pt')).last_hidden_state.mean(1)
|
|
|
return cosine_similarity(emb1.detach().numpy(), emb2.detach().numpy())[0][0]
|
|
|
|
|
|
def hyper_gamma_reward(rho):
|
|
|
return rho * np.exp(-GAMMA * (1 - rho))
|
|
|
|
|
|
|
|
|
class DogriDataset(Dataset):
|
|
|
def __init__(self, data):
|
|
|
self.data = data
|
|
|
|
|
|
def __len__(self):
|
|
|
return len(self.data)
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
return self.data[idx]
|
|
|
|
|
|
dataloader = DataLoader(DogriDataset(train_data), batch_size=BATCH_SIZE, shuffle=True)
|
|
|
|
|
|
|
|
|
model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda")
|
|
|
optimizer = AdamW(model.parameters(), lr=LR)
|
|
|
|
|
|
dpo_losses, hgrl_losses, final_losses = [], [], []
|
|
|
|
|
|
|
|
|
for step in range(DPO_STEPS):
|
|
|
batch = next(iter(dataloader))
|
|
|
loss_batch = []
|
|
|
for src, ref, unpref in zip(*batch):
|
|
|
input_ids = tokenizer(src, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
|
|
|
ref_ids = tokenizer(ref, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
|
|
|
unpref_ids = tokenizer(unpref, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
|
|
|
|
|
|
ref_logprob = model(input_ids=input_ids, labels=ref_ids).loss
|
|
|
unpref_logprob = model(input_ids=input_ids, labels=unpref_ids).loss
|
|
|
|
|
|
logit_diff = -ref_logprob.item() + unpref_logprob.item()
|
|
|
beta = 1.0
|
|
|
loss = -torch.log(torch.sigmoid(torch.tensor(beta * logit_diff)))
|
|
|
loss_batch.append(loss)
|
|
|
|
|
|
loss_val = torch.stack(loss_batch).mean()
|
|
|
loss_val.backward()
|
|
|
optimizer.step()
|
|
|
optimizer.zero_grad()
|
|
|
dpo_losses.append(loss_val.item())
|
|
|
|
|
|
|
|
|
for step in range(HGRL_STEPS):
|
|
|
batch = next(iter(dataloader))
|
|
|
loss_batch = []
|
|
|
for src, ref, _ in zip(*batch):
|
|
|
input_ids = tokenizer(src, return_tensors='pt').input_ids.to("cuda")
|
|
|
gen_ids = model.generate(input_ids)
|
|
|
gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
|
|
|
|
|
|
rho = compute_similarity(gen_text, ref)
|
|
|
reward = hyper_gamma_reward(rho)
|
|
|
|
|
|
labels = tokenizer(gen_text, return_tensors='pt').input_ids.to("cuda")
|
|
|
logprob = model(input_ids=input_ids, labels=labels).loss
|
|
|
|
|
|
loss = -reward * logprob
|
|
|
loss_batch.append(loss)
|
|
|
|
|
|
loss_val = torch.stack(loss_batch).mean()
|
|
|
loss_val.backward()
|
|
|
optimizer.step()
|
|
|
optimizer.zero_grad()
|
|
|
hgrl_losses.append(loss_val.item())
|
|
|
|
|
|
|
|
|
for step in range(COMBINED_STEPS):
|
|
|
batch = next(iter(dataloader))
|
|
|
loss_dpo_batch, loss_hgrl_batch = [], []
|
|
|
for src, ref, unpref in zip(*batch):
|
|
|
input_ids = tokenizer(src, return_tensors='pt').input_ids.to("cuda")
|
|
|
ref_ids = tokenizer(ref, return_tensors='pt').input_ids.to("cuda")
|
|
|
unpref_ids = tokenizer(unpref, return_tensors='pt').input_ids.to("cuda")
|
|
|
|
|
|
logprob_ref = model(input_ids=input_ids, labels=ref_ids).loss
|
|
|
logprob_unpref = model(input_ids=input_ids, labels=unpref_ids).loss
|
|
|
dpo_loss = -torch.log(torch.sigmoid(torch.tensor(logprob_unpref.item() - logprob_ref.item())))
|
|
|
loss_dpo_batch.append(dpo_loss)
|
|
|
|
|
|
gen_ids = model.generate(input_ids)
|
|
|
gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
|
|
|
rho = compute_similarity(gen_text, ref)
|
|
|
reward = hyper_gamma_reward(rho)
|
|
|
|
|
|
labels = tokenizer(gen_text, return_tensors='pt').input_ids.to("cuda")
|
|
|
logprob = model(input_ids=input_ids, labels=labels).loss
|
|
|
hgrl_loss = -reward * logprob
|
|
|
loss_hgrl_batch.append(hgrl_loss)
|
|
|
|
|
|
loss_dpo_mean = torch.stack(loss_dpo_batch).mean()
|
|
|
loss_hgrl_mean = torch.stack(loss_hgrl_batch).mean()
|
|
|
combined_loss = ALPHA * loss_dpo_mean + BETA * loss_hgrl_mean
|
|
|
combined_loss.backward()
|
|
|
optimizer.step()
|
|
|
optimizer.zero_grad()
|
|
|
final_losses.append(combined_loss.item())
|
|
|
|
|
|
|
|
|
plt.plot(dpo_losses, label="DPO")
|
|
|
plt.plot(hgrl_losses, label="HGRL")
|
|
|
plt.plot(final_losses, label="Combined")
|
|
|
plt.xlabel("Steps")
|
|
|
plt.ylabel("Loss")
|
|
|
plt.legend()
|
|
|
plt.savefig("loss_curve.png")
|
|
|
|
|
|
with open("loss_report.txt", "w") as f:
|
|
|
f.write("DPO Final Loss: {:.4f}\n".format(dpo_losses[-1]))
|
|
|
f.write("HGRL Final Loss: {:.4f}\n".format(hgrl_losses[-1]))
|
|
|
f.write("Combined Final Loss: {:.4f}\n".format(final_losses[-1]))
|
|
|
|
|
|
|
|
|
test_df = pd.read_excel("in22conv.xlsx")
|
|
|
test_outputs = []
|
|
|
for line in test_df.iloc[:, 0].tolist():
|
|
|
input_ids = tokenizer(line, return_tensors='pt').input_ids.to("cuda")
|
|
|
outputs = model.generate(input_ids)
|
|
|
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
test_outputs.append(translation)
|
|
|
|
|
|
output_df = pd.DataFrame({"Dogri": test_df.iloc[:, 0], "English": test_outputs})
|
|
|
output_df.to_excel("translated_output.xlsx", index=False)
|
|
|
|
|
|
|
|
|
model.save_pretrained("mt5-dogri")
|
|
|
tokenizer.save_pretrained("mt5-dogri")
|
|
|
create_repo(f"{HF_USERNAME}/{HF_REPO}", private=False, exist_ok=True)
|
|
|
upload_folder(repo_id=f"{HF_USERNAME}/{HF_REPO}", folder_path="mt5-dogri")
|
|
|
print("Model uploaded successfully!") |