import torch import torch.nn as nn from torch.utils.data import DataLoader, Dataset from transformers import MarianMTModel, MarianTokenizer # Define dataset class class TranslationDataset(Dataset): def __init__(self, source_sentences, target_sentences, tokenizer): self.source_sentences = source_sentences self.target_sentences = target_sentences self.tokenizer = tokenizer def __len__(self): return len(self.source_sentences) def __getitem__(self, idx): source_text = self.source_sentences[idx] target_text = self.target_sentences[idx] source_tokens = self.tokenizer(source_text, return_tensors='pt', padding=True, truncation=True) target_tokens = self.tokenizer(target_text, return_tensors='pt', padding=True, truncation=True) return {'input_ids': source_tokens['input_ids'], 'labels': target_tokens['input_ids']} # Define training function def train(model, dataloader, optimizer, criterion, num_epochs): model.train() for epoch in range(num_epochs): total_loss = 0.0 for batch in dataloader: input_ids = batch['input_ids'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() outputs = model(input_ids=input_ids, labels=labels) loss = outputs.loss loss.backward() optimizer.step() total_loss += loss.item() print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}') # Load tokenizer and model tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr') model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr').to(device) # Prepare dataset and dataloader dataset = TranslationDataset(source_sentences, target_sentences, tokenizer) dataloader = DataLoader(dataset, batch_size=32, shuffle=True) # Define optimizer and criterion optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) criterion = nn.CrossEntropyLoss() # Train the model train(model, dataloader, optimizer, criterion, num_epochs=10) # Save the trained model torch.save(model.state_dict(), 'translation_model.pth')