from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq from datasets import Dataset, DatasetDict import pandas as pd # Load the dataset file_path = "hindi_dataset.tsv" # Update with your actual file path data = pd.read_csv(file_path, delimiter="\t") # Convert the dataset to Hugging Face Dataset hf_dataset = Dataset.from_pandas(data) # Split the dataset into train and test subsets split_dataset = hf_dataset.train_test_split(test_size=0.2) # Create a DatasetDict with train and test splits dataset = DatasetDict({ "train": split_dataset["train"], "test": split_dataset["test"] }) # Load the tokenizer and model model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) # Tokenize source and target text def tokenize_function(examples): model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128) with tokenizer.as_target_tokenizer(): labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128) model_inputs['labels'] = labels['input_ids'] return model_inputs # Apply tokenization to the dataset tokenized_datasets = dataset.map(tokenize_function, batched=True) # Define the training arguments training_args = Seq2SeqTrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, save_total_limit=3, predict_with_generate=True, logging_dir="./logs", logging_steps=10, save_steps=500 ) # Use the DataCollatorForSeq2Seq for padding data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # Define the Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['test'], tokenizer=tokenizer, data_collator=data_collator ) # Train the model trainer.train() # Evaluate the model eval_results = trainer.evaluate() print("Evaluation Results:", eval_results) # Test the model with sample inputs def translate_text(text): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) translated = model.generate(**inputs) return [tokenizer.decode(t, skip_special_tokens=True) for t in translated] # Test translation sample_text = "How are you?" hindi_translation = translate_text(sample_text) print(f"English: {sample_text}") print(f"Hindi: {hindi_translation[0]}")