Redmind's picture
Update app.py
0d125e0 verified
raw
history blame
2.72 kB
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import pandas as pd
# Load the dataset
file_path = "hindi_dataset.tsv" # Update with your actual file path
data = pd.read_csv(file_path, delimiter="\t")
# Convert the dataset to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)
# Split the dataset into train and test subsets
split_dataset = hf_dataset.train_test_split(test_size=0.2)
# Create a DatasetDict with train and test splits
dataset = DatasetDict({
"train": split_dataset["train"],
"test": split_dataset["test"]
})
# Load the tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Tokenize source and target text
def tokenize_function(examples):
model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
model_inputs['labels'] = labels['input_ids']
return model_inputs
# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=3,
predict_with_generate=True,
logging_dir="./logs",
logging_steps=10,
save_steps=500
)
# Use the DataCollatorForSeq2Seq for padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Define the Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
tokenizer=tokenizer,
data_collator=data_collator
)
# Train the model
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
# Test the model with sample inputs
def translate_text(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
translated = model.generate(**inputs)
return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# Test translation
sample_text = "How are you?"
hindi_translation = translate_text(sample_text)
print(f"English: {sample_text}")
print(f"Hindi: {hindi_translation[0]}")