In [3]:
!pip install transformers datasets
import torch
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq

# Load dataset (replace with your actual dataset path)
dataset = load_dataset("sudeshna84/Hind-Beng-5k")

# Load BART tokenizer and model for translation
model_name = "facebook/bart-large"  # You can use a smaller variant like bart-base as well
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Preprocessing function for translation
def preprocess_function(examples):
    inputs = [f"translate Hindi to Bengali: {text}" for text in examples['hi']]  # Hindi text
    targets = examples['bn']  # Bengali text (target)
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = dataset["train"].map(preprocess_function, batched=True)
eval_dataset = dataset["test"].map(preprocess_function, batched=True)

# Set up data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for model checkpoints
    evaluation_strategy="epoch",     # evaluation at each epoch
    learning_rate=2e-5,              # initial learning rate
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size per device during evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay for regularization
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    save_steps=500,                  # save checkpoints every 500 steps
    eval_accumulation_steps=2,       # accumulate gradients before evaluation
)

# Initialize Trainer
trainer = Trainer(
    model=model,                        # the model to train
    args=training_args,                 # training arguments
    train_dataset=train_dataset,        # training dataset
    eval_dataset=eval_dataset,          # evaluation dataset
    data_collator=data_collator,        # data collator to pad inputs
    tokenizer=tokenizer,                # tokenizer
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_bart_model")
tokenizer.save_pretrained("./fine_tuned_bart_model")


Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/425 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/774k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/336k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3716 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1593 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Map:   0%|          | 0/3716 [00:00<?, ? examples/s]



Map:   0%|          | 0/1593 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marunapriyad24[0m ([33marunapriyad24-woxsen-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.8405,0.804304




Epoch,Training Loss,Validation Loss
1,0.8405,0.804304
2,0.7269,0.728273
3,0.6434,0.696879


('./fine_tuned_bart_model/tokenizer_config.json',
 './fine_tuned_bart_model/special_tokens_map.json',
 './fine_tuned_bart_model/vocab.json',
 './fine_tuned_bart_model/merges.txt',
 './fine_tuned_bart_model/added_tokens.json')

In [8]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained("./fine_tuned_bart_model")
tokenizer = BartTokenizer.from_pretrained("./fine_tuned_bart_model")

# Function to translate Hindi to Bengali using the fine-tuned model
def translate_hindi_to_bengali(hindi_text):
    # Prepare the input prompt for BART
    input_text = f"translate Hindi to Bengali: {hindi_text}"

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

    # Generate translation
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)


    # Decode the generated tokens to Bengali text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Example sentences to translate
hindi_sentences = [
    "फिर एक बार, मन की बातें करने के लिए, आपके बीच आने का मुझे अवसर मिला है।",
    "चिल्लाकर पुकारा “अजी हो क्या!",

]

# Translate the example sentences
for hindi_sentence in hindi_sentences:
    bengali_translation = translate_hindi_to_bengali(hindi_sentence)
    print(f"Hindi: {hindi_sentence}")
    print(f"Bengali: {bengali_translation}")
    print("-" * 50)


Hindi: फिर एक बार, मन की बातें करने के लिए, आपके बीच आने का मुझे अवसर मिला है।
Bengali: একবার, মনের প্রতিবীচ কোথা লইয়া , আপনি সুরূষ এ অসব হয়।
--------------------------------------------------
Hindi: चिल्लाकर पुकारा “अजी हो क्या!
Bengali: চিল্লাকর পুকেরা “অজী হইয়া!
--------------------------------------------------


In [9]:
torch.save(model.state_dict(), 'model.pth')  # Save model weights


In [11]:
from google.colab import files
files.download('model.pth')  # Replace 'model.h5' with your model's filename


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>