In [2]:
# #Checking if GPU is running or not

!nvidia-smi

Sun Aug 13 13:34:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Installing the Required Python Packages
This cell installs the necessary Python libraries using the pip package manager. It silently installs the datasets and transformers libraries with support for SentencePiece tokenizer, which are essential for working with NLP models.

In [3]:
# Installing the required Python Packages for running of my code.
!pip install datasets transformers[sentencepiece] -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

## Importing the Required libraries
In this cell, you import the libraries and modules necessary for setting up your translation model and creating the GUI interface.

In [4]:
# Imported Required Libraries.
import os
import sys
import transformers
import tensorflow as t
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [5]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"  # Set the model checkpoint to the pre-trained English to Hindi translation model from the Helsinki-NLP repository
device = "cuda" if torch.cuda.is_available() else "cpu"  # Check if CUDA (GPU) is available, and if so, use it; otherwise, use CPU

## Loading Dataset From HuggingFace
source: https://huggingface.co/datasets/cfilt/iitb-english-hindi

Here we load the dataset from Huggingface made by IIT Bombay with 1.66 Million training data elements and nearly 2500 testing dataset

In [6]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi") # Taking the dataset from HuggingFace of text translation from en to hi.
raw_datasets # Checking the number of rows present in train, validation and test data split.

Downloading readme:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading metadata:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [7]:
raw_datasets['train'][19] # Printing the training data value.

{'translation': {'en': 'Event monitor', 'hi': 'घटना मानिटर'}}

## Model and Tokenizer From HuggingFace
source: https://huggingface.co/Helsinki-NLP/opus-mt-en-hi

Here, you initialize the tokenizer and sequence-to-sequence model using the AutoTokenizer and TFAutoModelForSeq2SeqLM classes, respectively. These are pre-trained models that you'll fine-tune for translation tasks.

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  # Initialize a tokenizer from the pre-trained model checkpoint
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)  # Initialize a sequence-to-sequence model for language translation from the pre-trained checkpoint

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



Downloading tf_model.h5:   0%|          | 0.00/306M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [9]:
tokenizer("Hello, My name is Bob.") # Checking the values generated by tokenizer with an example.

{'input_ids': [12110, 2, 633, 300, 23, 25990, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

## Preprocessing the Dataset values
Define a preprocessing function named preprocess_function. This function is responsible for tokenizing the input and target sentences from the dataset, setting the maximum sequence lengths, and attaching tokenized target IDs to the model inputs as "labels." This is a crucial step before feeding the data into the model for training.

In [10]:
max_input_length = 128  # Maximum length of input sequences
max_target_length = 128  # Maximum length of target sequences

source_lang = "en"  # Source language code (English)
target_lang = "hi"  # Target language code (Hindi)

# Define a preprocessing function for the dataset
def preprocess_function(examples):
    # Extract source (input) and target sentences from the dataset
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]

    # Tokenize the input sentences using the tokenizer and set the max length
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets and tokenize the target sentences
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    # Attach the tokenized target IDs to the model inputs as "labels"
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs  # Return the processed model inputs with tokenized inputs and labels

preprocess_function(raw_datasets["train"][:2]) # Checking the correct functioning of preprocess() function on our dataset.



{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

## Tokenize and Prepare Datasets
Here, you tokenize the raw dataset using the previously defined preprocess_function. You extract the first 10,000 samples from the training data, creating a smaller subset for demonstration and testing purposes. The new dataset is named train_data.

In [52]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) # Tokenize the raw dataset using the defined preprocess_function
full_train_dataset = tokenized_datasets['train']

# Extract the first 10,000 samples from the train dataset
train_data = full_train_dataset.select(range(10000))

# Verify the new dataset
print(train_data)

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})


## Set Training Parameters and Data Collator

This cell sets up the parameters for training. You define the batch size, learning rate, weight decay, and number of training epochs. Additionally, you create a DataCollatorForSeq2Seq object that will be used to process and batch the data for training.

In [57]:
batch_size = 16  # Number of examples in each training batch
learning_rate = 2e-5  # Learning rate for optimization
weight_decay = 0.01  # Weight decay parameter for regularization
num_train_epochs = 1  # Number of training epochs
data_collator = DataCollatorForSeq2Seq(
    tokenizer,              # Tokenizer for processing data
    model=model,            # Pre-trained model for seq2seq
    return_tensors="tf"     # Return tensors in TensorFlow format
)

## Preparation of Training Datasets
Here, you use the inbuilt function from tensorflow model.prepare_tf_dataset function to prepare for training the model. You provide the tokenized and preprocessed data, set the batch size, specify shuffling for the training data, and apply the data_collator for batching.

In [58]:
# Prepare the training dataset using the pre-trained model's built-in function
train_dataset = model.prepare_tf_dataset(
    train_data,            # Raw training data (tokenized)(First 5000 data items as computed above)
    batch_size=batch_size, # Set the batch size for the training dataset
    shuffle=True,          # Shuffle the training data for each epoch
    collate_fn=data_collator,  # Data collator to process and batch the data
)

## Initialize Optimizer and Compile Model
In this cell, you initialize the optimizer for training. The Adam optimizer with weight decay is created using the specified learning rate and weight decay rate. You then compile the model with the optimizer, preparing it for training.

In [59]:
# Initialize the optimizer with Adam and weight decay
optimizer = AdamWeightDecay(
    learning_rate=learning_rate,   # Set the learning rate for the optimizer
    weight_decay_rate=weight_decay  # Set the weight decay rate for regularization
)

# Compile the model with the optimizer
model.compile(optimizer=optimizer)  # Assign the initialized optimizer to the model


In [60]:
# Train the model using the prepared training dataset and validate using the validation dataset for 3 epochs.
model.fit(train_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x79b225698f70>

## Save and Load Model
This cell saves the trained model using inbuilt tensorflow function model.save_pretrained() where input parameter is path where model is to be saved. I have specified path on Google Drive. It also demonstrates how to load both the tokenizer and model from the original model checkpoint and from your saved Google Drive path.

In [65]:
# Saving my model in my google drive in my_model folder.
path = 'tf_model/'
model.save_pretrained(path)
# Loading both tokenizer and my model from model_checkpoint and from my google drive.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(path)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /content/drive/MyDrive/my_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


## Testing the Model
 Here, you provide an example input text and test the translation capabilities of your model. The input text is tokenized using the tokenizer, and the model generates an output sequence. The generated output is then decoded using the target tokenizer to produce the translated text. The translated text is printed as the output.

In [75]:
# Testing our model with some random input text and printing the translated text.
input_text  = "How rude of you!"

# Tokenize the input text using the tokenizer and convert to NumPy arrays
tokenized = tokenizer([input_text], return_tensors='np')

# Generate output sequences using the pre-trained model
out = model.generate(**tokenized, max_length=128)

# Switch the tokenizer to target mode
with tokenizer.as_target_tokenizer():
    # Decode the generated output sequence, skipping special tokens
    decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)

# Print the decoded output
print(decoded_output)

कितना कठोर!


In [68]:
! pip install gradio -q # Installing the gradio package to deploy my model as an application.

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.7/65.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.3/140.3 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m7.6 MB/s[0

## Create and Launch Gradio App for Translation

Finally, you create a GUI interface for interacting with your translation model using the Gradio library. The GUI allows users to input English text and receive translated Hindi text as output. The interface includes text boxes for input and output and is launched with sharing enabled.

In [None]:
import gradio as gr  # Import the Gradio library for creating a simple GUI

title = "Text Translation(English to Hindi)"  # Title for the GUI application

# Define a function to process input text and generate translation
def process_input(text):
    # Tokenize the input text using the tokenizer and convert to NumPy arrays
    tokenized = tokenizer([text], return_tensors='np')
    # Generate output sequences using the pre-trained model
    out = model.generate(**tokenized, max_length=128)
    # Switch the tokenizer to target mode
    with tokenizer.as_target_tokenizer():
        # Decode the generated output sequence, skipping special tokens
        result = tokenizer.decode(out[0], skip_special_tokens=True)
    return result

# Example input text for the GUI
examples = ['If you have the time, come along with me.', 'I can come if you want.', 'Tom was at home alone.', 'Wow!','How rude of you!',"What's in your hand?"]

# Create a Gradio Interface for the model
model_gui = gr.Interface(
    process_input,                 # Function for processing input and generating output
    gr.Textbox(lines=3, label="English"),  # Textbox for entering English text
    gr.Textbox(lines=3, label="Hindi"),    # Textbox for displaying translated Hindi text
    title=title,                   # Set the title of the GUI
    examples=examples              # Provide example input text for the GUI
)

# Launch the Gradio GUI with sharing enabled
model_gui.launch(share=True)