neural_machine_translation

Sleeping

File size: 5,436 Bytes

238ef35
f1be6e3
9645311
fb302fa
 
f18bb84
 
835a881
cc54661
e6693a1
e825783
f172fd9
cc54661
 
 
014f2b7
 
cc54661
f18bb84
238ef35
 
 
 
 
 
fb302fa
 
238ef35
fb302fa
f73c3a8
 
88a55f1
e825783
 
f73c3a8
e825783
835a881
f73c3a8
835a881
fb302fa
 
 
 
 
 
 
 
cc54661
 
 
fb302fa
 
 
 
 
 
238ef35
 
fb302fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a2b73
 
 
 
 
 
 
 
fb302fa
238ef35
 
 
fb302fa
 
 
 
 
238ef35
e9a2b73
 
238ef35

import gradio as gr
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, MarianMTModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle
import json
import keras
from huggingface_hub import hf_hub_download
from transformers import pipeline
import torch
import os
import numpy as np

model_name = "Helsinki-NLP/opus-mt-en-hi"

model_base_nmt = MarianMTModel.from_pretrained(model_name)
tokenizer_base_nmt = AutoTokenizer.from_pretrained(model_name)



# Define the model repository and tokenizer checkpoint
model_checkpoint = "himanishprak23/neural_machine_translation"
tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

# Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Loading models, tokenizer & variables for trained LSTM translation model.
#repo_id = "Kumarkishalaya/lstm-eng-to-hin"
#lstm_filename = "seq2seq_model.keras" 


# Re-download the file
#lstm_model_path = hf_hub_download(repo_id=repo_id, filename=lstm_filename, force_download=True)


model_lstm = load_model('seq2seq_model.h5')

with open('eng_tokenizer.pkl', 'rb') as file:
    eng_tokenizer = pickle.load(file)
with open('hin_tokenizer.pkl', 'rb') as file:
    hin_tokenizer = pickle.load(file)
max_len_eng = 20
max_len_hin = 22

def translate_text_base_nmt(input_text):
    batch = tokenizer_base_nmt([input_text], return_tensors="pt")
    generated_ids = model_base_nmt.generate(**batch)
    predicted_text = tokenizer_base_nmt.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return predicted_text

def translate_text_nmt(input_text):
    tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
    generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
    predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
    return predicted_text

def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
    # Tokenize and pad the input sentence
    input_seq = eng_tokenizer.texts_to_sequences([sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')
    
    # Initialize target sequence with start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index['start']
    
    # Create reverse word index for Hindi
    reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])
    
    decoded_sentence = []
    
    for _ in range(max_len_hin):
        output = model.predict([input_seq, target_seq], verbose=0)
        sampled_token_index = np.argmax(output[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '')
        
        if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
            break
        
        decoded_sentence.append(sampled_word)
        
        # Update target sequence
        target_seq = np.zeros((1, len(decoded_sentence) + 1))
        for t, word in enumerate(decoded_sentence):
            target_seq[0, t] = hin_tokenizer.word_index.get(word, 0)  # Use 0 for unknown words
        target_seq[0, len(decoded_sentence)] = sampled_token_index
    
    return ' '.join(decoded_sentence)


def translate_text(input_text):
    translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
    translation_nmt_base = translate_text_base_nmt(input_text)
    translation_nmt_finetuned = translate_text_nmt(input_text)  
    return translation_lstm, translation_nmt_base, translation_nmt_finetuned
examples = [
    ["Microservices Architecture: Containers are ideal for deploying microservices, as each service can run in its own container and be independently managed."],
    ["Kubernetes: An open-source container orchestration platform that automates the deployment, scaling, and management of containerized applications."],
       ["Machine Learning: The practice of using algorithms to parse data, learn from it, and then make a determination or prediction about something in the world."],
    ["Data Science: An interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from noisy, structured, and unstructured data."],
    ["DevOps: A set of practices that combines software development and IT operations to shorten the development lifecycle and deliver high-quality software continuously."]

]

iface = gr.Interface(
    fn=translate_text,
    inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
    outputs=[
        gr.components.Textbox(label="Translation (LSTM Model)"),
        gr.components.Textbox(label="Translation (Base Helsinki Model)"),
        gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
    ],
    title="English to Hindi Translator",
    description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP. LSTM models both have a single hidden layer trained for 50 epochs, the GPT-2 has been trained for 5 epochs",
    examples=examples
)

# Launch the Gradio app
iface.launch()