File size: 5,436 Bytes
238ef35 f1be6e3 9645311 fb302fa f18bb84 835a881 cc54661 e6693a1 e825783 f172fd9 cc54661 014f2b7 cc54661 f18bb84 238ef35 fb302fa 238ef35 fb302fa f73c3a8 88a55f1 e825783 f73c3a8 e825783 835a881 f73c3a8 835a881 fb302fa cc54661 fb302fa 238ef35 fb302fa e9a2b73 fb302fa 238ef35 fb302fa 238ef35 e9a2b73 238ef35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, MarianMTModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle
import json
import keras
from huggingface_hub import hf_hub_download
from transformers import pipeline
import torch
import os
import numpy as np
model_name = "Helsinki-NLP/opus-mt-en-hi"
model_base_nmt = MarianMTModel.from_pretrained(model_name)
tokenizer_base_nmt = AutoTokenizer.from_pretrained(model_name)
# Define the model repository and tokenizer checkpoint
model_checkpoint = "himanishprak23/neural_machine_translation"
tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
# Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# Loading models, tokenizer & variables for trained LSTM translation model.
#repo_id = "Kumarkishalaya/lstm-eng-to-hin"
#lstm_filename = "seq2seq_model.keras"
# Re-download the file
#lstm_model_path = hf_hub_download(repo_id=repo_id, filename=lstm_filename, force_download=True)
model_lstm = load_model('seq2seq_model.h5')
with open('eng_tokenizer.pkl', 'rb') as file:
eng_tokenizer = pickle.load(file)
with open('hin_tokenizer.pkl', 'rb') as file:
hin_tokenizer = pickle.load(file)
max_len_eng = 20
max_len_hin = 22
def translate_text_base_nmt(input_text):
batch = tokenizer_base_nmt([input_text], return_tensors="pt")
generated_ids = model_base_nmt.generate(**batch)
predicted_text = tokenizer_base_nmt.batch_decode(generated_ids, skip_special_tokens=True)[0]
return predicted_text
def translate_text_nmt(input_text):
tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
return predicted_text
def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
# Tokenize and pad the input sentence
input_seq = eng_tokenizer.texts_to_sequences([sentence])
input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')
# Initialize target sequence with start token
target_seq = np.zeros((1, 1))
target_seq[0, 0] = hin_tokenizer.word_index['start']
# Create reverse word index for Hindi
reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])
decoded_sentence = []
for _ in range(max_len_hin):
output = model.predict([input_seq, target_seq], verbose=0)
sampled_token_index = np.argmax(output[0, -1, :])
sampled_word = reverse_word_index.get(sampled_token_index, '')
if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
break
decoded_sentence.append(sampled_word)
# Update target sequence
target_seq = np.zeros((1, len(decoded_sentence) + 1))
for t, word in enumerate(decoded_sentence):
target_seq[0, t] = hin_tokenizer.word_index.get(word, 0) # Use 0 for unknown words
target_seq[0, len(decoded_sentence)] = sampled_token_index
return ' '.join(decoded_sentence)
def translate_text(input_text):
translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
translation_nmt_base = translate_text_base_nmt(input_text)
translation_nmt_finetuned = translate_text_nmt(input_text)
return translation_lstm, translation_nmt_base, translation_nmt_finetuned
examples = [
["Microservices Architecture: Containers are ideal for deploying microservices, as each service can run in its own container and be independently managed."],
["Kubernetes: An open-source container orchestration platform that automates the deployment, scaling, and management of containerized applications."],
["Machine Learning: The practice of using algorithms to parse data, learn from it, and then make a determination or prediction about something in the world."],
["Data Science: An interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from noisy, structured, and unstructured data."],
["DevOps: A set of practices that combines software development and IT operations to shorten the development lifecycle and deliver high-quality software continuously."]
]
iface = gr.Interface(
fn=translate_text,
inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
outputs=[
gr.components.Textbox(label="Translation (LSTM Model)"),
gr.components.Textbox(label="Translation (Base Helsinki Model)"),
gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
],
title="English to Hindi Translator",
description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP. LSTM models both have a single hidden layer trained for 50 epochs, the GPT-2 has been trained for 5 epochs",
examples=examples
)
# Launch the Gradio app
iface.launch()
|