File size: 4,383 Bytes
238ef35 f1be6e3 fb302fa f18bb84 835a881 cc54661 e6693a1 e825783 cc54661 f18bb84 238ef35 fb302fa 238ef35 fb302fa f73c3a8 88a55f1 e825783 f73c3a8 e825783 835a881 f73c3a8 835a881 fb302fa cc54661 fb302fa 238ef35 fb302fa 238ef35 fb302fa 238ef35 fb302fa 238ef35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, MarianMTModel
from tensorflow.keras.models import load_model
import pickle
import json
import keras
from huggingface_hub import hf_hub_download
from transformers import pipeline
import torch
import os
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer_base_nmt = MarianMTModel.from_pretrained(model_name)
model_base_nmt = AutoTokenizer.from_pretrained(model_name)
# Define the model repository and tokenizer checkpoint
model_checkpoint = "himanishprak23/neural_machine_translation"
tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
# Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# Loading models, tokenizer & variables for trained LSTM translation model.
#repo_id = "Kumarkishalaya/lstm-eng-to-hin"
#lstm_filename = "seq2seq_model.keras"
# Re-download the file
#lstm_model_path = hf_hub_download(repo_id=repo_id, filename=lstm_filename, force_download=True)
model_lstm = load_model('seq2seq_model.h5')
with open('eng_tokenizer.pkl', 'rb') as file:
eng_tokenizer = pickle.load(file)
with open('hin_tokenizer.pkl', 'rb') as file:
hin_tokenizer = pickle.load(file)
max_len_eng = 20
max_len_hin = 22
def translate_text_base_nmt(input_text):
batch = tokenizer_base_nmt([input_text], return_tensors="pt")
generated_ids = model_base_nmt.generate(**batch)
predicted_text = tokenizer_base_nmt.batch_decode(generated_ids, skip_special_tokens=True)[0]
return predicted_text
def translate_text_nmt(input_text):
tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
return predicted_text
def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
# Tokenize and pad the input sentence
input_seq = eng_tokenizer.texts_to_sequences([sentence])
input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')
# Initialize target sequence with start token
target_seq = np.zeros((1, 1))
target_seq[0, 0] = hin_tokenizer.word_index['start']
# Create reverse word index for Hindi
reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])
decoded_sentence = []
for _ in range(max_len_hin):
output = model.predict([input_seq, target_seq], verbose=0)
sampled_token_index = np.argmax(output[0, -1, :])
sampled_word = reverse_word_index.get(sampled_token_index, '')
if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
break
decoded_sentence.append(sampled_word)
# Update target sequence
target_seq = np.zeros((1, len(decoded_sentence) + 1))
for t, word in enumerate(decoded_sentence):
target_seq[0, t] = hin_tokenizer.word_index.get(word, 0) # Use 0 for unknown words
target_seq[0, len(decoded_sentence)] = sampled_token_index
return ' '.join(decoded_sentence)
def translate_text(input_text):
translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
translation_nmt_base = translate_text_base_nmt(input_text)
translation_nmt_finetuned = translate_text_nmt(input_text)
return translation_lstm, translation_nmt_base, translation_nmt_finetuned
# Create the Gradio interface
iface = gr.Interface(
fn=translate_text,
inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
outputs=[
gr.components.Textbox(label="Translation (LSTM Model)"),
gr.components.Textbox(label="Translation (Base Helsinki Model)"),
gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
],
title="English to Hindi Translator",
description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP."
)
# Launch the Gradio app
iface.launch()
|