import string import re from unicodedata import normalize import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.models import Sequential,load_model from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed from keras.callbacks import EarlyStopping from nltk.translate.bleu_score import corpus_bleu import pandas as pd from string import punctuation import matplotlib.pyplot as plt from IPython.display import Markdown, display import gradio as gr import tensorflow as tf from tensorflow.keras.models import load_model total_sentences = 10000 # Load the dataset dataset = pd.read_csv("./eng_-french.csv", nrows = total_sentences) def clean(string): # Clean the string string = string.replace("\u202f"," ") # Replace no-break space with space string = string.lower() # Delete the punctuation and the numbers for p in punctuation + "«»" + "0123456789": string = string.replace(p," ") string = re.sub('\s+',' ', string) string = string.strip() return string dataset = dataset.sample(frac=1, random_state=0) dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x)) dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x)) # Select one part of the dataset dataset = dataset.values dataset = dataset[:total_sentences] source_str, target_str = "French", "English" idx_src, idx_tar = 1, 0 def create_tokenizer(lines): # fit a tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer def max_len(lines): # max sentence length return max(len(line.split()) for line in lines) def encode_sequences(tokenizer, length, lines): # encode and pad sequences X = tokenizer.texts_to_sequences(lines) # integer encode sequences X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values return X def word_for_id(integer, tokenizer): # map an integer to a word for word, index in tokenizer.word_index.items(): if index == integer: return word return None def predict_seq(model, tokenizer, source): # generate target from a source sequence prediction = model.predict(source, verbose=0)[0] integers = [np.argmax(vector) for vector in prediction] target = list() for i in integers: word = word_for_id(i, tokenizer) if word is None: break target.append(word) return ' '.join(target) src_tokenizer = create_tokenizer(dataset[:, idx_src]) src_vocab_size = len(src_tokenizer.word_index) + 1 src_length = max_len(dataset[:, idx_src]) tar_tokenizer = create_tokenizer(dataset[:, idx_tar]) # Load the model model = load_model('./french_to_english_translator.h5') # Function to translate French to English def translate_french_english(french_sentence): # Clean the input sentence french_sentence = clean(french_sentence) # Tokenize and pad the input sentence input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence]) # Generate the translation english_translation = predict_seq(model, tar_tokenizer, input_sequence) return english_translation # Create a Gradio interface gr.Interface( fn=translate_french_english, inputs="text", outputs="text", title="French to English Translator", description="Translate French sentences to English." ).launch()