Spaces:

KameliaZaman
/

French-to-English-Translation

Sleeping

App Files Files Community

French-to-English-Translation / app.py

KameliaZaman

Update app.py

72df95f verified 10 months ago

raw

history blame contribute delete

3.54 kB

	import string
	import re
	from unicodedata import normalize
	import numpy as np
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from keras.utils import to_categorical
	from keras.models import Sequential,load_model
	from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
	from keras.callbacks import EarlyStopping
	from nltk.translate.bleu_score import corpus_bleu
	import pandas as pd
	from string import punctuation
	import matplotlib.pyplot as plt
	from IPython.display import Markdown, display
	import gradio as gr
	import tensorflow as tf
	from tensorflow.keras.models import load_model

	total_sentences = 10000

	# Load the dataset
	dataset = pd.read_csv("./eng_-french.csv", nrows = total_sentences)

	def clean(string):
	# Clean the string
	string = string.replace("\u202f"," ") # Replace no-break space with space
	string = string.lower()

	# Delete the punctuation and the numbers
	for p in punctuation + "«»" + "0123456789":
	string = string.replace(p," ")

	string = re.sub('\s+',' ', string)
	string = string.strip()

	return string

	dataset = dataset.sample(frac=1, random_state=0)
	dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x))
	dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x))

	# Select one part of the dataset
	dataset = dataset.values
	dataset = dataset[:total_sentences]

	source_str, target_str = "French", "English"
	idx_src, idx_tar = 1, 0

	def create_tokenizer(lines):
	# fit a tokenizer
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

	def max_len(lines):
	# max sentence length
	return max(len(line.split()) for line in lines)

	def encode_sequences(tokenizer, length, lines):
	# encode and pad sequences
	X = tokenizer.texts_to_sequences(lines) # integer encode sequences
	X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
	return X

	def word_for_id(integer, tokenizer):
	# map an integer to a word
	for word, index in tokenizer.word_index.items():
	if index == integer:
	return word
	return None

	def predict_seq(model, tokenizer, source):
	# generate target from a source sequence
	prediction = model.predict(source, verbose=0)[0]
	integers = [np.argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
	word = word_for_id(i, tokenizer)
	if word is None:
	break
	target.append(word)
	return ' '.join(target)

	src_tokenizer = create_tokenizer(dataset[:, idx_src])
	src_vocab_size = len(src_tokenizer.word_index) + 1
	src_length = max_len(dataset[:, idx_src])
	tar_tokenizer = create_tokenizer(dataset[:, idx_tar])

	# Load the model
	model = load_model('./french_to_english_translator.h5')

	# Function to translate French to English
	def translate_french_english(french_sentence):
	# Clean the input sentence
	french_sentence = clean(french_sentence)
	# Tokenize and pad the input sentence
	input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])
	# Generate the translation
	english_translation = predict_seq(model, tar_tokenizer, input_sequence)
	return english_translation

	# Create a Gradio interface
	gr.Interface(
	fn=translate_french_english,
	inputs="text",
	outputs="text",
	title="French to English Translator",
	description="Translate French sentences to English."
	).launch()