# -*- coding: utf-8 -*- """en_de_model.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/11zBErr3Hns-ddBFgb05-oWAmvMPPNpnw """ import tensorflow from pickle import load from numpy import array, argmax from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import load_model from nltk.translate.bleu_score import corpus_bleu def load_clean_data(filename): return load(open(filename, 'rb')) def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer def max_length(lines): return max(len(line.split()) for line in lines) def encode_sequences(tokenizer, length, lines): X = tokenizer.texts_to_sequences(lines) X = pad_sequences(X, maxlen=length, padding='post') return X def word_for_id(integer, tokenizer): for word, index in tokenizer.word_index.items(): if index == integer: return word return None def predict_sequence(model, tokenizer, source): prediction = model.predict(source, verbose=0)[0] integers = [argmax(vector) for vector in prediction] target = list() for i in integers: word = word_for_id(i, tokenizer) if word is None: break target.append(word) return ' '.join(target) def evaluate_model(model, tokenizer, sources, raw_dataset): actual, predicted = list(), list() for i, source in enumerate(sources): source = source.reshape((1, source.shape[0])) translation = predict_sequence(model, tokenizer, source) raw_target, raw_src = raw_dataset[i] if i < 10: print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation)) actual.append([raw_target.split()]) predicted.append(translation.split()) print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))) print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))) dataset = load_clean_data('english-german-both.pkl') train_data = load_clean_data('english-german-train.pkl') test_data = load_clean_data('english-german-test.pkl') eng_tokenizer = create_tokenizer(dataset[:, 0]) eng_vocab_size = len(eng_tokenizer.word_index) + 1 eng_length = max_length(dataset[:, 0]) ger_tokenizer = create_tokenizer(dataset[:, 1]) ger_vocab_size = len(ger_tokenizer.word_index) + 1 ger_length = max_length(dataset[:, 1]) trainX = encode_sequences(ger_tokenizer, ger_length, train_data[:, 1]) testX = encode_sequences(ger_tokenizer, ger_length, test_data[:, 1]) model = load_model('en_de_model.h5') print('train') evaluate_model(model, eng_tokenizer, trainX, train_data) print('test') evaluate_model(model, eng_tokenizer, testX, test_data) def trans(input_sentence, ger_tokenizer, ger_length, eng_tokenizer, model): input_sequence = ger_tokenizer.texts_to_sequences([input_sentence]) input_sequence = pad_sequences(input_sequence, maxlen=ger_length, padding='post') translation = predict_sequence(model, eng_tokenizer, input_sequence) return translation input_sentence = "Ich mag deutsche Autos." translated_sentence = trans(input_sentence) print(f"Input: {input_sentence}") print(f"Translation: {translated_sentence}") import gradio as gr iface = gr.Interface( fn=trans, inputs="text", outputs="text" ) iface.launch()