| 
							 | 
						import gradio as gr | 
					
					
						
						| 
							 | 
						from time import time | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import torch | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						import argparse | 
					
					
						
						| 
							 | 
						import random | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						import faiss | 
					
					
						
						| 
							 | 
						from argparse import Namespace | 
					
					
						
						| 
							 | 
						from tqdm.notebook import tqdm | 
					
					
						
						| 
							 | 
						from torch.utils.data import DataLoader | 
					
					
						
						| 
							 | 
						from functools import partial | 
					
					
						
						| 
							 | 
						from sklearn.manifold import TSNE | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel | 
					
					
						
						| 
							 | 
						import os  | 
					
					
						
						| 
							 | 
						dir_path = os.path.dirname(os.path.realpath(__file__)) | 
					
					
						
						| 
							 | 
						print(dir_path) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						metadata_all = {}  | 
					
					
						
						| 
							 | 
						model_es = "Helsinki-NLP/opus-mt-en-es" | 
					
					
						
						| 
							 | 
						model_fr = "Helsinki-NLP/opus-mt-en-fr" | 
					
					
						
						| 
							 | 
						model_zh = "Helsinki-NLP/opus-mt-en-zh" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						tokenizer_es = AutoTokenizer.from_pretrained(model_es) | 
					
					
						
						| 
							 | 
						tokenizer_fr = AutoTokenizer.from_pretrained(model_fr) | 
					
					
						
						| 
							 | 
						tokenizer_zh = AutoTokenizer.from_pretrained(model_zh) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						model_tr_es = MarianMTModel.from_pretrained(model_es) | 
					
					
						
						| 
							 | 
						model_tr_fr = MarianMTModel.from_pretrained(model_fr) | 
					
					
						
						| 
							 | 
						model_tr_zh = MarianMTModel.from_pretrained(model_zh) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						dict_models = { | 
					
					
						
						| 
							 | 
							'en-es': model_es, | 
					
					
						
						| 
							 | 
							'en-fr': model_fr, | 
					
					
						
						| 
							 | 
							'en-zh': model_zh, | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						dict_models_tr = { | 
					
					
						
						| 
							 | 
							'en-es': model_tr_es, | 
					
					
						
						| 
							 | 
							'en-fr': model_tr_fr, | 
					
					
						
						| 
							 | 
							'en-zh': model_tr_zh, | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						dict_tokenizer_tr = { | 
					
					
						
						| 
							 | 
							'en-es': tokenizer_es, | 
					
					
						
						| 
							 | 
							'en-fr': tokenizer_fr, | 
					
					
						
						| 
							 | 
							'en-zh': tokenizer_zh, | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from faiss import write_index, read_index | 
					
					
						
						| 
							 | 
						import pickle  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def translation_model(w1,model ): | 
					
					
						
						| 
							 | 
							inputs = dict_tokenizer_tr[model](w1, return_tensors="pt") | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							input_embeddings = dict_models_tr[model].get_encoder().embed_tokens(inputs.input_ids) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							print(inputs) | 
					
					
						
						| 
							 | 
							num_ret_seq = 1 | 
					
					
						
						| 
							 | 
							translated  = dict_models_tr[model].generate(**inputs,  | 
					
					
						
						| 
							 | 
																	  num_beams=5, | 
					
					
						
						| 
							 | 
																	  num_return_sequences=num_ret_seq, | 
					
					
						
						| 
							 | 
																	  return_dict_in_generate=True,  | 
					
					
						
						| 
							 | 
																	  output_attentions =False,   | 
					
					
						
						| 
							 | 
																	  output_hidden_states = True, | 
					
					
						
						| 
							 | 
																	  output_scores=True,) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							tgt_text = dict_tokenizer_tr[model].decode(translated.sequences[0], skip_special_tokens=True) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							target_embeddings = dict_models_tr[model].get_decoder().embed_tokens(translated.sequences) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return tgt_text, translated, inputs.input_ids, input_embeddings, target_embeddings | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def create_vocab_multiple(embeddings_list, model):  | 
					
					
						
						| 
							 | 
							"""_summary_ | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
							Args: | 
					
					
						
						| 
							 | 
								embeddings_list (list): embedding array  | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
							Returns: | 
					
					
						
						| 
							 | 
								Dict: vocabulary of tokens' embeddings | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							print("START VOCAB CREATION MULTIPLE \n \n ") | 
					
					
						
						| 
							 | 
							vocab = {}  | 
					
					
						
						| 
							 | 
							sentence_tokens_text_list = [] | 
					
					
						
						| 
							 | 
							for embeddings in embeddings_list:  | 
					
					
						
						| 
							 | 
								tokens_id = embeddings['tokens']  | 
					
					
						
						| 
							 | 
								for sent_i, sentence in enumerate(tokens_id): | 
					
					
						
						| 
							 | 
									sentence_tokens = [] | 
					
					
						
						| 
							 | 
									for tok_i, token in enumerate(sentence):  | 
					
					
						
						| 
							 | 
										sentence_tokens.append(token) | 
					
					
						
						| 
							 | 
										if not (token in vocab): | 
					
					
						
						| 
							 | 
											vocab[token] = { | 
					
					
						
						| 
							 | 
												'token' : token, | 
					
					
						
						| 
							 | 
												'count': 1,  | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
												'text': dict_tokenizer_tr[model].decode([token]), | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
												'embed': embeddings['embeddings'][sent_i][tok_i]} | 
					
					
						
						| 
							 | 
										else:  | 
					
					
						
						| 
							 | 
											vocab[token]['count'] = vocab[token]['count'] + 1   | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
									sentence_tokens_text_list.append(sentence_tokens) | 
					
					
						
						| 
							 | 
							print("END VOCAB CREATION MULTIPLE \n \n ") | 
					
					
						
						| 
							 | 
							return vocab, sentence_tokens_text_list | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def vocab_words_all_prefix(token_embeddings, model, sufix="@@",prefix = '▁' ): | 
					
					
						
						| 
							 | 
							vocab = {}  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							sentence_words_text_list = [] | 
					
					
						
						| 
							 | 
							if prefix :  | 
					
					
						
						| 
							 | 
								n_prefix = len(prefix) | 
					
					
						
						| 
							 | 
								for input_sentences in token_embeddings:  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									for sent_i, sentence in enumerate(input_sentences['tokens']): | 
					
					
						
						| 
							 | 
										words_text_list = [] | 
					
					
						
						| 
							 | 
										 | 
					
					
						
						| 
							 | 
										word = ''  | 
					
					
						
						| 
							 | 
										tokens_ids = [] | 
					
					
						
						| 
							 | 
										embeddings = [] | 
					
					
						
						| 
							 | 
										ids_to_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(sentence) | 
					
					
						
						| 
							 | 
										 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
										to_save= False | 
					
					
						
						| 
							 | 
										for tok_i, token_text in enumerate(ids_to_tokens):  | 
					
					
						
						| 
							 | 
											token_id = sentence[tok_i] | 
					
					
						
						| 
							 | 
											if token_text[:n_prefix] == prefix :  | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
												if to_save:  | 
					
					
						
						| 
							 | 
													vocab[word] = { | 
					
					
						
						| 
							 | 
															'word' : word, | 
					
					
						
						| 
							 | 
															'text': word, | 
					
					
						
						| 
							 | 
															'count': 1,  | 
					
					
						
						| 
							 | 
															'tokens_ids' : tokens_ids,  | 
					
					
						
						| 
							 | 
															'embed': np.mean(np.array(embeddings), 0).tolist() | 
					
					
						
						| 
							 | 
														} | 
					
					
						
						| 
							 | 
													words_text_list.append(word) | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
												tokens_ids = [token_id] | 
					
					
						
						| 
							 | 
												embeddings = [input_sentences['embeddings'][sent_i][tok_i]] | 
					
					
						
						| 
							 | 
												word = token_text[n_prefix:] | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
												to_save = True  | 
					
					
						
						| 
							 | 
												 | 
					
					
						
						| 
							 | 
											else :  | 
					
					
						
						| 
							 | 
												if (token_text in dict_tokenizer_tr[model].special_tokens_map.values()): | 
					
					
						
						| 
							 | 
													 | 
					
					
						
						| 
							 | 
													if to_save:  | 
					
					
						
						| 
							 | 
														 | 
					
					
						
						| 
							 | 
														vocab[word] = { | 
					
					
						
						| 
							 | 
															'word' : word, | 
					
					
						
						| 
							 | 
															'text': word, | 
					
					
						
						| 
							 | 
															'count': 1,  | 
					
					
						
						| 
							 | 
															'tokens_ids' : tokens_ids,  | 
					
					
						
						| 
							 | 
															'embed': np.mean(np.array(embeddings), 0).tolist() | 
					
					
						
						| 
							 | 
														} | 
					
					
						
						| 
							 | 
														words_text_list.append(word) | 
					
					
						
						| 
							 | 
													 | 
					
					
						
						| 
							 | 
													 | 
					
					
						
						| 
							 | 
													tokens_ids = [token_id] | 
					
					
						
						| 
							 | 
													embeddings = [input_sentences['embeddings'][sent_i][tok_i]] | 
					
					
						
						| 
							 | 
													vocab[token_text] = { | 
					
					
						
						| 
							 | 
															'word' : token_text, | 
					
					
						
						| 
							 | 
															'count': 1,  | 
					
					
						
						| 
							 | 
															'text': word, | 
					
					
						
						| 
							 | 
															'tokens_ids' : tokens_ids,  | 
					
					
						
						| 
							 | 
															'embed': np.mean(np.array(embeddings), 0).tolist() | 
					
					
						
						| 
							 | 
														} | 
					
					
						
						| 
							 | 
													words_text_list.append(token_text) | 
					
					
						
						| 
							 | 
													to_save = False | 
					
					
						
						| 
							 | 
												else:  | 
					
					
						
						| 
							 | 
													 | 
					
					
						
						| 
							 | 
													to_save = True  | 
					
					
						
						| 
							 | 
													word += token_text  | 
					
					
						
						| 
							 | 
													tokens_ids.append(token_id) | 
					
					
						
						| 
							 | 
													embeddings.append(input_sentences['embeddings'][sent_i][tok_i]) | 
					
					
						
						| 
							 | 
										if to_save:  | 
					
					
						
						| 
							 | 
											 | 
					
					
						
						| 
							 | 
											vocab[word] = tokens_ids | 
					
					
						
						| 
							 | 
											if not (word in vocab): | 
					
					
						
						| 
							 | 
												vocab[word] = { | 
					
					
						
						| 
							 | 
													'word' : word, | 
					
					
						
						| 
							 | 
													'count': 1,  | 
					
					
						
						| 
							 | 
													'text': word, | 
					
					
						
						| 
							 | 
													'tokens_ids' : tokens_ids,  | 
					
					
						
						| 
							 | 
													'embed': np.mean(np.array(embeddings), 0).tolist() | 
					
					
						
						| 
							 | 
													} | 
					
					
						
						| 
							 | 
												words_text_list.append(word) | 
					
					
						
						| 
							 | 
											else:  | 
					
					
						
						| 
							 | 
												vocab[word]['count'] = vocab[word]['count'] + 1  | 
					
					
						
						| 
							 | 
										sentence_words_text_list.append(words_text_list) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return vocab, sentence_words_text_list | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def create_index_voronoi(vocab): | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							it returns an index of words and a metadata of ids.  | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							d = 1024 | 
					
					
						
						| 
							 | 
							nb_embds = []  | 
					
					
						
						| 
							 | 
							metadata = {} | 
					
					
						
						| 
							 | 
							i_pos = 0 | 
					
					
						
						| 
							 | 
							for key_token, token_values in vocab.items(): | 
					
					
						
						| 
							 | 
								nb_embds.append(token_values['embed'])  | 
					
					
						
						| 
							 | 
								metadata[i_pos] = {'token': token_values['token'], 'text': token_values['text']} | 
					
					
						
						| 
							 | 
								i_pos += 1 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							xb = np.array(nb_embds).astype('float32')  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							d = len(xb[0])  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							nlist = 5  | 
					
					
						
						| 
							 | 
							quantizer = faiss.IndexFlatL2(d) | 
					
					
						
						| 
							 | 
							index = faiss.IndexIVFFlat(quantizer, d, nlist) | 
					
					
						
						| 
							 | 
							index.train(xb) | 
					
					
						
						| 
							 | 
							index.add(xb) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							return index, metadata | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def create_index_voronoi_words(vocab): | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							it returns an index of words and a metadata of ids.  | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							d = 1024 | 
					
					
						
						| 
							 | 
							nb_embds = []  | 
					
					
						
						| 
							 | 
							metadata = {} | 
					
					
						
						| 
							 | 
							i_pos = 0 | 
					
					
						
						| 
							 | 
							for key_token, token_values in vocab.items(): | 
					
					
						
						| 
							 | 
								nb_embds.append(token_values['embed'])  | 
					
					
						
						| 
							 | 
								metadata[i_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'],'text': token_values['text']} | 
					
					
						
						| 
							 | 
								i_pos += 1 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							xb = np.array(nb_embds).astype('float32')  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							d = len(xb[0])  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							nlist = 5  | 
					
					
						
						| 
							 | 
							quantizer = faiss.IndexFlatL2(d) | 
					
					
						
						| 
							 | 
							index = faiss.IndexIVFFlat(quantizer, d, nlist) | 
					
					
						
						| 
							 | 
							index.train(xb) | 
					
					
						
						| 
							 | 
							index.add(xb) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							return index, metadata | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def search_query_vocab(index, vocab_queries,  topk = 10, limited_search = []): | 
					
					
						
						| 
							 | 
							""" the embed queries are a vocabulary of words : embds_input_voc | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
							Args: | 
					
					
						
						| 
							 | 
								index (_type_): faiss index | 
					
					
						
						| 
							 | 
								embed_queries (_type_): vocab format. | 
					
					
						
						| 
							 | 
									{   'token' : token, | 
					
					
						
						| 
							 | 
										'count': 1,  | 
					
					
						
						| 
							 | 
										'text': src_token_lists[sent_i][tok_i],  | 
					
					
						
						| 
							 | 
										'embed': embeddings[0]['embeddings'][sent_i][tok_i] } | 
					
					
						
						| 
							 | 
								nb_ids (_type_): hash to find the token_id w.r.t the faiss index id.  | 
					
					
						
						| 
							 | 
								topk (int, optional): nb of similar tokens. Defaults to 10. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
							Returns: | 
					
					
						
						| 
							 | 
								_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids) | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							nb_q_embds = []  | 
					
					
						
						| 
							 | 
							metadata = {} | 
					
					
						
						| 
							 | 
							qi_pos = 0 | 
					
					
						
						| 
							 | 
							for key , token_values in vocab_queries.items(): | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								metadata[qi_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'], 'text': token_values['text']} | 
					
					
						
						| 
							 | 
								qi_pos += 1 | 
					
					
						
						| 
							 | 
								nb_q_embds.append(token_values['embed'])  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							xq = np.array(nb_q_embds).astype('float32')  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							D,I = index.search(xq, topk) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return D,I, metadata | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def search_query_vocab_token(index, vocab_queries,  topk = 10, limited_search = []): | 
					
					
						
						| 
							 | 
							""" the embed queries are a vocabulary of words : embds_input_vov | 
					
					
						
						| 
							 | 
							Returns: | 
					
					
						
						| 
							 | 
								_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids) | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							nb_q_embds = []  | 
					
					
						
						| 
							 | 
							metadata = {} | 
					
					
						
						| 
							 | 
							qi_pos = 0 | 
					
					
						
						| 
							 | 
							for key , token_values in vocab_queries.items(): | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								metadata[qi_pos] = {'token': token_values['token'], 'text': token_values['text']} | 
					
					
						
						| 
							 | 
								qi_pos += 1 | 
					
					
						
						| 
							 | 
								nb_q_embds.append(token_values['embed'])  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							xq = np.array(nb_q_embds).astype('float32')  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							D,I = index.search(xq, topk) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return D,I, metadata		 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def build_search(query_embeddings, model,type="input"): | 
					
					
						
						| 
							 | 
							global metadata_all  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							vocab_queries, sentence_tokens_list = create_vocab_multiple(query_embeddings, model) | 
					
					
						
						| 
							 | 
							words_vocab_queries, sentence_words_list = vocab_words_all_prefix(query_embeddings, model, sufix="@@",prefix="▁") | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							index_vor_tokens = metadata_all[type]['tokens'][1] | 
					
					
						
						| 
							 | 
							md_tokens = metadata_all[type]['tokens'][2] | 
					
					
						
						| 
							 | 
							D, I, meta = search_query_vocab_token(index_vor_tokens, vocab_queries) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							qi_pos = 0  | 
					
					
						
						| 
							 | 
							similar_tokens = {} | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							for dist, ind in zip(D,I): | 
					
					
						
						| 
							 | 
								try:  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									similar_tokens[str(meta[qi_pos]['token'])] = { | 
					
					
						
						| 
							 | 
										'token': meta[qi_pos]['token'],  | 
					
					
						
						| 
							 | 
										'text': meta[qi_pos]['text'],  | 
					
					
						
						| 
							 | 
										 | 
					
					
						
						| 
							 | 
										 | 
					
					
						
						| 
							 | 
										"similar_topk": [md_tokens[i_index]['token'] for i_index in ind if (i_index != -1) ],  | 
					
					
						
						| 
							 | 
										"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],  | 
					
					
						
						| 
							 | 
										} | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								except:  | 
					
					
						
						| 
							 | 
									print("\n ERROR ", qi_pos, dist, ind) | 
					
					
						
						| 
							 | 
								qi_pos += 1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							index_vor_words = metadata_all[type]['words'][1] | 
					
					
						
						| 
							 | 
							md_words = metadata_all[type]['words'][2] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							Dw, Iw, metaw = search_query_vocab(index_vor_words, words_vocab_queries) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							qi_pos = 0  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							similar_words = {} | 
					
					
						
						| 
							 | 
							for dist, ind in zip(Dw,Iw): | 
					
					
						
						| 
							 | 
								try:  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									similar_words[str(metaw[qi_pos]['word']) ] = { | 
					
					
						
						| 
							 | 
										'word': metaw[qi_pos]['word'],  | 
					
					
						
						| 
							 | 
										'text': metaw[qi_pos]['word'],  | 
					
					
						
						| 
							 | 
										"similar_topk": [md_words[i_index]['word'] for i_index in ind if (i_index != -1) ],  | 
					
					
						
						| 
							 | 
										"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],  | 
					
					
						
						| 
							 | 
										} | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								except:  | 
					
					
						
						| 
							 | 
									print("\n ERROR ", qi_pos, dist, ind) | 
					
					
						
						| 
							 | 
								qi_pos += 1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens, 'sentence_key_list': sentence_tokens_list},  | 
					
					
						
						| 
							 | 
									'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words} | 
					
					
						
						| 
							 | 
									} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def build_reference(all_embeddings, model): | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							vocab, sentence_tokens = create_vocab_multiple(all_embeddings,model) | 
					
					
						
						| 
							 | 
							words_vocab, sentences = vocab_words_all_prefix(all_embeddings, model, sufix="@@",prefix="▁") | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							index_tokens, meta_tokens = create_index_voronoi(vocab) | 
					
					
						
						| 
							 | 
							index_words, meta_words = create_index_voronoi_words(words_vocab) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							return {'tokens': [vocab, index_tokens, meta_tokens],  | 
					
					
						
						| 
							 | 
									'words': [words_vocab, index_words, meta_words] | 
					
					
						
						| 
							 | 
							 		}  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def embds_input_projection_vocab(vocab, key="token"):  | 
					
					
						
						| 
							 | 
							t0 = time() | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							nb_ids = []  | 
					
					
						
						| 
							 | 
							nb_embds = []  | 
					
					
						
						| 
							 | 
							nb_text = []  | 
					
					
						
						| 
							 | 
							tnse_error = [] | 
					
					
						
						| 
							 | 
							for _ , token_values in vocab.items(): | 
					
					
						
						| 
							 | 
								tnse_error.append([0,0]) | 
					
					
						
						| 
							 | 
								nb_ids.append(token_values[key])  | 
					
					
						
						| 
							 | 
								nb_text.append(token_values['text'])  | 
					
					
						
						| 
							 | 
								nb_embds.append(token_values['embed'])  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							X = np.array(nb_embds).astype('float32')  | 
					
					
						
						| 
							 | 
							try: | 
					
					
						
						| 
							 | 
								tsne = TSNE(random_state=0, n_iter=1000) | 
					
					
						
						| 
							 | 
								tsne_results = tsne.fit_transform(X) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
								tsne_results = np.c_[tsne_results, nb_ids, nb_text, range(len(nb_ids))]  | 
					
					
						
						| 
							 | 
							except:  | 
					
					
						
						| 
							 | 
								tsne_results = np.c_[tnse_error, nb_ids, nb_text, range(len(nb_ids))]  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							t1 = time() | 
					
					
						
						| 
							 | 
							print("t-SNE: %.2g sec" % (t1 - t0)) | 
					
					
						
						| 
							 | 
							print(tsne_results) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							return tsne_results.tolist() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def filtered_projection(similar_key, vocab, type="input", key="word"):  | 
					
					
						
						| 
							 | 
							global metadata_all | 
					
					
						
						| 
							 | 
							vocab_proj = vocab.copy() | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							source_words_voc_similar = set() | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							for key_i in similar_key: | 
					
					
						
						| 
							 | 
								words_set = similar_key[key_i] | 
					
					
						
						| 
							 | 
								source_words_voc_similar.update(words_set['similar_topk']) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							print(len(source_words_voc_similar)) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							source_embeddings_filtered = {key_value:  metadata_all[type][key][0][key_value] for key_value in source_words_voc_similar} | 
					
					
						
						| 
							 | 
							vocab_proj.update(source_embeddings_filtered) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							try: | 
					
					
						
						| 
							 | 
								result_TSNE = embds_input_projection_vocab(vocab_proj, key=key[:-1])  | 
					
					
						
						| 
							 | 
								dict_projected_embds_all = {str(embds[2]): [embds[0], embds[1], embds[2], embds[3], embds[4]] for embds in result_TSNE} | 
					
					
						
						| 
							 | 
							except:  | 
					
					
						
						| 
							 | 
								print('TSNE error', type, key) | 
					
					
						
						| 
							 | 
								dict_projected_embds_all = {} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							return dict_projected_embds_all  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def first_function(w1, model): | 
					
					
						
						| 
							 | 
							global metadata_all | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							sentences = w1.split("\n") | 
					
					
						
						| 
							 | 
							all_sentences = [] | 
					
					
						
						| 
							 | 
							translated_text = '' | 
					
					
						
						| 
							 | 
							input_embeddings = [] | 
					
					
						
						| 
							 | 
							output_embeddings = [] | 
					
					
						
						| 
							 | 
							for sentence in sentences : | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								params = translation_model(sentence, model) | 
					
					
						
						| 
							 | 
								all_sentences.append(params) | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								translated_text +=  params[0] + ' \n' | 
					
					
						
						| 
							 | 
								input_embeddings.append({	 | 
					
					
						
						| 
							 | 
									'embeddings': params[3].detach(),  | 
					
					
						
						| 
							 | 
									'tokens': params[2].tolist(),  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
								})  | 
					
					
						
						| 
							 | 
								output_embeddings.append({ | 
					
					
						
						| 
							 | 
									'embeddings' : params[4].detach(), | 
					
					
						
						| 
							 | 
									'tokens': params[1].sequences.tolist(), | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								}) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							result_input = build_reference(input_embeddings,model) | 
					
					
						
						| 
							 | 
							result_output = build_reference(output_embeddings,model) | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							metadata_all = {'input': result_input, 'output': result_output} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						   | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return [translated_text, params] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def first_function_tr(w1, model, var2={}): | 
					
					
						
						| 
							 | 
							global metadata_all | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							print("SEARCH -- ") | 
					
					
						
						| 
							 | 
							sentences = w1.split("\n") | 
					
					
						
						| 
							 | 
							all_sentences = [] | 
					
					
						
						| 
							 | 
							translated_text = '' | 
					
					
						
						| 
							 | 
							input_embeddings = [] | 
					
					
						
						| 
							 | 
							output_embeddings = [] | 
					
					
						
						| 
							 | 
							for sentence in sentences : | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								params = translation_model(sentence, model) | 
					
					
						
						| 
							 | 
								all_sentences.append(params) | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
								translated_text +=  params[0] + ' \n' | 
					
					
						
						| 
							 | 
								input_embeddings.append({	 | 
					
					
						
						| 
							 | 
									'embeddings': params[3].detach(),  | 
					
					
						
						| 
							 | 
									'tokens': params[2].tolist(),  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								})  | 
					
					
						
						| 
							 | 
								output_embeddings.append({ | 
					
					
						
						| 
							 | 
									'embeddings' : params[4].detach(), | 
					
					
						
						| 
							 | 
									'tokens': params[1].sequences.tolist(), | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								}) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							result_search = {} | 
					
					
						
						| 
							 | 
							result_search['input'] = build_search(input_embeddings, model, type='input') | 
					
					
						
						| 
							 | 
							result_search['output'] = build_search(output_embeddings, model, type='output') | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}} | 
					
					
						
						| 
							 | 
							dict_projected = {} | 
					
					
						
						| 
							 | 
							for type in ['input', 'output']: | 
					
					
						
						| 
							 | 
								dict_projected[type] = {}  | 
					
					
						
						| 
							 | 
								for key in ['tokens', 'words']:  | 
					
					
						
						| 
							 | 
									similar_key = result_search[type][key]['similar'] | 
					
					
						
						| 
							 | 
									vocab = result_search[type][key]['vocab_queries'] | 
					
					
						
						| 
							 | 
									dict_projected[type][key] =  filtered_projection(similar_key, vocab, type=type, key=key) | 
					
					
						
						| 
							 | 
									json_out[type][key]['similar_queries'] = similar_key | 
					
					
						
						| 
							 | 
									json_out[type][key]['tnse'] = dict_projected[type][key] | 
					
					
						
						| 
							 | 
									json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list'] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							return [translated_text, [ json_out, json_out['output']['words'], json_out['output']['tokens']] ] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from pathlib import Path | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						html = """ | 
					
					
						
						| 
							 | 
						<html> | 
					
					
						
						| 
							 | 
						<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script> | 
					
					
						
						| 
							 | 
						<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script> | 
					
					
						
						| 
							 | 
						<script async data-require="d3@3.5.3" data-semver="3.5.3" | 
					
					
						
						| 
							 | 
						  src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script> | 
					
					
						
						| 
							 | 
						<body> | 
					
					
						
						| 
							 | 
						  <div id="select_div"> | 
					
					
						
						| 
							 | 
						    <select id="select_type" class="form-select" aria-label="select example" hidden> | 
					
					
						
						| 
							 | 
						      <option selected value="words">Words</option> | 
					
					
						
						| 
							 | 
						      <option value="tokens">Tokens</option> | 
					
					
						
						| 
							 | 
						    </select> | 
					
					
						
						| 
							 | 
						  </div> | 
					
					
						
						| 
							 | 
						  <div id="d3_embed_div"> | 
					
					
						
						| 
							 | 
						    <div class="row"> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_embeds_input_words" class="d3_embed words"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_embeds_output_words" class="d3_embed words"></div> | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_embeds_input_tokens" class="d3_embed tokens"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_embeds_output_tokens" class="d3_embed tokens"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						    </div> | 
					
					
						
						| 
							 | 
						  </div> | 
					
					
						
						| 
							 | 
						  <div id="d3_graph_div"> | 
					
					
						
						| 
							 | 
						    <div class="row"> | 
					
					
						
						| 
							 | 
						      <div class="col-4"> | 
					
					
						
						| 
							 | 
						        <div id="d3_graph_input_words" class="d3_graph words"></div> | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
							  <div class="col-4"> | 
					
					
						
						| 
							 | 
							    <div id="similar_input_words" class=""></div> | 
					
					
						
						| 
							 | 
						    </div> | 
					
					
						
						| 
							 | 
							  <div class="col-4"> | 
					
					
						
						| 
							 | 
						        <div id="d3_graph_output_words" class="d3_graph words"></div> | 
					
					
						
						| 
							 | 
						        <div id="similar_output_words" class="d3_graph words"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
							  </div> | 
					
					
						
						| 
							 | 
							  <div class="row"> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_graph_input_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
						        <div id="similar_input_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						      <div class="col-6"> | 
					
					
						
						| 
							 | 
						        <div id="d3_graph_output_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
						        <div id="similar_output_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
						      </div> | 
					
					
						
						| 
							 | 
						    </div> | 
					
					
						
						| 
							 | 
						  </div> | 
					
					
						
						| 
							 | 
						</body> | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						</html> | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						html0 =  """ | 
					
					
						
						| 
							 | 
						<html> | 
					
					
						
						| 
							 | 
						<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script> | 
					
					
						
						| 
							 | 
						<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script> | 
					
					
						
						| 
							 | 
						<script async data-require="d3@3.5.3" data-semver="3.5.3" | 
					
					
						
						| 
							 | 
						  src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script> | 
					
					
						
						| 
							 | 
						<body> | 
					
					
						
						| 
							 | 
						  <div id="select_div"> | 
					
					
						
						| 
							 | 
						    <select id="select_type" class="form-select" aria-label="select example" hidden> | 
					
					
						
						| 
							 | 
						      <option selected value="words">Words</option> | 
					
					
						
						| 
							 | 
						      <option value="tokens">Tokens</option> | 
					
					
						
						| 
							 | 
						    </select> | 
					
					
						
						| 
							 | 
						  </div> | 
					
					
						
						| 
							 | 
						</body> | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						</html> | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						html_col1 = """  | 
					
					
						
						| 
							 | 
						      <div id="d3_graph_input_words" class="d3_graph words"></div> | 
					
					
						
						| 
							 | 
						      <div id="d3_graph_input_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						html_col2 = """ | 
					
					
						
						| 
							 | 
						 <div id="similar_input_words" class=""></div> | 
					
					
						
						| 
							 | 
						  <div id="similar_output_words" class=""></div> | 
					
					
						
						| 
							 | 
						  <div id="similar_input_tokens" class=" "></div> | 
					
					
						
						| 
							 | 
						<div id="similar_output_tokens" class=" "></div> | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						html_col3 = """ | 
					
					
						
						| 
							 | 
						<div id="d3_graph_output_words" class="d3_graph words"></div> | 
					
					
						
						| 
							 | 
						<div id="d3_graph_output_tokens" class="d3_graph tokens"></div> | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def second_function(w1,j2): | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							print("second_function -- after the js", w1,j2) | 
					
					
						
						| 
							 | 
							return "transition to second js function finished."	 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						paths = [] | 
					
					
						
						| 
							 | 
						def save_index(model) :  | 
					
					
						
						| 
							 | 
							names = [] | 
					
					
						
						| 
							 | 
							with open(model + '_metadata_ref.pkl', 'wb') as f: | 
					
					
						
						| 
							 | 
								pickle.dump(metadata_all, f) | 
					
					
						
						| 
							 | 
								names.append(model + '_metadata_ref.pkl') | 
					
					
						
						| 
							 | 
							for type in ['tokens','words']: | 
					
					
						
						| 
							 | 
								for kind in ['input', 'output']:  | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									name = model + "_" + kind + "_"+ type + ".index"  | 
					
					
						
						| 
							 | 
									write_index(metadata_all[kind][type][1], name) | 
					
					
						
						| 
							 | 
									names.append(name) | 
					
					
						
						| 
							 | 
							print("in save index done") | 
					
					
						
						| 
							 | 
							return gr.File(names) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						with gr.Blocks(js="plotsjs.js") as demo: | 
					
					
						
						| 
							 | 
							gr.Markdown( | 
					
					
						
						| 
							 | 
							""" | 
					
					
						
						| 
							 | 
							# MAKE NMT Workshop \t `Embeddings representation`  | 
					
					
						
						| 
							 | 
							""") | 
					
					
						
						| 
							 | 
							with gr.Row(): | 
					
					
						
						| 
							 | 
								with gr.Column(scale=1): | 
					
					
						
						| 
							 | 
									model_radio_c = gr.Radio(choices=['en-es', 'en-zh', 'en-fr'], value="en-es", label= '', container=False) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
								with gr.Column(scale=2): | 
					
					
						
						| 
							 | 
									gr.Markdown( | 
					
					
						
						| 
							 | 
										""" | 
					
					
						
						| 
							 | 
										### Reference Translation Sentences  | 
					
					
						
						| 
							 | 
										Enter at least 50 sentences to be used as comparison. | 
					
					
						
						| 
							 | 
										This is submitted just once.  | 
					
					
						
						| 
							 | 
										""") | 
					
					
						
						| 
							 | 
									in_text = gr.Textbox(lines=2, label="reference source text") | 
					
					
						
						| 
							 | 
									out_text  = gr.Textbox(label="reference target text", interactive=False) | 
					
					
						
						| 
							 | 
									out_text2  = gr.Textbox(visible=False) | 
					
					
						
						| 
							 | 
									var2 = gr.JSON(visible=False) | 
					
					
						
						| 
							 | 
									btn = gr.Button("Reference Translation") | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									save_index_btn = gr.Button("Generate index files to download ",) | 
					
					
						
						| 
							 | 
									tab2_outputs = gr.File() | 
					
					
						
						| 
							 | 
									input = tab2_outputs | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
								with gr.Column(scale=3): | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
									gr.Markdown( | 
					
					
						
						| 
							 | 
										""" | 
					
					
						
						| 
							 | 
										### Translation Sentences  | 
					
					
						
						| 
							 | 
										Sentences to be analysed.  | 
					
					
						
						| 
							 | 
										""") | 
					
					
						
						| 
							 | 
									in_text_tr = gr.Textbox(lines=2, label="source text") | 
					
					
						
						| 
							 | 
									out_text_tr  = gr.Textbox(label="target text", interactive=False) | 
					
					
						
						| 
							 | 
									out_text2_tr  = gr.Textbox(visible=False) | 
					
					
						
						| 
							 | 
									var2_tr = gr.JSON(visible=False) | 
					
					
						
						| 
							 | 
									btn_faiss= gr.Button("Translation ") | 
					
					
						
						| 
							 | 
									gr.Button("Download", link="/file=en-es_input_tokens.index") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							with gr.Row(): | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
								with gr.Column(scale=1): | 
					
					
						
						| 
							 | 
									input_mic = gr.HTML(html0) | 
					
					
						
						| 
							 | 
									input_html2 = gr.HTML(html_col2) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
								with gr.Column(scale=2): | 
					
					
						
						| 
							 | 
									input_html1 = gr.HTML(html_col1) | 
					
					
						
						| 
							 | 
								 | 
					
					
						
						| 
							 | 
									 | 
					
					
						
						| 
							 | 
								with gr.Column(scale=2): | 
					
					
						
						| 
							 | 
									input_html3 = gr.HTML(html_col3) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							btn.click(first_function, [in_text, model_radio_c], [out_text,var2], js="(in_text,model_radio_c) => testFn_out(in_text,model_radio_c)")  | 
					
					
						
						| 
							 | 
							btn_faiss.click(first_function_tr, [in_text_tr, model_radio_c], [out_text_tr,var2_tr], js="(in_text_tr,model_radio_c) => testFn_out(in_text_tr,model_radio_c)")  | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							out_text.change(second_function, [out_text, var2], out_text2, js="(out_text,var2) => testFn_out_json(var2)")  | 
					
					
						
						| 
							 | 
							out_text_tr.change(second_function, [out_text_tr, var2_tr], out_text2_tr, js="(out_text_tr,var2_tr) => testFn_out_json_tr(var2_tr)")  | 
					
					
						
						| 
							 | 
							save_index_btn.click(save_index, [model_radio_c], [tab2_outputs]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
							 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						if __name__ == "__main__":    | 
					
					
						
						| 
							 | 
						    demo.launch(allowed_paths=["./", ".", "/"]) |