Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import json | |
| from urllib.request import urlopen | |
| from thefuzz import fuzz | |
| from itertools import combinations | |
| from keras_transformer import get_model, decode | |
| #################################################################################################### | |
| # FUNCTIONS | |
| def search_fit(word, data, threshold=50, fraction=2/3): | |
| # Esta función se puede usar para n palabras, basta con quitar los espacios | |
| # entre palabras | |
| target = '' | |
| original = '' | |
| best_score = 0 | |
| for item in data.keys(): | |
| for i in range(len(data[item])): | |
| data_item = data[item][i].replace(' ', '') | |
| score = fuzz.ratio(word, data_item) | |
| if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction: | |
| best_score = score | |
| target = item | |
| original = data_item | |
| return target, best_score, original | |
| def find_longest_phrase(data): | |
| biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()]) | |
| return biggest_len | |
| def create_tuples(sample_list, tuple_size): | |
| tuple_list = [tuple([i+j for j in range(tuple_size)]) \ | |
| for i in range(len(sample_list)-tuple_size+1)] | |
| #print(tuple_list) | |
| return tuple_list | |
| # OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS | |
| def make_translation(transcription, data, threshold=50, fraction=2/3): | |
| # To set limits for comparison size | |
| data_len = find_longest_phrase(data) | |
| transcription_len = len(transcription.split()) | |
| biggest_len = min(data_len, transcription_len) | |
| # To get the best translation given a phrase | |
| index_transcription = list(range(transcription_len)) | |
| index_translation = list(range(transcription_len)) | |
| translation_dict = {} | |
| translation = transcription#.copy() | |
| transcription_split = transcription.split() | |
| for i in range(1, 0, -1): | |
| # Match comparisons | |
| if i>1: | |
| translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)}) | |
| else: | |
| translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)}) | |
| # Get the best translation priorizing the longest phrases | |
| for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL | |
| clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free | |
| if clear_index and i>1 and translation_dict[combination][1]>threshold: | |
| taken = False | |
| translation_split = translation.split() | |
| for number, word in enumerate(translation_split): | |
| if number in combination: | |
| if not taken: | |
| if len(translation_dict[combination][0].split())>1: | |
| translation_split[number] = '-'.join(translation_dict[combination][0]) | |
| else: | |
| translation_split[number] = translation_dict[combination][0] | |
| taken = True | |
| else: | |
| translation_split[number] = '<>' | |
| translation = ' '.join(translation_split) | |
| index_translation = [item if item not in combination else 0 for item in index_translation] | |
| elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold: | |
| taken = False | |
| translation_split = translation.split() | |
| for number, word in enumerate(translation_split): | |
| if number in combination: | |
| if not taken: | |
| if len(translation_dict[combination][0].split())>1: | |
| translation_split[number] = '-'.join(translation_dict[combination][0]) | |
| else: | |
| translation_split[number] = translation_dict[combination][0] | |
| taken = True | |
| else: | |
| translation_split[number] = '<>' | |
| translation = ' '.join(translation_split) | |
| index_translation = [item if item not in combination else 0 for item in index_translation] | |
| return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip() | |
| def remover(my_string = ""): | |
| for item in my_string: | |
| if item not in values: | |
| my_string = my_string.replace(item, "") | |
| return my_string | |
| def translate(oracion, model): | |
| sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) # | |
| sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]] | |
| tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0] | |
| decoded = decode( | |
| model, | |
| tr_input, | |
| start_token = target_token_dict['<START>'], | |
| end_token = target_token_dict['<END>'], | |
| pad_token = target_token_dict['<PAD>'] | |
| ) | |
| return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1])) | |
| #################################################################################################### | |
| # MAIN APP | |
| path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/' | |
| response = urlopen(path_dict+'uncased_tokens_pretrained.json') | |
| source_token_dict = json.loads(response.read()) | |
| target_token_dict = source_token_dict.copy() | |
| response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json') | |
| target_token_dict_inv = json.loads(response.read()) | |
| target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()} | |
| response = urlopen(path_dict+'nah_es.json') | |
| dictionary = json.loads(response.read()) | |
| model = get_model( | |
| token_num = max(len(source_token_dict),len(target_token_dict)), | |
| embed_dim = 256, | |
| encoder_num = 2, | |
| decoder_num = 2, | |
| head_num = 32, | |
| hidden_dim = 2048, | |
| dropout_rate = 0.1, | |
| use_same_embed = False, | |
| ) | |
| from keras.utils.data_utils import get_file | |
| path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/' | |
| filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5' | |
| weights_path = get_file( | |
| '.././model.h5', | |
| filename) | |
| model.load_weights(weights_path) | |
| values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ") | |
| text = st.text_area('Escriba una frase a traducir: ') | |
| if text: | |
| out = translate(remover(text.lower()), model) | |
| st.text(out) |