Spaces:

gilesitorr
/

Nahuatl2Spanish

Runtime error

App Files Files Community

Nahuatl2Spanish / app.py

gilesitorr

Update app.py

ebdd4c1 over 2 years ago

raw

history blame contribute delete

6.56 kB

	import streamlit as st
	import json
	from urllib.request import urlopen
	from thefuzz import fuzz
	from itertools import combinations
	from keras_transformer import get_model, decode

	####################################################################################################
	# FUNCTIONS
	def search_fit(word, data, threshold=50, fraction=2/3):
	# Esta función se puede usar para n palabras, basta con quitar los espacios
	# entre palabras
	target = ''
	original = ''
	best_score = 0

	for item in data.keys():
	for i in range(len(data[item])):
	data_item = data[item][i].replace(' ', '')
	score = fuzz.ratio(word, data_item)
	if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
	best_score = score
	target = item
	original = data_item

	return target, best_score, original

	def find_longest_phrase(data):
	biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
	return biggest_len

	def create_tuples(sample_list, tuple_size):
	tuple_list = [tuple([i+j for j in range(tuple_size)]) \
	for i in range(len(sample_list)-tuple_size+1)]
	#print(tuple_list)
	return tuple_list

	# OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
	def make_translation(transcription, data, threshold=50, fraction=2/3):


	# To set limits for comparison size
	data_len = find_longest_phrase(data)
	transcription_len = len(transcription.split())
	biggest_len = min(data_len, transcription_len)

	# To get the best translation given a phrase
	index_transcription = list(range(transcription_len))
	index_translation = list(range(transcription_len))

	translation_dict = {}
	translation = transcription#.copy()
	transcription_split = transcription.split()

	for i in range(1, 0, -1):
	# Match comparisons
	if i>1:
	translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
	else:
	translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})

	# Get the best translation priorizing the longest phrases
	for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL
	clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
	if clear_index and i>1 and translation_dict[combination][1]>threshold:
	taken = False
	translation_split = translation.split()
	for number, word in enumerate(translation_split):
	if number in combination:
	if not taken:
	if len(translation_dict[combination][0].split())>1:
	translation_split[number] = '-'.join(translation_dict[combination][0])
	else:
	translation_split[number] = translation_dict[combination][0]
	taken = True
	else:
	translation_split[number] = '<>'
	translation = ' '.join(translation_split)

	index_translation = [item if item not in combination else 0 for item in index_translation]

	elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
	taken = False
	translation_split = translation.split()
	for number, word in enumerate(translation_split):
	if number in combination:
	if not taken:
	if len(translation_dict[combination][0].split())>1:
	translation_split[number] = '-'.join(translation_dict[combination][0])
	else:
	translation_split[number] = translation_dict[combination][0]
	taken = True
	else:
	translation_split[number] = '<>'
	translation = ' '.join(translation_split)
	index_translation = [item if item not in combination else 0 for item in index_translation]

	return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip()


	def remover(my_string = ""):
	for item in my_string:
	if item not in values:
	my_string = my_string.replace(item, "")
	return my_string

	def translate(oracion, model):
	sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) #
	sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
	tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
	decoded = decode(
	model,
	tr_input,
	start_token = target_token_dict['<START>'],
	end_token = target_token_dict['<END>'],
	pad_token = target_token_dict['<PAD>']
	)

	return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))

	####################################################################################################
	# MAIN APP
	path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'

	response = urlopen(path_dict+'uncased_tokens_pretrained.json')
	source_token_dict = json.loads(response.read())
	target_token_dict = source_token_dict.copy()

	response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
	target_token_dict_inv = json.loads(response.read())
	target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}

	response = urlopen(path_dict+'nah_es.json')
	dictionary = json.loads(response.read())

	model = get_model(
	token_num = max(len(source_token_dict),len(target_token_dict)),
	embed_dim = 256,
	encoder_num = 2,
	decoder_num = 2,
	head_num = 32,
	hidden_dim = 2048,
	dropout_rate = 0.1,
	use_same_embed = False,
	)

	from keras.utils.data_utils import get_file

	path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/'
	filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5'
	weights_path = get_file(
	'.././model.h5',
	filename)
	model.load_weights(weights_path)

	values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
	text = st.text_area('Escriba una frase a traducir: ')
	if text:
	out = translate(remover(text.lower()), model)
	st.text(out)