Spaces:

muhammadayman
/

data_science_content_en_to_ar

Runtime error

App Files Files

xet

Community

data_science_content_en_to_ar / app.py

muhammadayman

Update app.py

67bb639 almost 3 years ago

raw

history blame contribute delete

3.11 kB

	import sys
	import gradio as gr
	from transformers import AutoTokenizer
	import torch
	import json, re
	tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
	model = torch.load("helsinki_fineTuned.pt", map_location=torch.device('cpu'))
	model.eval()

	# Open Keywords
	with open('merged.json', encoding='utf8') as merged:
	data = json.load(merged)
	keyword_map = {}
	for en in data:
	keyword_map[en.lower()] = data[en]
	merged.close()

	# Getting keywords from the file
	def getKeywords(word):
	words = word.lower().rstrip()
	if keyword_map.get(words):
	return keyword_map[words], True
	return word, False

	# Replace keywords with the translation
	def final_output(text):
	reg = re.compile('[a-zA-Z][ \-()a-zA-Z]+')
	keywords = re.findall(reg, text)
	text_split = re.split(reg, text)
	n=len(keywords)
	for i in range(n):
	word, found = getKeywords(keywords[i])
	if found:
	text_split[i]+=(word + " (" + keywords[i].rstrip() + ")")
	else:
	text_split[i]+=word
	return ' '.join(text_split)



	def translate(input):
	translation = []
	text_list = input.split('.')
	for i in range(len(text_list)):
	encode = model.generate(**tokenizer.prepare_seq2seq_batch(text_list[i],return_tensors='pt'))
	text_ar = tokenizer.batch_decode(encode,skip_special_tokens=True)[0]
	text_post = final_output(text_ar)
	translation.append(text_post)
	article_ar = ".".join(translation)
	return article_ar
	# Wrap up function
	translate_interface = gr.Interface(fn = translate,
	allow_flagging = True,
	flagging_dir = 'flagged/logs',
	title = 'Translating "English Data Science" content into Arabic',
	inputs=gr.inputs.Textbox(lines = 7, label = 'English content'),
	outputs="text",
	examples = [
	['In the last few years the RNN-based architectures have shown the best performance in machine translation problems, but still they have some problems that had to be solved. First, they have a difficulty to cope with long-range dependencies (also LSTM when it has to deal with really long sentences). Secondly, each hidden state depends on the previous one which impossible to parallelize and makes it inefficient on GPUs.']
	,
	['Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data,[1][2] and apply knowledge and actionable insights from data across a broad range of application domains. Data science is related to data mining, machine learning and big data.']
	]
	)

	translate_interface.launch()