Spaces:

guymorlan
/

English2ShamiDialect

Running

App Files Files Community

English2ShamiDialect / app.py

guymorlan

Update app.py

fff6c5b over 2 years ago

raw

history blame contribute delete

13.6 kB

	import gradio as gr
	from transformers import pipeline, MarianMTModel, AutoTokenizer
	import os
	import azure.cognitiveservices.speech as speechsdk
	import matplotlib.pyplot as plt
	import numpy as np

	dialects = {"Palestinian/Jordanian": "P", "Syrian": "S", "Lebanese": "L", "Egyptian": "E"}

	# translator_en2ar = pipeline(task="translation", model="guymorlan/English2Dialect")
	translator_en2ar = MarianMTModel.from_pretrained("guymorlan/English2Dialect", output_attentions=True)
	tokenizer_en2ar = AutoTokenizer.from_pretrained("guymorlan/English2Dialect")
	translator_ar2en = MarianMTModel.from_pretrained("guymorlan/Shami2English", output_attentions=True)
	tokenizer_ar2en = AutoTokenizer.from_pretrained("guymorlan/Shami2English")
	transliterator = pipeline(task="translation", model="guymorlan/DialectTransliterator")

	speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))

	def generate_diverging_colors(num_colors, palette='Set3'): # courtesy of ChatGPT
	# Generate a colormap with a specified number of colors
	cmap = plt.cm.get_cmap(palette, num_colors)

	# Get the RGB values of the colors in the colormap
	colors_rgb = cmap(np.arange(num_colors))

	# Convert the RGB values to hexadecimal color codes
	colors_hex = [format(int(color[0]255)<<16\|int(color[1]255)<<8\|int(color[2]*255), '06x') for color in colors_rgb]

	return colors_hex


	def align_words(outputs, tokenizer, encoder_input_ids, decoder_input_ids, threshold=0.4, skip_first_src=True):
	alignment = []
	for i, tok in enumerate(outputs.cross_attentions[2][0][7]):
	alignment.append([[i], (tok > threshold).nonzero().squeeze(-1).tolist()])

	merged = []
	for i in alignment:
	token = tokenizer.convert_ids_to_tokens([decoder_input_ids[0][i[0]]])[0]
	if token not in tokenizer.convert_tokens_to_ids(["</s>", "<pad>", "<unk>"]):
	if merged:
	tomerge = False
	# check overlap with previous entry
	for x in i[1]:
	if x in merged[-1][1]:# or tokenizer.convert_ids_to_tokens([encoder_input_ids[0][x]])[0][0] != "▁":
	tomerge = True
	break
	# if first character is not a "▁"
	if token[0] != "▁":
	tomerge = True
	if tomerge:
	merged[-1][0] += i[0]
	merged[-1][1] += i[1]
	else:
	merged.append(i)
	else:
	merged.append(i)

	colordict = {}
	ncolors = 0
	for i in merged:
	src_tok = [f"src_{x}" for x in i[0]]
	trg_tok = [f"trg_{x}" for x in i[1]]
	all_tok = src_tok + trg_tok
	# see if any tokens in entry already have associated color
	newcolor = None
	for t in all_tok:
	if t in colordict:
	newcolor = colordict[t]
	break
	if not newcolor:
	newcolor = ncolors
	ncolors += 1
	for t in all_tok:
	if t not in colordict:
	colordict[t] = newcolor

	colors = generate_diverging_colors(ncolors, palette="Set2")
	id_to_color = {i: c for i, c in enumerate(colors)}
	for k, v in colordict.items():
	colordict[k] = id_to_color[v]


	tgthtml = []
	for i, token in enumerate(decoder_input_ids[0]):
	if f"src_{i}" in colordict:
	label = f"src_{i}"
	tgthtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
	else:
	tgthtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
	tgthtml = "".join(tgthtml)
	tgthtml = tgthtml.replace("▁", " ")
	tgthtml = f"<span style='font-size: 30px'>{tgthtml}</span>"

	srchtml = []
	for i, token in enumerate(encoder_input_ids[0]):
	if skip_first_src and i == 0:
	continue
	if f"trg_{i}" in colordict:
	label = f"trg_{i}"
	srchtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
	else:
	srchtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
	srchtml = "".join(srchtml)
	srchtml = srchtml.replace("▁", " ")
	srchtml = f"<span style='font-size: 30px'>{srchtml}</span>"
	return srchtml, tgthtml

	def translate_english(input_text, include):
	if not input_text:
	return "", "", "", "", ""

	inputs = [f"{val} {input_text}" for val in dialects.values()]

	sy, lb, eg = "SYR" in include, "LEB" in include, "EGY" in include
	# remove 2nd element if sy is false
	if not eg:
	inputs.pop()
	if not lb:
	inputs.pop()
	if not sy:
	inputs.pop()

	input_tokens = tokenizer_en2ar(inputs, return_tensors="pt").input_ids
	# print(input_tokens)
	outputs = translator_en2ar.generate(input_tokens)
	# print(outputs)

	encoder_input_ids = input_tokens[0].unsqueeze(0)
	decoder_input_ids = outputs[0].unsqueeze(0)


	decoded = tokenizer_en2ar.batch_decode(outputs, skip_special_tokens=True)
	# print(decoded)
	pal_out = decoded[0]
	sy_out = decoded[1] if sy else ""
	lb_out = decoded[1 + sy] if lb else ""
	eg_out = decoded[1 + sy + lb] if eg else ""

	if "Colorize" in include:
	html_outputs = translator_en2ar(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

	# set dynamic threshold
	# print(input_tokens, input_tokens.shape)
	if input_tokens.shape[1] < 10:
	threshold = 0.4
	elif input_tokens.shape[1] < 20:
	threshold = 0.10
	else:
	threshold = 0.05

	print("threshold", threshold)

	srchtml, tgthtml = align_words(html_outputs, tokenizer_en2ar, encoder_input_ids, decoder_input_ids, threshold)
	palhtml = f"{srchtml}<br><br><div style='direction: rtl'>{tgthtml}</div>"
	else:
	palhtml = f"<div style='font-size: 30px; direction: rtl'>{pal_out}</div>"


	return palhtml, pal_out, sy_out, lb_out, eg_out

	def translate_arabic(input_text, include=["Colorize"]):
	if not input_text:
	return ""

	input_tokens = tokenizer_ar2en(input_text, return_tensors="pt").input_ids
	# print(input_tokens)
	outputs = translator_ar2en.generate(input_tokens)
	# print(outputs)

	encoder_input_ids = input_tokens[0].unsqueeze(0)
	decoder_input_ids = outputs[0].unsqueeze(0)

	decoded = tokenizer_en2ar.batch_decode(outputs, skip_special_tokens=True)
	# print(decoded)

	print(include)
	if "Colorize" in include:
	html_outputs = translator_ar2en(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

	# set dynamic threshold
	# print(input_tokens, input_tokens.shape)
	if input_tokens.shape[1] < 20:
	threshold = 0.1
	elif input_tokens.shape[1] < 30:
	threshold = 0.01
	else:
	threshold = 0.05

	print("threshold", threshold)

	srchtml, tgthtml = align_words(html_outputs, tokenizer_ar2en, encoder_input_ids, decoder_input_ids, threshold, skip_first_src=False)
	enhtml = f"<div style='direction: rtl'>{srchtml}</div><br><br><div>{tgthtml}</div>"
	else:
	enhtml = f"<div style='font-size: 30px;'>{decoded[0]}</div>"

	return enhtml

	def get_audio(input_text):
	audio_config = speechsdk.audio.AudioOutputConfig(filename=f"{input_text}.wav")
	speech_config.speech_synthesis_voice_name='ar-SY-AmanyNeural'
	speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
	speech_synthesis_result = speech_synthesizer.speak_text_async(input_text).get()
	return f"{input_text}.wav"

	def get_transliteration(input_text, include=["Translit."]):
	if "Translit." not in include:
	return ""
	result = transliterator([input_text])
	return result[0]["translation_text"]


	bla = """
	"""
	css = """
	#liter textarea, #trans textarea { font-size: 25px;}
	#trans textarea { direction: rtl; }
	#check { border-style: none !important; }
	:root {--button-secondary-background-focus: #2563eb !important;
	--button-secondary-background-base: #2563eb !important;
	--button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2);
	--button-secondary-text-color-base: white !important;
	--button-secondary-text-color-hover: white !important;
	--button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
	--button-secondary-text-color-focus: white !important}
	.dark {--button-secondary-background-base: #2563eb !important;
	--button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
	--button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2)}
	.feather-music { stroke: #2563eb; }
	"""

	def toggle_visibility(include):
	outs = [gr.Textbox.update(visible=True)] * 4
	if "Translit." not in include:
	outs[0] = gr.Textbox.update(visible=False)
	if "SYR" not in include:
	outs[1] = gr.Textbox.update(visible=False)
	if "LEB" not in include:
	outs[2] = gr.Textbox.update(visible=False)
	if "EGY" not in include:
	outs[3] = gr.Textbox.update(visible=False)

	return outs

	with gr.Blocks(title = "Levantine Arabic Translator", css=css, theme="default") as demo:
	gr.HTML("<h2><span style='color: #2563eb'>Levantine Arabic</span> Translator</h2>")
	with gr.Tab('En > Ar'):
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input", placeholder="Enter English text", lines=2)
	gr.Examples(["I wanted to go to the store yesterday, but it rained", "How are you feeling today?"], input_text)
	btn = gr.Button("Translate", label="Translate")
	with gr.Row():
	include = gr.CheckboxGroup(["Translit.", "SYR", "LEB", "EGY", "Colorize"],
	label="Disable features to speed up translation",
	value=["Translit.", "EGY", "Colorize"])
	gr.Markdown("Built by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il). Pronunciation model is specifically tailored to urban Palestinian Arabic. Text-to-speech uses Microsoft Azure's API and may provide different result from the transliterated pronunciation.")

	with gr.Column():
	with gr.Box(label = "Palestinian"):
	gr.Markdown("Palestinian")
	with gr.Box():
	pal_html = gr.HTML("<br>", visible=True, label="Palestinian", elem_id="main")
	pal = gr.Textbox(lines=1, label="Palestinian", elem_id="trans", visible=False)
	pal_translit = gr.Textbox(lines=1, label="Palestinian Pronunciation (Urban)", elem_id="liter")
	sy = gr.Textbox(lines=1, label="Syrian", elem_id="trans", visible=False)
	lb = gr.Textbox(lines=1, label="Lebanese", elem_id="trans", visible=False)
	eg = gr.Textbox(lines=1, label="Egyptian", elem_id="trans")
	# with gr.Row():
	audio = gr.Audio(label="Audio - Palestinian", interactive=False)
	audio_button = gr.Button("Get Audio", label="Click Here to Get Audio")
	audio_button.click(get_audio, inputs=[pal], outputs=[audio])
	btn.click(translate_english,inputs=[input_text, include], outputs=[pal_html, pal, sy, lb, eg], api_name="en2ar", _js="function jump(x, y){document.getElementById('main').scrollIntoView(); return [x, y];}")
	input_text.submit(translate_english, inputs=[input_text, include], outputs=[pal_html, pal, sy, lb, eg],scroll_to_output=True)
	pal.change(get_transliteration, inputs=[pal, include], outputs=[pal_translit]);
	include.change(toggle_visibility, inputs=[include], outputs=[pal_translit, sy, lb, eg])

	with gr.Tab('Ar > En'):
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input", placeholder="Enter Levantine Arabic text", lines=1, elem_id="trans")
	gr.Examples(["خلينا ندور على مطعم تاني", "قديش حق البندورة؟"], input_text)
	btn = gr.Button("Translate", label="Translate")
	gr.Markdown("Built by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il).")
	with gr.Column():
	with gr.Box(label = "English"):
	gr.Markdown("English")
	with gr.Box():
	eng = gr.HTML("<br>", label="English", elem_id="main")
	btn.click(translate_arabic,inputs=input_text, outputs=[eng], api_name = "ar2en")

	with gr.Tab("Transliterate"):
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input", placeholder="Enter Levantine Arabic text", lines=1)
	gr.Examples(["خلينا ندور على مطعم تاني", "قديش حق البندورة؟"], input_text)
	btn = gr.Button("Transliterate", label="Transliterate")
	gr.Markdown("Built by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il)")
	with gr.Column():
	translit = gr.Textbox(label="Transliteration", lines=1, elem_id="liter")
	btn.click(get_transliteration, inputs=input_text, outputs=[translit])


	demo.launch()