Spaces:

NgalNgal
/

mT5-new

Sleeping

App Files Files Community

mT5-new / app.py

NgalNgal

Update app.py

d226f22 verified 5 months ago

raw

history blame

20.3 kB

	import os
	import spaces
	import gradio as gr #gr.load("models/NgalNgal/mT5-new").launch()
	import torch
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	import sentencepiece as spm
	import ctranslate2
	import transformers
	from nltk import sent_tokenize

	# Load the source SentecePiece model
	sp = spm.SentencePieceProcessor()


	mbart_enmy_ct_model_path = "mbart25enmy_ct2/"
	mbart_enmy_sp_model_path = "mbart25enmy_ct2/sentence.bpe.model"

	mbart_myen_ct_model_path = "mbart25-myen_ct2/"
	mbart_myen_sp_model_path = "mbart25-myen_ct2/sentence.bpe.model"

	mt5_ct_model_path = "mt5-ct2/"
	mt5_sp_model_path = "mt5-base/"


	trans_sp_source_enmy_path = "enmy_ctranslate2/source.model"
	trans_sp_target_enmy_path = "enmy_ctranslate2/target.model"

	trans_sp_source_myen_path = "myen_ctranslate2/source.model"
	trans_sp_target_myen_path = "myen_ctranslate2/target.model"

	trans_enmy_ct_model_path = "enmy_ctranslate2/"
	trans_myen_ct_model_path = "myen_ctranslate2/"

	#translator = ctranslate2.Translator(ct_model_path)
	#sp_model = spm.SentencePieceProcessor(sp_model_path)

	#!/usr/bin/python

	def segment_sentence(source):
	input_file = "input.txt"
	output_file = "output.txt"
	with open(input_file, "w", encoding="utf-8") as file:
	file.write(source)
	os.system("python myseg.py < input.txt > output.txt")



	#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
	#subprocess.run(segmentation_command, shell=True, check=True)

	with open(output_file, "r", encoding="utf-8") as file:
	segmented_content = file.read()

	#subprocess.run(f"rm /content/drive/MyDrive/input.txt /content/drive/MyDrive/output.txt", shell=True, check=True)
	#segmented_sentence = segmented_content.replace("\|", " ")
	print(segmented_content)
	return segmented_content


	def write_to_file_myanmar(source):
	input_file = "write-input.txt"
	output_file = "read-output.txt"
	with open(input_file, "w", encoding="utf-8") as file:
	file.write(source)

	os.system("python myseg.py < write-input.txt > read-output.txt")



	#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
	#subprocess.run(segmentation_command, shell=True, check=True)

	with open(output_file, "r", encoding="utf-8") as file:
	segmented_contents = file.read()

	#source_sents = [segmented_content.strip() for segmented_content in segmented_contents]

	return segmented_contents

	def write_to_file_english(source):
	input_file = "write-input.txt"
	#output_file = "read-output.txt"
	with open(input_file, "w", encoding="utf-8") as file:
	file.write(source)

	#os.system("python /content/drive/MyDrive/mbart-enmy/myseg.py < /content/drive/MyDrive/write-input.txt > /content/drive/MyDrive/read-output.txt")



	#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
	#subprocess.run(segmentation_command, shell=True, check=True)

	with open(input_file, "r", encoding="utf-8") as file:
	segmented_contents = file.read()

	#source_sents = [segmented_content.strip() for segmented_content in segmented_contents]

	return segmented_contents


	def call_model_transformer(source, direction_trans):
	if direction_trans == "English to Myanmar":
	ct_model_path = "enmy_ctranslate2/"
	sp_source_model_path = "enmy_ctranslate2/source.model"
	sp_target_model_path = "enmy_ctranslate2/target.model"
	if source == "" :
	gr.Warning("Please Enter English Text")
	else:
	#Set file paths
	source_file_path = "write-input.txt"
	target_file_path = "read-output.txt"

	# Load the source SentecePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(sp_source_model_path)

	# write source to file
	with open(source_file_path, "w", encoding="utf-8") as file:
	file.write(source)

	# Open the source file
	with open(source_file_path, "r") as source:
	lines = source.readlines()

	source_sents = [line.strip() for line in lines]

	# Subword the source sentences
	source_sents_subworded = sp.encode_as_pieces(source_sents)

	# Translate the source sentences
	translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU
	translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
	translations = [translation.hypotheses[0] for translation in translations]

	# Load the target SentecePiece model
	sp.load(sp_target_model_path)

	# Desubword the target sentences
	translations_desubword = sp.decode(translations)


	# Save the translations to the a file
	with open(target_file_path, "w+", encoding="utf-8") as target:
	for line in translations_desubword:
	target.write(line.strip() + "\n")

	#print("Done")

	with open(target_file_path, "r", encoding="utf-8") as file:
	segmented_contents = file.read()


	elif direction_trans == "Myanmar to English":
	ct_model_path = "myen_ctranslate2/"
	sp_source_model_path = "myen_ctranslate2/source.model"
	sp_target_model_path = "myen_ctranslate2/target.model"
	if sources == "" :
	gr.Warning("Please Enter Myanmar Text")
	else:
	#Set file paths
	source_file_path = "write-input.txt"
	target_file_path = "read-output.txt"

	# Load the source SentecePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(sp_source_model_path)

	# write source to file
	with open(source_file_path, "w", encoding="utf-8") as file:
	file.write(source)

	# Open the source file
	with open(source_file_path, "r") as source:
	lines = source.readlines()

	source_sents = [line.strip() for line in lines]

	# Subword the source sentences
	source_sents_subworded = sp.encode_as_pieces(source_sents)

	# Translate the source sentences
	translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU
	translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
	translations = [translation.hypotheses[0] for translation in translations]

	# Load the target SentecePiece model
	sp.load(sp_target_model_path)

	# Desubword the target sentences
	translations_desubword = sp.decode(translations)


	# Save the translations to the a file
	with open(target_file_path, "w+", encoding="utf-8") as target:
	for line in translations_desubword:
	target.write(line.strip() + "\n")

	#print("Done")

	with open(target_file_path, "r", encoding="utf-8") as file:
	segmented_contents = file.read()


	else: gr.Warning("Please Select Language Direction")


	return segmented_contents


	def translate_trans_myen(source, translator, sp_source_model, sp_target_model):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""
	source = segment_sentence(source)
	source_sentences = sent_tokenize(source) # split sentences
	source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
	translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
	translations = [translation[0]["tokens"] for translation in translations]
	translations = sp_target_model.decode(translations)

	return translations

	def translate_trans_enmy(source, translator, sp_source_model, sp_target_model):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""
	source_sentences = sent_tokenize(source) # split sentences
	source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
	translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
	translations = [translation[0]["tokens"] for translation in translations]
	translations_detokenized = sp_target_model.decode(translations)

	return translations_detokenized
	def translate_mt5_myen(source, translator, tokenizer):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""
	source = segment_sentence(source)
	input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
	results = translator.translate_batch([input_tokens])
	output_tokens = results[0].hypotheses[0]
	translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
	return translations

	def call_model_mt5(source, direction_mt5):
	if direction_mt5 == "English to Myanmar":
	translator = ctranslate2.Translator(mt5_ct_model_path)
	tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
	if source == "" :
	gr.Warning("Please Enter English Text")
	else:
	translation = translate_mt5_enmy(source, translator, tokenizer)
	elif direction_mt5 == "Myanmar to English":
	translator = ctranslate2.Translator(mt5_ct_model_path)
	tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
	if source == "" :
	gr.Warning("Please Enter Myanmar Text")
	else:
	translation = translate_mt5_myen(source, translator, tokenizer)
	else: gr.Warning("Please Select Language Direction")
	return translation

	def translate_mt5_enmy(source, translator, tokenizer):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""
	input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
	results = translator.translate_batch([input_tokens])
	output_tokens = results[0].hypotheses[0]
	translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
	return translations

	def translate_mbart_myen(source, translator, sp_model):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""
	source = segment_sentence(source)
	# source_sentences = sent_tokenize(source)
	source_tokenized = sp_model.encode(source, out_type=str)
	# print("print 1" , source_tokenized)
	source_tokenized = ["[my_MM]"] + source_tokenized
	# print("print " + source_tokenized)
	target_prefix = ["[en_XX]"]
	translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
	# translations = [translation[0]["tokens"] for translation in translations]
	translations = sp_model.decode(translations[0].hypotheses[0][1:])
	# translation = " ".join(translations_detokenized)

	return translations

	def translate_mbart_enmy(source, translator, sp_model):
	"""Use CTranslate model to translate a sentence
	Args:
	source (str): Source sentences to translate
	translator (object): Object of Translator, with the CTranslate2 model
	sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
	Returns:
	Translation of the source text
	"""

	# source_sentences = sent_tokenize(source)
	source_tokenized = sp_model.encode(source, out_type=str)
	# print("print 1" , source_tokenized)
	source_tokenized = ["[en_XX]"] + source_tokenized
	# print("print " + source_tokenized)
	target_prefix = ["[my_MM]"]
	translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
	# translations = [translation[0]["tokens"] for translation in translations]
	translations = sp_model.decode(translations[0].hypotheses[0][1:])
	# translation = " ".join(translations_detokenized)

	return translations

	def call_model(source, direction):
	if direction == "English to Myanmar":
	translator = ctranslate2.Translator(mbart_enmy_ct_model_path)
	sp_model = spm.SentencePieceProcessor(mbart_enmy_sp_model_path)
	if source == "" :
	gr.Warning("Please Enter English Text")
	else:
	translation = translate_mbart_enmy(source, translator, sp_model)
	elif direction == "Myanmar to English":
	translator = ctranslate2.Translator(mbart_myen_ct_model_path)
	sp_model = spm.SentencePieceProcessor(mbart_myen_sp_model_path)
	if source == "" :
	gr.Warning("Please Enter Myanmar Text")
	else:
	translation = translate_mbart_myen(source, translator, sp_model)
	else: gr.Warning("Please Select Language Direction")
	return translation

	css = """
	#warning {background-color: #FFCCCB}
	.feedback textarea {font-size: 24px !important}
	#text_button1 {background: gray;}
	.translate {font-size: 5px !important;}
	.translate {width: 150px !important;}
	#img .img {width: 30px; height: 400px;}
	#img1
	input, textarea, select {font-weight: bold; color:blue !important;}
	.tab button.selected{
	font-size: 36px !important;
	font-weight: bold;
	color:blue !important;
	}
	"""
	def clear_mbart():
	return "", ""

	def clear_mm_to_wa():
	return "", "", "", ""
	import base64

	theme = 'gstaff/whiteboard'

	demo = gr.Blocks(css=css, theme=gr.themes.Soft(), title="Machine Translation between Myanmar and English Translator")

	with open("logo.png", "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode()
	width, height = 80, 80

	html_content = f'<img src="data:image/x-icon;base64,{encoded_string}" alt="NLP Logo" align="left" width="{width}" height="{height}"/>'

	with demo:

	with gr.Row(equal_height=True):
	with gr.Column(scale=1, min_width=150):
	gr.HTML(html_content)
	with gr.Column(scale=25, min_width=150):
	gr.Markdown("<div style='text-align: center;font-weight: bold;font-size: 32px; color:blue'>Transformer-based Neural Machine Traslation between Myanmar and English Languages Translator</div>")
	with gr.Column(scale=1, min_width=150):
	gr.HTML("<img src='https://www.ucsy.edu.mm/img/ucsylogo.png' alt='UCSY Logo' width='100' height='100' align='right'/>")

	####################
	with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

	with gr.TabItem("Proposed Transformer"):
	with gr.Row():
	direction_trans = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

	with gr.Row():
	with gr.Column():
	input_to_translate_trans = gr.Textbox("", label="Enter Input Text", lines=5)

	#english = gr.Textbox(label="English text")

	translate_btn = gr.Button(value="Translate")

	with gr.Column():
	translated_text_trans = gr.Textbox(value="", label="Translated Text", lines=5)
	gr.ClearButton([input_to_translate_trans,translated_text_trans])
	#clear_button2 = gr.Button(value="Clear")
	#myanmar = gr.Textbox(label="Myanmar Text")
	#with gr.Row():
	#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
	examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
	inputs=[input_to_translate_trans])
	translate_btn.click(call_model_transformer, inputs=[input_to_translate_trans,direction_trans], outputs=translated_text_trans, api_name="English-to-Myanmar")
	##################



	###########################################################
	with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

	with gr.TabItem("Fine-Tuned MBART"):
	with gr.Row():
	direction = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

	with gr.Row():
	with gr.Column():
	input_to_translate = gr.Textbox("", label="Enter Input Text", lines=5)

	#english = gr.Textbox(label="English text")

	translate_btn = gr.Button(value="Translate")

	with gr.Column():
	translated_text = gr.Textbox(value="", label="Translated Text", lines=5)
	gr.ClearButton([input_to_translate,translated_text])
	#clear_button2 = gr.Button(value="Clear")
	#myanmar = gr.Textbox(label="Myanmar Text")
	#with gr.Row():
	#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
	examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
	inputs=[input_to_translate])
	translate_btn.click(call_model, inputs=[input_to_translate,direction], outputs=translated_text, api_name="English-to-Myanmar")

	#text_button2.click(translate_interface, inputs=[input_to_translate, model_choice], outputs=[segmented_text, translated_text_pivot, translated_text_combined])

	#clear_button2.click(outputs=[input_to_translate,translated_text])
	##########################
	####################
	with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

	with gr.TabItem("Fine-Tuned MT5"):
	with gr.Row():
	direction_mt5 = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

	with gr.Row():
	with gr.Column():
	input_to_translate_mt5 = gr.Textbox("", label="Enter Input Text", lines=5)

	#english = gr.Textbox(label="English text")

	translate_btn = gr.Button(value="Translate")

	with gr.Column():
	translated_text_mt5 = gr.Textbox(value="", label="Translated Text", lines=5)
	gr.ClearButton([input_to_translate_mt5,translated_text_mt5])
	#clear_button2 = gr.Button(value="Clear")
	#myanmar = gr.Textbox(label="Myanmar Text")
	#with gr.Row():
	#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
	examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
	inputs=[input_to_translate_mt5])
	translate_btn.click(call_model_mt5, inputs=[input_to_translate_mt5, direction_mt5], outputs=translated_text_mt5, api_name="English-to-Myanmar")
	##################
	demo.launch()