mT5-new / app.py
NgalNgal's picture
Update app.py
d226f22 verified
import os
import spaces
import gradio as gr #gr.load("models/NgalNgal/mT5-new").launch()
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import sentencepiece as spm
import ctranslate2
import transformers
from nltk import sent_tokenize
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
mbart_enmy_ct_model_path = "mbart25enmy_ct2/"
mbart_enmy_sp_model_path = "mbart25enmy_ct2/sentence.bpe.model"
mbart_myen_ct_model_path = "mbart25-myen_ct2/"
mbart_myen_sp_model_path = "mbart25-myen_ct2/sentence.bpe.model"
mt5_ct_model_path = "mt5-ct2/"
mt5_sp_model_path = "mt5-base/"
trans_sp_source_enmy_path = "enmy_ctranslate2/source.model"
trans_sp_target_enmy_path = "enmy_ctranslate2/target.model"
trans_sp_source_myen_path = "myen_ctranslate2/source.model"
trans_sp_target_myen_path = "myen_ctranslate2/target.model"
trans_enmy_ct_model_path = "enmy_ctranslate2/"
trans_myen_ct_model_path = "myen_ctranslate2/"
#translator = ctranslate2.Translator(ct_model_path)
#sp_model = spm.SentencePieceProcessor(sp_model_path)
#!/usr/bin/python
def segment_sentence(source):
input_file = "input.txt"
output_file = "output.txt"
with open(input_file, "w", encoding="utf-8") as file:
file.write(source)
os.system("python myseg.py < input.txt > output.txt")
#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
#subprocess.run(segmentation_command, shell=True, check=True)
with open(output_file, "r", encoding="utf-8") as file:
segmented_content = file.read()
#subprocess.run(f"rm /content/drive/MyDrive/input.txt /content/drive/MyDrive/output.txt", shell=True, check=True)
#segmented_sentence = segmented_content.replace("|", " ")
print(segmented_content)
return segmented_content
def write_to_file_myanmar(source):
input_file = "write-input.txt"
output_file = "read-output.txt"
with open(input_file, "w", encoding="utf-8") as file:
file.write(source)
os.system("python myseg.py < write-input.txt > read-output.txt")
#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
#subprocess.run(segmentation_command, shell=True, check=True)
with open(output_file, "r", encoding="utf-8") as file:
segmented_contents = file.read()
#source_sents = [segmented_content.strip() for segmented_content in segmented_contents]
return segmented_contents
def write_to_file_english(source):
input_file = "write-input.txt"
#output_file = "read-output.txt"
with open(input_file, "w", encoding="utf-8") as file:
file.write(source)
#os.system("python /content/drive/MyDrive/mbart-enmy/myseg.py < /content/drive/MyDrive/write-input.txt > /content/drive/MyDrive/read-output.txt")
#segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
#subprocess.run(segmentation_command, shell=True, check=True)
with open(input_file, "r", encoding="utf-8") as file:
segmented_contents = file.read()
#source_sents = [segmented_content.strip() for segmented_content in segmented_contents]
return segmented_contents
def call_model_transformer(source, direction_trans):
if direction_trans == "English to Myanmar":
ct_model_path = "enmy_ctranslate2/"
sp_source_model_path = "enmy_ctranslate2/source.model"
sp_target_model_path = "enmy_ctranslate2/target.model"
if source == "" :
gr.Warning("Please Enter English Text")
else:
#Set file paths
source_file_path = "write-input.txt"
target_file_path = "read-output.txt"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_source_model_path)
# write source to file
with open(source_file_path, "w", encoding="utf-8") as file:
file.write(source)
# Open the source file
with open(source_file_path, "r") as source:
lines = source.readlines()
source_sents = [line.strip() for line in lines]
# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
# Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU
translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
translations = [translation.hypotheses[0] for translation in translations]
# Load the target SentecePiece model
sp.load(sp_target_model_path)
# Desubword the target sentences
translations_desubword = sp.decode(translations)
# Save the translations to the a file
with open(target_file_path, "w+", encoding="utf-8") as target:
for line in translations_desubword:
target.write(line.strip() + "\n")
#print("Done")
with open(target_file_path, "r", encoding="utf-8") as file:
segmented_contents = file.read()
elif direction_trans == "Myanmar to English":
ct_model_path = "myen_ctranslate2/"
sp_source_model_path = "myen_ctranslate2/source.model"
sp_target_model_path = "myen_ctranslate2/target.model"
if sources == "" :
gr.Warning("Please Enter Myanmar Text")
else:
#Set file paths
source_file_path = "write-input.txt"
target_file_path = "read-output.txt"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_source_model_path)
# write source to file
with open(source_file_path, "w", encoding="utf-8") as file:
file.write(source)
# Open the source file
with open(source_file_path, "r") as source:
lines = source.readlines()
source_sents = [line.strip() for line in lines]
# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
# Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU
translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
translations = [translation.hypotheses[0] for translation in translations]
# Load the target SentecePiece model
sp.load(sp_target_model_path)
# Desubword the target sentences
translations_desubword = sp.decode(translations)
# Save the translations to the a file
with open(target_file_path, "w+", encoding="utf-8") as target:
for line in translations_desubword:
target.write(line.strip() + "\n")
#print("Done")
with open(target_file_path, "r", encoding="utf-8") as file:
segmented_contents = file.read()
else: gr.Warning("Please Select Language Direction")
return segmented_contents
def translate_trans_myen(source, translator, sp_source_model, sp_target_model):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
source = segment_sentence(source)
source_sentences = sent_tokenize(source) # split sentences
source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
translations = [translation[0]["tokens"] for translation in translations]
translations = sp_target_model.decode(translations)
return translations
def translate_trans_enmy(source, translator, sp_source_model, sp_target_model):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
source_sentences = sent_tokenize(source) # split sentences
source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
translations = [translation[0]["tokens"] for translation in translations]
translations_detokenized = sp_target_model.decode(translations)
return translations_detokenized
def translate_mt5_myen(source, translator, tokenizer):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
source = segment_sentence(source)
input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
results = translator.translate_batch([input_tokens])
output_tokens = results[0].hypotheses[0]
translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
return translations
def call_model_mt5(source, direction_mt5):
if direction_mt5 == "English to Myanmar":
translator = ctranslate2.Translator(mt5_ct_model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
if source == "" :
gr.Warning("Please Enter English Text")
else:
translation = translate_mt5_enmy(source, translator, tokenizer)
elif direction_mt5 == "Myanmar to English":
translator = ctranslate2.Translator(mt5_ct_model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
if source == "" :
gr.Warning("Please Enter Myanmar Text")
else:
translation = translate_mt5_myen(source, translator, tokenizer)
else: gr.Warning("Please Select Language Direction")
return translation
def translate_mt5_enmy(source, translator, tokenizer):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
results = translator.translate_batch([input_tokens])
output_tokens = results[0].hypotheses[0]
translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
return translations
def translate_mbart_myen(source, translator, sp_model):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
source = segment_sentence(source)
# source_sentences = sent_tokenize(source)
source_tokenized = sp_model.encode(source, out_type=str)
# print("print 1" , source_tokenized)
source_tokenized = ["[my_MM]"] + source_tokenized
# print("print " + source_tokenized)
target_prefix = ["[en_XX]"]
translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
# translations = [translation[0]["tokens"] for translation in translations]
translations = sp_model.decode(translations[0].hypotheses[0][1:])
# translation = " ".join(translations_detokenized)
return translations
def translate_mbart_enmy(source, translator, sp_model):
"""Use CTranslate model to translate a sentence
Args:
source (str): Source sentences to translate
translator (object): Object of Translator, with the CTranslate2 model
sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
Returns:
Translation of the source text
"""
# source_sentences = sent_tokenize(source)
source_tokenized = sp_model.encode(source, out_type=str)
# print("print 1" , source_tokenized)
source_tokenized = ["[en_XX]"] + source_tokenized
# print("print " + source_tokenized)
target_prefix = ["[my_MM]"]
translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
# translations = [translation[0]["tokens"] for translation in translations]
translations = sp_model.decode(translations[0].hypotheses[0][1:])
# translation = " ".join(translations_detokenized)
return translations
def call_model(source, direction):
if direction == "English to Myanmar":
translator = ctranslate2.Translator(mbart_enmy_ct_model_path)
sp_model = spm.SentencePieceProcessor(mbart_enmy_sp_model_path)
if source == "" :
gr.Warning("Please Enter English Text")
else:
translation = translate_mbart_enmy(source, translator, sp_model)
elif direction == "Myanmar to English":
translator = ctranslate2.Translator(mbart_myen_ct_model_path)
sp_model = spm.SentencePieceProcessor(mbart_myen_sp_model_path)
if source == "" :
gr.Warning("Please Enter Myanmar Text")
else:
translation = translate_mbart_myen(source, translator, sp_model)
else: gr.Warning("Please Select Language Direction")
return translation
css = """
#warning {background-color: #FFCCCB}
.feedback textarea {font-size: 24px !important}
#text_button1 {background: gray;}
.translate {font-size: 5px !important;}
.translate {width: 150px !important;}
#img .img {width: 30px; height: 400px;}
#img1
input, textarea, select {font-weight: bold; color:blue !important;}
.tab button.selected{
font-size: 36px !important;
font-weight: bold;
color:blue !important;
}
"""
def clear_mbart():
return "", ""
def clear_mm_to_wa():
return "", "", "", ""
import base64
theme = 'gstaff/whiteboard'
demo = gr.Blocks(css=css, theme=gr.themes.Soft(), title="Machine Translation between Myanmar and English Translator")
with open("logo.png", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
width, height = 80, 80
html_content = f'<img src="data:image/x-icon;base64,{encoded_string}" alt="NLP Logo" align="left" width="{width}" height="{height}"/>'
with demo:
with gr.Row(equal_height=True):
with gr.Column(scale=1, min_width=150):
gr.HTML(html_content)
with gr.Column(scale=25, min_width=150):
gr.Markdown("<div style='text-align: center;font-weight: bold;font-size: 32px; color:blue'>Transformer-based Neural Machine Traslation between Myanmar and English Languages Translator</div>")
with gr.Column(scale=1, min_width=150):
gr.HTML("<img src='https://www.ucsy.edu.mm/img/ucsylogo.png' alt='UCSY Logo' width='100' height='100' align='right'/>")
####################
with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]
with gr.TabItem("Proposed Transformer"):
with gr.Row():
direction_trans = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")
with gr.Row():
with gr.Column():
input_to_translate_trans = gr.Textbox("", label="Enter Input Text", lines=5)
#english = gr.Textbox(label="English text")
translate_btn = gr.Button(value="Translate")
with gr.Column():
translated_text_trans = gr.Textbox(value="", label="Translated Text", lines=5)
gr.ClearButton([input_to_translate_trans,translated_text_trans])
#clear_button2 = gr.Button(value="Clear")
#myanmar = gr.Textbox(label="Myanmar Text")
#with gr.Row():
#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α€…α€¬α€žα€„α€Ία€α€”α€Ία€Έα€‘α€²α€™α€Ύα€¬α€€α€½α€”α€Ία€•α€»α€°α€α€¬α€‘α€žα€…α€Ία€α€…α€Ία€œα€―α€Άα€Έα€›α€Ύα€­α€α€šα€Ία‹"],
inputs=[input_to_translate_trans])
translate_btn.click(call_model_transformer, inputs=[input_to_translate_trans,direction_trans], outputs=translated_text_trans, api_name="English-to-Myanmar")
##################
###########################################################
with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]
with gr.TabItem("Fine-Tuned MBART"):
with gr.Row():
direction = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")
with gr.Row():
with gr.Column():
input_to_translate = gr.Textbox("", label="Enter Input Text", lines=5)
#english = gr.Textbox(label="English text")
translate_btn = gr.Button(value="Translate")
with gr.Column():
translated_text = gr.Textbox(value="", label="Translated Text", lines=5)
gr.ClearButton([input_to_translate,translated_text])
#clear_button2 = gr.Button(value="Clear")
#myanmar = gr.Textbox(label="Myanmar Text")
#with gr.Row():
#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α€…α€¬α€žα€„α€Ία€α€”α€Ία€Έα€‘α€²α€™α€Ύα€¬α€€α€½α€”α€Ία€•α€»α€°α€α€¬α€‘α€žα€…α€Ία€α€…α€Ία€œα€―α€Άα€Έα€›α€Ύα€­α€α€šα€Ία‹"],
inputs=[input_to_translate])
translate_btn.click(call_model, inputs=[input_to_translate,direction], outputs=translated_text, api_name="English-to-Myanmar")
#text_button2.click(translate_interface, inputs=[input_to_translate, model_choice], outputs=[segmented_text, translated_text_pivot, translated_text_combined])
#clear_button2.click(outputs=[input_to_translate,translated_text])
##########################
####################
with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]
with gr.TabItem("Fine-Tuned MT5"):
with gr.Row():
direction_mt5 = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")
with gr.Row():
with gr.Column():
input_to_translate_mt5 = gr.Textbox("", label="Enter Input Text", lines=5)
#english = gr.Textbox(label="English text")
translate_btn = gr.Button(value="Translate")
with gr.Column():
translated_text_mt5 = gr.Textbox(value="", label="Translated Text", lines=5)
gr.ClearButton([input_to_translate_mt5,translated_text_mt5])
#clear_button2 = gr.Button(value="Clear")
#myanmar = gr.Textbox(label="Myanmar Text")
#with gr.Row():
#clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α€…α€¬α€žα€„α€Ία€α€”α€Ία€Έα€‘α€²α€™α€Ύα€¬α€€α€½α€”α€Ία€•α€»α€°α€α€¬α€‘α€žα€…α€Ία€α€…α€Ία€œα€―α€Άα€Έα€›α€Ύα€­α€α€šα€Ία‹"],
inputs=[input_to_translate_mt5])
translate_btn.click(call_model_mt5, inputs=[input_to_translate_mt5, direction_mt5], outputs=translated_text_mt5, api_name="English-to-Myanmar")
##################
demo.launch()