|
import os |
|
import spaces |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
import sentencepiece as spm |
|
import ctranslate2 |
|
import transformers |
|
from nltk import sent_tokenize |
|
|
|
|
|
sp = spm.SentencePieceProcessor() |
|
|
|
|
|
mbart_enmy_ct_model_path = "mbart25enmy_ct2/" |
|
mbart_enmy_sp_model_path = "mbart25enmy_ct2/sentence.bpe.model" |
|
|
|
mbart_myen_ct_model_path = "mbart25-myen_ct2/" |
|
mbart_myen_sp_model_path = "mbart25-myen_ct2/sentence.bpe.model" |
|
|
|
mt5_ct_model_path = "mt5-ct2/" |
|
mt5_sp_model_path = "mt5-base/" |
|
|
|
|
|
trans_sp_source_enmy_path = "enmy_ctranslate2/source.model" |
|
trans_sp_target_enmy_path = "enmy_ctranslate2/target.model" |
|
|
|
trans_sp_source_myen_path = "myen_ctranslate2/source.model" |
|
trans_sp_target_myen_path = "myen_ctranslate2/target.model" |
|
|
|
trans_enmy_ct_model_path = "enmy_ctranslate2/" |
|
trans_myen_ct_model_path = "myen_ctranslate2/" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def segment_sentence(source): |
|
input_file = "input.txt" |
|
output_file = "output.txt" |
|
with open(input_file, "w", encoding="utf-8") as file: |
|
file.write(source) |
|
os.system("python myseg.py < input.txt > output.txt") |
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(output_file, "r", encoding="utf-8") as file: |
|
segmented_content = file.read() |
|
|
|
|
|
|
|
print(segmented_content) |
|
return segmented_content |
|
|
|
|
|
def write_to_file_myanmar(source): |
|
input_file = "write-input.txt" |
|
output_file = "read-output.txt" |
|
with open(input_file, "w", encoding="utf-8") as file: |
|
file.write(source) |
|
|
|
os.system("python myseg.py < write-input.txt > read-output.txt") |
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(output_file, "r", encoding="utf-8") as file: |
|
segmented_contents = file.read() |
|
|
|
|
|
|
|
return segmented_contents |
|
|
|
def write_to_file_english(source): |
|
input_file = "write-input.txt" |
|
|
|
with open(input_file, "w", encoding="utf-8") as file: |
|
file.write(source) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(input_file, "r", encoding="utf-8") as file: |
|
segmented_contents = file.read() |
|
|
|
|
|
|
|
return segmented_contents |
|
|
|
|
|
def call_model_transformer(source, direction_trans): |
|
if direction_trans == "English to Myanmar": |
|
ct_model_path = "enmy_ctranslate2/" |
|
sp_source_model_path = "enmy_ctranslate2/source.model" |
|
sp_target_model_path = "enmy_ctranslate2/target.model" |
|
if source == "" : |
|
gr.Warning("Please Enter English Text") |
|
else: |
|
|
|
source_file_path = "write-input.txt" |
|
target_file_path = "read-output.txt" |
|
|
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(sp_source_model_path) |
|
|
|
|
|
with open(source_file_path, "w", encoding="utf-8") as file: |
|
file.write(source) |
|
|
|
|
|
with open(source_file_path, "r") as source: |
|
lines = source.readlines() |
|
|
|
source_sents = [line.strip() for line in lines] |
|
|
|
|
|
source_sents_subworded = sp.encode_as_pieces(source_sents) |
|
|
|
|
|
translator = ctranslate2.Translator(ct_model_path, device="cpu") |
|
translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096) |
|
translations = [translation.hypotheses[0] for translation in translations] |
|
|
|
|
|
sp.load(sp_target_model_path) |
|
|
|
|
|
translations_desubword = sp.decode(translations) |
|
|
|
|
|
|
|
with open(target_file_path, "w+", encoding="utf-8") as target: |
|
for line in translations_desubword: |
|
target.write(line.strip() + "\n") |
|
|
|
|
|
|
|
with open(target_file_path, "r", encoding="utf-8") as file: |
|
segmented_contents = file.read() |
|
|
|
|
|
elif direction_trans == "Myanmar to English": |
|
ct_model_path = "myen_ctranslate2/" |
|
sp_source_model_path = "myen_ctranslate2/source.model" |
|
sp_target_model_path = "myen_ctranslate2/target.model" |
|
if sources == "" : |
|
gr.Warning("Please Enter Myanmar Text") |
|
else: |
|
|
|
source_file_path = "write-input.txt" |
|
target_file_path = "read-output.txt" |
|
|
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(sp_source_model_path) |
|
|
|
|
|
with open(source_file_path, "w", encoding="utf-8") as file: |
|
file.write(source) |
|
|
|
|
|
with open(source_file_path, "r") as source: |
|
lines = source.readlines() |
|
|
|
source_sents = [line.strip() for line in lines] |
|
|
|
|
|
source_sents_subworded = sp.encode_as_pieces(source_sents) |
|
|
|
|
|
translator = ctranslate2.Translator(ct_model_path, device="cpu") |
|
translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096) |
|
translations = [translation.hypotheses[0] for translation in translations] |
|
|
|
|
|
sp.load(sp_target_model_path) |
|
|
|
|
|
translations_desubword = sp.decode(translations) |
|
|
|
|
|
|
|
with open(target_file_path, "w+", encoding="utf-8") as target: |
|
for line in translations_desubword: |
|
target.write(line.strip() + "\n") |
|
|
|
|
|
|
|
with open(target_file_path, "r", encoding="utf-8") as file: |
|
segmented_contents = file.read() |
|
|
|
|
|
else: gr.Warning("Please Select Language Direction") |
|
|
|
|
|
return segmented_contents |
|
|
|
|
|
def translate_trans_myen(source, translator, sp_source_model, sp_target_model): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
source = segment_sentence(source) |
|
source_sentences = sent_tokenize(source) |
|
source_tokenized = sp_source_model.encode(source_sentences, out_type=str) |
|
translations = translator.translate_batch(source_tokenized, replace_unknowns=True) |
|
translations = [translation[0]["tokens"] for translation in translations] |
|
translations = sp_target_model.decode(translations) |
|
|
|
return translations |
|
|
|
def translate_trans_enmy(source, translator, sp_source_model, sp_target_model): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
source_sentences = sent_tokenize(source) |
|
source_tokenized = sp_source_model.encode(source_sentences, out_type=str) |
|
translations = translator.translate_batch(source_tokenized, replace_unknowns=True) |
|
translations = [translation[0]["tokens"] for translation in translations] |
|
translations_detokenized = sp_target_model.decode(translations) |
|
|
|
return translations_detokenized |
|
def translate_mt5_myen(source, translator, tokenizer): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
source = segment_sentence(source) |
|
input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source)) |
|
results = translator.translate_batch([input_tokens]) |
|
output_tokens = results[0].hypotheses[0] |
|
translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens)) |
|
return translations |
|
|
|
def call_model_mt5(source, direction_mt5): |
|
if direction_mt5 == "English to Myanmar": |
|
translator = ctranslate2.Translator(mt5_ct_model_path) |
|
tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path) |
|
if source == "" : |
|
gr.Warning("Please Enter English Text") |
|
else: |
|
translation = translate_mt5_enmy(source, translator, tokenizer) |
|
elif direction_mt5 == "Myanmar to English": |
|
translator = ctranslate2.Translator(mt5_ct_model_path) |
|
tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path) |
|
if source == "" : |
|
gr.Warning("Please Enter Myanmar Text") |
|
else: |
|
translation = translate_mt5_myen(source, translator, tokenizer) |
|
else: gr.Warning("Please Select Language Direction") |
|
return translation |
|
|
|
def translate_mt5_enmy(source, translator, tokenizer): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source)) |
|
results = translator.translate_batch([input_tokens]) |
|
output_tokens = results[0].hypotheses[0] |
|
translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens)) |
|
return translations |
|
|
|
def translate_mbart_myen(source, translator, sp_model): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
source = segment_sentence(source) |
|
|
|
source_tokenized = sp_model.encode(source, out_type=str) |
|
|
|
source_tokenized = ["[my_MM]"] + source_tokenized |
|
|
|
target_prefix = ["[en_XX]"] |
|
translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix]) |
|
|
|
translations = sp_model.decode(translations[0].hypotheses[0][1:]) |
|
|
|
|
|
return translations |
|
|
|
def translate_mbart_enmy(source, translator, sp_model): |
|
"""Use CTranslate model to translate a sentence |
|
Args: |
|
source (str): Source sentences to translate |
|
translator (object): Object of Translator, with the CTranslate2 model |
|
sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model |
|
Returns: |
|
Translation of the source text |
|
""" |
|
|
|
|
|
source_tokenized = sp_model.encode(source, out_type=str) |
|
|
|
source_tokenized = ["[en_XX]"] + source_tokenized |
|
|
|
target_prefix = ["[my_MM]"] |
|
translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix]) |
|
|
|
translations = sp_model.decode(translations[0].hypotheses[0][1:]) |
|
|
|
|
|
return translations |
|
|
|
def call_model(source, direction): |
|
if direction == "English to Myanmar": |
|
translator = ctranslate2.Translator(mbart_enmy_ct_model_path) |
|
sp_model = spm.SentencePieceProcessor(mbart_enmy_sp_model_path) |
|
if source == "" : |
|
gr.Warning("Please Enter English Text") |
|
else: |
|
translation = translate_mbart_enmy(source, translator, sp_model) |
|
elif direction == "Myanmar to English": |
|
translator = ctranslate2.Translator(mbart_myen_ct_model_path) |
|
sp_model = spm.SentencePieceProcessor(mbart_myen_sp_model_path) |
|
if source == "" : |
|
gr.Warning("Please Enter Myanmar Text") |
|
else: |
|
translation = translate_mbart_myen(source, translator, sp_model) |
|
else: gr.Warning("Please Select Language Direction") |
|
return translation |
|
|
|
css = """ |
|
#warning {background-color: #FFCCCB} |
|
.feedback textarea {font-size: 24px !important} |
|
#text_button1 {background: gray;} |
|
.translate {font-size: 5px !important;} |
|
.translate {width: 150px !important;} |
|
#img .img {width: 30px; height: 400px;} |
|
#img1 |
|
input, textarea, select {font-weight: bold; color:blue !important;} |
|
.tab button.selected{ |
|
font-size: 36px !important; |
|
font-weight: bold; |
|
color:blue !important; |
|
} |
|
""" |
|
def clear_mbart(): |
|
return "", "" |
|
|
|
def clear_mm_to_wa(): |
|
return "", "", "", "" |
|
import base64 |
|
|
|
theme = 'gstaff/whiteboard' |
|
|
|
demo = gr.Blocks(css=css, theme=gr.themes.Soft(), title="Machine Translation between Myanmar and English Translator") |
|
|
|
with open("logo.png", "rb") as image_file: |
|
encoded_string = base64.b64encode(image_file.read()).decode() |
|
width, height = 80, 80 |
|
|
|
html_content = f'<img src="data:image/x-icon;base64,{encoded_string}" alt="NLP Logo" align="left" width="{width}" height="{height}"/>' |
|
|
|
with demo: |
|
|
|
with gr.Row(equal_height=True): |
|
with gr.Column(scale=1, min_width=150): |
|
gr.HTML(html_content) |
|
with gr.Column(scale=25, min_width=150): |
|
gr.Markdown("<div style='text-align: center;font-weight: bold;font-size: 32px; color:blue'>Transformer-based Neural Machine Traslation between Myanmar and English Languages Translator</div>") |
|
with gr.Column(scale=1, min_width=150): |
|
gr.HTML("<img src='https://www.ucsy.edu.mm/img/ucsylogo.png' alt='UCSY Logo' width='100' height='100' align='right'/>") |
|
|
|
|
|
with gr.Tabs(elem_classes=["tab"]): |
|
|
|
with gr.TabItem("Proposed Transformer"): |
|
with gr.Row(): |
|
direction_trans = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_to_translate_trans = gr.Textbox("", label="Enter Input Text", lines=5) |
|
|
|
|
|
|
|
translate_btn = gr.Button(value="Translate") |
|
|
|
with gr.Column(): |
|
translated_text_trans = gr.Textbox(value="", label="Translated Text", lines=5) |
|
gr.ClearButton([input_to_translate_trans,translated_text_trans]) |
|
|
|
|
|
|
|
|
|
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α
α¬αααΊαααΊαΈαα²ααΎα¬αα½ααΊαα»α°αα¬α‘αα
αΊαα
αΊαα―αΆαΈααΎααααΊα"], |
|
inputs=[input_to_translate_trans]) |
|
translate_btn.click(call_model_transformer, inputs=[input_to_translate_trans,direction_trans], outputs=translated_text_trans, api_name="English-to-Myanmar") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Tabs(elem_classes=["tab"]): |
|
|
|
with gr.TabItem("Fine-Tuned MBART"): |
|
with gr.Row(): |
|
direction = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_to_translate = gr.Textbox("", label="Enter Input Text", lines=5) |
|
|
|
|
|
|
|
translate_btn = gr.Button(value="Translate") |
|
|
|
with gr.Column(): |
|
translated_text = gr.Textbox(value="", label="Translated Text", lines=5) |
|
gr.ClearButton([input_to_translate,translated_text]) |
|
|
|
|
|
|
|
|
|
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α
α¬αααΊαααΊαΈαα²ααΎα¬αα½ααΊαα»α°αα¬α‘αα
αΊαα
αΊαα―αΆαΈααΎααααΊα"], |
|
inputs=[input_to_translate]) |
|
translate_btn.click(call_model, inputs=[input_to_translate,direction], outputs=translated_text, api_name="English-to-Myanmar") |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tabs(elem_classes=["tab"]): |
|
|
|
with gr.TabItem("Fine-Tuned MT5"): |
|
with gr.Row(): |
|
direction_mt5 = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_to_translate_mt5 = gr.Textbox("", label="Enter Input Text", lines=5) |
|
|
|
|
|
|
|
translate_btn = gr.Button(value="Translate") |
|
|
|
with gr.Column(): |
|
translated_text_mt5 = gr.Textbox(value="", label="Translated Text", lines=5) |
|
gr.ClearButton([input_to_translate_mt5,translated_text_mt5]) |
|
|
|
|
|
|
|
|
|
examples = gr.Examples(examples=["I went to the supermarket yesterday.", "α
α¬αααΊαααΊαΈαα²ααΎα¬αα½ααΊαα»α°αα¬α‘αα
αΊαα
αΊαα―αΆαΈααΎααααΊα"], |
|
inputs=[input_to_translate_mt5]) |
|
translate_btn.click(call_model_mt5, inputs=[input_to_translate_mt5, direction_mt5], outputs=translated_text_mt5, api_name="English-to-Myanmar") |
|
|
|
demo.launch() |