Spaces:

NgalNgal
/

mT5-new

Sleeping

File size: 20,327 Bytes

import os
import spaces
import gradio as gr #gr.load("models/NgalNgal/mT5-new").launch()
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import sentencepiece as spm
import ctranslate2
import transformers
from nltk import sent_tokenize

# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()


mbart_enmy_ct_model_path = "mbart25enmy_ct2/"
mbart_enmy_sp_model_path = "mbart25enmy_ct2/sentence.bpe.model"

mbart_myen_ct_model_path = "mbart25-myen_ct2/"
mbart_myen_sp_model_path = "mbart25-myen_ct2/sentence.bpe.model"

mt5_ct_model_path = "mt5-ct2/"
mt5_sp_model_path = "mt5-base/"


trans_sp_source_enmy_path = "enmy_ctranslate2/source.model"
trans_sp_target_enmy_path = "enmy_ctranslate2/target.model"

trans_sp_source_myen_path = "myen_ctranslate2/source.model"
trans_sp_target_myen_path = "myen_ctranslate2/target.model"

trans_enmy_ct_model_path = "enmy_ctranslate2/"
trans_myen_ct_model_path = "myen_ctranslate2/"

#translator = ctranslate2.Translator(ct_model_path)
#sp_model = spm.SentencePieceProcessor(sp_model_path)

#!/usr/bin/python

def segment_sentence(source):
    input_file = "input.txt"
    output_file = "output.txt"
    with open(input_file, "w", encoding="utf-8") as file:
        file.write(source)
    os.system("python myseg.py < input.txt > output.txt")



    #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
    #subprocess.run(segmentation_command, shell=True, check=True)

    with open(output_file, "r", encoding="utf-8") as file:
        segmented_content = file.read()

    #subprocess.run(f"rm /content/drive/MyDrive/input.txt /content/drive/MyDrive/output.txt", shell=True, check=True)
    #segmented_sentence = segmented_content.replace("|", " ")
    print(segmented_content)
    return segmented_content


def write_to_file_myanmar(source):
    input_file = "write-input.txt"
    output_file = "read-output.txt"
    with open(input_file, "w", encoding="utf-8") as file:
        file.write(source)
    
    os.system("python myseg.py < write-input.txt > read-output.txt")



    #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
    #subprocess.run(segmentation_command, shell=True, check=True)

    with open(output_file, "r", encoding="utf-8") as file:
        segmented_contents = file.read()
    
    #source_sents = [segmented_content.strip() for segmented_content in segmented_contents]
        
    return segmented_contents

def write_to_file_english(source):
    input_file = "write-input.txt"
    #output_file = "read-output.txt"
    with open(input_file, "w", encoding="utf-8") as file:
        file.write(source)
    
    #os.system("python /content/drive/MyDrive/mbart-enmy/myseg.py < /content/drive/MyDrive/write-input.txt > /content/drive/MyDrive/read-output.txt")



    #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}"
    #subprocess.run(segmentation_command, shell=True, check=True)

    with open(input_file, "r", encoding="utf-8") as file:
        segmented_contents = file.read()
    
    #source_sents = [segmented_content.strip() for segmented_content in segmented_contents]
        
    return segmented_contents


def call_model_transformer(source, direction_trans):
    if direction_trans == "English to Myanmar":
        ct_model_path = "enmy_ctranslate2/"
        sp_source_model_path = "enmy_ctranslate2/source.model"
        sp_target_model_path = "enmy_ctranslate2/target.model"
        if source == "" :
            gr.Warning("Please Enter English Text")
        else:
            #Set file paths
            source_file_path = "write-input.txt"
            target_file_path = "read-output.txt"
            
            # Load the source SentecePiece model
            sp = spm.SentencePieceProcessor()
            sp.load(sp_source_model_path)

            # write source to file
            with open(source_file_path, "w", encoding="utf-8") as file:
                file.write(source)

            # Open the source file
            with open(source_file_path, "r") as source:
                  lines = source.readlines()

            source_sents = [line.strip() for line in lines]

            # Subword the source sentences
            source_sents_subworded = sp.encode_as_pieces(source_sents)

            # Translate the source sentences
            translator = ctranslate2.Translator(ct_model_path, device="cpu")  # or "cuda" for GPU
            translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
            translations = [translation.hypotheses[0] for translation in translations]

            # Load the target SentecePiece model
            sp.load(sp_target_model_path)

            # Desubword the target sentences
            translations_desubword = sp.decode(translations)


            # Save the translations to the a file
            with open(target_file_path, "w+", encoding="utf-8") as target:
                for line in translations_desubword:
                    target.write(line.strip() + "\n")

            #print("Done")
            
            with open(target_file_path, "r", encoding="utf-8") as file:
                segmented_contents = file.read()

     
    elif direction_trans == "Myanmar to English":
        ct_model_path = "myen_ctranslate2/"
        sp_source_model_path = "myen_ctranslate2/source.model"
        sp_target_model_path = "myen_ctranslate2/target.model"
        if sources == "" :
            gr.Warning("Please Enter Myanmar Text")
        else:
            #Set file paths
            source_file_path = "write-input.txt"
            target_file_path = "read-output.txt"
            
            # Load the source SentecePiece model
            sp = spm.SentencePieceProcessor()
            sp.load(sp_source_model_path)

            # write source to file
            with open(source_file_path, "w", encoding="utf-8") as file:
                file.write(source)

            # Open the source file
            with open(source_file_path, "r") as source:
                  lines = source.readlines()

            source_sents = [line.strip() for line in lines]

            # Subword the source sentences
            source_sents_subworded = sp.encode_as_pieces(source_sents)

            # Translate the source sentences
            translator = ctranslate2.Translator(ct_model_path, device="cpu")  # or "cuda" for GPU
            translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096)
            translations = [translation.hypotheses[0] for translation in translations]

            # Load the target SentecePiece model
            sp.load(sp_target_model_path)

            # Desubword the target sentences
            translations_desubword = sp.decode(translations)


            # Save the translations to the a file
            with open(target_file_path, "w+", encoding="utf-8") as target:
                for line in translations_desubword:
                    target.write(line.strip() + "\n")

            #print("Done")
            
            with open(target_file_path, "r", encoding="utf-8") as file:
                segmented_contents = file.read()


    else: gr.Warning("Please Select Language Direction")
  
  
    return segmented_contents


def translate_trans_myen(source, translator, sp_source_model, sp_target_model):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """
    source = segment_sentence(source)
    source_sentences = sent_tokenize(source)  # split sentences
    source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
    translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
    translations = [translation[0]["tokens"] for translation in translations]
    translations = sp_target_model.decode(translations)

    return translations

def translate_trans_enmy(source, translator, sp_source_model, sp_target_model):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """
    source_sentences = sent_tokenize(source)  # split sentences
    source_tokenized = sp_source_model.encode(source_sentences, out_type=str)
    translations = translator.translate_batch(source_tokenized, replace_unknowns=True)
    translations = [translation[0]["tokens"] for translation in translations]
    translations_detokenized = sp_target_model.decode(translations)

    return translations_detokenized
def translate_mt5_myen(source, translator, tokenizer):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """
    source = segment_sentence(source)
    input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
    results = translator.translate_batch([input_tokens])
    output_tokens = results[0].hypotheses[0]
    translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
    return translations

def call_model_mt5(source, direction_mt5):
    if direction_mt5 == "English to Myanmar":
        translator = ctranslate2.Translator(mt5_ct_model_path)
        tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
        if source == "" :
            gr.Warning("Please Enter English Text")
        else:
            translation = translate_mt5_enmy(source, translator, tokenizer)
    elif direction_mt5 == "Myanmar to English":
        translator = ctranslate2.Translator(mt5_ct_model_path)
        tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path)
        if source == "" :
            gr.Warning("Please Enter Myanmar Text")
        else:
            translation = translate_mt5_myen(source, translator, tokenizer)
    else: gr.Warning("Please Select Language Direction")
    return translation

def translate_mt5_enmy(source, translator, tokenizer):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """
    input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source))
    results = translator.translate_batch([input_tokens])
    output_tokens = results[0].hypotheses[0]
    translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
    return translations

def translate_mbart_myen(source, translator, sp_model):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """
    source = segment_sentence(source)
#    source_sentences = sent_tokenize(source)
    source_tokenized = sp_model.encode(source, out_type=str)
#    print("print 1" , source_tokenized)
    source_tokenized = ["[my_MM]"] + source_tokenized
#    print("print " + source_tokenized)
    target_prefix = ["[en_XX]"]
    translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
#    translations = [translation[0]["tokens"] for translation in translations]
    translations = sp_model.decode(translations[0].hypotheses[0][1:])
#    translation = " ".join(translations_detokenized)

    return translations

def translate_mbart_enmy(source, translator, sp_model):
    """Use CTranslate model to translate a sentence
    Args:
        source (str): Source sentences to translate
        translator (object): Object of Translator, with the CTranslate2 model
        sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
    Returns:
        Translation of the source text
    """

#    source_sentences = sent_tokenize(source)
    source_tokenized = sp_model.encode(source, out_type=str)
#    print("print 1" , source_tokenized)
    source_tokenized = ["[en_XX]"] + source_tokenized
#    print("print " + source_tokenized)
    target_prefix = ["[my_MM]"]
    translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix])
#    translations = [translation[0]["tokens"] for translation in translations]
    translations = sp_model.decode(translations[0].hypotheses[0][1:])
#    translation = " ".join(translations_detokenized)

    return translations

def call_model(source, direction):
  if direction == "English to Myanmar":
    translator = ctranslate2.Translator(mbart_enmy_ct_model_path)
    sp_model = spm.SentencePieceProcessor(mbart_enmy_sp_model_path)
    if source == "" :
      gr.Warning("Please Enter English Text")
    else:
      translation = translate_mbart_enmy(source, translator, sp_model)
  elif direction == "Myanmar to English":
    translator = ctranslate2.Translator(mbart_myen_ct_model_path)
    sp_model = spm.SentencePieceProcessor(mbart_myen_sp_model_path)
    if source == "" :
      gr.Warning("Please Enter Myanmar Text")
    else:
      translation = translate_mbart_myen(source, translator, sp_model)
  else: gr.Warning("Please Select Language Direction")
  return translation

css = """
#warning {background-color: #FFCCCB}
.feedback textarea {font-size: 24px !important}
#text_button1 {background: gray;}
.translate {font-size: 5px !important;}
.translate {width: 150px !important;}
#img .img {width: 30px; height: 400px;}
#img1
input, textarea, select {font-weight: bold; color:blue !important;}
.tab button.selected{
    font-size: 36px !important;
    font-weight: bold;
    color:blue !important;
}
"""
def clear_mbart():
    return "", ""

def clear_mm_to_wa():
    return "", "", "", ""
import base64

theme = 'gstaff/whiteboard'

demo = gr.Blocks(css=css, theme=gr.themes.Soft(), title="Machine Translation between Myanmar and English Translator")

with open("logo.png", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read()).decode()
width, height = 80, 80

html_content = f'<img src="data:image/x-icon;base64,{encoded_string}" alt="NLP Logo" align="left" width="{width}" height="{height}"/>'

with demo:

    with gr.Row(equal_height=True):
        with gr.Column(scale=1, min_width=150):
            gr.HTML(html_content)
        with gr.Column(scale=25, min_width=150):
            gr.Markdown("<div style='text-align: center;font-weight: bold;font-size: 32px; color:blue'>Transformer-based Neural Machine Traslation between Myanmar and English Languages Translator</div>")
        with gr.Column(scale=1, min_width=150):
            gr.HTML("<img src='https://www.ucsy.edu.mm/img/ucsylogo.png' alt='UCSY Logo' width='100' height='100' align='right'/>")

####################
    with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

        with gr.TabItem("Proposed Transformer"):
            with gr.Row():
              direction_trans = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

            with gr.Row():
              with gr.Column():
                input_to_translate_trans = gr.Textbox("", label="Enter Input Text", lines=5)

                #english = gr.Textbox(label="English text")

                translate_btn = gr.Button(value="Translate")

              with gr.Column():
                translated_text_trans = gr.Textbox(value="", label="Translated Text", lines=5)
                gr.ClearButton([input_to_translate_trans,translated_text_trans])
                #clear_button2 = gr.Button(value="Clear")
                #myanmar = gr.Textbox(label="Myanmar Text")
              #with gr.Row():
                #clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
            examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
                           inputs=[input_to_translate_trans])
    translate_btn.click(call_model_transformer, inputs=[input_to_translate_trans,direction_trans], outputs=translated_text_trans, api_name="English-to-Myanmar")
##################



###########################################################
    with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

        with gr.TabItem("Fine-Tuned MBART"):
            with gr.Row():
              direction = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

            with gr.Row():
              with gr.Column():
                input_to_translate = gr.Textbox("", label="Enter Input Text", lines=5)

                #english = gr.Textbox(label="English text")

                translate_btn = gr.Button(value="Translate")

              with gr.Column():
                translated_text = gr.Textbox(value="", label="Translated Text", lines=5)
                gr.ClearButton([input_to_translate,translated_text])
                #clear_button2 = gr.Button(value="Clear")
                #myanmar = gr.Textbox(label="Myanmar Text")
              #with gr.Row():
                #clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
            examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
                           inputs=[input_to_translate])
    translate_btn.click(call_model, inputs=[input_to_translate,direction], outputs=translated_text, api_name="English-to-Myanmar")

    #text_button2.click(translate_interface, inputs=[input_to_translate, model_choice], outputs=[segmented_text, translated_text_pivot, translated_text_combined])

    #clear_button2.click(outputs=[input_to_translate,translated_text])
##########################
####################
    with gr.Tabs(elem_classes=["tab"]): #elem_classes=["tab"]

        with gr.TabItem("Fine-Tuned MT5"):
            with gr.Row():
              direction_mt5 = gr.Dropdown(["English to Myanmar", "Myanmar to English"] ,label="Select Direction")

            with gr.Row():
              with gr.Column():
                input_to_translate_mt5 = gr.Textbox("", label="Enter Input Text", lines=5)

                #english = gr.Textbox(label="English text")

                translate_btn = gr.Button(value="Translate")

              with gr.Column():
                translated_text_mt5 = gr.Textbox(value="", label="Translated Text", lines=5)
                gr.ClearButton([input_to_translate_mt5,translated_text_mt5])
                #clear_button2 = gr.Button(value="Clear")
                #myanmar = gr.Textbox(label="Myanmar Text")
              #with gr.Row():
                #clear_button2 = gr.Button(value="Clear", elem_id="clear_button2", elem_classes="translate")
            examples = gr.Examples(examples=["I went to the supermarket yesterday.", "စာသင်ခန်းထဲမှာကွန်ပျူတာအသစ်တစ်လုံးရှိတယ်။"],
                           inputs=[input_to_translate_mt5])
    translate_btn.click(call_model_mt5, inputs=[input_to_translate_mt5, direction_mt5], outputs=translated_text_mt5, api_name="English-to-Myanmar")
##################
demo.launch()