import os import spaces import gradio as gr #gr.load("models/NgalNgal/mT5-new").launch() import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import sentencepiece as spm import ctranslate2 import transformers from nltk import sent_tokenize # Load the source SentecePiece model sp = spm.SentencePieceProcessor() mbart_enmy_ct_model_path = "mbart25enmy_ct2/" mbart_enmy_sp_model_path = "mbart25enmy_ct2/sentence.bpe.model" mbart_myen_ct_model_path = "mbart25myen_ct2/" mbart_myen_sp_model_path = "mbart25myen_ct2/sentence.bpe.model" mt5_ct_model_path = "mt5/mt5-ct2/" mt5_sp_model_path = "mt5/mt5-base/" trans_sp_source_enmy_path = "enmy_ctranslate2/source.model" trans_sp_target_enmy_path = "enmy_ctranslate2/target.model" trans_sp_source_myen_path = "myen_ctranslate2/source.model" trans_sp_target_myen_path = "myen_ctranslate2/target.model" trans_enmy_ct_model_path = "enmy_ctranslate2/" trans_myen_ct_model_path = "myen_ctranslate2/" #translator = ctranslate2.Translator(ct_model_path) #sp_model = spm.SentencePieceProcessor(sp_model_path) #!/usr/bin/python def segment_sentence(source): input_file = "input.txt" output_file = "output.txt" with open(input_file, "w", encoding="utf-8") as file: file.write(source) os.system("python myseg.py < input.txt > output.txt") #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}" #subprocess.run(segmentation_command, shell=True, check=True) with open(output_file, "r", encoding="utf-8") as file: segmented_content = file.read() #subprocess.run(f"rm /content/drive/MyDrive/input.txt /content/drive/MyDrive/output.txt", shell=True, check=True) #segmented_sentence = segmented_content.replace("|", " ") print(segmented_content) return segmented_content def write_to_file_myanmar(source): input_file = "write-input.txt" output_file = "read-output.txt" with open(input_file, "w", encoding="utf-8") as file: file.write(source) os.system("python myseg.py < write-input.txt > read-output.txt") #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}" #subprocess.run(segmentation_command, shell=True, check=True) with open(output_file, "r", encoding="utf-8") as file: segmented_contents = file.readlines() source_sents = [segmented_content.strip() for segmented_content in segmented_contents] return source_sents def write_to_file_english(source): input_file = "write-input.txt" #output_file = "read-output.txt" with open(input_file, "w", encoding="utf-8") as file: file.write(source) #os.system("python /content/drive/MyDrive/mbart-enmy/myseg.py < /content/drive/MyDrive/write-input.txt > /content/drive/MyDrive/read-output.txt") #segmentation_command = f"python /content/drive/MyDrive/mbart-enmy/myseg.py {input_file} > {output_file}" #subprocess.run(segmentation_command, shell=True, check=True) with open(input_file, "r", encoding="utf-8") as file: segmented_contents = file.readlines() source_sents = [segmented_content.strip() for segmented_content in segmented_contents] return source_sents def call_model_transformer(sources, direction_trans): if direction_trans == "English to Myanmar": ct_model_path = "enmy_ctranslate2/" sp_source_model_path = "enmy_ctranslate2/source.model" sp_target_model_path = "enmy_ctranslate2/target.model" if sources == "" : gr.Warning("Please Enter English Text") else: sp_source_model = sp.load("enmy_ctranslate2/source.model") sp_target_model = sp.load("enmy_ctranslate2/target.model") #translator = ctranslate2.Translator(ct_model_path) sources_seg = write_to_file_english(sources) # Subword the source sentences print(sources_seg) source_sents_subworded = sp.encode_as_pieces(sources_seg) # Translate the source sentences translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096) translations = [translation.hypotheses[0] for translation in translations] # Load the target SentecePiece model sp.load(sp_target_model_path) # Desubword the target sentences translations_desubword = sp.decode(translations) elif direction_trans == "Myanmar to English": ct_model_path = "myen_ctranslate2/" sp_source_model_path = "myen_ctranslate2/source.model" sp_target_model_path = "myen_ctranslate2/target.model" if source == "" : gr.Warning("Please Enter Myanmar Text") else: sp_source_model = sp.load(sp_source_model_path) sp_target_model = sp.load(sp_target_model_path) #translator = ctranslate2.Translator(ct_model_path) sources_seg = write_to_file_myanmar(sources) #Subword the source sentences source_sents_subworded = sp.encode_as_pieces(sources_seg) # Translate the source sentences translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096) translations = [translation.hypotheses[0] for translation in translations] # Load the target SentecePiece model sp.load(sp_target_model_path) # Desubword the target sentences translations_desubword = sp.decode(translations) else: gr.Warning("Please Select Language Direction") return translations_desubword def translate_trans_myen(source, translator, sp_source_model, sp_target_model): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ source = segment_sentence(source) source_sentences = sent_tokenize(source) # split sentences source_tokenized = sp_source_model.encode(source_sentences, out_type=str) translations = translator.translate_batch(source_tokenized, replace_unknowns=True) translations = [translation[0]["tokens"] for translation in translations] translations = sp_target_model.decode(translations) return translations def translate_trans_enmy(source, translator, sp_source_model, sp_target_model): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ source_sentences = sent_tokenize(source) # split sentences source_tokenized = sp_source_model.encode(source_sentences, out_type=str) translations = translator.translate_batch(source_tokenized, replace_unknowns=True) translations = [translation[0]["tokens"] for translation in translations] translations_detokenized = sp_target_model.decode(translations) return translations_detokenized def translate_mt5_myen(source, translator, tokenizer): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ source = segment_sentence(source) input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source)) results = translator.translate_batch([input_tokens]) output_tokens = results[0].hypotheses[0] translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens)) return translations def call_model_mt5(source, direction_mt5): if direction_mt5 == "English to Myanmar": translator = ctranslate2.Translator(mt5_ct_model_path) tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path) if source == "" : gr.Warning("Please Enter English Text") else: translation = translate_mt5_enmy(source, translator, tokenizer) elif direction_mt5 == "Myanmar to English": translator = ctranslate2.Translator(mt5_ct_model_path) tokenizer = transformers.AutoTokenizer.from_pretrained(mt5_sp_model_path) if source == "" : gr.Warning("Please Enter Myanmar Text") else: translation = translate_mt5_myen(source, translator, tokenizer) else: gr.Warning("Please Select Language Direction") return translation def translate_mt5_enmy(source, translator, tokenizer): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model tokenizer (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(source)) results = translator.translate_batch([input_tokens]) output_tokens = results[0].hypotheses[0] translations = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens)) return translations def translate_mbart_myen(source, translator, sp_model): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ source = segment_sentence(source) # source_sentences = sent_tokenize(source) source_tokenized = sp_model.encode(source, out_type=str) # print("print 1" , source_tokenized) source_tokenized = ["[my_MM]"] + source_tokenized # print("print " + source_tokenized) target_prefix = ["[en_XX]"] translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix]) # translations = [translation[0]["tokens"] for translation in translations] translations = sp_model.decode(translations[0].hypotheses[0][1:]) # translation = " ".join(translations_detokenized) return translations def translate_mbart_enmy(source, translator, sp_model): """Use CTranslate model to translate a sentence Args: source (str): Source sentences to translate translator (object): Object of Translator, with the CTranslate2 model sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model Returns: Translation of the source text """ # source_sentences = sent_tokenize(source) source_tokenized = sp_model.encode(source, out_type=str) # print("print 1" , source_tokenized) source_tokenized = ["[en_XX]"] + source_tokenized # print("print " + source_tokenized) target_prefix = ["[my_MM]"] translations = translator.translate_batch([source_tokenized],target_prefix= [target_prefix]) # translations = [translation[0]["tokens"] for translation in translations] translations = sp_model.decode(translations[0].hypotheses[0][1:]) # translation = " ".join(translations_detokenized) return translations def call_model(source, direction): if direction == "English to Myanmar": translator = ctranslate2.Translator(mbart_enmy_ct_model_path) sp_model = spm.SentencePieceProcessor(mbart_enmy_sp_model_path) if source == "" : gr.Warning("Please Enter English Text") else: translation = translate_mbart_enmy(source, translator, sp_model) elif direction == "Myanmar to English": translator = ctranslate2.Translator(mbart_myen_ct_model_path) sp_model = spm.SentencePieceProcessor(mbart_myen_sp_model_path) if source == "" : gr.Warning("Please Enter Myanmar Text") else: translation = translate_mbart_myen(source, translator, sp_model) else: gr.Warning("Please Select Language Direction") return translation css = """ #warning {background-color: #FFCCCB} .feedback textarea {font-size: 24px !important} #text_button1 {background: gray;} .translate {font-size: 5px !important;} .translate {width: 150px !important;} #img .img {width: 30px; height: 400px;} #img1 input, textarea, select {font-weight: bold; color:blue !important;} .tab button.selected{ font-size: 36px !important; font-weight: bold; color:blue !important; } """ def clear_mbart(): return "", "" def clear_mm_to_wa(): return "", "", "", "" import base64 theme = 'gstaff/whiteboard' demo = gr.Blocks(css=css, theme=gr.themes.Soft(), title="Machine Translation between Myanmar and English Translator") with open("logo.png", "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode() width, height = 80, 80 html_content = f'' with demo: with gr.Row(equal_height=True): with gr.Column(scale=1, min_width=150): gr.HTML(html_content) with gr.Column(scale=25, min_width=150): gr.Markdown("