|
""" |
|
translation program for simple text |
|
1. detect language from langdetect |
|
2. translate to target language given by user |
|
|
|
Example from |
|
https://www.thepythoncode.com/article/machine-translation-using-huggingface-transformers-in-python |
|
|
|
user_input: |
|
string: string to be translated |
|
target_lang: language to be translated to |
|
|
|
Returns: |
|
string: translated string of text |
|
|
|
try this : https://pypi.org/project/EasyNMT/ |
|
and this : https://huggingface.co/IDEA-CCNL/Randeng-Deltalm-362M-En-Zh |
|
""" |
|
from __future__ import annotations |
|
from typing import Iterable |
|
import gradio as gr |
|
from gradio.themes.base import Base |
|
from gradio.themes.utils import colors, fonts, sizes |
|
import argparse |
|
|
|
import langid |
|
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, logging |
|
from easynmt import EasyNMT |
|
|
|
|
|
logging.set_verbosity_info() |
|
logger = logging.get_logger("transformers") |
|
|
|
|
|
|
|
|
|
|
|
|
|
mbart_m2en_model = EasyNMT("mbart50_m2en") |
|
mbart_en2m_model = EasyNMT("mbart50_en2m") |
|
logger.info("mbart50 models initialized") |
|
|
|
|
|
m2m_model = EasyNMT("m2m_100_1.2B") |
|
logger.info("m2m_100 models initialized") |
|
|
|
|
|
class myTheme(Base): |
|
def __init__( |
|
self, |
|
*, |
|
primary_hue: colors.Color | str = colors.red, |
|
secondary_hue: colors.Color | str = colors.blue, |
|
neutral_hue: colors.Color | str = colors.orange, |
|
spacing_size: sizes.Size | str = sizes.spacing_md, |
|
radius_size: sizes.Size | str = sizes.radius_md, |
|
text_size: sizes.Size | str = sizes.text_lg, |
|
font: fonts.Font |
|
| str |
|
| Iterable[fonts.Font | str] = ( |
|
fonts.GoogleFont("handjet"), |
|
"cursive", |
|
|
|
), |
|
font_mono: fonts.Font |
|
| str |
|
| Iterable[fonts.Font | str] = ( |
|
fonts.GoogleFont("IBM Plex Mono"), |
|
"ui-monospace", |
|
"monospace", |
|
), |
|
): |
|
super().__init__( |
|
primary_hue=primary_hue, |
|
secondary_hue=secondary_hue, |
|
neutral_hue=neutral_hue, |
|
spacing_size=spacing_size, |
|
radius_size=radius_size, |
|
text_size=text_size, |
|
font=font, |
|
font_mono=font_mono, |
|
) |
|
super().set( |
|
body_background_fill="repeating-linear-gradient(135deg, *primary_800, *primary_800 10px, *primary_900 10px, *primary_900 20px)", |
|
button_primary_background_fill="linear-gradient(90deg, *primary_600, *secondary_800)", |
|
button_primary_background_fill_hover="linear-gradient(45deg, *primary_200, *secondary_300)", |
|
button_primary_text_color="white", |
|
slider_color="*secondary_300", |
|
slider_color_dark="*secondary_600", |
|
block_title_text_weight="600", |
|
block_border_width="3px", |
|
block_shadow="*shadow_drop_lg", |
|
button_shadow="*shadow_drop_lg", |
|
button_large_padding="24px", |
|
) |
|
|
|
|
|
def detect_lang(article): |
|
""" |
|
Language Detection using library langid |
|
|
|
Args: |
|
article (string): article that user wish to translate |
|
target_lang (string): language user want to translate article into |
|
|
|
Returns: |
|
string: detected language short form |
|
""" |
|
|
|
result_lang = langid.classify(article) |
|
logger.info(f"language detected as {result_lang}") |
|
return result_lang[0] |
|
|
|
|
|
def opus_trans(article, target_language): |
|
""" |
|
Translation by Helsinki-NLP model |
|
|
|
Args: |
|
article (string): article that user wishes to translate |
|
target_language (string): language that user wishes to translate article into |
|
|
|
Returns: |
|
string: translated piece of article based off target_language |
|
""" |
|
|
|
result_lang = detect_lang(article) |
|
|
|
if target_language == "English": |
|
target_lang = "en" |
|
elif target_language == "Chinese": |
|
target_lang = "zh" |
|
|
|
if result_lang != target_lang: |
|
task_name = f"translation_{result_lang}_to_{target_lang}" |
|
model_name = f"Helsinki-NLP/opus-mt-{result_lang}-{target_lang}" |
|
try: |
|
translator = pipeline(task_name, model=model_name, tokenizer=model_name) |
|
translated = translator(article)[0]["translation_text"] |
|
except: |
|
translated = "Error: Model doesn't exist" |
|
else: |
|
translated = "Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
return translated |
|
|
|
|
|
def nllb_trans(article, target_language): |
|
result_lang = detect_lang(article) |
|
|
|
inputs = tokenizer(article, return_tensors="pt") |
|
|
|
if target_language == "English": |
|
target_lang = "eng_Latn" |
|
target_language = "en" |
|
elif target_language == "Chinese": |
|
target_lang = "zho_Hans" |
|
target_language = "zh" |
|
|
|
if result_lang != target_language: |
|
translated_tokens = model.generate( |
|
**inputs, |
|
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang], |
|
max_length=30, |
|
) |
|
translated = tokenizer.batch_decode( |
|
translated_tokens, skip_special_tokens=True |
|
)[0] |
|
else: |
|
translated = "Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
|
|
return translated |
|
|
|
|
|
def mbart_trans(article, target_language): |
|
result_lang = detect_lang(article) |
|
if target_language == "English": |
|
target_lang = "en" |
|
elif target_language == "Chinese": |
|
target_lang = "zh" |
|
|
|
logger.info(f"Article to translate : {article}") |
|
logger.info(f"Chose which translation model: mbart model") |
|
logger.info(f"Language selected: {target_language}") |
|
|
|
if result_lang != target_lang: |
|
if target_language == "English": |
|
translated = mbart_m2en_model.translate(article, target_lang="en") |
|
logger.info(f"Translated Result: {translated}") |
|
return translated |
|
else: |
|
translated = mbart_en2m_model.translate(article, target_lang="zh") |
|
logger.info(f"Translated Result: {translated}") |
|
return translated |
|
else: |
|
logger.warning( |
|
"Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
) |
|
return "Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
|
|
|
|
def m2m_trans(article, target_language): |
|
result_lang = detect_lang(article) |
|
if target_language == "English": |
|
target_lang = "en" |
|
elif target_language == "Chinese": |
|
target_lang = "zh" |
|
|
|
logger.info(f"Article to translate : {article}") |
|
logger.info(f"Chose which translation model: m2m model") |
|
logger.info(f"Language selected: {target_language}") |
|
|
|
if result_lang != target_lang: |
|
translated = m2m_model.translate(article, target_lang) |
|
logger.info(f"Translation Result: {translated}") |
|
return translated |
|
else: |
|
logger.warning( |
|
f"Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
) |
|
return "Error: You chose the same language as the article detected language. Please reselect language and try again." |
|
|
|
|
|
def translate(article, toolkit, target_language): |
|
if toolkit == "OPUS": |
|
translated = opus_trans(article, target_language) |
|
|
|
|
|
elif toolkit == "MBART": |
|
translated = mbart_trans(article, target_language) |
|
elif toolkit == "M2M": |
|
translated = m2m_trans(article, target_language) |
|
|
|
return translated |
|
|
|
|
|
myTheme = myTheme() |
|
|
|
with gr.Blocks(theme=myTheme) as demo: |
|
article = gr.Textbox(label="Article") |
|
toolkit_select = gr.Radio( |
|
["OPUS", "MBART", "M2M"], label="Select Translation Model", value="MBART" |
|
) |
|
lang_select = gr.Radio(["English", "Chinese"], label="Select Desired Language") |
|
result = gr.Textbox(label="Translated Result") |
|
trans_btn = gr.Button("Translate") |
|
trans_btn.click( |
|
fn=translate, inputs=[article, toolkit_select, lang_select], outputs=result |
|
) |
|
|
|
|
|
demo.launch() |
|
|