Spaces:

chinhon
/

translation_eng2ch

Running

App Files Files Community

chinhon commited on Jan 19, 2022

Commit

eee9aa8

•

1 Parent(s): ba46ea1

Upload app.py

Browse files

Files changed (1) hide show

app.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+import nltk
+import numpy as np
+import re
+import warnings
+from nltk.tokenize import sent_tokenize
+from transformers import (
+    MarianTokenizer,
+    MarianMTModel,
+)
+#define function for text cleaning
+def clean_text(text):
+    text = text.encode("ascii", errors="ignore").decode(
+        "ascii"
+    )  # remove non-ascii, Chinese characters
+    text = re.sub(r"\n", " ", text)
+    text = re.sub(r"\n\n", " ", text)
+    text = re.sub(r"\t", " ", text)
+    text = re.sub(r"http\S+", "", text)
+    text = re.sub(r"ADVERTISEMENT", " ", text)
+    text = re.sub(
+        r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram",
+        " ",
+        text,
+    )
+    text = re.sub(
+        r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram",
+        " ",
+        text,
+    )
+    text = text.strip(" ")
+    text = re.sub(
+        " +", " ", text
+    ).strip()  # get rid of multiple spaces and replace with a single
+    return text
+# define function for translation
+modchoice = "Helsinki-NLP/opus-mt-en-zh"
+def translate(text):
+    input_text = clean_text(text)
+    tokenizer = MarianTokenizer.from_pretrained(modchoice)
+    model = MarianMTModel.from_pretrained(modchoice)
+    if input_text is None or text == "":
+        return ("Error",)
+    translated = model.generate(
+        **tokenizer.prepare_seq2seq_batch(
+            sent_tokenize(input_text),
+            truncation=True,
+            padding="longest",
+            return_tensors="pt"
+        )
+    )
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    return " ".join(tgt_text)
+gradio_ui = gr.Interface(
+    fn=translate,
+    title="English-to-Chinese translation",
+    description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.",
+    inputs=gr.inputs.Textbox(
+        lines=20, label="Paste English text here"
+    ),
+    outputs=gr.outputs.Textbox(label="Chinese translation"),
+    theme="huggingface",
+)
+gradio_ui.launch(enable_queue=True)