File size: 2,181 Bytes
eee9aa8
 
 
 
 
 
 
 
 
 
 
 
e631c98
 
eee9aa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import nltk
import numpy as np
import re
import warnings

from nltk.tokenize import sent_tokenize
from transformers import (
    MarianTokenizer,
    MarianMTModel,
)

nltk.download('punkt')

#define function for text cleaning
def clean_text(text):
    text = text.encode("ascii", errors="ignore").decode(
        "ascii"
    )  # remove non-ascii, Chinese characters
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"ADVERTISEMENT", " ", text)
    text = re.sub(
        r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram",
        " ",
        text,
    )
    text = re.sub(
        r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram",
        " ",
        text,
    )
    text = text.strip(" ")
    text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single
    return text


# define function for translation
modchoice = "Helsinki-NLP/opus-mt-en-zh"


def translate(text):

    input_text = clean_text(text)

    tokenizer = MarianTokenizer.from_pretrained(modchoice)

    model = MarianMTModel.from_pretrained(modchoice)

    if input_text is None or text == "":
        return ("Error",)

    translated = model.generate(
        **tokenizer.prepare_seq2seq_batch(
            sent_tokenize(input_text),
            truncation=True,
            padding="longest",
            return_tensors="pt"
        )
    )

    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return " ".join(tgt_text)


gradio_ui = gr.Interface(
    fn=translate,
    title="English-to-Chinese translation",
    description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.",
    inputs=gr.inputs.Textbox(
        lines=20, label="Paste English text here"
    ),
    outputs=gr.outputs.Textbox(label="Chinese translation"),
    theme="huggingface",
)

gradio_ui.launch(enable_queue=True)