import gradio as gr import nltk import numpy as np import re import warnings from nltk.tokenize import sent_tokenize from transformers import ( MarianTokenizer, MarianMTModel, ) nltk.download('punkt') #define function for text cleaning def clean_text(text): text = text.encode("ascii", errors="ignore").decode( "ascii" ) # remove non-ascii, Chinese characters text = re.sub(r"\n", " ", text) text = re.sub(r"\n\n", " ", text) text = re.sub(r"\t", " ", text) text = re.sub(r"http\S+", "", text) text = re.sub(r"ADVERTISEMENT", " ", text) text = re.sub( r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram", " ", text, ) text = re.sub( r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram", " ", text, ) text = text.strip(" ") text = re.sub( " +", " ", text ).strip() # get rid of multiple spaces and replace with a single return text # define function for translation modchoice = "Helsinki-NLP/opus-mt-en-zh" def translate(text): input_text = clean_text(text) tokenizer = MarianTokenizer.from_pretrained(modchoice) model = MarianMTModel.from_pretrained(modchoice) if input_text is None or text == "": return ("Error",) translated = model.generate( **tokenizer.prepare_seq2seq_batch( sent_tokenize(input_text), truncation=True, padding="longest", return_tensors="pt" ) ) tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] return " ".join(tgt_text) gradio_ui = gr.Interface( fn=translate, title="English-to-Chinese translation", description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.", inputs=gr.inputs.Textbox( lines=20, label="Paste English text here" ), outputs=gr.outputs.Textbox(label="Chinese translation"), theme="huggingface", ) gradio_ui.launch(enable_queue=True)