chinhon commited on
Commit
eee9aa8
1 Parent(s): ba46ea1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import nltk
3
+ import numpy as np
4
+ import re
5
+ import warnings
6
+
7
+ from nltk.tokenize import sent_tokenize
8
+ from transformers import (
9
+ MarianTokenizer,
10
+ MarianMTModel,
11
+ )
12
+
13
+ #define function for text cleaning
14
+ def clean_text(text):
15
+ text = text.encode("ascii", errors="ignore").decode(
16
+ "ascii"
17
+ ) # remove non-ascii, Chinese characters
18
+ text = re.sub(r"\n", " ", text)
19
+ text = re.sub(r"\n\n", " ", text)
20
+ text = re.sub(r"\t", " ", text)
21
+ text = re.sub(r"http\S+", "", text)
22
+ text = re.sub(r"ADVERTISEMENT", " ", text)
23
+ text = re.sub(
24
+ r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram",
25
+ " ",
26
+ text,
27
+ )
28
+ text = re.sub(
29
+ r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram",
30
+ " ",
31
+ text,
32
+ )
33
+ text = text.strip(" ")
34
+ text = re.sub(
35
+ " +", " ", text
36
+ ).strip() # get rid of multiple spaces and replace with a single
37
+ return text
38
+
39
+
40
+ # define function for translation
41
+ modchoice = "Helsinki-NLP/opus-mt-en-zh"
42
+
43
+
44
+ def translate(text):
45
+
46
+ input_text = clean_text(text)
47
+
48
+ tokenizer = MarianTokenizer.from_pretrained(modchoice)
49
+
50
+ model = MarianMTModel.from_pretrained(modchoice)
51
+
52
+ if input_text is None or text == "":
53
+ return ("Error",)
54
+
55
+ translated = model.generate(
56
+ **tokenizer.prepare_seq2seq_batch(
57
+ sent_tokenize(input_text),
58
+ truncation=True,
59
+ padding="longest",
60
+ return_tensors="pt"
61
+ )
62
+ )
63
+
64
+ tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
65
+
66
+ return " ".join(tgt_text)
67
+
68
+
69
+ gradio_ui = gr.Interface(
70
+ fn=translate,
71
+ title="English-to-Chinese translation",
72
+ description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.",
73
+ inputs=gr.inputs.Textbox(
74
+ lines=20, label="Paste English text here"
75
+ ),
76
+ outputs=gr.outputs.Textbox(label="Chinese translation"),
77
+ theme="huggingface",
78
+ )
79
+
80
+ gradio_ui.launch(enable_queue=True)