adrianmoses commited on
Commit
b3dc152
1 Parent(s): 0251d4d

saving changes

Browse files
Files changed (2) hide show
  1. app.py +101 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langdetect import detect
3
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
+
5
+
6
+ @st.cache
7
+ def load_data():
8
+ supported_languages = [
9
+ 'ar_AR',
10
+ 'cs_CZ',
11
+ 'de_DE',
12
+ 'en_XX',
13
+ 'es_XX',
14
+ 'et_EE',
15
+ 'fi_FI',
16
+ 'fr_XX',
17
+ 'gu_IN',
18
+ 'hi_IN',
19
+ 'it_IT',
20
+ 'ja_XX',
21
+ 'kk_KZ',
22
+ 'ko_KR',
23
+ 'lt_LT',
24
+ 'lv_LV',
25
+ 'my_MM',
26
+ 'ne_NP',
27
+ 'nl_XX',
28
+ 'ro_RO',
29
+ 'ru_RU',
30
+ 'si_LK',
31
+ 'tr_TR',
32
+ 'vi_VN',
33
+ 'zh_CN',
34
+ 'af_ZA',
35
+ 'az_AZ',
36
+ 'bn_IN',
37
+ 'fa_IR',
38
+ 'he_IL',
39
+ 'hr_HR',
40
+ 'id_ID',
41
+ 'ka_GE',
42
+ 'km_KH',
43
+ 'mk_MK',
44
+ 'ml_IN',
45
+ 'mn_MN',
46
+ 'mr_IN',
47
+ 'pl_PL',
48
+ 'ps_AF',
49
+ 'pt_XX',
50
+ 'sv_SE',
51
+ 'sw_KE',
52
+ 'ta_IN',
53
+ 'te_IN',
54
+ 'th_TH',
55
+ 'tl_XX',
56
+ 'uk_UA',
57
+ 'ur_PK',
58
+ 'xh_ZA',
59
+ 'gl_ES',
60
+ 'sl_SI'
61
+ ]
62
+ return {k.split('_')[0]:k for k in supported_languages}
63
+
64
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
65
+ def load_model():
66
+ model_name = "facebook/mbart-large-50-many-to-many-mmt"
67
+ model = MBartForConditionalGeneration.from_pretrained(model_name)
68
+ tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
69
+ return (model, tokenizer)
70
+
71
+ data = load_data()
72
+
73
+ def translate_to_english(model, tokenizer, text):
74
+ src_lang = detect(text)
75
+ if src_lang in data:
76
+ tokenizer.src_lang = src_lang
77
+ encoded_txt = tokenizer(text, return_tensors="pt")
78
+ generated_tokens = model.generate(
79
+ **encoded_txt,
80
+ forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
81
+ )
82
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
83
+ else:
84
+ print(f"Language {src_lang} not found")
85
+ return
86
+
87
+ st.title("Auto Translate (To English)")
88
+
89
+
90
+ text = st.text_input(f"Write in any (1 of {len(data.keys())}) language")
91
+
92
+ st.text("What you wrote: ")
93
+
94
+ st.write(text)
95
+
96
+ st.text("English Translation: ")
97
+
98
+ if text:
99
+ model, tokenizer = load_model()
100
+ translated_text = translate_to_english(model, tokenizer, text)
101
+ st.write(translated_text[0] if translated_text else "No translation found")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ transformers
4
+ transformers[sentencepiece]
5
+ spacy
6
+ pycld2
7
+ langdetect