dpc commited on
Commit
8877cce
1 Parent(s): 1c69775

update model and gr

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +140 -29
README.md CHANGED
@@ -11,9 +11,9 @@ pinned: false
11
  ## Info
12
 
13
 
14
- Using facebook/m2m100_1.2B pre-trained model
15
 
16
- facebook/m2m100_1.2B supports 100 languages.
17
 
18
  Here, this app uses/tests these languages only.
19
 
@@ -31,4 +31,4 @@ Vietnamese(vi)
31
 
32
  ## Read more:
33
 
34
- https://huggingface.co/facebook/m2m100_1.2B
 
11
  ## Info
12
 
13
 
14
+ Using facebook/m2m100-12B-avg-5-ckpt pre-trained model
15
 
16
+ facebook/m2m100-12B-avg-5-ckpt supports 100 languages.
17
 
18
  Here, this app uses/tests these languages only.
19
 
 
31
 
32
  ## Read more:
33
 
34
+ https://huggingface.co/facebook/m2m100-12B-avg-5-ckpt
app.py CHANGED
@@ -10,7 +10,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
10
 
11
 
12
  this_description = '''
13
- Using facebook/m2m100_1.2B pre-trained model. Language code:
14
  Chinese(zh)
15
  English(en)
16
  Hindi(hi)
@@ -18,16 +18,121 @@ Japanese(ja)
18
  Sinhalese(si)
19
  Thai(th)
20
  Vietnamese(vi)
 
21
  '''
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def m2m_translate(Input_Text, from_lang, to_lang):
25
- tokenizer.src_lang = from_lang
26
 
27
  encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
28
 
29
  generated_tokens = model.generate(
30
- **encoded_from_lang, forced_bos_token_id=tokenizer.get_lang_id(to_lang))
31
 
32
  res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
33
 
@@ -39,35 +144,41 @@ def m2m_translate(Input_Text, from_lang, to_lang):
39
  iface = gr.Interface(
40
  fn=m2m_translate,
41
 
42
- title="M2M100 Translation",
43
  description=this_description,
44
 
45
  inputs=[
46
- gr.inputs.Textbox(lines=5, placeholder="Enter text"),
47
-
48
- gr.inputs.Radio(
49
- choices=[
50
- 'zh',
51
- 'en',
52
- 'hi',
53
- 'ja',
54
- 'si',
55
- 'th',
56
- 'vi'],
57
- default='vi',
58
- label='From language'),
59
-
60
- gr.inputs.Radio(
61
- choices=[
62
- 'zh',
63
- 'en',
64
- 'hi',
65
- 'ja',
66
- 'si',
67
- 'th',
68
- 'vi'],
69
- default='en',
70
- label='To language'),
 
 
 
 
 
 
71
  ],
72
  outputs="text")
73
 
 
10
 
11
 
12
  this_description = '''
13
+ Using facebook/m2m100-12B-avg-5-ckpt pre-trained model. Language code:
14
  Chinese(zh)
15
  English(en)
16
  Hindi(hi)
 
18
  Sinhalese(si)
19
  Thai(th)
20
  Vietnamese(vi)
21
+ ...
22
  '''
23
 
24
+ # From facebook/m2m100-12B-avg-5-ckpt
25
+ lang_codes = {
26
+ "Afrikaans": "af",
27
+ "Amharic": "am",
28
+ "Arabic": "ar",
29
+ "Asturian": "ast",
30
+ "Azerbaijani": "az",
31
+ "Bashkir": "ba",
32
+ "Belarusian": "be",
33
+ "Bulgarian": "bg",
34
+ "Bengali": "bn",
35
+ "Breton": "br",
36
+ "Bosnian": "bs",
37
+ "Catalan; Valencian": "ca",
38
+ "Cebuano": "ceb",
39
+ "Czech": "cs",
40
+ "Welsh": "cy",
41
+ "Danish": "da",
42
+ "German": "de",
43
+ "Greeek": "el",
44
+ "English": "en",
45
+ "Spanish": "es",
46
+ "Estonian": "et",
47
+ "Persian": "fa",
48
+ "Fulah": "ff",
49
+ "Finnish": "fi",
50
+ "French": "fr",
51
+ "Western Frisian": "fy",
52
+ "Irish": "ga",
53
+ "Gaelic; Scottish Gaelic": "gd",
54
+ "Galician": "gl",
55
+ "Gujarati": "gu",
56
+ "Hausa": "ha",
57
+ "Hebrew": "he",
58
+ "Hindi": "hi",
59
+ "Croatian": "hr",
60
+ "Haitian; Haitian Creole": "ht",
61
+ "Hungarian": "hu",
62
+ "Armenian": "hy",
63
+ "Indonesian": "id",
64
+ "Igbo": "ig",
65
+ "Iloko": "ilo",
66
+ "Icelandic": "is",
67
+ "Italian": "it",
68
+ "Japanese": "ja",
69
+ "Javanese": "jv",
70
+ "Georgian": "ka",
71
+ "Kazakh": "kk",
72
+ "Central Khmer": "km",
73
+ "Kannada": "kn",
74
+ "Korean": "ko",
75
+ "Luxembourgish; Letzeburgesch": "lb",
76
+ "Ganda": "lg",
77
+ "Lingala": "ln",
78
+ "Lao": "lo",
79
+ "Lithuanian": "lt",
80
+ "Latvian": "lv",
81
+ "Malagasy": "mg",
82
+ "Macedonian": "mk",
83
+ "Malayalam": "ml",
84
+ "Mongolian": "mn",
85
+ "Marathi": "mr",
86
+ "Malay": "ms",
87
+ "Burmese": "my",
88
+ "Nepali": "ne",
89
+ "Dutch; Flemish": "nl",
90
+ "Norwegian": "no",
91
+ "Northern Sotho": "ns",
92
+ "Occitan": "oc",
93
+ "Oriya": "or",
94
+ "Panjabi; Punjabi": "pa",
95
+ "Polish": "pl",
96
+ "Pushto": "ps",
97
+ "Portuguese": "pt",
98
+ "Romanian; Moldavian; Moldovan": "ro",
99
+ "Russian": "ru",
100
+ "Sindhi": "sd",
101
+ "Sinhala; Sinhalese": "si",
102
+ "Slovak": "sk",
103
+ "Slovenian": "sl",
104
+ "Somali": "so",
105
+ "Albanian": "sq",
106
+ "Serbian": "sr",
107
+ "Swati": "ss",
108
+ "Sundanese": "su",
109
+ "Swedish": "sv",
110
+ "Swahili": "sw",
111
+ "Tamil": "ta",
112
+ "Thai": "th",
113
+ "Tagalog": "tl",
114
+ "Tswana": "tn",
115
+ "Turkish": "tr",
116
+ "Ukrainian": "uk",
117
+ "Urdu": "ur",
118
+ "Uzbek": "uz",
119
+ "Vietnamese": "vi",
120
+ "Wolof": "wo",
121
+ "Xhosa": "xh",
122
+ "Yiddish": "yi",
123
+ "Yoruba": "yo",
124
+ "Chinese": "zh",
125
+ "Zulu": "zu"
126
+ }
127
+
128
 
129
  def m2m_translate(Input_Text, from_lang, to_lang):
130
+ tokenizer.src_lang = lang_codes[from_lang]
131
 
132
  encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
133
 
134
  generated_tokens = model.generate(
135
+ **encoded_from_lang, forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang]))
136
 
137
  res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
138
 
 
144
  iface = gr.Interface(
145
  fn=m2m_translate,
146
 
147
+ title="M2M100 Text Translation",
148
  description=this_description,
149
 
150
  inputs=[
151
+ gr.Textbox(lines=5, placeholder="Enter text", label="Text input"),
152
+
153
+ gr.Radio(
154
+ choices=[
155
+ 'Burmese',
156
+ 'Chinese',
157
+ 'English',
158
+ 'Hindi',
159
+ 'Japanese',
160
+ 'Sinhala',
161
+ 'Thai',
162
+ 'Vietnamese'
163
+ ],
164
+ default='Vietnamese',
165
+ label='From language'
166
+ ),
167
+
168
+ gr.Radio(
169
+ choices=[
170
+ 'Burmese',
171
+ 'Chinese',
172
+ 'English',
173
+ 'Hindi',
174
+ 'Japanese',
175
+ 'Sinhala',
176
+ 'Thai',
177
+ 'Vietnamese'
178
+ ],
179
+ default='English',
180
+ label='To language'
181
+ ),
182
  ],
183
  outputs="text")
184