AreesaAshfaq commited on
Commit
a316b7b
·
verified ·
1 Parent(s): 6ac05e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -127
app.py CHANGED
@@ -1,139 +1,131 @@
1
  import streamlit as st
2
- from transformers import MarianMTModel, MarianTokenizer
3
 
4
- # Define a comprehensive dictionary of language names, codes, and model names
5
  LANGUAGE_MODELS = {
6
- 'Afrikaans': ('af', 'Helsinki-NLP/opus-mt-en-af'),
7
- 'Albanian': ('sq', 'Helsinki-NLP/opus-mt-en-sq'),
8
- 'Arabic': ('ar', 'Helsinki-NLP/opus-mt-en-ar'),
9
- 'Armenian': ('hy', 'Helsinki-NLP/opus-mt-en-hy'),
10
- 'Catalan': ('ca', 'Helsinki-NLP/opus-mt-en-ca'),
11
- 'Czech': ('cs', 'Helsinki-NLP/opus-mt-en-cs'),
12
- 'Danish': ('da', 'Helsinki-NLP/opus-mt-en-da'),
13
- 'Dutch': ('nl', 'Helsinki-NLP/opus-mt-en-nl'),
14
- 'Esperanto': ('eo', 'Helsinki-NLP/opus-mt-en-eo'),
15
- 'Estonian': ('et', 'Helsinki-NLP/opus-mt-en-et'),
16
- 'Finnish': ('fi', 'Helsinki-NLP/opus-mt-en-fi'),
17
- 'French': ('fr', 'Helsinki-NLP/opus-mt-en-fr'),
18
- 'German': ('de', 'Helsinki-NLP/opus-mt-en-de'),
19
- 'Greek': ('el', 'Helsinki-NLP/opus-mt-en-el'),
20
- 'Gujarati': ('gu', 'Helsinki-NLP/opus-mt-en-gu'),
21
- 'Haitian Creole': ('ht', 'Helsinki-NLP/opus-mt-en-ht'),
22
- 'Hausa': ('ha', 'Helsinki-NLP/opus-mt-en-ha'),
23
- 'Hawaiian': ('haw', 'Helsinki-NLP/opus-mt-en-haw'),
24
- 'Hebrew': ('he', 'Helsinki-NLP/opus-mt-en-he'),
25
- 'Hindi': ('hi', 'Helsinki-NLP/opus-mt-en-hi'),
26
- 'Hungarian': ('hu', 'Helsinki-NLP/opus-mt-en-hu'),
27
- 'Icelandic': ('is', 'Helsinki-NLP/opus-mt-en-is'),
28
- 'Igbo': ('ig', 'Helsinki-NLP/opus-mt-en-ig'),
29
- 'Indonesian': ('id', 'Helsinki-NLP/opus-mt-en-id'),
30
- 'Irish': ('ga', 'Helsinki-NLP/opus-mt-en-ga'),
31
- 'Italian': ('it', 'Helsinki-NLP/opus-mt-en-it'),
32
- 'Japanese': ('ja', 'Helsinki-NLP/opus-mt-en-ja'),
33
- 'Javanese': ('jw', 'Helsinki-NLP/opus-mt-en-jw'),
34
- 'Kannada': ('kn', 'Helsinki-NLP/opus-mt-en-kn'),
35
- 'Khmer': ('km', 'Helsinki-NLP/opus-mt-en-km'),
36
- 'Korean': ('ko', 'Helsinki-NLP/opus-mt-en-ko'),
37
- 'Latin': ('la', 'Helsinki-NLP/opus-mt-en-la'),
38
- 'Latvian': ('lv', 'Helsinki-NLP/opus-mt-en-lv'),
39
- 'Lithuanian': ('lt', 'Helsinki-NLP/opus-mt-en-lt'),
40
- 'Luxembourgish': ('lb', 'Helsinki-NLP/opus-mt-en-lb'),
41
- 'Macedonian': ('mk', 'Helsinki-NLP/opus-mt-en-mk'),
42
- 'Malagasy': ('mg', 'Helsinki-NLP/opus-mt-en-mg'),
43
- 'Malayalam': ('ml', 'Helsinki-NLP/opus-mt-en-ml'),
44
- 'Maltese': ('mt', 'Helsinki-NLP/opus-mt-en-mt'),
45
- 'Maori': ('mi', 'Helsinki-NLP/opus-mt-en-mi'),
46
- 'Marathi': ('mr', 'Helsinki-NLP/opus-mt-en-mr'),
47
- 'Myanmar': ('my', 'Helsinki-NLP/opus-mt-en-my'),
48
- 'Nepali': ('ne', 'Helsinki-NLP/opus-mt-en-ne'),
49
- 'Norwegian': ('no', 'Helsinki-NLP/opus-mt-en-no'),
50
- 'Nyanja': ('ny', 'Helsinki-NLP/opus-mt-en-ny'),
51
- 'Odia': ('or', 'Helsinki-NLP/opus-mt-en-or'),
52
- 'Oromo': ('om', 'Helsinki-NLP/opus-mt-en-om'),
53
- 'Pashto': ('ps', 'Helsinki-NLP/opus-mt-en-ps'),
54
- 'Persian': ('fa', 'Helsinki-NLP/opus-mt-en-fa'),
55
- 'Polish': ('pl', 'Helsinki-NLP/opus-mt-en-pl'),
56
- 'Portuguese': ('pt', 'Helsinki-NLP/opus-mt-en-pt'),
57
- 'Punjabi': ('pa', 'Helsinki-NLP/opus-mt-en-pa'),
58
- 'Quechua': ('qu', 'Helsinki-NLP/opus-mt-en-qu'),
59
- 'Romanian': ('ro', 'Helsinki-NLP/opus-mt-en-ro'),
60
- 'Russian': ('ru', 'Helsinki-NLP/opus-mt-en-ru'),
61
- 'Samoan': ('sm', 'Helsinki-NLP/opus-mt-en-sm'),
62
- 'Scots Gaelic': ('gd', 'Helsinki-NLP/opus-mt-en-gd'),
63
- 'Serbian': ('sr', 'Helsinki-NLP/opus-mt-en-sr'),
64
- 'Sesotho': ('st', 'Helsinki-NLP/opus-mt-en-st'),
65
- 'Shona': ('sn', 'Helsinki-NLP/opus-mt-en-sn'),
66
- 'Sindhi': ('sd', 'Helsinki-NLP/opus-mt-en-sd'),
67
- 'Sinhala': ('si', 'Helsinki-NLP/opus-mt-en-si'),
68
- 'Slovak': ('sk', 'Helsinki-NLP/opus-mt-en-sk'),
69
- 'Slovenian': ('sl', 'Helsinki-NLP/opus-mt-en-sl'),
70
- 'Somali': ('so', 'Helsinki-NLP/opus-mt-en-so'),
71
- 'Spanish': ('es', 'Helsinki-NLP/opus-mt-en-es'),
72
- 'Sundanese': ('su', 'Helsinki-NLP/opus-mt-en-su'),
73
- 'Swahili': ('sw', 'Helsinki-NLP/opus-mt-en-sw'),
74
- 'Swedish': ('sv', 'Helsinki-NLP/opus-mt-en-sv'),
75
- 'Tajik': ('tg', 'Helsinki-NLP/opus-mt-en-tg'),
76
- 'Tamil': ('ta', 'Helsinki-NLP/opus-mt-en-ta'),
77
- 'Telugu': ('te', 'Helsinki-NLP/opus-mt-en-te'),
78
- 'Thai': ('th', 'Helsinki-NLP/opus-mt-en-th'),
79
- 'Turkmen': ('tk', 'Helsinki-NLP/opus-mt-en-tk'),
80
- 'Turkish': ('tr', 'Helsinki-NLP/opus-mt-en-tr'),
81
- 'Ukrainian': ('uk', 'Helsinki-NLP/opus-mt-en-uk'),
82
- 'Urdu': ('ur', 'Helsinki-NLP/opus-mt-en-ur'),
83
- 'Vietnamese': ('vi', 'Helsinki-NLP/opus-mt-en-vi'),
84
- 'Welsh': ('cy', 'Helsinki-NLP/opus-mt-en-cy'),
85
- 'Xhosa': ('xh', 'Helsinki-NLP/opus-mt-en-xh'),
86
- 'Yiddish': ('yi', 'Helsinki-NLP/opus-mt-en-yi'),
87
- 'Yoruba': ('yo', 'Helsinki-NLP/opus-mt-en-yo'),
88
- 'Zulu': ('zu', 'Helsinki-NLP/opus-mt-en-zu'),
 
 
 
 
89
  }
90
 
91
  @st.cache_resource
92
- def load_model(target_language):
93
- code, model_name = LANGUAGE_MODELS.get(target_language, (None, None))
94
- if not model_name:
95
- st.error(f"Model for language '{target_language}' not found.")
96
- return None, None
97
-
98
- tokenizer = MarianTokenizer.from_pretrained(model_name)
99
- model = MarianMTModel.from_pretrained(model_name)
100
  return tokenizer, model
101
 
102
- def translate_text(text, target_language):
103
- tokenizer, model = load_model(target_language)
104
- if tokenizer is None or model is None:
105
- return ""
106
-
107
- # Tokenize the input text
108
- inputs = tokenizer(text, return_tensors="pt", padding=True)
109
 
110
- # Generate translation
111
- translated = model.generate(**inputs)
112
-
113
- # Decode the translated text
114
- translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
115
- return translated_text
116
 
117
- def main():
118
- st.title("Language Translator")
119
- st.write("Translate English Text into Your Preferred Language")
 
 
120
 
121
- # Input text from the user
122
- source_text = st.text_area("Enter text in English:", "")
123
-
124
- # Select target language
125
- target_language = st.selectbox(
126
- "Select target language:",
127
- options=list(LANGUAGE_MODELS.keys())
128
- )
129
-
130
- if st.button("Translate"):
131
- if source_text:
132
- translated_text = translate_text(source_text, target_language)
133
- st.write(f"Translated text ({target_language}):")
134
- st.write(translated_text)
135
- else:
136
- st.warning("Please enter text to translate.")
137
 
138
- if __name__ == "__main__":
139
- main()
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
3
 
4
+ # Define the dictionary of language models
5
  LANGUAGE_MODELS = {
6
+ 'Afrikaans': 'af',
7
+ 'Albanian': 'sq',
8
+ 'Amharic': 'am',
9
+ 'Arabic': 'ar',
10
+ 'Armenian': 'hy',
11
+ 'Bengali': 'bn',
12
+ 'Bosnian': 'bs',
13
+ 'Catalan': 'ca',
14
+ 'Croatian': 'hr',
15
+ 'Czech': 'cs',
16
+ 'Danish': 'da',
17
+ 'Dutch': 'nl',
18
+ 'Esperanto': 'eo',
19
+ 'Estonian': 'et',
20
+ 'Finnish': 'fi',
21
+ 'French': 'fr',
22
+ 'German': 'de',
23
+ 'Greek': 'el',
24
+ 'Gujarati': 'gu',
25
+ 'Haitian Creole': 'ht',
26
+ 'Hausa': 'ha',
27
+ 'Hawaiian': 'haw',
28
+ 'Hebrew': 'he',
29
+ 'Hindi': 'hi',
30
+ 'Hungarian': 'hu',
31
+ 'Icelandic': 'is',
32
+ 'Igbo': 'ig',
33
+ 'Indonesian': 'id',
34
+ 'Irish': 'ga',
35
+ 'Italian': 'it',
36
+ 'Japanese': 'ja',
37
+ 'Javanese': 'jw',
38
+ 'Kannada': 'kn',
39
+ 'Khmer': 'km',
40
+ 'Korean': 'ko',
41
+ 'Latin': 'la',
42
+ 'Latvian': 'lv',
43
+ 'Lithuanian': 'lt',
44
+ 'Luxembourgish': 'lb',
45
+ 'Macedonian': 'mk',
46
+ 'Malagasy': 'mg',
47
+ 'Malayalam': 'ml',
48
+ 'Maltese': 'mt',
49
+ 'Maori': 'mi',
50
+ 'Marathi': 'mr',
51
+ 'Myanmar': 'my',
52
+ 'Nepali': 'ne',
53
+ 'Norwegian': 'no',
54
+ 'Nyanja': 'ny',
55
+ 'Odia': 'or',
56
+ 'Oromo': 'om',
57
+ 'Pashto': 'ps',
58
+ 'Persian': 'fa',
59
+ 'Polish': 'pl',
60
+ 'Portuguese': 'pt',
61
+ 'Punjabi': 'pa',
62
+ 'Quechua': 'qu',
63
+ 'Romanian': 'ro',
64
+ 'Russian': 'ru',
65
+ 'Samoan': 'sm',
66
+ 'Scots Gaelic': 'gd',
67
+ 'Serbian': 'sr',
68
+ 'Sesotho': 'st',
69
+ 'Shona': 'sn',
70
+ 'Sindhi': 'sd',
71
+ 'Sinhala': 'si',
72
+ 'Slovak': 'sk',
73
+ 'Slovenian': 'sl',
74
+ 'Somali': 'so',
75
+ 'Spanish': 'es',
76
+ 'Sundanese': 'su',
77
+ 'Swahili': 'sw',
78
+ 'Swedish': 'sv',
79
+ 'Tajik': 'tg',
80
+ 'Tamil': 'ta',
81
+ 'Telugu': 'te',
82
+ 'Thai': 'th',
83
+ 'Turkmen': 'tk',
84
+ 'Turkish': 'tr',
85
+ 'Ukrainian': 'uk',
86
+ 'Urdu': 'ur',
87
+ 'Vietnamese': 'vi',
88
+ 'Welsh': 'cy',
89
+ 'Xhosa': 'xh',
90
+ 'Yiddish': 'yi',
91
+ 'Yoruba': 'yo',
92
+ 'Zulu': 'zu',
93
  }
94
 
95
  @st.cache_resource
96
+ def load_model():
97
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
98
+ model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 
 
 
 
 
99
  return tokenizer, model
100
 
101
+ def translate(text, target_language):
102
+ tokenizer, model = load_model()
 
 
 
 
 
103
 
104
+ # Set the target language code for translation
105
+ target_lang_code = LANGUAGE_MODELS.get(target_language)
106
+ if not target_lang_code:
107
+ return "Target language not supported."
 
 
108
 
109
+ tokenizer.src_lang = "en"
110
+ encoded_input = tokenizer(text, return_tensors="pt")
111
+ generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(target_lang_code))
112
+ translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
113
+ return translation
114
 
115
+ st.title('English to Any Language Translator')
116
+
117
+ text_input = st.text_area("Enter text in English:", "Hello, how are you?")
118
+
119
+ target_language = st.selectbox(
120
+ 'Select the target language:',
121
+ list(LANGUAGE_MODELS.keys())
122
+ )
 
 
 
 
 
 
 
 
123
 
124
+ if st.button('Translate'):
125
+ with st.spinner('Translating...'):
126
+ try:
127
+ translation = translate(text_input, target_language)
128
+ st.write(f'Translation ({target_language}):')
129
+ st.write(translation)
130
+ except Exception as e:
131
+ st.error(f"Error: {e}")