cloghost commited on
Commit
64ce29d
1 Parent(s): fd1f73a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -62
app.py CHANGED
@@ -1,6 +1,10 @@
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
 
 
 
4
 
5
  # Page configuration
6
  st.set_page_config(
@@ -9,92 +13,236 @@ st.set_page_config(
9
  layout="wide"
10
  )
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  @st.cache_resource
13
  def load_model():
14
  """Load and cache the model and tokenizer"""
15
- model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
16
-
17
- # Load model and tokenizer
18
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
19
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # Set device dynamically
22
- device = 0 if torch.cuda.is_available() else -1
23
 
24
- # Initialize translation pipeline
25
- translator = pipeline(
26
- "translation",
27
- model=model,
28
- tokenizer=tokenizer,
29
- src_lang="hin_Deva",
30
- tgt_lang="kang_Deva",
31
- device=device
32
- )
33
 
34
- return translator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def translate_text(translator, text):
37
- """Translate the input text"""
38
  try:
39
- translation = translator(text)
 
 
 
 
40
  return translation[0]['translation_text']
41
  except Exception as e:
42
  st.error(f"Translation Error: {str(e)}")
43
  return None
44
 
45
  def main():
46
- # App title and description
47
  st.title("🗣️ Hindi to Kangri Translator")
48
  st.markdown("""
49
- This application translates Hindi (Devanagari) text to Kangri language using a fine-tuned NLLB-200 model.
50
- Simply enter your Hindi text in the input box below and click 'Translate'.
51
  """)
52
 
53
- # Model loading with spinner
54
- with st.spinner("Loading translation model..."):
55
- translator = load_model()
 
56
 
57
- # Create two columns for input and output
58
- col1, col2 = st.columns(2)
59
 
60
- # Input text area
61
- with col1:
62
- st.subheader("Hindi Text (हिंदी)")
63
- input_text = st.text_area(
64
- "Enter Hindi text",
65
- height=200,
66
- help="Enter the Hindi text you want to translate to Kangri",
67
- placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
68
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Add translation button
71
- if st.button("Translate to Kangri"):
72
- if input_text:
73
- with st.spinner("Translating..."):
74
- translated_text = translate_text(translator, input_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- if translated_text:
77
- with col2:
78
- st.subheader("Kangri Translation (कांगड़ी)")
79
- st.text_area(
80
- "Kangri translation",
81
- value=translated_text,
82
- height=200,
83
- disabled=True
84
- )
85
- else:
86
- st.warning("Please enter some Hindi text to translate.")
 
 
 
 
 
 
 
 
 
87
 
88
- # Add information about the model
89
- st.markdown("---")
90
- st.markdown("""
91
- ### About the Model
92
- This translator uses the `cloghost/nllb-200-distilled-600M-hin-kang-v1` model, which is a distilled version
93
- of the NLLB-200 model specifically fine-tuned for Hindi to Kangri translation. The model supports:
94
- - Source Language: Hindi (Devanagari script)
95
- - Target Language: Kangri (Devanagari script)
96
- - Maximum input length: 512 tokens
97
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  if __name__ == "__main__":
100
  main()
 
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ import re
5
+ import pandas as pd
6
+ from io import StringIO
7
+ import time
8
 
9
  # Page configuration
10
  st.set_page_config(
 
13
  layout="wide"
14
  )
15
 
16
+ # Custom CSS for better styling
17
+ st.markdown("""
18
+ <style>
19
+ .stAlert {
20
+ padding: 10px;
21
+ margin: 10px 0;
22
+ }
23
+ .example-text {
24
+ padding: 10px;
25
+ background-color: #f0f2f6;
26
+ border-radius: 5px;
27
+ margin: 5px 0;
28
+ cursor: pointer;
29
+ }
30
+ </style>
31
+ """, unsafe_allow_html=True)
32
+
33
+ # Example texts
34
+ EXAMPLE_TEXTS = {
35
+ "General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?",
36
+ "Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।",
37
+ "Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।",
38
+ "Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।"
39
+ }
40
+
41
  @st.cache_resource
42
  def load_model():
43
  """Load and cache the model and tokenizer"""
44
+ try:
45
+ model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
46
+
47
+ with st.spinner("Loading model and tokenizer..."):
48
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
49
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
50
+
51
+ device = 0 if torch.cuda.is_available() else -1
52
+
53
+ translator = pipeline(
54
+ "translation",
55
+ model=model,
56
+ tokenizer=tokenizer,
57
+ src_lang="hin_Deva",
58
+ tgt_lang="kang_Deva",
59
+ device=device
60
+ )
61
+
62
+ return translator
63
+ except Exception as e:
64
+ st.error(f"Error loading model: {str(e)}")
65
+ return None
66
+
67
+ def preprocess_text(text):
68
+ """Preprocess the input text"""
69
+ # Remove extra whitespace
70
+ text = re.sub(r'\s+', ' ', text.strip())
71
 
72
+ # Remove special characters except Devanagari and basic punctuation
73
+ text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text)
74
 
75
+ # Normalize common variations of Hindi characters
76
+ text = text.replace('॰', '.')
 
 
 
 
 
 
 
77
 
78
+ return text
79
+
80
+ def batch_translate(translator, texts):
81
+ """Translate a batch of texts"""
82
+ results = []
83
+ for text in texts:
84
+ try:
85
+ if text.strip(): # Only translate non-empty texts
86
+ translation = translator(text)
87
+ results.append({
88
+ 'Source': text.strip(),
89
+ 'Translation': translation[0]['translation_text']
90
+ })
91
+ else:
92
+ results.append({
93
+ 'Source': '',
94
+ 'Translation': ''
95
+ })
96
+ except Exception as e:
97
+ results.append({
98
+ 'Source': text.strip(),
99
+ 'Translation': f'Error: {str(e)}'
100
+ })
101
+ return pd.DataFrame(results)
102
 
103
  def translate_text(translator, text):
104
+ """Translate single text with error handling"""
105
  try:
106
+ preprocessed_text = preprocess_text(text)
107
+ if not preprocessed_text:
108
+ return None
109
+
110
+ translation = translator(preprocessed_text)
111
  return translation[0]['translation_text']
112
  except Exception as e:
113
  st.error(f"Translation Error: {str(e)}")
114
  return None
115
 
116
  def main():
 
117
  st.title("🗣️ Hindi to Kangri Translator")
118
  st.markdown("""
119
+ An advanced translation tool for converting Hindi text to Kangri language.
120
+ Features include single text translation, batch processing, and text preprocessing.
121
  """)
122
 
123
+ # Load model
124
+ translator = load_model()
125
+ if not translator:
126
+ st.stop()
127
 
128
+ # Create tabs for different features
129
+ tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"])
130
 
131
+ # Single Translation Tab
132
+ with tabs[0]:
133
+ col1, col2 = st.columns(2)
134
+
135
+ with col1:
136
+ st.subheader("Hindi Text (हिंदी)")
137
+ input_text = st.text_area(
138
+ "Enter Hindi text",
139
+ height=200,
140
+ help="Enter the Hindi text you want to translate to Kangri",
141
+ placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
142
+ )
143
+
144
+ # Preprocessing options
145
+ with st.expander("Preprocessing Options"):
146
+ remove_special = st.checkbox("Remove special characters", value=True)
147
+ normalize_chars = st.checkbox("Normalize Hindi characters", value=True)
148
+
149
+ if st.button("Translate to Kangri"):
150
+ if input_text:
151
+ with st.spinner("Translating..."):
152
+ # Show preprocessing steps
153
+ if remove_special or normalize_chars:
154
+ st.info("Preprocessing text...")
155
+ processed_text = preprocess_text(input_text)
156
+ st.code(processed_text, language="text")
157
+
158
+ translated_text = translate_text(translator, input_text)
159
+
160
+ if translated_text:
161
+ with col2:
162
+ st.subheader("Kangri Translation (कांगड़ी)")
163
+ st.text_area(
164
+ "Kangri translation",
165
+ value=translated_text,
166
+ height=200,
167
+ disabled=True
168
+ )
169
+ else:
170
+ st.warning("Please enter some Hindi text to translate.")
171
 
172
+ # Batch Translation Tab
173
+ with tabs[1]:
174
+ st.subheader("Batch Translation")
175
+ st.markdown("""
176
+ Upload a CSV or TXT file containing Hindi texts to translate in bulk.
177
+ - For CSV: Include a column named 'text' containing Hindi texts
178
+ - For TXT: Each line should contain one Hindi text to translate
179
+ """)
180
+
181
+ uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'])
182
+
183
+ if uploaded_file:
184
+ try:
185
+ if uploaded_file.type == 'text/csv':
186
+ df = pd.read_csv(uploaded_file)
187
+ texts = df['text'].tolist()
188
+ else: # txt file
189
+ content = uploaded_file.read().decode()
190
+ texts = content.split('\n')
191
 
192
+ if st.button("Translate Batch"):
193
+ progress_bar = st.progress(0)
194
+ with st.spinner("Processing batch translation..."):
195
+ results_df = batch_translate(translator, texts)
196
+ progress_bar.progress(100)
197
+
198
+ st.success("Translation completed!")
199
+ st.dataframe(results_df)
200
+
201
+ # Download button for results
202
+ csv = results_df.to_csv(index=False)
203
+ st.download_button(
204
+ "Download Results",
205
+ csv,
206
+ "translation_results.csv",
207
+ "text/csv",
208
+ key='download-csv'
209
+ )
210
+ except Exception as e:
211
+ st.error(f"Error processing file: {str(e)}")
212
 
213
+ # Examples Tab
214
+ with tabs[2]:
215
+ st.subheader("Example Texts")
216
+ st.markdown("Click on any example to load it into the translator:")
217
+
218
+ for category, text in EXAMPLE_TEXTS.items():
219
+ st.markdown(f"**{category}:**")
220
+ if st.button(text, key=f"example_{category}"):
221
+ tabs[0].button = True # Switch to translation tab
222
+ st.session_state.input_text = text
223
+ st.experimental_rerun()
224
+
225
+ # About Tab
226
+ with tabs[3]:
227
+ st.subheader("About the Model")
228
+ st.markdown("""
229
+ ### Model Information
230
+ - **Base Model**: NLLB-200 Distilled (600M parameters)
231
+ - **Fine-tuned for**: Hindi (hin_Deva) to Kangri (kang_Deva) translation
232
+ - **Maximum input length**: 512 tokens
233
+ - **Model ID**: `cloghost/nllb-200-distilled-600M-hin-kang-v1`
234
+
235
+ ### Preprocessing Features
236
+ - Remove special characters while preserving Devanagari script
237
+ - Normalize Hindi character variations
238
+ - Clean extra whitespace and formatting
239
+
240
+ ### Usage Tips
241
+ 1. For best results, input clean Hindi text in Devanagari script
242
+ 2. Use batch translation for processing multiple texts efficiently
243
+ 3. Check preprocessing options for better translation quality
244
+ 4. Refer to example texts for optimal input format
245
+ """)
246
 
247
  if __name__ == "__main__":
248
  main()