gauravchand11 commited on
Commit
8b4e117
·
verified ·
1 Parent(s): 77a6efe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -87
app.py CHANGED
@@ -4,14 +4,6 @@ from PyPDF2 import PdfReader
4
  import docx
5
  import os
6
  import re
7
- from datetime import datetime
8
-
9
- # Page config
10
- st.set_page_config(
11
- page_title="Document Translator (NLLB-200)",
12
- page_icon="📄",
13
- layout="wide"
14
- )
15
 
16
  # Load NLLB model and tokenizer
17
  @st.cache_resource
@@ -27,92 +19,78 @@ def initialize_models():
27
  tokenizer, model = load_translation_model()
28
  return {"nllb": (tokenizer, model)}
29
 
30
- def split_long_sentence(sentence, max_length=200):
31
- """Split long sentences into smaller chunks at appropriate break points."""
32
- if len(sentence) <= max_length:
33
- return [sentence]
34
-
35
- chunks = []
36
- current_chunk = ""
37
- words = sentence.split()
38
-
39
- for word in words:
40
- if len(current_chunk) + len(word) + 1 <= max_length:
41
- current_chunk += (" " + word if current_chunk else word)
42
- else:
43
- chunks.append(current_chunk)
44
- current_chunk = word
45
-
46
- if current_chunk:
47
- chunks.append(current_chunk)
48
-
49
- return chunks
50
-
51
  def preprocess_idioms(text, src_lang, tgt_lang):
52
  if src_lang == "en" and tgt_lang == "hi":
53
  idiom_map = {
54
- # Common English-Hindi idiom mappings
55
  "no piece of cake": "कोई आसान काम नहीं",
 
56
  "bite the bullet": "दांतों तले उंगली दबाना",
57
- "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
58
- "fell into place": "ठीक हो गया",
59
- "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
60
  "with a little perseverance": "थोड़े से धैर्य से",
61
- "break the ice": "बातचीत की शुरुआत करना",
62
- "on cloud nine": "सातवें आसमान पर होना",
63
- "once in a blue moon": "कभी-कभार",
 
 
 
 
64
  "beating around the bush": "इधर-उधर की बात करना",
65
- "burning the midnight oil": "रात-रात भर जागकर काम करना",
66
- "calm before the storm": "तूफान से पहले की शांति",
67
- "cost an arm and a leg": "बहुत महंगा होना",
68
- "blessing in disguise": "छुपा हुआ वरदान",
69
- "kill two birds with one stone": "एक पंथ दो काज",
70
- "a piece of cake": "बहुत आसान काम",
71
- "under the weather": "तबीयत ठीक न होना",
72
  "pull yourself together": "खुद को संभालो",
73
- "rise and shine": "जल्दी उठो और तैयार हो जाओ",
 
 
74
  "time flies": "समय पंख लगाकर उड़ता है",
75
- "actions speak louder than words": "कथनी से करनी बड़ी",
76
- "all ears": "पूरा ध्यान से सुन रहा हूं",
77
- "back to square one": "वापस शुरुआत में",
78
- "better late than never": "देर आये दुरुस्त आये",
79
  "cry over spilled milk": "बीती बात पर पछताना",
80
- "down to earth": "सरल स्वभाव का",
81
- "every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
82
- "food for thought": "सोचने वाली बात",
83
- "give someone the benefit of the doubt": "शक का फायदा देना",
84
- "hit the nail on the head": "सटीक बात कहना",
85
- "in hot water": "मुसीबत में होना"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  }
87
 
88
  # Sort idioms by length (longest first) to handle overlapping phrases
89
  sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
90
 
91
- # Replace idioms with their translations
92
- for idiom in sorted_idioms:
93
- pattern = r'\b' + re.escape(idiom) + r'\b'
94
- text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
95
-
96
- elif src_lang == "en" and tgt_lang == "mr":
97
- idiom_map = {
98
- "no piece of cake": "सोपं काम नाही",
99
- "bite the bullet": "कठीण निर्णय घेणे",
100
- "tackle it head-on": "समस्येला थेट सामोरे जाणे",
101
- "fell into place": "सगळं व्यवस्थित झालं",
102
- "see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
103
- "with a little perseverance": "थोड्या धीराने",
104
- "break the ice": "संभाषणाची सुरुवात करणे",
105
- "on cloud nine": "आनंदात असणे",
106
- "once in a blue moon": "क्वचितच",
107
- "burning the midnight oil": "रात्रंदिवस मेहनत करणे",
108
- "better late than never": "उशीर का होईना पण योग्य वेळी"
109
- }
110
- for idiom, translation in idiom_map.items():
111
- pattern = r'\b' + re.escape(idiom) + r'\b'
112
- text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
113
 
114
  return text
115
 
 
116
  def extract_text(file):
117
  ext = os.path.splitext(file.name)[1].lower()
118
 
@@ -136,6 +114,7 @@ def extract_text(file):
136
  else:
137
  raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
138
 
 
139
  def translate_text(text, src_lang, tgt_lang, models):
140
  if src_lang == tgt_lang:
141
  return text
@@ -147,22 +126,106 @@ def translate_text(text, src_lang, tgt_lang, models):
147
  return "Error: Unsupported language combination"
148
 
149
  tgt_lang_code = lang_map[tgt_lang]
150
-
151
  tokenizer, model = models["nllb"]
152
 
153
  # Preprocess for idioms
154
  preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
155
 
156
- # Split text into smaller chunks (sentences)
157
- sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
158
- translated_text = []
159
 
160
- for sentence in sentences:
161
  if sentence.strip():
162
- chunks = split_long_sentence(sentence, max_length=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- for chunk in chunks:
165
- try:
166
- inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
167
- translated = model.generate(
168
- **inputs
 
 
 
 
 
 
 
 
 
 
4
  import docx
5
  import os
6
  import re
 
 
 
 
 
 
 
 
7
 
8
  # Load NLLB model and tokenizer
9
  @st.cache_resource
 
19
  tokenizer, model = load_translation_model()
20
  return {"nllb": (tokenizer, model)}
21
 
22
+ # Enhanced idiom mapping with more comprehensive translations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def preprocess_idioms(text, src_lang, tgt_lang):
24
  if src_lang == "en" and tgt_lang == "hi":
25
  idiom_map = {
26
+ # Basic phrases
27
  "no piece of cake": "कोई आसान काम नहीं",
28
+ "piece of cake": "बहुत आसान काम",
29
  "bite the bullet": "दांतों तले उंगली दबाना",
30
+ "tackle it head-on": "सीधे मुकाबला करना",
31
+ "fell into place": "सब कुछ ठीक हो गया",
32
+ "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
33
  "with a little perseverance": "थोड़े से धैर्य से",
34
+
35
+ # Additional common idioms
36
+ "break a leg": "बहुत बहुत शुभकामनाएं",
37
+ "hit the nail on the head": "बिल्कुल सही बात कहना",
38
+ "once in a blue moon": "बहुत कम, कभी-कभार",
39
+ "under the weather": "तबीयत ठीक नहीं",
40
+ "cost an arm and a leg": "बहुत महंगा",
41
  "beating around the bush": "इधर-उधर की बात करना",
42
+ "call it a day": "काम समाप्त करना",
43
+ "burn the midnight oil": "रात-रात भर जागकर काम करना",
44
+ "get the ball rolling": "शुरुआत करना",
 
 
 
 
45
  "pull yourself together": "खुद को संभालो",
46
+ "shoot yourself in the foot": "अपना ही नुकसान करना",
47
+ "take it with a grain of salt": "संदेह से लेना",
48
+ "the last straw": "सहनशीलता की आखिरी सीमा",
49
  "time flies": "समय पंख लगाकर उड़ता है",
50
+ "wrap your head around": "समझने की कोशिश करना",
51
+ "cut corners": "काम में छोटा रास्ता अपनाना",
52
+ "back to square one": "फिर से शुरू से",
53
+ "blessing in disguise": "छिपा हुआ वरदान",
54
  "cry over spilled milk": "बीती बात पर पछताना",
55
+ "keep your chin up": "हिम्मत रखना",
56
+
57
+ # Work-related idioms
58
+ "think outside the box": "नए तरीके से सोचना",
59
+ "raise the bar": "मानक ऊंचा करना",
60
+ "learning curve": "सीखने की प्रक्रिया",
61
+ "up and running": "चालू और कार्यरत",
62
+ "back to the drawing board": "फिर से योजना बनाना",
63
+
64
+ # Project-related phrases
65
+ "running into issues": "समस्याओं का सामना करना",
66
+ "iron out the bugs": "खामियां दूर करना",
67
+ "in the pipeline": "विचाराधीन",
68
+ "moving forward": "आगे बढ़ते हुए",
69
+ "touch base": "संपर्क में रहना",
70
+
71
+ # Technical phrases
72
+ "user-friendly": "उपयोगकर्ता के अनुकूल",
73
+ "cutting-edge": "अत्याधुनिक",
74
+ "state of the art": "अत्याधुनिक तकनीक",
75
+ "proof of concept": "व्यवहार्यता का प्रमाण",
76
+ "game changer": "खेल बदलने वाला"
77
  }
78
 
79
  # Sort idioms by length (longest first) to handle overlapping phrases
80
  sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
81
 
82
+ # Create a single regex pattern for all idioms
83
+ pattern = '|'.join(map(re.escape, sorted_idioms))
84
+
85
+ def replace_idiom(match):
86
+ return idiom_map[match.group(0).lower()]
87
+
88
+ # Replace all idioms in one pass, case-insensitive
89
+ text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  return text
92
 
93
+ # Function to extract text from different file types
94
  def extract_text(file):
95
  ext = os.path.splitext(file.name)[1].lower()
96
 
 
114
  else:
115
  raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
116
 
117
+ # Translation function with improved chunking
118
  def translate_text(text, src_lang, tgt_lang, models):
119
  if src_lang == tgt_lang:
120
  return text
 
126
  return "Error: Unsupported language combination"
127
 
128
  tgt_lang_code = lang_map[tgt_lang]
 
129
  tokenizer, model = models["nllb"]
130
 
131
  # Preprocess for idioms
132
  preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
133
 
134
+ # Improved chunking: Split by sentences while preserving context
135
+ chunks = []
136
+ current_chunk = ""
137
 
138
+ for sentence in re.split('([.!?।]+)', preprocessed_text):
139
  if sentence.strip():
140
+ if len(current_chunk) + len(sentence) < 450: # Leave room for tokenization
141
+ current_chunk += sentence
142
+ else:
143
+ if current_chunk:
144
+ chunks.append(current_chunk)
145
+ current_chunk = sentence
146
+
147
+ if current_chunk:
148
+ chunks.append(current_chunk)
149
+
150
+ translated_text = ""
151
+
152
+ for chunk in chunks:
153
+ if chunk.strip():
154
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
155
+ translated = model.generate(
156
+ **inputs,
157
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang_code],
158
+ max_length=512,
159
+ num_beams=5, # Improved beam search
160
+ length_penalty=1.0, # Balanced length penalty
161
+ no_repeat_ngram_size=3 # Avoid repetition
162
+ )
163
+ translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
164
+ translated_text += translated_chunk + " "
165
+
166
+ return translated_text.strip()
167
+
168
+ # Function to save text as a file
169
+ def save_text_to_file(text, original_filename, prefix="translated"):
170
+ output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
171
+ with open(output_filename, "w", encoding="utf-8") as f:
172
+ f.write(text)
173
+ return output_filename
174
+
175
+ # Main processing function
176
+ def process_document(file, source_lang, target_lang, models):
177
+ try:
178
+ # Extract text from uploaded file
179
+ text = extract_text(file)
180
+
181
+ # Translate the text
182
+ translated_text = translate_text(text, source_lang, target_lang, models)
183
+
184
+ # Save the result
185
+ if translated_text.startswith("Error:"):
186
+ output_file = save_text_to_file(translated_text, file.name, prefix="error")
187
+ else:
188
+ output_file = save_text_to_file(translated_text, file.name)
189
+
190
+ return output_file, translated_text
191
+ except Exception as e:
192
+ error_message = f"Error: {str(e)}"
193
+ output_file = save_text_to_file(error_message, file.name, prefix="error")
194
+ return output_file, error_message
195
+
196
+ # Streamlit interface
197
+ def main():
198
+ st.title("Document Translator (NLLB-200)")
199
+ st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
200
+
201
+ # Initialize models
202
+ models = initialize_models()
203
+
204
+ # File uploader
205
+ uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
206
+
207
+ # Language selection
208
+ col1, col2 = st.columns(2)
209
+ with col1:
210
+ source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
211
+ with col2:
212
+ target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
213
+
214
+ if uploaded_file is not None and st.button("Translate"):
215
+ with st.spinner("Translating..."):
216
+ output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
217
 
218
+ # Display result
219
+ st.text_area("Translated Text", result_text, height=300)
220
+
221
+ # Provide download button
222
+ with open(output_file, "rb") as file:
223
+ st.download_button(
224
+ label="Download Translated Document",
225
+ data=file,
226
+ file_name=os.path.basename(output_file),
227
+ mime="text/plain"
228
+ )
229
+
230
+ if __name__ == "__main__":
231
+ main()