Anupam251272 commited on
Commit
45a5021
1 Parent(s): 8c38885

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +287 -0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from deep_translator import GoogleTranslator # More stable than googletrans
6
+ import logging
7
+ from typing import Optional, Dict
8
+ import time
9
+ from pathlib import Path
10
+ import os
11
+ import pandas as pd
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Language mapping with detailed descriptions
21
+ LANGUAGE_MAPPING = {
22
+ "hi": {
23
+ "name": "Hindi - हिन्दी",
24
+ "description": "Official language of India, written in Devanagari script",
25
+ "deep_translator_code": "hi"
26
+ },
27
+ "ta": {
28
+ "name": "Tamil - தமிழ்",
29
+ "description": "Classical language of Tamil Nadu, written in Tamil script",
30
+ "deep_translator_code": "ta"
31
+ },
32
+ "te": {
33
+ "name": "Telugu - తెలుగు",
34
+ "description": "Official language of Andhra Pradesh and Telangana",
35
+ "deep_translator_code": "te"
36
+ },
37
+ "bn": {
38
+ "name": "Bengali - বাংলা",
39
+ "description": "Official language of West Bengal and Bangladesh",
40
+ "deep_translator_code": "bn"
41
+ },
42
+ "mr": {
43
+ "name": "Marathi - मराठी",
44
+ "description": "Official language of Maharashtra",
45
+ "deep_translator_code": "mr"
46
+ }
47
+ }
48
+
49
+ class FileQueryTranslator:
50
+ def __init__(self, max_retries=3, retry_delay=1):
51
+ self.max_retries = max_retries
52
+ self.retry_delay = retry_delay
53
+ self.setup_device()
54
+ self.setup_model()
55
+ logger.info(f"Initialization complete. Using device: {self.device}")
56
+
57
+ def setup_device(self):
58
+ """Setup CUDA device with error handling"""
59
+ try:
60
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
+ if self.device.type == "cuda":
62
+ # Check CUDA memory
63
+ torch.cuda.empty_cache()
64
+ logger.info(f"Available CUDA memory: {torch.cuda.get_device_properties(0).total_memory}")
65
+ except Exception as e:
66
+ logger.warning(f"Error setting up CUDA device: {e}. Falling back to CPU.")
67
+ self.device = torch.device("cpu")
68
+
69
+ def setup_model(self):
70
+ """Initialize the model with retry mechanism"""
71
+ for attempt in range(self.max_retries):
72
+ try:
73
+ model_name = "facebook/opt-125m" # Using smaller model for stability
74
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
75
+ self.model = AutoModelForCausalLM.from_pretrained(
76
+ model_name,
77
+ torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
78
+ )
79
+
80
+ if self.device.type == "cuda":
81
+ self.model = self.model.to(self.device)
82
+ torch.cuda.empty_cache() # Clear CUDA cache
83
+ else:
84
+ self.model = self.model.to(self.device)
85
+
86
+ logger.info(f"Model loaded successfully on {self.device}")
87
+ break
88
+ except Exception as e:
89
+ logger.error(f"Attempt {attempt + 1} failed: {str(e)}")
90
+ if attempt < self.max_retries - 1:
91
+ time.sleep(self.retry_delay)
92
+ else:
93
+ raise Exception("Failed to load model after maximum retries")
94
+
95
+ def extract_text_from_pdf(self, pdf_file: str) -> str:
96
+ """Extract text from PDF with robust error handling"""
97
+ try:
98
+ if not os.path.exists(pdf_file):
99
+ raise FileNotFoundError(f"PDF file not found: {pdf_file}")
100
+
101
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
102
+ text = []
103
+
104
+ for page_num in range(len(pdf_reader.pages)):
105
+ try:
106
+ page = pdf_reader.pages[page_num]
107
+ text.append(page.extract_text())
108
+ except Exception as e:
109
+ logger.error(f"Error extracting text from page {page_num}: {e}")
110
+ text.append(f"[Error extracting page {page_num}]")
111
+
112
+ return "\n".join(text)
113
+ except Exception as e:
114
+ logger.error(f"Error processing PDF: {str(e)}")
115
+ return f"Error processing PDF: {str(e)}"
116
+
117
+ def extract_text_from_csv(self, csv_file: str) -> str:
118
+ """Extract text from CSV with robust error handling"""
119
+ try:
120
+ if not os.path.exists(csv_file):
121
+ raise FileNotFoundError(f"CSV file not found: {csv_file}")
122
+
123
+ df = pd.read_csv(csv_file)
124
+ text = df.to_string(index=False)
125
+
126
+ return text
127
+ except Exception as e:
128
+ logger.error(f"Error processing CSV: {str(e)}")
129
+ return f"Error processing CSV: {str(e)}"
130
+
131
+ def extract_text_from_xlsx(self, xlsx_file: str) -> str:
132
+ """Extract text from XLSX with robust error handling"""
133
+ try:
134
+ if not os.path.exists(xlsx_file):
135
+ raise FileNotFoundError(f"XLSX file not found: {xlsx_file}")
136
+
137
+ df = pd.read_excel(xlsx_file)
138
+ text = df.to_string(index=False)
139
+
140
+ return text
141
+ except Exception as e:
142
+ logger.error(f"Error processing XLSX: {str(e)}")
143
+ return f"Error processing XLSX: {str(e)}"
144
+
145
+ def translate_text(self, text: str, target_lang: str) -> str:
146
+ """Translate text using deep_translator with retry mechanism"""
147
+ for attempt in range(self.max_retries):
148
+ try:
149
+ translator = GoogleTranslator(source='auto', target=target_lang)
150
+
151
+ # Split text into chunks if it's too long (Google Translate limit)
152
+ max_chunk_size = 4500
153
+ chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
154
+
155
+ translated_chunks = []
156
+ for chunk in chunks:
157
+ translated_chunk = translator.translate(chunk)
158
+ translated_chunks.append(translated_chunk)
159
+ time.sleep(0.5) # Rate limiting
160
+
161
+ return ' '.join(translated_chunks)
162
+ except Exception as e:
163
+ logger.error(f"Translation attempt {attempt + 1} failed: {str(e)}")
164
+ if attempt < self.max_retries - 1:
165
+ time.sleep(self.retry_delay)
166
+ else:
167
+ return f"Translation error: {str(e)}"
168
+
169
+ def process_query(self, file_path: str, file_type: str, query: str, language: str) -> str:
170
+ """Process query with comprehensive error handling"""
171
+ try:
172
+ # Validate inputs
173
+ if not file_path or not os.path.exists(file_path):
174
+ return "Please provide a valid file."
175
+ if not query.strip():
176
+ return "Please provide a valid query."
177
+ if language not in LANGUAGE_MAPPING:
178
+ return "Please select a valid language."
179
+
180
+ # Extract text based on file type
181
+ if file_type == "pdf":
182
+ file_text = self.extract_text_from_pdf(file_path)
183
+ elif file_type == "csv":
184
+ file_text = self.extract_text_from_csv(file_path)
185
+ elif file_type == "xlsx":
186
+ file_text = self.extract_text_from_xlsx(file_path)
187
+ else:
188
+ return "Unsupported file type."
189
+
190
+ if file_text.startswith("Error"):
191
+ return file_text
192
+
193
+ # Generate response
194
+ prompt = f"Query: {query}\n\nContent: {file_text[:1000]}\n\nAnswer:" # Limit content length
195
+
196
+ input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
197
+ with torch.no_grad():
198
+ output = self.model.generate(
199
+ input_ids,
200
+ max_new_tokens=200, # Use max_new_tokens instead of max_length
201
+ num_return_sequences=1,
202
+ temperature=0.7,
203
+ pad_token_id=self.tokenizer.eos_token_id
204
+ )
205
+ response = self.tokenizer.decode(output[0], skip_special_tokens=True)
206
+
207
+ # Translate
208
+ target_lang = LANGUAGE_MAPPING[language]["deep_translator_code"]
209
+ translated_response = self.translate_text(response, target_lang)
210
+
211
+ return translated_response
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error in process_query: {str(e)}")
215
+ return f"An error occurred: {str(e)}"
216
+
217
+ # Gradio interface
218
+ def create_interface():
219
+ file_processor = FileQueryTranslator()
220
+
221
+ with gr.Blocks() as demo:
222
+ gr.Markdown("### File Query and Translation System")
223
+
224
+ with gr.Row():
225
+ with gr.Column():
226
+ file_input = gr.File(
227
+ label="Upload File (PDF, CSV, XLSX)",
228
+ type="filepath"
229
+ )
230
+ file_type_input = gr.Radio(
231
+ label="Select File Type",
232
+ choices=["pdf", "csv", "xlsx"],
233
+ value="pdf"
234
+ )
235
+ query_input = gr.Textbox(
236
+ label="Enter your question about the file",
237
+ placeholder="What would you like to know about the document?"
238
+ )
239
+ language_input = gr.Dropdown(
240
+ label="Select Output Language",
241
+ choices=[f"{code} - {info['name']}" for code, info in LANGUAGE_MAPPING.items()],
242
+ value="hi - Hindi - हिन्दी"
243
+ )
244
+ language_description = gr.Textbox(
245
+ label="Language Information",
246
+ value=LANGUAGE_MAPPING['hi']['description'],
247
+ interactive=False
248
+ )
249
+
250
+ with gr.Row():
251
+ output_text = gr.Textbox(
252
+ label="Translated Answer",
253
+ placeholder="Translation will appear here...",
254
+ lines=5
255
+ )
256
+
257
+ def update_description(selected):
258
+ code = selected.split(" - ")[0]
259
+ return LANGUAGE_MAPPING[code]['description']
260
+
261
+ def process_and_translate(file_path, file_type, query, language):
262
+ try:
263
+ lang_code = language.split(" - ")[0]
264
+ return file_processor.process_query(file_path, file_type, query, lang_code)
265
+ except Exception as e:
266
+ return f"Error processing request: {str(e)}"
267
+
268
+ # Event handlers
269
+ language_input.change(
270
+ fn=update_description,
271
+ inputs=[language_input],
272
+ outputs=[language_description]
273
+ )
274
+
275
+ submit_button = gr.Button("Get Answer")
276
+ submit_button.click(
277
+ fn=process_and_translate,
278
+ inputs=[file_input, file_type_input, query_input, language_input],
279
+ outputs=output_text
280
+ )
281
+
282
+ return demo
283
+
284
+ if __name__ == "__main__":
285
+ demo = create_interface()
286
+ demo.queue() # Enable queueing
287
+ demo.launch(share=True)