gmustafa413 commited on
Commit
edeecaa
·
verified ·
1 Parent(s): b181eef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -7
app.py CHANGED
@@ -31,11 +31,156 @@ class DocumentProcessor:
31
  self.chunks = []
32
  self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
33
 
34
- # [Keep all the original document processing methods unchanged]
35
- # ... (Include all the document processing methods from previous version) ...
 
 
 
 
 
 
36
 
37
- # [Include the complete DocumentProcessor class implementation]
38
- # ... (Include the full class implementation from previous version) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Initialize processor
41
  processor = DocumentProcessor()
@@ -49,8 +194,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Document Chatbot") as app:
49
  files = gr.File(
50
  file_count="multiple",
51
  file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
52
- label="Upload Documents",
53
- max_size=500*1024*1024
54
  )
55
  process_btn = gr.Button("Process Documents", variant="primary")
56
  status = gr.Textbox(label="Processing Status")
@@ -91,4 +235,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Document Chatbot") as app:
91
  )
92
 
93
  if __name__ == "__main__":
94
- app.launch(debug=True)
 
 
31
  self.chunks = []
32
  self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
33
 
34
+ # File processing methods
35
+ def extract_text_from_pptx(self, file_path):
36
+ try:
37
+ prs = Presentation(file_path)
38
+ return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
39
+ except Exception as e:
40
+ print(f"PPTX Error: {str(e)}")
41
+ return ""
42
 
43
+ def extract_text_from_xls_csv(self, file_path):
44
+ try:
45
+ if file_path.endswith(('.xls', '.xlsx')):
46
+ df = pd.read_excel(file_path)
47
+ else:
48
+ df = pd.read_csv(file_path)
49
+ return " ".join(df.astype(str).values.flatten())
50
+ except Exception as e:
51
+ print(f"Spreadsheet Error: {str(e)}")
52
+ return ""
53
+
54
+ def extract_text_from_pdf(self, file_path):
55
+ try:
56
+ doc = fitz.open(file_path)
57
+ return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
58
+ except Exception as e:
59
+ print(f"PDF Error: {str(e)}")
60
+ return ""
61
+
62
+ def process_file(self, file):
63
+ try:
64
+ file_path = file.name
65
+ print(f"Processing: {file_path}")
66
+
67
+ if file_path.endswith('.pdf'):
68
+ text = self.extract_text_from_pdf(file_path)
69
+ elif file_path.endswith('.docx'):
70
+ text = " ".join(p.text for p in Document(file_path).paragraphs)
71
+ elif file_path.endswith('.txt'):
72
+ with open(file_path, 'r', encoding='utf-8') as f:
73
+ text = f.read()
74
+ elif file_path.endswith('.pptx'):
75
+ text = self.extract_text_from_pptx(file_path)
76
+ elif file_path.endswith(('.xls', '.xlsx', '.csv')):
77
+ text = self.extract_text_from_xls_csv(file_path)
78
+ else:
79
+ return ""
80
+
81
+ clean_text = re.sub(r'\s+', ' ', text).strip()
82
+ print(f"Extracted {len(clean_text)} characters from {file_path}")
83
+ return clean_text
84
+ except Exception as e:
85
+ print(f"Processing Error: {str(e)}")
86
+ return ""
87
+
88
+ def semantic_chunking(self, text):
89
+ words = re.findall(r'\S+\s*', text)
90
+ chunks = [''.join(words[i:i+CHUNK_SIZE//2]) for i in range(0, len(words), CHUNK_SIZE//2)]
91
+ return chunks[:1000]
92
+
93
+ def process_documents(self, files):
94
+ self.chunks = []
95
+ if not files:
96
+ return "No files uploaded!"
97
+
98
+ print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
99
+ texts = list(self.processor_pool.map(self.process_file, files))
100
+
101
+ with ThreadPoolExecutor(max_workers=WORKERS) as executor:
102
+ chunk_lists = list(executor.map(self.semantic_chunking, texts))
103
+
104
+ all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
105
+ print(f"Total chunks generated: {len(all_chunks)}")
106
+
107
+ if not all_chunks:
108
+ return "Error: No chunks generated from documents"
109
+
110
+ try:
111
+ embeddings = MODEL.encode(
112
+ all_chunks,
113
+ batch_size=256,
114
+ convert_to_tensor=True,
115
+ show_progress_bar=False
116
+ ).cpu().numpy().astype('float32')
117
+
118
+ self.index.reset()
119
+ self.index.add(embeddings)
120
+ self.chunks = all_chunks
121
+ return f"✅ Processed {len(all_chunks)} chunks from {len(files)} files"
122
+ except Exception as e:
123
+ print(f"Embedding Error: {str(e)}")
124
+ return f"Error: {str(e)}"
125
+
126
+ def query(self, question):
127
+ if not self.chunks:
128
+ return "Please process documents first", False
129
+
130
+ try:
131
+ print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
132
+ print(f"Question: {question}")
133
+
134
+ # Generate embedding for the question
135
+ question_embedding = MODEL.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
136
+
137
+ # Search FAISS index
138
+ _, indices = self.index.search(question_embedding, 3)
139
+ print(f"Top indices: {indices}")
140
+
141
+ # Get context from top chunks
142
+ context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
143
+ print(f"Context length: {len(context)} characters")
144
+
145
+ # Gemini API Call
146
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={GEMINI_API_KEY}"
147
+ headers = {"Content-Type": "application/json"}
148
+
149
+ payload = {
150
+ "contents": [{
151
+ "parts": [{
152
+ "text": f"Answer concisely based on this context: {context}\n\nQuestion: {question}"
153
+ }]
154
+ }],
155
+ "generationConfig": {
156
+ "temperature": 0.3,
157
+ "maxOutputTokens": MAX_TOKENS
158
+ }
159
+ }
160
+
161
+ response = requests.post(
162
+ url,
163
+ headers=headers,
164
+ json=payload,
165
+ timeout=20
166
+ )
167
+
168
+ if response.status_code != 200:
169
+ return f"API Error: {response.text}", False
170
+
171
+ # Parse response
172
+ try:
173
+ response_json = response.json()
174
+ final_answer = response_json['candidates'][0]['content']['parts'][0]['text']
175
+ except (KeyError, IndexError) as e:
176
+ print(f"Response parsing error: {str(e)}")
177
+ return "Error: Could not parse API response", False
178
+
179
+ return final_answer, True
180
+
181
+ except Exception as e:
182
+ print(f"Query Error: {str(e)}")
183
+ return f"Error: {str(e)}", False
184
 
185
  # Initialize processor
186
  processor = DocumentProcessor()
 
194
  files = gr.File(
195
  file_count="multiple",
196
  file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
197
+ label="Upload Documents"
 
198
  )
199
  process_btn = gr.Button("Process Documents", variant="primary")
200
  status = gr.Textbox(label="Processing Status")
 
235
  )
236
 
237
  if __name__ == "__main__":
238
+ app.launch(debug=True)
239
+