gmustafa413 commited on
Commit
43c6974
·
verified ·
1 Parent(s): 159cb34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -202
app.py CHANGED
@@ -1,235 +1,318 @@
1
- !pip install langdetect faiss-cpu transformers gradio groq sentence-transformers pypdf2 python-pptx pandas docx2txt
2
-
3
- import gradio as gr
4
- import fitz # PyMuPDF
5
  import numpy as np
 
6
  import requests
7
- import faiss
8
- import re
9
  import json
10
- import pandas as pd
11
- from docx import Document
12
- from pptx import Presentation
 
13
  from sentence_transformers import SentenceTransformer
14
  from concurrent.futures import ThreadPoolExecutor
 
15
 
16
  # Configuration
17
  GROQ_API_KEY = "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" # Replace with your actual key
18
- EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Proper embedding model
 
19
  CHUNK_SIZE = 512
20
  MAX_TOKENS = 4096
21
- WORKERS = 8
 
22
 
23
- # Initialize the embedding model
24
- embedding_model = SentenceTransformer(EMBEDDING_MODEL)
 
25
 
26
- class DocumentProcessor:
27
  def __init__(self):
28
- self.index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
29
  self.chunks = []
30
- self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
31
-
32
- def extract_text_from_pptx(self, file_path):
 
 
33
  try:
34
- prs = Presentation(file_path)
35
- return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
36
- except Exception as e:
37
- print(f"PPTX Error: {str(e)}")
38
- return ""
39
-
40
- def extract_text_from_xls_csv(self, file_path):
41
- try:
42
- if file_path.endswith(('.xls', '.xlsx')):
43
- df = pd.read_excel(file_path)
44
- else:
45
- df = pd.read_csv(file_path)
46
- return " ".join(df.astype(str).values.flatten())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  except Exception as e:
48
- print(f"Spreadsheet Error: {str(e)}")
49
- return ""
50
-
51
- def extract_text_from_pdf(self, file_path):
 
 
 
 
 
52
  try:
53
- doc = fitz.open(file_path)
54
- return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
- print(f"PDF Error: {str(e)}")
57
- return ""
58
 
59
- def process_file(self, file):
60
- try:
61
- file_path = file.name
62
- print(f"Processing: {file_path}")
63
-
64
- if file_path.endswith('.pdf'):
65
- text = self.extract_text_from_pdf(file_path)
66
- elif file_path.endswith('.docx'):
67
- text = " ".join(p.text for p in Document(file_path).paragraphs)
68
- elif file_path.endswith('.txt'):
69
- with open(file_path, 'r', encoding='utf-8') as f:
70
- text = f.read()
71
- elif file_path.endswith('.pptx'):
72
- text = self.extract_text_from_pptx(file_path)
73
- elif file_path.endswith(('.xls', '.xlsx', '.csv')):
74
- text = self.extract_text_from_xls_csv(file_path)
75
- else:
76
- return ""
77
-
78
- clean_text = re.sub(r'\s+', ' ', text).strip()
79
- print(f"Extracted {len(clean_text)} characters from {file_path}")
80
- return clean_text
81
- except Exception as e:
82
- print(f"Processing Error: {str(e)}")
83
- return ""
84
 
85
- def semantic_chunking(self, text):
86
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
87
- chunks = []
88
- current_chunk = ""
89
 
90
- for sentence in sentences:
91
- if len(current_chunk) + len(sentence) < CHUNK_SIZE:
92
- current_chunk += " " + sentence
93
- else:
94
- if current_chunk:
95
- chunks.append(current_chunk.strip())
96
- current_chunk = sentence
97
 
98
- if current_chunk:
99
- chunks.append(current_chunk.strip())
100
-
101
- return chunks[:1000] # Limit to 1000 chunks per document
102
-
103
- def process_documents(self, files):
104
- self.chunks = []
105
- if not files:
106
- return "No files uploaded!"
107
-
108
- print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
109
- texts = list(self.processor_pool.map(self.process_file, files))
110
-
111
- with ThreadPoolExecutor(max_workers=WORKERS) as executor:
112
- chunk_lists = list(executor.map(self.semantic_chunking, texts))
113
-
114
- all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
115
- print(f"Total chunks generated: {len(all_chunks)}")
116
-
117
- if not all_chunks:
118
- return "Error: No chunks generated from documents"
119
-
120
- try:
121
- embeddings = embedding_model.encode(
122
- all_chunks,
123
- batch_size=32,
124
- convert_to_tensor=True,
125
- show_progress_bar=False
126
- ).cpu().numpy().astype('float32')
127
-
128
- self.index.reset()
129
- self.index.add(embeddings)
130
- self.chunks = all_chunks
131
- return f"Processed {len(all_chunks)} chunks from {len(files)} files"
132
- except Exception as e:
133
- print(f"Embedding Error: {str(e)}")
134
- return f"Error: {str(e)}"
135
-
136
- def query(self, question):
137
- if not self.chunks:
138
- return "Please process documents first", False
139
-
140
- try:
141
- print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
142
- print(f"Question: {question}")
143
-
144
- question_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
145
- _, indices = self.index.search(question_embedding, 3)
146
- print(f"Top indices: {indices}")
147
-
148
- context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
149
- print(f"Context length: {len(context)} characters")
150
-
151
- headers = {
152
- "Authorization": f"Bearer {GROQ_API_KEY}",
153
- "Content-Type": "application/json"
154
- }
155
-
156
- payload = {
157
- "messages": [{
158
- "role": "user",
159
- "content": f"Answer concisely based on the context: {question}\nContext: {context}"
160
- }],
161
- "model": "mixtral-8x7b-32768",
162
- "temperature": 0.3,
163
- "max_tokens": MAX_TOKENS,
164
- "stream": False # Changed to False for simpler handling
165
- }
166
-
167
- response = requests.post(
168
- "https://api.groq.com/openai/v1/chat/completions",
169
- headers=headers,
170
- json=payload,
171
- timeout=20
172
- )
173
-
174
- print(f"API Status Code: {response.status_code}")
175
-
176
- if response.status_code != 200:
177
- return f"API Error: {response.text}", False
178
-
179
- data = response.json()
180
- final_answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
181
- print(f"Final Answer: {final_answer}")
182
- return final_answer, True
183
-
184
- except Exception as e:
185
- print(f"Query Error: {str(e)}")
186
- return f"Error: {str(e)}", False
187
-
188
- processor = DocumentProcessor()
189
-
190
- def ask_question(question, chat_history):
191
- if not question.strip():
192
- return chat_history + [("", "Please enter a valid question")]
193
-
194
- answer, success = processor.query(question)
195
- return chat_history + [(question, answer)]
196
-
197
- with gr.Blocks(title="Document ChatBot") as app:
198
- gr.Markdown("## 🚀 Multi-Format Document ChatBot")
199
- with gr.Row():
200
- files = gr.File(
201
- file_count="multiple",
202
- file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
203
- label="Upload Documents"
204
  )
205
- process_btn = gr.Button("Process Documents", variant="primary")
206
- status = gr.Textbox(label="Processing Status", interactive=False)
207
- chatbot = gr.Chatbot(height=500, label="Chat History")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  with gr.Row():
209
- question = gr.Textbox(
210
- label="Your Query",
211
- placeholder="Enter your question about the documents...",
212
- max_lines=3
213
- )
214
- ask_btn = gr.Button("Ask", variant="primary")
215
- clear_btn = gr.Button("Clear Chat")
216
-
217
- process_btn.click(
218
- fn=processor.process_documents,
219
- inputs=files,
220
- outputs=status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  )
222
-
223
  ask_btn.click(
224
- fn=ask_question,
225
  inputs=[question, chatbot],
226
- outputs=chatbot
 
227
  ).then(lambda: "", None, question)
228
-
229
  clear_btn.click(
230
  fn=lambda: [],
231
  inputs=None,
232
- outputs=chatbot
 
233
  )
234
 
235
- app.launch()
 
 
 
1
+ import faiss
 
 
 
2
  import numpy as np
3
+ import gradio as gr
4
  import requests
 
 
5
  import json
6
+ import re
7
+ import torch
8
+ from transformers import AutoTokenizer
9
+ from langdetect import detect
10
  from sentence_transformers import SentenceTransformer
11
  from concurrent.futures import ThreadPoolExecutor
12
+ from tqdm import tqdm
13
 
14
  # Configuration
15
  GROQ_API_KEY = "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" # Replace with your actual key
16
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
17
+ DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json"
18
  CHUNK_SIZE = 512
19
  MAX_TOKENS = 4096
20
+ WORKERS = 4
21
+ EMBEDDING_BATCH_SIZE = 32
22
 
23
+ # Load the embedding model
24
+ model = SentenceTransformer(MODEL_NAME)
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
26
 
27
+ class UniversityKnowledgeBase:
28
  def __init__(self):
29
+ self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
30
  self.chunks = []
31
+ self.loaded = False
32
+ self.total_chunks = 0
33
+
34
+ def load_dataset(self):
35
+ """Loads and thoroughly processes the University dataset"""
36
  try:
37
+ print("\n" + "="*50)
38
+ print("Loading University of Education, Lahore dataset...")
39
+ print("="*50 + "\n")
40
+
41
+ # Fetch dataset with error handling
42
+ response = requests.get(DATASET_URL, timeout=30)
43
+ if response.status_code != 200:
44
+ raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}")
45
+
46
+ # Parse JSON content
47
+ try:
48
+ data = response.json()
49
+ except json.JSONDecodeError:
50
+ raise Exception("Invalid JSON format in dataset")
51
+
52
+ if not isinstance(data, list):
53
+ raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.")
54
+
55
+ # Process all content with progress tracking
56
+ self.chunks = []
57
+ with tqdm(data, desc="Processing dataset") as progress_bar:
58
+ for item in progress_bar:
59
+ if isinstance(item, dict):
60
+ if 'question' in item and 'answer' in item:
61
+ # Create comprehensive Q&A chunks
62
+ self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n")
63
+ elif 'text' in item:
64
+ # Process text content with semantic chunking
65
+ text = item['text'].strip()
66
+ if len(text) > CHUNK_SIZE:
67
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
68
+ current_chunk = ""
69
+ for sentence in sentences:
70
+ if len(current_chunk) + len(sentence) < CHUNK_SIZE:
71
+ current_chunk += " " + sentence
72
+ else:
73
+ if current_chunk:
74
+ self.chunks.append(current_chunk.strip())
75
+ current_chunk = sentence
76
+ if current_chunk:
77
+ self.chunks.append(current_chunk.strip())
78
+ else:
79
+ self.chunks.append(text)
80
+
81
+ self.total_chunks = len(self.chunks)
82
+ if self.total_chunks == 0:
83
+ raise Exception("No valid content found in the dataset")
84
+
85
+ print(f"\nSuccessfully processed {self.total_chunks} knowledge chunks from dataset")
86
+
87
+ # Generate embeddings in batches with progress tracking
88
+ print("\nGenerating embeddings...")
89
+ embeddings = []
90
+ for i in tqdm(range(0, self.total_chunks, EMBEDDING_BATCH_SIZE),
91
+ desc="Creating embeddings",
92
+ total=(self.total_chunks//EMBEDDING_BATCH_SIZE)+1):
93
+ batch = self.chunks[i:i+EMBEDDING_BATCH_SIZE]
94
+ batch_embeddings = model.encode(
95
+ batch,
96
+ convert_to_tensor=True,
97
+ show_progress_bar=False
98
+ ).cpu().numpy().astype('float32')
99
+ embeddings.append(batch_embeddings)
100
+
101
+ # Combine all embeddings and build FAISS index
102
+ all_embeddings = np.concatenate(embeddings)
103
+ self.index.add(all_embeddings)
104
+ self.loaded = True
105
+
106
+ return f"✅ Successfully loaded {self.total_chunks} knowledge chunks from University dataset"
107
+
108
  except Exception as e:
109
+ import traceback
110
+ traceback.print_exc()
111
+ return f"❌ Error loading dataset: {str(e)}"
112
+
113
+ def find_relevant_context(self, query, k=5):
114
+ """Finds the most relevant context with enhanced retrieval"""
115
+ if not self.loaded or not self.chunks:
116
+ return None
117
+
118
  try:
119
+ # Generate query embedding
120
+ query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
121
+
122
+ # Search with higher k initially for better context
123
+ _, indices = self.index.search(query_embedding, k*2)
124
+
125
+ # Get unique chunks (avoid duplicates)
126
+ unique_indices = list(dict.fromkeys(indices[0]))
127
+
128
+ # Select top-k most relevant unique chunks
129
+ selected_chunks = []
130
+ for idx in unique_indices[:k]:
131
+ if 0 <= idx < len(self.chunks):
132
+ selected_chunks.append(self.chunks[idx])
133
+
134
+ return "\n\n---\n\n".join(selected_chunks) if selected_chunks else None
135
  except Exception as e:
136
+ print(f"Context retrieval error: {str(e)}")
137
+ return None
138
 
139
+ # Initialize the knowledge base
140
+ knowledge_base = UniversityKnowledgeBase()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ def detect_language(text):
143
+ """Enhanced language detection with Urdu support"""
144
+ try:
145
+ text = text.lower().strip()
146
 
147
+ # Roman Urdu detection
148
+ roman_urdu_keywords = ['hai', 'ho', 'hain', 'ka', 'ki', 'ke', 'main', 'tum', 'ap', 'kyun', 'kya']
149
+ if any(keyword in text for keyword in roman_urdu_keywords):
150
+ return "Roman Urdu"
 
 
 
151
 
152
+ # Standard detection
153
+ lang = detect(text)
154
+ if lang == "ur":
155
+ return "Urdu"
156
+ elif lang == "hi": # Hindi/Urdu handling
157
+ return "Urdu" if not text.isascii() else "Roman Urdu"
158
+ return "English"
159
+ except:
160
+ return "English"
161
+
162
+ def get_groq_response(context, user_query, language="English"):
163
+ """Generates accurate responses strictly based on context"""
164
+ headers = {
165
+ "Authorization": f"Bearer {GROQ_API_KEY}",
166
+ "Content-Type": "application/json"
167
+ }
168
+
169
+ # Language-specific system prompts
170
+ system_prompts = {
171
+ "Urdu": """
172
+ آپ یونیورسٹی آف ایجوکیشن، لاہور کا سرکاری چیٹ بوٹ ہیں۔ درج ذیل معلومات کی بنیاد پر درست جواب دیں۔
173
+ اگر جواب دستیاب نہ ہو تو کہیں:
174
+ "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔"
175
+ """,
176
+ "Roman Urdu": """
177
+ Aap University of Education, Lahore ka chatbot hain. Diye gaye context ke hisab se jawab dein.
178
+ Agar jawab nahin mila to kehain:
179
+ "Maazrat, yeh maloomat mojood nahin. University ki website check karein."
180
+ """,
181
+ "English": """
182
+ You are the official chatbot of University of Education, Lahore.
183
+ Answer STRICTLY based on the provided context. If the answer isn't available, say:
184
+ "I'm sorry, this information isn't available. Please check the university website."
185
+ """
186
+ }
187
+
188
+ payload = {
189
+ "model": "mixtral-8x7b-32768",
190
+ "messages": [
191
+ {"role": "system", "content": system_prompts.get(language, system_prompts["English"])},
192
+ {"role": "user", "content": f"University Context:\n{context}\n\nQuestion: {user_query}"}
193
+ ],
194
+ "temperature": 0.1, # Low temperature for factual accuracy
195
+ "max_tokens": MAX_TOKENS,
196
+ "top_p": 0.9
197
+ }
198
+
199
+ try:
200
+ response = requests.post(
201
+ "https://api.groq.com/openai/v1/chat/completions",
202
+ headers=headers,
203
+ json=payload,
204
+ timeout=30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  )
206
+
207
+ if response.status_code != 200:
208
+ print(f"API Error {response.status_code}: {response.text[:200]}")
209
+ return None
210
+
211
+ return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
212
+ except Exception as e:
213
+ print(f"API Request Failed: {str(e)}")
214
+ return None
215
+
216
+ def chatbot_response(user_input, chat_history):
217
+ """Handles user queries with comprehensive response generation"""
218
+ if not user_input.strip():
219
+ return chat_history + [(user_input, "Please enter a valid question.")]
220
+
221
+ # Detect language
222
+ language = detect_language(user_input)
223
+
224
+ # Retrieve relevant context (more chunks for better accuracy)
225
+ context = knowledge_base.find_relevant_context(user_input, k=5)
226
+
227
+ # Handle no context found
228
+ if not context:
229
+ error_messages = {
230
+ "Urdu": "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔",
231
+ "Roman Urdu": "Maazrat, yeh maloomat mojood nahin. University ki website check karein.",
232
+ "English": "I'm sorry, this information isn't available. Please check the university website."
233
+ }
234
+ return chat_history + [(user_input, error_messages.get(language, error_messages["English"]))]
235
+
236
+ # Generate response
237
+ response = get_groq_response(context, user_input, language)
238
+
239
+ # Fallback if API fails
240
+ if not response:
241
+ fallback_messages = {
242
+ "Urdu": "معذرت، نظام میں عارضی خرابی ہے۔ بعد میں کوشش کریں۔",
243
+ "Roman Urdu": "Maazrat, system mein masla hai. Baad mein koshish karein.",
244
+ "English": "Sorry, there's a temporary system issue. Please try again later."
245
+ }
246
+ response = fallback_messages.get(language, fallback_messages["English"])
247
+
248
+ return chat_history + [(user_input, response)]
249
+
250
+ # Gradio Interface
251
+ with gr.Blocks(title="University of Education ChatBot", theme=gr.themes.Soft()) as app:
252
+ gr.Markdown("""
253
+ <div style='text-align: center;'>
254
+ <h1>University of Education, Lahore</h1>
255
+ <h2>Official Information ChatBot</h2>
256
+ <p>Ask any question about the university in English, Urdu, or Roman Urdu</p>
257
+ </div>
258
+ """)
259
+
260
+ # Initialize dataset
261
+ load_status = knowledge_base.load_dataset()
262
+
263
  with gr.Row():
264
+ with gr.Column(scale=1):
265
+ gr.Markdown("### Knowledge Base Status")
266
+ status = gr.Textbox(
267
+ label="Dataset Status",
268
+ value=load_status,
269
+ interactive=False,
270
+ lines=2
271
+ )
272
+ reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary")
273
+
274
+ gr.Markdown("""
275
+ **Note:** This chatbot answers strictly based on the official University of Education, Lahore dataset.
276
+ """)
277
+
278
+ with gr.Column(scale=2):
279
+ chatbot = gr.Chatbot(
280
+ height=500,
281
+ label="Conversation History",
282
+ bubble_full_width=False
283
+ )
284
+ question = gr.Textbox(
285
+ label="Your Question",
286
+ placeholder="Type your question about the university...",
287
+ lines=2,
288
+ max_lines=5
289
+ )
290
+ with gr.Row():
291
+ ask_btn = gr.Button("Ask Question", variant="primary")
292
+ clear_btn = gr.Button("Clear Conversation", variant="secondary")
293
+
294
+ # Event handlers
295
+ reload_btn.click(
296
+ fn=lambda: knowledge_base.load_dataset(),
297
+ inputs=None,
298
+ outputs=status,
299
+ queue=False
300
  )
301
+
302
  ask_btn.click(
303
+ fn=chatbot_response,
304
  inputs=[question, chatbot],
305
+ outputs=chatbot,
306
+ queue=True
307
  ).then(lambda: "", None, question)
308
+
309
  clear_btn.click(
310
  fn=lambda: [],
311
  inputs=None,
312
+ outputs=chatbot,
313
+ queue=False
314
  )
315
 
316
+ # Launch the application
317
+ if __name__ == "__main__":
318
+ app.launch(server_name="0.0.0.0", server_port=7860)