Techbite commited on
Commit
43ebacc
·
1 Parent(s): 1c90c4e

changed to deep translator

Browse files
Files changed (3) hide show
  1. app.py +261 -10
  2. requirements.txt +1 -22
  3. src/data_processing.py +157 -6
app.py CHANGED
@@ -7,6 +7,7 @@ from src.data_processing import load_huggingface_faq_data, load_faq_data, prepro
7
  from src.embedding import FAQEmbedder
8
  from src.llm_response import ResponseGenerator
9
  from src.utils import time_function, format_memory_stats, evaluate_response, evaluate_retrieval, baseline_keyword_search
 
10
 
11
  # Suppress CUDA warning and Torch path errors
12
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -146,10 +147,9 @@ def main():
146
 
147
  if submit_button and user_query:
148
  from src.data_processing import translate_faq
149
- from googletrans import Translator
150
- translator = Translator()
151
  if target_lang != "en":
152
- user_query_translated = translator.translate(user_query, dest="en").text
153
  else:
154
  user_query_translated = user_query
155
 
@@ -172,7 +172,7 @@ def main():
172
  generation_time = time.time() - start_time
173
 
174
  if target_lang != "en":
175
- response = translator.translate(response, dest=target_lang).text
176
 
177
  st.session_state.query_cache[user_query_translated] = (response, relevant_faqs)
178
  st.session_state.retrieval_time = retrieval_time
@@ -210,11 +210,9 @@ def main():
210
  st.session_state.user_input = question
211
  st.session_state.chat_history.append({"role": "user", "content": question})
212
 
213
- from src.data_processing import translate_faq
214
- from googletrans import Translator
215
- translator = Translator()
216
  if target_lang != "en":
217
- question_translated = translator.translate(question, dest="en").text
218
  else:
219
  question_translated = question
220
 
@@ -237,7 +235,7 @@ def main():
237
  generation_time = time.time() - start_time
238
 
239
  if target_lang != "en":
240
- response = translator.translate(response, dest=target_lang).text
241
 
242
  st.session_state.query_cache[question_translated] = (response, relevant_faqs)
243
  st.session_state.retrieval_time = retrieval_time
@@ -247,4 +245,257 @@ def main():
247
  st.session_state.chat_history.append({"role": "assistant", "content": response})
248
 
249
  if __name__ == "__main__":
250
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from src.embedding import FAQEmbedder
8
  from src.llm_response import ResponseGenerator
9
  from src.utils import time_function, format_memory_stats, evaluate_response, evaluate_retrieval, baseline_keyword_search
10
+ from deep_translator import GoogleTranslator # Updated import
11
 
12
  # Suppress CUDA warning and Torch path errors
13
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
147
 
148
  if submit_button and user_query:
149
  from src.data_processing import translate_faq
150
+ translator = GoogleTranslator(source='auto', target='en') # Updated translator
 
151
  if target_lang != "en":
152
+ user_query_translated = translator.translate(user_query)
153
  else:
154
  user_query_translated = user_query
155
 
 
172
  generation_time = time.time() - start_time
173
 
174
  if target_lang != "en":
175
+ response = translator.translate(response, target=target_lang)
176
 
177
  st.session_state.query_cache[user_query_translated] = (response, relevant_faqs)
178
  st.session_state.retrieval_time = retrieval_time
 
210
  st.session_state.user_input = question
211
  st.session_state.chat_history.append({"role": "user", "content": question})
212
 
213
+ translator = GoogleTranslator(source='auto', target='en') # Updated translator
 
 
214
  if target_lang != "en":
215
+ question_translated = translator.translate(question)
216
  else:
217
  question_translated = question
218
 
 
235
  generation_time = time.time() - start_time
236
 
237
  if target_lang != "en":
238
+ response = translator.translate(response, target=target_lang)
239
 
240
  st.session_state.query_cache[question_translated] = (response, relevant_faqs)
241
  st.session_state.retrieval_time = retrieval_time
 
245
  st.session_state.chat_history.append({"role": "assistant", "content": response})
246
 
247
  if __name__ == "__main__":
248
+ main()
249
+
250
+
251
+
252
+ # import streamlit as st
253
+ # import time
254
+ # import os
255
+ # import gc
256
+ # import torch
257
+ # from src.data_processing import load_huggingface_faq_data, load_faq_data, preprocess_faq, augment_faqs
258
+ # from src.embedding import FAQEmbedder
259
+ # from src.llm_response import ResponseGenerator
260
+ # from src.utils import time_function, format_memory_stats, evaluate_response, evaluate_retrieval, baseline_keyword_search
261
+
262
+ # # Suppress CUDA warning and Torch path errors
263
+ # os.environ["CUDA_VISIBLE_DEVICES"] = ""
264
+ # os.environ["TORCH_NO_PATH_CHECK"] = "1"
265
+
266
+ # st.set_page_config(page_title="E-Commerce FAQ Chatbot", layout="wide", initial_sidebar_state="expanded")
267
+
268
+ # @time_function
269
+ # def initialize_components(use_huggingface: bool = True, model_name: str = "microsoft/phi-2", enable_augmentation: bool = True):
270
+ # """
271
+ # Initialize RAG system components
272
+ # """
273
+ # try:
274
+ # if use_huggingface:
275
+ # faqs = load_huggingface_faq_data("NebulaByte/E-Commerce_FAQs")
276
+ # else:
277
+ # faqs = load_faq_data("data/faq_data.csv")
278
+
279
+ # processed_faqs = augment_faqs(preprocess_faq(faqs), enable_augmentation=enable_augmentation)
280
+ # embedder = FAQEmbedder()
281
+
282
+ # if os.path.exists("embeddings"):
283
+ # embedder.load("embeddings")
284
+ # else:
285
+ # embedder.create_embeddings(processed_faqs)
286
+ # embedder.save("embeddings")
287
+
288
+ # gc.collect()
289
+ # if torch.cuda.is_available():
290
+ # torch.cuda.empty_cache()
291
+
292
+ # response_generator = ResponseGenerator(model_name=model_name)
293
+ # response_generator.generate_response("Warmup query", [{"question": "Test", "answer": "Test"}])
294
+
295
+ # return embedder, response_generator, len(processed_faqs)
296
+ # except Exception as e:
297
+ # st.error(f"Initialization failed: {e}")
298
+ # raise
299
+
300
+ # def main():
301
+ # st.title("E-Commerce Customer Support FAQ Chatbot")
302
+ # st.subheader("Ask about orders, shipping, returns, or other e-commerce queries")
303
+
304
+ # st.sidebar.title("Configuration")
305
+ # use_huggingface = st.sidebar.checkbox("Use Hugging Face Dataset", value=True)
306
+ # enable_augmentation = st.sidebar.checkbox("Enable FAQ Augmentation", value=True, help="Generate paraphrased questions to expand dataset")
307
+ # target_lang = st.sidebar.selectbox("Language", ["en", "es", "fr"], index=0)
308
+
309
+ # model_options = {
310
+ # "Phi-2 (Recommended for 16GB RAM)": "microsoft/phi-2",
311
+ # "TinyLlama-1.1B (Fastest)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
312
+ # "Mistral-7B (For 15GB+ GPU)": "mistralai/Mistral-7B-Instruct-v0.1"
313
+ # }
314
+ # selected_model = st.sidebar.selectbox("Select LLM Model", list(model_options.keys()), index=0)
315
+ # model_name = model_options[selected_model]
316
+
317
+ # if st.sidebar.checkbox("Show Memory Usage", value=True):
318
+ # st.sidebar.subheader("Memory Usage")
319
+ # for key, value in format_memory_stats().items():
320
+ # st.sidebar.text(f"{key}: {value}")
321
+
322
+ # if "chat_history" not in st.session_state:
323
+ # st.session_state.chat_history = []
324
+ # if "query_cache" not in st.session_state:
325
+ # st.session_state.query_cache = {}
326
+ # if "feedback" not in st.session_state:
327
+ # st.session_state.feedback = []
328
+
329
+ # if "system_initialized" not in st.session_state or st.sidebar.button("Reload System"):
330
+ # with st.spinner("Initializing system..."):
331
+ # try:
332
+ # st.session_state.embedder, st.session_state.response_generator, num_faqs = initialize_components(
333
+ # use_huggingface=use_huggingface,
334
+ # model_name=model_name,
335
+ # enable_augmentation=enable_augmentation
336
+ # )
337
+ # st.session_state.system_initialized = True
338
+ # st.sidebar.success(f"System initialized with {num_faqs} FAQs!")
339
+ # except Exception as e:
340
+ # st.error(f"System initialization failed: {e}")
341
+ # return
342
+
343
+ # col1, col2 = st.columns([2, 1])
344
+
345
+ # with col1:
346
+ # st.subheader("Conversation")
347
+ # chat_container = st.container(height=400)
348
+ # with chat_container:
349
+ # for i, message in enumerate(st.session_state.chat_history):
350
+ # if message["role"] == "user":
351
+ # st.markdown(f"**You**: {message['content']}")
352
+ # else:
353
+ # st.markdown(f"**Bot**: {message['content']}")
354
+ # if i < len(st.session_state.chat_history) - 1:
355
+ # st.markdown("---")
356
+
357
+ # with st.form(key="chat_form"):
358
+ # user_query = st.text_input("Type your question:", key="user_input", placeholder="e.g., How do I track my order?")
359
+ # submit_button = st.form_submit_button("Ask")
360
+
361
+ # if len(st.session_state.chat_history) > 0:
362
+ # with st.form(key=f"feedback_form_{len(st.session_state.chat_history)}"):
363
+ # rating = st.slider("Rate this response (1-5)", 1, 5, key=f"rating_{len(st.session_state.chat_history)}")
364
+ # comments = st.text_area("Comments", key=f"comments_{len(st.session_state.chat_history)}")
365
+ # if st.form_submit_button("Submit Feedback"):
366
+ # st.session_state.feedback.append({
367
+ # "rating": rating,
368
+ # "comments": comments,
369
+ # "response": st.session_state.chat_history[-1]["content"]
370
+ # })
371
+ # with open("feedback.json", "w") as f:
372
+ # json.dump(st.session_state.feedback, f)
373
+ # st.success("Feedback submitted!")
374
+
375
+ # with col2:
376
+ # if st.session_state.get("system_initialized", False):
377
+ # st.subheader("Retrieved Information")
378
+ # info_container = st.container(height=500)
379
+ # with info_container:
380
+ # if "current_faqs" in st.session_state:
381
+ # for i, faq in enumerate(st.session_state.current_faqs):
382
+ # st.markdown(f"**Relevant FAQ #{i+1}**")
383
+ # st.markdown(f"**Q**: {faq['question']}")
384
+ # st.markdown(f"**A**: {faq['answer'][:150]}..." if len(faq['answer']) > 150 else f"**A**: {faq['answer']}")
385
+ # st.markdown(f"*Similarity Score*: {faq['similarity']:.2f}")
386
+ # if 'category' in faq and faq['category']:
387
+ # st.markdown(f"*Category*: {faq['category']}")
388
+ # st.markdown("---")
389
+ # else:
390
+ # st.markdown("Ask a question to see relevant FAQs.")
391
+
392
+ # if "retrieval_time" in st.session_state and "generation_time" in st.session_state:
393
+ # st.sidebar.subheader("Performance Metrics")
394
+ # st.sidebar.markdown(f"Retrieval time: {st.session_state.retrieval_time:.2f} seconds")
395
+ # st.sidebar.markdown(f"Response generation: {st.session_state.generation_time:.2f} seconds")
396
+ # st.sidebar.markdown(f"Total time: {st.session_state.retrieval_time + st.session_state.generation_time:.2f} seconds")
397
+
398
+ # if submit_button and user_query:
399
+ # from src.data_processing import translate_faq
400
+ # from googletrans import Translator
401
+ # translator = Translator()
402
+ # if target_lang != "en":
403
+ # user_query_translated = translator.translate(user_query, dest="en").text
404
+ # else:
405
+ # user_query_translated = user_query
406
+
407
+ # if user_query_translated in st.session_state.query_cache:
408
+ # response, relevant_faqs = st.session_state.query_cache[user_query_translated]
409
+ # else:
410
+ # gc.collect()
411
+ # if torch.cuda.is_available():
412
+ # torch.cuda.empty_cache()
413
+
414
+ # start_time = time.time()
415
+ # relevant_faqs = st.session_state.embedder.retrieve_relevant_faqs(user_query_translated)
416
+ # retrieval_time = time.time() - start_time
417
+
418
+ # if target_lang != "en":
419
+ # relevant_faqs = [translate_faq(faq, target_lang) for faq in relevant_faqs]
420
+
421
+ # start_time = time.time()
422
+ # response = st.session_state.response_generator.generate_response(user_query_translated, relevant_faqs)
423
+ # generation_time = time.time() - start_time
424
+
425
+ # if target_lang != "en":
426
+ # response = translator.translate(response, dest=target_lang).text
427
+
428
+ # st.session_state.query_cache[user_query_translated] = (response, relevant_faqs)
429
+ # st.session_state.retrieval_time = retrieval_time
430
+ # st.session_state.generation_time = generation_time
431
+ # st.session_state.current_faqs = relevant_faqs
432
+
433
+ # st.session_state.chat_history.append({"role": "user", "content": user_query})
434
+ # st.session_state.chat_history.append({"role": "assistant", "content": response})
435
+
436
+ # if st.button("Clear Chat History"):
437
+ # st.session_state.chat_history = []
438
+ # st.session_state.query_cache = {}
439
+ # gc.collect()
440
+ # if torch.cuda.is_available():
441
+ # torch.cuda.empty_cache()
442
+
443
+ # if st.session_state.get("system_initialized", False):
444
+ # st.sidebar.subheader("Baseline Comparison")
445
+ # baseline_faqs = baseline_keyword_search(user_query_translated if 'user_query_translated' in locals() else "", st.session_state.embedder.faqs)
446
+ # st.sidebar.write(f"RAG FAQs: {[faq['question'][:50] for faq in st.session_state.get('current_faqs', [])]}")
447
+ # st.sidebar.write(f"Keyword FAQs: {[faq['question'][:50] for faq in baseline_faqs]}")
448
+
449
+ # st.subheader("Sample Questions")
450
+ # sample_questions = [
451
+ # "How do I track my order?",
452
+ # "What should I do if my delivery is delayed?",
453
+ # "How do I return a product?",
454
+ # "Can I cancel my order after placing it?",
455
+ # "How quickly will my order be delivered?"
456
+ # ]
457
+ # cols = st.columns(2)
458
+ # for i, question in enumerate(sample_questions):
459
+ # col_idx = i % 2
460
+ # if cols[col_idx].button(question, key=f"sample_{i}"):
461
+ # st.session_state.user_input = question
462
+ # st.session_state.chat_history.append({"role": "user", "content": question})
463
+
464
+ # from src.data_processing import translate_faq
465
+ # from googletrans import Translator
466
+ # translator = Translator()
467
+ # if target_lang != "en":
468
+ # question_translated = translator.translate(question, dest="en").text
469
+ # else:
470
+ # question_translated = question
471
+
472
+ # if question_translated in st.session_state.query_cache:
473
+ # response, relevant_faqs = st.session_state.query_cache[question_translated]
474
+ # else:
475
+ # gc.collect()
476
+ # if torch.cuda.is_available():
477
+ # torch.cuda.empty_cache()
478
+
479
+ # start_time = time.time()
480
+ # relevant_faqs = st.session_state.embedder.retrieve_relevant_faqs(question_translated)
481
+ # retrieval_time = time.time() - start_time
482
+
483
+ # if target_lang != "en":
484
+ # relevant_faqs = [translate_faq(faq, target_lang) for faq in relevant_faqs]
485
+
486
+ # start_time = time.time()
487
+ # response = st.session_state.response_generator.generate_response(question_translated, relevant_faqs)
488
+ # generation_time = time.time() - start_time
489
+
490
+ # if target_lang != "en":
491
+ # response = translator.translate(response, dest=target_lang).text
492
+
493
+ # st.session_state.query_cache[question_translated] = (response, relevant_faqs)
494
+ # st.session_state.retrieval_time = retrieval_time
495
+ # st.session_state.generation_time = generation_time
496
+ # st.session_state.current_faqs = relevant_faqs
497
+
498
+ # st.session_state.chat_history.append({"role": "assistant", "content": response})
499
+
500
+ # if __name__ == "__main__":
501
+ # main()
requirements.txt CHANGED
@@ -11,28 +11,7 @@ accelerate>=0.20.0
11
  evaluate>=0.4.0
12
  scikit-learn>=1.2.0
13
  nlpaug>=1.1.0
14
- googletrans==4.0.0-rc1
15
- httpx==0.23.0 # Pinned to compatible version
16
- httpcore==0.15.0 # Pinned to compatible version
17
  psutil>=5.9.0
18
  nltk>=3.8.0
19
 
20
-
21
-
22
- # torch>=2.0.0
23
- # transformers>=4.30.0
24
- # sentence-transformers>=2.2.2
25
- # faiss-cpu>=1.7.4
26
- # pandas>=1.5.0
27
- # streamlit>=1.36.0
28
- # numpy>=1.24.0
29
- # datasets>=2.10.0
30
- # bitsandbytes>=0.40.0
31
- # accelerate>=0.20.0
32
- # evaluate>=0.4.0
33
- # scikit-learn>=1.2.0
34
- # nlpaug>=1.1.0
35
- # googletrans==4.0.0-rc1
36
- # psutil>=5.9.0
37
- # nltk>=3.8.0
38
-
 
11
  evaluate>=0.4.0
12
  scikit-learn>=1.2.0
13
  nlpaug>=1.1.0
14
+ deep-translator>=1.9.0
 
 
15
  psutil>=5.9.0
16
  nltk>=3.8.0
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/data_processing.py CHANGED
@@ -5,7 +5,7 @@ import nltk
5
  from typing import List, Dict, Any
6
  from datasets import load_dataset
7
  import nlpaug.augmenter.word as naw
8
- from googletrans import Translator
9
 
10
  # Configure NLTK data path and download required resources
11
  NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
@@ -133,15 +133,166 @@ def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmen
133
 
134
  def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
135
  """
136
- Translate FAQ to a target language
137
  """
138
  try:
139
- translator = Translator()
140
  translated = faq.copy()
141
- translated["question"] = translator.translate(faq["question"], dest=target_lang).text
142
- translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
143
  translated["language"] = target_lang
144
  return translated
145
  except Exception as e:
146
  print(f"Translation error: {e}")
147
- return faq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from typing import List, Dict, Any
6
  from datasets import load_dataset
7
  import nlpaug.augmenter.word as naw
8
+ from deep_translator import GoogleTranslator # Updated import
9
 
10
  # Configure NLTK data path and download required resources
11
  NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
 
133
 
134
  def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
135
  """
136
+ Translate FAQ to a target language using deep-translator
137
  """
138
  try:
139
+ translator = GoogleTranslator(source='en', target=target_lang)
140
  translated = faq.copy()
141
+ translated["question"] = translator.translate(faq["question"])
142
+ translated["answer"] = translator.translate(faq["answer"])
143
  translated["language"] = target_lang
144
  return translated
145
  except Exception as e:
146
  print(f"Translation error: {e}")
147
+ return faq
148
+
149
+
150
+
151
+
152
+ # import pandas as pd
153
+ # import json
154
+ # import os
155
+ # import nltk
156
+ # from typing import List, Dict, Any
157
+ # from datasets import load_dataset
158
+ # import nlpaug.augmenter.word as naw
159
+ # from googletrans import Translator
160
+
161
+ # # Configure NLTK data path and download required resources
162
+ # NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
163
+ # os.makedirs(NLTK_DATA_PATH, exist_ok=True)
164
+ # nltk.data.path.append(NLTK_DATA_PATH)
165
+
166
+ # def ensure_nltk_resources():
167
+ # """
168
+ # Ensure NLTK resources are downloaded and available
169
+ # """
170
+ # try:
171
+ # nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
172
+ # nltk.download('punkt', download_dir=NLTK_DATA_PATH)
173
+ # print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
174
+ # return True
175
+ # except Exception as e:
176
+ # print(f"Failed to download NLTK resources: {e}")
177
+ # return False
178
+
179
+ # def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
180
+ # """
181
+ # Load FAQ data from Hugging Face datasets, cache locally
182
+ # """
183
+ # local_path = "data/ecommerce_faqs.json"
184
+ # if os.path.exists(local_path):
185
+ # print(f"Loading cached dataset from {local_path}")
186
+ # with open(local_path, 'r') as f:
187
+ # return json.load(f)
188
+
189
+ # print(f"Loading dataset {dataset_name} from Hugging Face...")
190
+ # try:
191
+ # dataset = load_dataset(dataset_name)
192
+ # faqs = [{
193
+ # "question": item["question"],
194
+ # "answer": item["answer"],
195
+ # "category": item.get("category", ""),
196
+ # "question_id": item.get("question_id", ""),
197
+ # "faq_url": item.get("faq_url", "")
198
+ # } for item in dataset["train"]]
199
+ # with open(local_path, 'w') as f:
200
+ # json.dump(faqs, f)
201
+ # print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
202
+ # return faqs
203
+ # except Exception as e:
204
+ # print(f"Error loading dataset: {e}")
205
+ # print("Falling back to local data...")
206
+ # return load_faq_data("data/faq_data.csv")
207
+
208
+ # def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
209
+ # """
210
+ # Load FAQ data from a local CSV or JSON file
211
+ # """
212
+ # print(f"Loading data from {file_path}")
213
+ # try:
214
+ # if file_path.endswith('.csv'):
215
+ # df = pd.read_csv(file_path)
216
+ # faqs = df.to_dict('records')
217
+ # elif file_path.endswith('.json'):
218
+ # with open(file_path, 'r') as f:
219
+ # faqs = json.load(f)
220
+ # else:
221
+ # raise ValueError(f"Unsupported file format: {file_path}")
222
+ # print(f"Loaded {len(faqs)} FAQ entries")
223
+ # return faqs
224
+ # except Exception as e:
225
+ # print(f"Error loading data: {e}")
226
+ # print("Creating sample dataset as fallback")
227
+ # sample_faqs = [
228
+ # {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
229
+ # {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
230
+ # ]
231
+ # return sample_faqs
232
+
233
+ # def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
234
+ # """
235
+ # Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
236
+ # """
237
+ # processed_faqs = []
238
+ # for faq in faqs:
239
+ # # Safely handle question and answer fields
240
+ # question = faq.get('question')
241
+ # answer = faq.get('answer')
242
+
243
+ # # Convert to string and strip, handling None values
244
+ # question = str(question).strip() if question is not None else ""
245
+ # answer = str(answer).strip() if answer is not None else ""
246
+
247
+ # # Update FAQ dictionary
248
+ # faq['question'] = question
249
+ # faq['answer'] = answer
250
+
251
+ # # Only include FAQs with both question and answer
252
+ # if question and answer:
253
+ # processed_faqs.append(faq)
254
+ # else:
255
+ # print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
256
+
257
+ # print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
258
+ # return processed_faqs
259
+
260
+ # def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
261
+ # """
262
+ # Augment FAQs with paraphrased questions if enabled
263
+ # """
264
+ # if not enable_augmentation:
265
+ # print("Augmentation disabled; returning original FAQs")
266
+ # return faqs
267
+
268
+ # if not ensure_nltk_resources():
269
+ # print("NLTK resources unavailable; skipping augmentation")
270
+ # return faqs
271
+
272
+ # aug = naw.SynonymAug()
273
+ # augmented = []
274
+ # for faq in faqs:
275
+ # augmented.append(faq)
276
+ # if len(augmented) < max_faqs:
277
+ # try:
278
+ # aug_question = aug.augment(faq['question'])[0]
279
+ # augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
280
+ # except Exception as e:
281
+ # print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
282
+ # print(f"Augmented to {len(augmented)} FAQs")
283
+ # return augmented
284
+
285
+ # def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
286
+ # """
287
+ # Translate FAQ to a target language
288
+ # """
289
+ # try:
290
+ # translator = Translator()
291
+ # translated = faq.copy()
292
+ # translated["question"] = translator.translate(faq["question"], dest=target_lang).text
293
+ # translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
294
+ # translated["language"] = target_lang
295
+ # return translated
296
+ # except Exception as e:
297
+ # print(f"Translation error: {e}")
298
+ # return faq