import pandas as pd import json import os import nltk from typing import List, Dict, Any from datasets import load_dataset import nlpaug.augmenter.word as naw from deep_translator import GoogleTranslator # Updated import # Configure NLTK data path and download required resources NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data") os.makedirs(NLTK_DATA_PATH, exist_ok=True) nltk.data.path.append(NLTK_DATA_PATH) def ensure_nltk_resources(): """ Ensure NLTK resources are downloaded and available """ try: nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH) nltk.download('punkt', download_dir=NLTK_DATA_PATH) print(f"NLTK resources downloaded to {NLTK_DATA_PATH}") return True except Exception as e: print(f"Failed to download NLTK resources: {e}") return False def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]: """ Load FAQ data from Hugging Face datasets, cache locally """ local_path = "data/ecommerce_faqs.json" if os.path.exists(local_path): print(f"Loading cached dataset from {local_path}") with open(local_path, 'r') as f: return json.load(f) print(f"Loading dataset {dataset_name} from Hugging Face...") try: dataset = load_dataset(dataset_name) faqs = [{ "question": item["question"], "answer": item["answer"], "category": item.get("category", ""), "question_id": item.get("question_id", ""), "faq_url": item.get("faq_url", "") } for item in dataset["train"]] with open(local_path, 'w') as f: json.dump(faqs, f) print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs") return faqs except Exception as e: print(f"Error loading dataset: {e}") print("Falling back to local data...") return load_faq_data("data/faq_data.csv") def load_faq_data(file_path: str) -> List[Dict[str, Any]]: """ Load FAQ data from a local CSV or JSON file """ print(f"Loading data from {file_path}") try: if file_path.endswith('.csv'): df = pd.read_csv(file_path) faqs = df.to_dict('records') elif file_path.endswith('.json'): with open(file_path, 'r') as f: faqs = json.load(f) else: raise ValueError(f"Unsupported file format: {file_path}") print(f"Loaded {len(faqs)} FAQ entries") return faqs except Exception as e: print(f"Error loading data: {e}") print("Creating sample dataset as fallback") sample_faqs = [ {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."}, {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."} ] return sample_faqs def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Preprocess FAQ data: clean text, handle formatting, and filter invalid entries """ processed_faqs = [] for faq in faqs: # Safely handle question and answer fields question = faq.get('question') answer = faq.get('answer') # Convert to string and strip, handling None values question = str(question).strip() if question is not None else "" answer = str(answer).strip() if answer is not None else "" # Update FAQ dictionary faq['question'] = question faq['answer'] = answer # Only include FAQs with both question and answer if question and answer: processed_faqs.append(faq) else: print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'") print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries") return processed_faqs def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]: """ Augment FAQs with paraphrased questions if enabled """ if not enable_augmentation: print("Augmentation disabled; returning original FAQs") return faqs if not ensure_nltk_resources(): print("NLTK resources unavailable; skipping augmentation") return faqs aug = naw.SynonymAug() augmented = [] for faq in faqs: augmented.append(faq) if len(augmented) < max_faqs: try: aug_question = aug.augment(faq['question'])[0] augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")}) except Exception as e: print(f"Augmentation error for question '{faq['question'][:50]}...': {e}") print(f"Augmented to {len(augmented)} FAQs") return augmented def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]: """ Translate FAQ to a target language using deep-translator """ try: translator = GoogleTranslator(source='en', target=target_lang) translated = faq.copy() translated["question"] = translator.translate(faq["question"]) translated["answer"] = translator.translate(faq["answer"]) translated["language"] = target_lang return translated except Exception as e: print(f"Translation error: {e}") return faq # import pandas as pd # import json # import os # import nltk # from typing import List, Dict, Any # from datasets import load_dataset # import nlpaug.augmenter.word as naw # from googletrans import Translator # # Configure NLTK data path and download required resources # NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data") # os.makedirs(NLTK_DATA_PATH, exist_ok=True) # nltk.data.path.append(NLTK_DATA_PATH) # def ensure_nltk_resources(): # """ # Ensure NLTK resources are downloaded and available # """ # try: # nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH) # nltk.download('punkt', download_dir=NLTK_DATA_PATH) # print(f"NLTK resources downloaded to {NLTK_DATA_PATH}") # return True # except Exception as e: # print(f"Failed to download NLTK resources: {e}") # return False # def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]: # """ # Load FAQ data from Hugging Face datasets, cache locally # """ # local_path = "data/ecommerce_faqs.json" # if os.path.exists(local_path): # print(f"Loading cached dataset from {local_path}") # with open(local_path, 'r') as f: # return json.load(f) # print(f"Loading dataset {dataset_name} from Hugging Face...") # try: # dataset = load_dataset(dataset_name) # faqs = [{ # "question": item["question"], # "answer": item["answer"], # "category": item.get("category", ""), # "question_id": item.get("question_id", ""), # "faq_url": item.get("faq_url", "") # } for item in dataset["train"]] # with open(local_path, 'w') as f: # json.dump(faqs, f) # print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs") # return faqs # except Exception as e: # print(f"Error loading dataset: {e}") # print("Falling back to local data...") # return load_faq_data("data/faq_data.csv") # def load_faq_data(file_path: str) -> List[Dict[str, Any]]: # """ # Load FAQ data from a local CSV or JSON file # """ # print(f"Loading data from {file_path}") # try: # if file_path.endswith('.csv'): # df = pd.read_csv(file_path) # faqs = df.to_dict('records') # elif file_path.endswith('.json'): # with open(file_path, 'r') as f: # faqs = json.load(f) # else: # raise ValueError(f"Unsupported file format: {file_path}") # print(f"Loaded {len(faqs)} FAQ entries") # return faqs # except Exception as e: # print(f"Error loading data: {e}") # print("Creating sample dataset as fallback") # sample_faqs = [ # {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."}, # {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."} # ] # return sample_faqs # def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # """ # Preprocess FAQ data: clean text, handle formatting, and filter invalid entries # """ # processed_faqs = [] # for faq in faqs: # # Safely handle question and answer fields # question = faq.get('question') # answer = faq.get('answer') # # Convert to string and strip, handling None values # question = str(question).strip() if question is not None else "" # answer = str(answer).strip() if answer is not None else "" # # Update FAQ dictionary # faq['question'] = question # faq['answer'] = answer # # Only include FAQs with both question and answer # if question and answer: # processed_faqs.append(faq) # else: # print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'") # print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries") # return processed_faqs # def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]: # """ # Augment FAQs with paraphrased questions if enabled # """ # if not enable_augmentation: # print("Augmentation disabled; returning original FAQs") # return faqs # if not ensure_nltk_resources(): # print("NLTK resources unavailable; skipping augmentation") # return faqs # aug = naw.SynonymAug() # augmented = [] # for faq in faqs: # augmented.append(faq) # if len(augmented) < max_faqs: # try: # aug_question = aug.augment(faq['question'])[0] # augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")}) # except Exception as e: # print(f"Augmentation error for question '{faq['question'][:50]}...': {e}") # print(f"Augmented to {len(augmented)} FAQs") # return augmented # def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]: # """ # Translate FAQ to a target language # """ # try: # translator = Translator() # translated = faq.copy() # translated["question"] = translator.translate(faq["question"], dest=target_lang).text # translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text # translated["language"] = target_lang # return translated # except Exception as e: # print(f"Translation error: {e}") # return faq