faq-rag-chatbot / src /data_processing.py
Techbite's picture
changed to deep translator
43ebacc
import pandas as pd
import json
import os
import nltk
from typing import List, Dict, Any
from datasets import load_dataset
import nlpaug.augmenter.word as naw
from deep_translator import GoogleTranslator # Updated import
# Configure NLTK data path and download required resources
NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
nltk.data.path.append(NLTK_DATA_PATH)
def ensure_nltk_resources():
"""
Ensure NLTK resources are downloaded and available
"""
try:
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
nltk.download('punkt', download_dir=NLTK_DATA_PATH)
print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
return True
except Exception as e:
print(f"Failed to download NLTK resources: {e}")
return False
def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
"""
Load FAQ data from Hugging Face datasets, cache locally
"""
local_path = "data/ecommerce_faqs.json"
if os.path.exists(local_path):
print(f"Loading cached dataset from {local_path}")
with open(local_path, 'r') as f:
return json.load(f)
print(f"Loading dataset {dataset_name} from Hugging Face...")
try:
dataset = load_dataset(dataset_name)
faqs = [{
"question": item["question"],
"answer": item["answer"],
"category": item.get("category", ""),
"question_id": item.get("question_id", ""),
"faq_url": item.get("faq_url", "")
} for item in dataset["train"]]
with open(local_path, 'w') as f:
json.dump(faqs, f)
print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
return faqs
except Exception as e:
print(f"Error loading dataset: {e}")
print("Falling back to local data...")
return load_faq_data("data/faq_data.csv")
def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
"""
Load FAQ data from a local CSV or JSON file
"""
print(f"Loading data from {file_path}")
try:
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
faqs = df.to_dict('records')
elif file_path.endswith('.json'):
with open(file_path, 'r') as f:
faqs = json.load(f)
else:
raise ValueError(f"Unsupported file format: {file_path}")
print(f"Loaded {len(faqs)} FAQ entries")
return faqs
except Exception as e:
print(f"Error loading data: {e}")
print("Creating sample dataset as fallback")
sample_faqs = [
{"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
{"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
]
return sample_faqs
def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
"""
processed_faqs = []
for faq in faqs:
# Safely handle question and answer fields
question = faq.get('question')
answer = faq.get('answer')
# Convert to string and strip, handling None values
question = str(question).strip() if question is not None else ""
answer = str(answer).strip() if answer is not None else ""
# Update FAQ dictionary
faq['question'] = question
faq['answer'] = answer
# Only include FAQs with both question and answer
if question and answer:
processed_faqs.append(faq)
else:
print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
return processed_faqs
def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
"""
Augment FAQs with paraphrased questions if enabled
"""
if not enable_augmentation:
print("Augmentation disabled; returning original FAQs")
return faqs
if not ensure_nltk_resources():
print("NLTK resources unavailable; skipping augmentation")
return faqs
aug = naw.SynonymAug()
augmented = []
for faq in faqs:
augmented.append(faq)
if len(augmented) < max_faqs:
try:
aug_question = aug.augment(faq['question'])[0]
augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
except Exception as e:
print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
print(f"Augmented to {len(augmented)} FAQs")
return augmented
def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
"""
Translate FAQ to a target language using deep-translator
"""
try:
translator = GoogleTranslator(source='en', target=target_lang)
translated = faq.copy()
translated["question"] = translator.translate(faq["question"])
translated["answer"] = translator.translate(faq["answer"])
translated["language"] = target_lang
return translated
except Exception as e:
print(f"Translation error: {e}")
return faq
# import pandas as pd
# import json
# import os
# import nltk
# from typing import List, Dict, Any
# from datasets import load_dataset
# import nlpaug.augmenter.word as naw
# from googletrans import Translator
# # Configure NLTK data path and download required resources
# NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
# os.makedirs(NLTK_DATA_PATH, exist_ok=True)
# nltk.data.path.append(NLTK_DATA_PATH)
# def ensure_nltk_resources():
# """
# Ensure NLTK resources are downloaded and available
# """
# try:
# nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
# nltk.download('punkt', download_dir=NLTK_DATA_PATH)
# print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
# return True
# except Exception as e:
# print(f"Failed to download NLTK resources: {e}")
# return False
# def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
# """
# Load FAQ data from Hugging Face datasets, cache locally
# """
# local_path = "data/ecommerce_faqs.json"
# if os.path.exists(local_path):
# print(f"Loading cached dataset from {local_path}")
# with open(local_path, 'r') as f:
# return json.load(f)
# print(f"Loading dataset {dataset_name} from Hugging Face...")
# try:
# dataset = load_dataset(dataset_name)
# faqs = [{
# "question": item["question"],
# "answer": item["answer"],
# "category": item.get("category", ""),
# "question_id": item.get("question_id", ""),
# "faq_url": item.get("faq_url", "")
# } for item in dataset["train"]]
# with open(local_path, 'w') as f:
# json.dump(faqs, f)
# print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
# return faqs
# except Exception as e:
# print(f"Error loading dataset: {e}")
# print("Falling back to local data...")
# return load_faq_data("data/faq_data.csv")
# def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
# """
# Load FAQ data from a local CSV or JSON file
# """
# print(f"Loading data from {file_path}")
# try:
# if file_path.endswith('.csv'):
# df = pd.read_csv(file_path)
# faqs = df.to_dict('records')
# elif file_path.endswith('.json'):
# with open(file_path, 'r') as f:
# faqs = json.load(f)
# else:
# raise ValueError(f"Unsupported file format: {file_path}")
# print(f"Loaded {len(faqs)} FAQ entries")
# return faqs
# except Exception as e:
# print(f"Error loading data: {e}")
# print("Creating sample dataset as fallback")
# sample_faqs = [
# {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
# {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
# ]
# return sample_faqs
# def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
# """
# Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
# """
# processed_faqs = []
# for faq in faqs:
# # Safely handle question and answer fields
# question = faq.get('question')
# answer = faq.get('answer')
# # Convert to string and strip, handling None values
# question = str(question).strip() if question is not None else ""
# answer = str(answer).strip() if answer is not None else ""
# # Update FAQ dictionary
# faq['question'] = question
# faq['answer'] = answer
# # Only include FAQs with both question and answer
# if question and answer:
# processed_faqs.append(faq)
# else:
# print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
# print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
# return processed_faqs
# def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
# """
# Augment FAQs with paraphrased questions if enabled
# """
# if not enable_augmentation:
# print("Augmentation disabled; returning original FAQs")
# return faqs
# if not ensure_nltk_resources():
# print("NLTK resources unavailable; skipping augmentation")
# return faqs
# aug = naw.SynonymAug()
# augmented = []
# for faq in faqs:
# augmented.append(faq)
# if len(augmented) < max_faqs:
# try:
# aug_question = aug.augment(faq['question'])[0]
# augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
# except Exception as e:
# print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
# print(f"Augmented to {len(augmented)} FAQs")
# return augmented
# def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
# """
# Translate FAQ to a target language
# """
# try:
# translator = Translator()
# translated = faq.copy()
# translated["question"] = translator.translate(faq["question"], dest=target_lang).text
# translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
# translated["language"] = target_lang
# return translated
# except Exception as e:
# print(f"Translation error: {e}")
# return faq