Spaces:
Sleeping
Sleeping
File size: 11,496 Bytes
26d1a81 f402ae8 26d1a81 f402ae8 43ebacc f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 43ebacc f402ae8 43ebacc f402ae8 43ebacc f402ae8 43ebacc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 |
import pandas as pd
import json
import os
import nltk
from typing import List, Dict, Any
from datasets import load_dataset
import nlpaug.augmenter.word as naw
from deep_translator import GoogleTranslator # Updated import
# Configure NLTK data path and download required resources
NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
nltk.data.path.append(NLTK_DATA_PATH)
def ensure_nltk_resources():
"""
Ensure NLTK resources are downloaded and available
"""
try:
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
nltk.download('punkt', download_dir=NLTK_DATA_PATH)
print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
return True
except Exception as e:
print(f"Failed to download NLTK resources: {e}")
return False
def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
"""
Load FAQ data from Hugging Face datasets, cache locally
"""
local_path = "data/ecommerce_faqs.json"
if os.path.exists(local_path):
print(f"Loading cached dataset from {local_path}")
with open(local_path, 'r') as f:
return json.load(f)
print(f"Loading dataset {dataset_name} from Hugging Face...")
try:
dataset = load_dataset(dataset_name)
faqs = [{
"question": item["question"],
"answer": item["answer"],
"category": item.get("category", ""),
"question_id": item.get("question_id", ""),
"faq_url": item.get("faq_url", "")
} for item in dataset["train"]]
with open(local_path, 'w') as f:
json.dump(faqs, f)
print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
return faqs
except Exception as e:
print(f"Error loading dataset: {e}")
print("Falling back to local data...")
return load_faq_data("data/faq_data.csv")
def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
"""
Load FAQ data from a local CSV or JSON file
"""
print(f"Loading data from {file_path}")
try:
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
faqs = df.to_dict('records')
elif file_path.endswith('.json'):
with open(file_path, 'r') as f:
faqs = json.load(f)
else:
raise ValueError(f"Unsupported file format: {file_path}")
print(f"Loaded {len(faqs)} FAQ entries")
return faqs
except Exception as e:
print(f"Error loading data: {e}")
print("Creating sample dataset as fallback")
sample_faqs = [
{"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
{"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
]
return sample_faqs
def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
"""
processed_faqs = []
for faq in faqs:
# Safely handle question and answer fields
question = faq.get('question')
answer = faq.get('answer')
# Convert to string and strip, handling None values
question = str(question).strip() if question is not None else ""
answer = str(answer).strip() if answer is not None else ""
# Update FAQ dictionary
faq['question'] = question
faq['answer'] = answer
# Only include FAQs with both question and answer
if question and answer:
processed_faqs.append(faq)
else:
print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
return processed_faqs
def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
"""
Augment FAQs with paraphrased questions if enabled
"""
if not enable_augmentation:
print("Augmentation disabled; returning original FAQs")
return faqs
if not ensure_nltk_resources():
print("NLTK resources unavailable; skipping augmentation")
return faqs
aug = naw.SynonymAug()
augmented = []
for faq in faqs:
augmented.append(faq)
if len(augmented) < max_faqs:
try:
aug_question = aug.augment(faq['question'])[0]
augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
except Exception as e:
print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
print(f"Augmented to {len(augmented)} FAQs")
return augmented
def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
"""
Translate FAQ to a target language using deep-translator
"""
try:
translator = GoogleTranslator(source='en', target=target_lang)
translated = faq.copy()
translated["question"] = translator.translate(faq["question"])
translated["answer"] = translator.translate(faq["answer"])
translated["language"] = target_lang
return translated
except Exception as e:
print(f"Translation error: {e}")
return faq
# import pandas as pd
# import json
# import os
# import nltk
# from typing import List, Dict, Any
# from datasets import load_dataset
# import nlpaug.augmenter.word as naw
# from googletrans import Translator
# # Configure NLTK data path and download required resources
# NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
# os.makedirs(NLTK_DATA_PATH, exist_ok=True)
# nltk.data.path.append(NLTK_DATA_PATH)
# def ensure_nltk_resources():
# """
# Ensure NLTK resources are downloaded and available
# """
# try:
# nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
# nltk.download('punkt', download_dir=NLTK_DATA_PATH)
# print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
# return True
# except Exception as e:
# print(f"Failed to download NLTK resources: {e}")
# return False
# def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
# """
# Load FAQ data from Hugging Face datasets, cache locally
# """
# local_path = "data/ecommerce_faqs.json"
# if os.path.exists(local_path):
# print(f"Loading cached dataset from {local_path}")
# with open(local_path, 'r') as f:
# return json.load(f)
# print(f"Loading dataset {dataset_name} from Hugging Face...")
# try:
# dataset = load_dataset(dataset_name)
# faqs = [{
# "question": item["question"],
# "answer": item["answer"],
# "category": item.get("category", ""),
# "question_id": item.get("question_id", ""),
# "faq_url": item.get("faq_url", "")
# } for item in dataset["train"]]
# with open(local_path, 'w') as f:
# json.dump(faqs, f)
# print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
# return faqs
# except Exception as e:
# print(f"Error loading dataset: {e}")
# print("Falling back to local data...")
# return load_faq_data("data/faq_data.csv")
# def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
# """
# Load FAQ data from a local CSV or JSON file
# """
# print(f"Loading data from {file_path}")
# try:
# if file_path.endswith('.csv'):
# df = pd.read_csv(file_path)
# faqs = df.to_dict('records')
# elif file_path.endswith('.json'):
# with open(file_path, 'r') as f:
# faqs = json.load(f)
# else:
# raise ValueError(f"Unsupported file format: {file_path}")
# print(f"Loaded {len(faqs)} FAQ entries")
# return faqs
# except Exception as e:
# print(f"Error loading data: {e}")
# print("Creating sample dataset as fallback")
# sample_faqs = [
# {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
# {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
# ]
# return sample_faqs
# def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
# """
# Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
# """
# processed_faqs = []
# for faq in faqs:
# # Safely handle question and answer fields
# question = faq.get('question')
# answer = faq.get('answer')
# # Convert to string and strip, handling None values
# question = str(question).strip() if question is not None else ""
# answer = str(answer).strip() if answer is not None else ""
# # Update FAQ dictionary
# faq['question'] = question
# faq['answer'] = answer
# # Only include FAQs with both question and answer
# if question and answer:
# processed_faqs.append(faq)
# else:
# print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
# print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
# return processed_faqs
# def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
# """
# Augment FAQs with paraphrased questions if enabled
# """
# if not enable_augmentation:
# print("Augmentation disabled; returning original FAQs")
# return faqs
# if not ensure_nltk_resources():
# print("NLTK resources unavailable; skipping augmentation")
# return faqs
# aug = naw.SynonymAug()
# augmented = []
# for faq in faqs:
# augmented.append(faq)
# if len(augmented) < max_faqs:
# try:
# aug_question = aug.augment(faq['question'])[0]
# augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
# except Exception as e:
# print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
# print(f"Augmented to {len(augmented)} FAQs")
# return augmented
# def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
# """
# Translate FAQ to a target language
# """
# try:
# translator = Translator()
# translated = faq.copy()
# translated["question"] = translator.translate(faq["question"], dest=target_lang).text
# translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
# translated["language"] = target_lang
# return translated
# except Exception as e:
# print(f"Translation error: {e}")
# return faq |