File size: 11,496 Bytes
26d1a81
 
f402ae8
 
26d1a81
 
f402ae8
43ebacc
f402ae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26d1a81
 
 
f402ae8
26d1a81
f402ae8
 
 
 
 
26d1a81
f402ae8
26d1a81
 
f402ae8
 
 
 
 
 
 
 
 
 
26d1a81
 
f402ae8
26d1a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f402ae8
 
26d1a81
 
 
 
 
f402ae8
26d1a81
 
 
f402ae8
 
 
 
 
 
 
 
 
 
 
26d1a81
 
f402ae8
26d1a81
f402ae8
 
26d1a81
 
f402ae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43ebacc
f402ae8
 
43ebacc
f402ae8
43ebacc
 
f402ae8
 
 
 
43ebacc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import pandas as pd
import json
import os
import nltk
from typing import List, Dict, Any
from datasets import load_dataset
import nlpaug.augmenter.word as naw
from deep_translator import GoogleTranslator  # Updated import

# Configure NLTK data path and download required resources
NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
nltk.data.path.append(NLTK_DATA_PATH)

def ensure_nltk_resources():
    """
    Ensure NLTK resources are downloaded and available
    """
    try:
        nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
        nltk.download('punkt', download_dir=NLTK_DATA_PATH)
        print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
        return True
    except Exception as e:
        print(f"Failed to download NLTK resources: {e}")
        return False

def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
    """
    Load FAQ data from Hugging Face datasets, cache locally
    """
    local_path = "data/ecommerce_faqs.json"
    if os.path.exists(local_path):
        print(f"Loading cached dataset from {local_path}")
        with open(local_path, 'r') as f:
            return json.load(f)
    
    print(f"Loading dataset {dataset_name} from Hugging Face...")
    try:
        dataset = load_dataset(dataset_name)
        faqs = [{
            "question": item["question"],
            "answer": item["answer"],
            "category": item.get("category", ""),
            "question_id": item.get("question_id", ""),
            "faq_url": item.get("faq_url", "")
        } for item in dataset["train"]]
        with open(local_path, 'w') as f:
            json.dump(faqs, f)
        print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
        return faqs
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Falling back to local data...")
        return load_faq_data("data/faq_data.csv")

def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
    """
    Load FAQ data from a local CSV or JSON file
    """
    print(f"Loading data from {file_path}")
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            faqs = df.to_dict('records')
        elif file_path.endswith('.json'):
            with open(file_path, 'r') as f:
                faqs = json.load(f)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        print(f"Loaded {len(faqs)} FAQ entries")
        return faqs
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating sample dataset as fallback")
        sample_faqs = [
            {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
            {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
        ]
        return sample_faqs

def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
    """
    processed_faqs = []
    for faq in faqs:
        # Safely handle question and answer fields
        question = faq.get('question')
        answer = faq.get('answer')
        
        # Convert to string and strip, handling None values
        question = str(question).strip() if question is not None else ""
        answer = str(answer).strip() if answer is not None else ""
        
        # Update FAQ dictionary
        faq['question'] = question
        faq['answer'] = answer
        
        # Only include FAQs with both question and answer
        if question and answer:
            processed_faqs.append(faq)
        else:
            print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
    
    print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
    return processed_faqs

def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
    """
    Augment FAQs with paraphrased questions if enabled
    """
    if not enable_augmentation:
        print("Augmentation disabled; returning original FAQs")
        return faqs
    
    if not ensure_nltk_resources():
        print("NLTK resources unavailable; skipping augmentation")
        return faqs
    
    aug = naw.SynonymAug()
    augmented = []
    for faq in faqs:
        augmented.append(faq)
        if len(augmented) < max_faqs:
            try:
                aug_question = aug.augment(faq['question'])[0]
                augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
            except Exception as e:
                print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
    print(f"Augmented to {len(augmented)} FAQs")
    return augmented

def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
    """
    Translate FAQ to a target language using deep-translator
    """
    try:
        translator = GoogleTranslator(source='en', target=target_lang)
        translated = faq.copy()
        translated["question"] = translator.translate(faq["question"])
        translated["answer"] = translator.translate(faq["answer"])
        translated["language"] = target_lang
        return translated
    except Exception as e:
        print(f"Translation error: {e}")
        return faq




# import pandas as pd
# import json
# import os
# import nltk
# from typing import List, Dict, Any
# from datasets import load_dataset
# import nlpaug.augmenter.word as naw
# from googletrans import Translator

# # Configure NLTK data path and download required resources
# NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
# os.makedirs(NLTK_DATA_PATH, exist_ok=True)
# nltk.data.path.append(NLTK_DATA_PATH)

# def ensure_nltk_resources():
#     """
#     Ensure NLTK resources are downloaded and available
#     """
#     try:
#         nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
#         nltk.download('punkt', download_dir=NLTK_DATA_PATH)
#         print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
#         return True
#     except Exception as e:
#         print(f"Failed to download NLTK resources: {e}")
#         return False

# def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
#     """
#     Load FAQ data from Hugging Face datasets, cache locally
#     """
#     local_path = "data/ecommerce_faqs.json"
#     if os.path.exists(local_path):
#         print(f"Loading cached dataset from {local_path}")
#         with open(local_path, 'r') as f:
#             return json.load(f)
    
#     print(f"Loading dataset {dataset_name} from Hugging Face...")
#     try:
#         dataset = load_dataset(dataset_name)
#         faqs = [{
#             "question": item["question"],
#             "answer": item["answer"],
#             "category": item.get("category", ""),
#             "question_id": item.get("question_id", ""),
#             "faq_url": item.get("faq_url", "")
#         } for item in dataset["train"]]
#         with open(local_path, 'w') as f:
#             json.dump(faqs, f)
#         print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
#         return faqs
#     except Exception as e:
#         print(f"Error loading dataset: {e}")
#         print("Falling back to local data...")
#         return load_faq_data("data/faq_data.csv")

# def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
#     """
#     Load FAQ data from a local CSV or JSON file
#     """
#     print(f"Loading data from {file_path}")
#     try:
#         if file_path.endswith('.csv'):
#             df = pd.read_csv(file_path)
#             faqs = df.to_dict('records')
#         elif file_path.endswith('.json'):
#             with open(file_path, 'r') as f:
#                 faqs = json.load(f)
#         else:
#             raise ValueError(f"Unsupported file format: {file_path}")
#         print(f"Loaded {len(faqs)} FAQ entries")
#         return faqs
#     except Exception as e:
#         print(f"Error loading data: {e}")
#         print("Creating sample dataset as fallback")
#         sample_faqs = [
#             {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
#             {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
#         ]
#         return sample_faqs

# def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
#     """
#     Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
#     """
#     processed_faqs = []
#     for faq in faqs:
#         # Safely handle question and answer fields
#         question = faq.get('question')
#         answer = faq.get('answer')
        
#         # Convert to string and strip, handling None values
#         question = str(question).strip() if question is not None else ""
#         answer = str(answer).strip() if answer is not None else ""
        
#         # Update FAQ dictionary
#         faq['question'] = question
#         faq['answer'] = answer
        
#         # Only include FAQs with both question and answer
#         if question and answer:
#             processed_faqs.append(faq)
#         else:
#             print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")
    
#     print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
#     return processed_faqs

# def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
#     """
#     Augment FAQs with paraphrased questions if enabled
#     """
#     if not enable_augmentation:
#         print("Augmentation disabled; returning original FAQs")
#         return faqs
    
#     if not ensure_nltk_resources():
#         print("NLTK resources unavailable; skipping augmentation")
#         return faqs
    
#     aug = naw.SynonymAug()
#     augmented = []
#     for faq in faqs:
#         augmented.append(faq)
#         if len(augmented) < max_faqs:
#             try:
#                 aug_question = aug.augment(faq['question'])[0]
#                 augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
#             except Exception as e:
#                 print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
#     print(f"Augmented to {len(augmented)} FAQs")
#     return augmented

# def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
#     """
#     Translate FAQ to a target language
#     """
#     try:
#         translator = Translator()
#         translated = faq.copy()
#         translated["question"] = translator.translate(faq["question"], dest=target_lang).text
#         translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
#         translated["language"] = target_lang
#         return translated
#     except Exception as e:
#         print(f"Translation error: {e}")
#         return faq