| import pandas as pd
|
| import json
|
|
|
| def preprocess_data(file_path):
|
|
|
| df = pd.read_csv(file_path)
|
|
|
|
|
|
|
| relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating']
|
|
|
|
|
| df = df.dropna(subset=['reviews.text', 'name'])
|
|
|
| documents = []
|
| for _, row in df.iterrows():
|
| name = row['name']
|
| brand = row.get('brand', 'Unknown')
|
| categories = row.get('categories', 'N/A')
|
| text = row['reviews.text']
|
| title = row.get('reviews.title', '')
|
| rating = row.get('reviews.rating', 'N/A')
|
|
|
|
|
| price_str = "Price info not available"
|
| prices_raw = row.get('prices')
|
| if pd.notna(prices_raw):
|
| try:
|
|
|
| prices_data = json.loads(prices_raw.replace('""', '"'))
|
| if isinstance(prices_data, list) and len(prices_data) > 0:
|
| best_price = min([p.get('amountMin', float('inf')) for p in prices_data])
|
| currency = prices_data[0].get('currency', 'USD')
|
| if best_price != float('inf'):
|
| price_str = f"{best_price} {currency}"
|
| except:
|
| pass
|
|
|
| doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}"
|
|
|
| metadata = {
|
| "name": name,
|
| "brand": brand,
|
| "rating": str(rating),
|
| "price": price_str
|
| }
|
|
|
| documents.append({"content": doc_content, "metadata": metadata})
|
|
|
| return documents
|
|
|
| if __name__ == "__main__":
|
| docs = preprocess_data("7817_1.csv")
|
| with open("preprocessed_docs.json", "w") as f:
|
| json.dump(docs, f, indent=2)
|
| print(f"Preprocessed {len(docs)} documents.")
|
|
|