Spaces:
Running
Running
| import logging | |
| import os | |
| import sys | |
| import pandas as pd | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| def prepare_100k_data(): | |
| """ | |
| Convert GoodReads_100k_books.csv format to the format expected by data_processor. | |
| Input Columns: author, bookformat, desc, genre, img, isbn, isbn13, link, pages, rating, reviews, title, totalratings | |
| Target Columns: title, authors, genres, description, tags, rating, cover_image_url | |
| """ | |
| input_path = "data/raw/GoodReads_100k_books.csv" | |
| output_path = "data/raw/books_prepared.csv" | |
| logger.info(f"Loading new 100k dataset from {input_path}...") | |
| if not os.path.exists(input_path): | |
| logger.error(f"Input file not found: {input_path}") | |
| return | |
| try: | |
| # Load data | |
| df = pd.read_csv(input_path) | |
| logger.info(f"Loaded {len(df)} books.") | |
| # Rename columns | |
| logger.info("Mapping columns...") | |
| df = df.rename(columns={ | |
| "author": "authors", | |
| "desc": "description", | |
| "genre": "genres", | |
| "img": "cover_image_url", | |
| "rating": "rating" | |
| }) | |
| # Create tags column (using genres as base if available, else empty) | |
| df["tags"] = df["genres"].fillna("") | |
| # Select and Reorder | |
| target_cols = ["title", "authors", "genres", "description", "tags", "rating", "cover_image_url"] | |
| # Ensure all target columns exist | |
| for col in target_cols: | |
| if col not in df.columns: | |
| df[col] = "" | |
| logger.warning(f"Column {col} missing in source, filled with empty strings.") | |
| df = df[target_cols] | |
| # Clean up | |
| logger.info("Cleaning data...") | |
| # Remove rows with no title | |
| df = df.dropna(subset=["title"]) | |
| # Fill NaNs in text columns | |
| df[["authors", "genres", "description", "cover_image_url"]] = df[["authors", "genres", "description", "cover_image_url"]].fillna("") | |
| logger.info(f"Saving prepared data to {output_path}...") | |
| df.to_csv(output_path, index=False) | |
| logger.info(f"Successfully prepared {len(df)} books.") | |
| except Exception as e: | |
| logger.error(f"Error processing data: {e}") | |
| raise | |
| if __name__ == "__main__": | |
| prepare_100k_data() | |