Spaces:

nice-bill
/

deepshelf-api

Running

App Files Files Community

deepshelf-api / scripts /prepare_100k_data.py

nice-bill

initial commit

cdb73a8 3 months ago

raw

history blame contribute delete

2.43 kB

	import logging
	import os
	import sys
	import pandas as pd

	# Setup logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	def prepare_100k_data():
	"""
	Convert GoodReads_100k_books.csv format to the format expected by data_processor.

	Input Columns: author, bookformat, desc, genre, img, isbn, isbn13, link, pages, rating, reviews, title, totalratings
	Target Columns: title, authors, genres, description, tags, rating, cover_image_url
	"""
	input_path = "data/raw/GoodReads_100k_books.csv"
	output_path = "data/raw/books_prepared.csv"

	logger.info(f"Loading new 100k dataset from {input_path}...")

	if not os.path.exists(input_path):
	logger.error(f"Input file not found: {input_path}")
	return

	try:
	# Load data
	df = pd.read_csv(input_path)
	logger.info(f"Loaded {len(df)} books.")

	# Rename columns
	logger.info("Mapping columns...")
	df = df.rename(columns={
	"author": "authors",
	"desc": "description",
	"genre": "genres",
	"img": "cover_image_url",
	"rating": "rating"
	})

	# Create tags column (using genres as base if available, else empty)
	df["tags"] = df["genres"].fillna("")

	# Select and Reorder
	target_cols = ["title", "authors", "genres", "description", "tags", "rating", "cover_image_url"]

	# Ensure all target columns exist
	for col in target_cols:
	if col not in df.columns:
	df[col] = ""
	logger.warning(f"Column {col} missing in source, filled with empty strings.")

	df = df[target_cols]

	# Clean up
	logger.info("Cleaning data...")
	# Remove rows with no title
	df = df.dropna(subset=["title"])
	# Fill NaNs in text columns
	df[["authors", "genres", "description", "cover_image_url"]] = df[["authors", "genres", "description", "cover_image_url"]].fillna("")

	logger.info(f"Saving prepared data to {output_path}...")
	df.to_csv(output_path, index=False)
	logger.info(f"Successfully prepared {len(df)} books.")

	except Exception as e:
	logger.error(f"Error processing data: {e}")
	raise

	if __name__ == "__main__":
	prepare_100k_data()