import pandas as pd import numpy as np # Function Definitions # Load libraries def load_data(file): return pd.read_csv(file, index_col=False) # Handle duplicate rows def remove_duplicate_rows(df): df = df.drop_duplicates() print("Number of removed duplicated rows:", len(df)-len(df.drop_duplicates())) return df # One hot encode categorical columns def onehot_encoder(df, cols): encoded_cols = [] for col in cols: encoder = pd.get_dummies(df[col]) encoded_cols += list(encoder.columns) df = df.join(encoder) del df[col] return df, encoded_cols # Deal with NaN values in specified columns def fillna_values(df, cols, strategy='mean'): for col in cols: if strategy == 'median': df[col].fillna(df[col].median(), inplace=True) elif strategy == 'mean': df[col].fillna(df[col].mean(), inplace=True) else: raise ValueError('Invalid filling strategy') return df # Preprocess books dataset def preprocess_books(books): # Drop duplicates books = remove_duplicate_rows(books) # Get categorical columns cat_cols = ['language_code'] # One-hot encode categoricals books, _ = onehot_encoder(books, cat_cols) # Fill NAs fillna_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count'] books = fillna_values(books, fillna_cols, strategy='mean') return books # Preprocess tags dataset def preprocess_tags(tags): return tags def preprocess_book_tags(book_tags): # Map tag_id to tag_name instead of dropping the column tag_mapping = dict(zip(book_tags["tag_id"], book_tags["tag_name"])) book_tags["tag_name"] = book_tags["tag_id"].apply(lambda x: tag_mapping.get(x, None)) # Groupby aggregate agg_funcs = {'count': 'sum'} # Sum or other functions according to requirement book_tags = book_tags.groupby(['goodreads_book_id'], as_index=False).agg(agg_funcs) return book_tags # Preprocess goodbooks-10k dataset def preprocess_goodbooks(goodbooks): # Scaling/softening extreme ratings scaling_threshold = 4.5 goodbooks['scaled_rating'] = np.where(goodbooks['rating'] > scaling_threshold, scaling_threshold - 0.5 + ((scaling_threshold - 0.5) / (5 - scaling_threshold)) * (goodbooks['rating'] - scaling_threshold), goodbooks['rating']) return goodbooks # Merge and save dataset # Merge and save dataset def merge_and_save_dataset(): # Read files files = { 'books': '../data/books.csv', 'book_tags': '../data/book_tags.csv', 'goodbooks': '../data/goodbooks-10k.csv', 'ratings': '../data/ratings.csv', 'tags': '../data/tags.csv', 'to_read': '../data/to_read.csv' } merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True) # Additional cleanup and preprocessing merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()] # Save the final dataset merged_dataset.to_csv("../data/final_dataset.csv", index=False) # Merge and save dataset def merge_and_save_dataset(): # Read files files = { 'books': '../data/books.csv', 'book_tags': '../data/book_tags.csv', 'goodbooks': '../data/goodbooks-10k.csv', 'ratings': '../data/ratings.csv', 'tags': '../data/tags.csv', 'to_read': '../data/to_read.csv' } merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True, right_index=True) # Additional cleanup and preprocessing merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()] # Save the final dataset merged_dataset.to_csv("../data/final_dataset.csv", index=False) merge_and_save_dataset()