Spaces:
Running
Running
File size: 1,299 Bytes
dc4b86a dc7bbeb dc4b86a abf887e dc4b86a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from z_utils import get_dataframe
# Const
ORIGNAL_DF = "books_summary.csv"
CLEAN_DF = "clean_"+ORIGNAL_DF
CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF
# Load dataset
books_df = get_dataframe(ORIGNAL_DF)
# Original stats
print(f"Original Shape: {books_df.shape}")
# Drop Unknown columns
req_columns = ['book_name', 'summaries', 'categories']
books_df = books_df[req_columns] # another way could be .drop(...)
# Check for nulls
print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
# removing nulls rowsise cuz their other attirbutes dont contribute
books_df.dropna(axis=0, inplace=True)
# Check & remove duplciates
print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
books_df.drop_duplicates(inplace=True)
# Final stats
print(f"\n\nCleaned Shape: {books_df.shape}")
# Saving these cleaned DF
print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
books_df.to_csv(CLEAN_DF, index=False)
# ==== NOW to store the unique titles ====
books_df = books_df[["book_name", "summaries"]]
books_df.drop_duplicates(inplace=True)
print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
# Saving these cleaned DF
print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)
|