File size: 1,299 Bytes
dc4b86a
 
 
 
dc7bbeb
dc4b86a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf887e
dc4b86a
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from z_utils import get_dataframe

# Const
ORIGNAL_DF = "books_summary.csv"
CLEAN_DF = "clean_"+ORIGNAL_DF 
CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF

# Load dataset
books_df = get_dataframe(ORIGNAL_DF) 

# Original stats
print(f"Original Shape: {books_df.shape}")

# Drop Unknown columns 
req_columns = ['book_name', 'summaries', 'categories']
books_df = books_df[req_columns] # another way could be .drop(...)

# Check for nulls
print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
# removing nulls rowsise cuz their other attirbutes dont contribute 
books_df.dropna(axis=0, inplace=True) 


# Check & remove duplciates
print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
books_df.drop_duplicates(inplace=True) 


# Final stats
print(f"\n\nCleaned Shape: {books_df.shape}")

# Saving these cleaned DF
print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
books_df.to_csv(CLEAN_DF, index=False)

# ==== NOW to store the unique titles  ====
books_df = books_df[["book_name", "summaries"]]
books_df.drop_duplicates(inplace=True)
print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
# Saving these cleaned DF
print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)