# clean_yelp_data.py from loguru import logger import pandas as pd import numpy as np from dataclasses import dataclass from typing import Dict, List, Optional, Tuple import json from pathlib import Path import logging from scipy.stats import entropy import warnings from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns import re from textblob import TextBlob import os from pathlib import Path class DataCleaner: def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"): self.df=df self.output_path=output_path self.filename=filename def saving_cleaned_preprocess(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) output_file = Path(self.output_path) / self.filename logger.info(f"Files saved in directory {output_file} as : { self.filename}") self.df.to_csv(output_file, index=False) def dropping_unncessary_columns(self): self.df.drop("review_text", axis=1, inplace=True) self.df.drop("review_date", axis=1, inplace=True) self.df.drop("business_name", axis=1, inplace=True) self.df.drop("address", axis=1, inplace=True) self.df.drop("city", axis=1, inplace=True) self.df.drop("state", axis=1, inplace=True) self.df.drop("postal_code", axis=1, inplace=True) self.df.drop("categories", axis=1, inplace=True) self.df.drop("user_name", axis=1, inplace=True) self.df.drop("yelping_since", axis=1, inplace=True) self.df.drop("checkin_date", axis=1, inplace=True) self.df.drop("review_useful", axis=1, inplace=True) self.df.drop("review_funny", axis=1, inplace=True) self.df.drop("review_cool", axis=1, inplace=True) self.df.drop("user_useful", axis=1, inplace=True) self.df.drop("user_funny", axis=1, inplace=True) self.df.drop("user_cool", axis=1, inplace=True) self.df.drop("is_open", axis=1, inplace=True) self.df.drop("compliment_hot", axis=1, inplace=True) self.df.drop("compliment_more", axis=1, inplace=True) self.df.drop("compliment_profile", axis=1, inplace=True) self.df.drop("compliment_cute", axis=1, inplace=True) self.df.drop("compliment_list", axis=1, inplace=True) self.df.drop("compliment_note", axis=1, inplace=True) self.df.drop("compliment_plain", axis=1, inplace=True) self.df.drop("compliment_cool", axis=1, inplace=True) self.df.drop("compliment_funny", axis=1, inplace=True) self.df.drop("compliment_writer", axis=1, inplace=True) self.df.drop("compliment_photos", axis=1, inplace=True) def run_pipeline(self): logger.info("Dropping Unnecessary Columns") self.dropping_unncessary_columns() logger.info("Checking Again for NULL values in Columns") for col in self.df.columns: if self.df[col].isnull().sum()>0: print(f" {col} has {self.df[col].isnull().sum()} null values") logger.info("Saving Cleaned and Preprocessed Data") self.saving_cleaned_preprocess()