Spaces:

TBF
/

AutomaticDatavisualization

Runtime error

App Files Files Community

AutomaticDatavisualization / data_clean.py

RahulParajuli12

Added application file

2735152 almost 3 years ago

raw

history blame contribute delete

8.59 kB

	# from cv2 import dft
	import pandas as pd
	import numpy as np
	from sklearn.impute import KNNImputer
	import streamlit as st

	# def remove_col(df ,i):
	# df.drop([i], axis = 1,inplace = True)
	# return df

	# def column_delete(df, column_name):
	# print("deleting the column: ", column_name)
	# # new_df = (df.drop['column_name'], axis=1)
	# del df[column_name]
	# df.head()
	# return df

	# def row_delete(df, row_number):
	# print("deleting the row number: ", row_number)
	# df.drop(df.index[row_number])
	# df.head()
	# return df

	# def mean_fill(df,column_name):
	# mean_value=df[column_name].mean()
	# filled = df[column_name].fillna(value=mean_value, inplace=True)
	# return filled

	# def median_fill(df,column_name):
	# median_value=df[column_name].median()
	# filled = df[column_name].fillna(value=median_value, inplace=True)
	# return filled

	# def random_fill(df):
	# for i in df.columns:
	# df[i+"_imputed"] = df[i]
	# df[i+"_imputed"][df[i+"_imputed"].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values

	# def EndDistribution(df, column_name):

	# mean = df[column_name].mean()
	# std = df[column_name].std()
	# #calculating extreme standard deviation
	# extreme = (mean + (3*std))
	# df[column_name+'_median'] = df[column_name].fillna(df[column_name].median())
	# df[column_name+'_end_distribution'] = df[column_name].fillna(extreme)
	# return df

	# #knn imputer


	# def impute_knn(df):
	# '''
	# function for knn imputation in missing values in the data
	# df - dataset provided by the users
	# '''
	# from sklearn.impute import KNNImputer
	# imputer =KNNImputer(n_neighbors=5)

	# #finding only numeric columns
	# cols_num = df.select_dtypes(include=np.number).columns
	# for feature in df.columns:
	# #for numeric type
	# if feature in cols_num:
	# df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
	# else:
	# #for categorical type
	# df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
	# return df

	# #Z score capping
	# def zScore(df):
	# cols_num = df.select_dtypes(include=np.number).columns
	# for i in cols_num:
	# max_threshold = df[i].mean() + 3*df[i].std()
	# min_threshold = df[i].mean() - 3*df[i].std()
	# # df = df[(df['cgpa'] > 8.80) \| (df['cgpa'] < 5.11)]
	# df[i] = np.where(
	# df[i]>max_threshold,
	# max_threshold,
	# np.where(
	# df[i]<min_threshold,
	# min_threshold,
	# df[i]
	# )
	# )
	# return df

	# # zscore trimming
	# def zScore_trim(df):
	# cols_num = df.select_dtypes(include=np.number).columns
	# for i in cols_num:
	# max_threshold = df[i].mean() + 3*df[i].std()
	# min_threshold = df[i].mean() - 3*df[i].std()
	# df = df[(df[i] < max_threshold) \| (df[i] > min_threshold)]
	# return df

	# # Ourlier using Percentile
	# # trimming
	# def percentile_trimming(df):
	# cols_num = df.select_dtypes(include=np.number).columns
	# for i in cols_num:
	# percentile25 = df[i].quantile(0.25)
	# percentile75 = df[i].quantile(0.75)
	# iqr = percentile75 - percentile25
	# max_threshold = percentile75 + 3*iqr
	# min_threshold = percentile25 - 3*iqr
	# df = df[(df[i] < max_threshold) \| (df[i] > min_threshold)]
	# return df

	# #capping
	# def percentile_capping(df):
	# cols_num = df.select_dtypes(include=np.number).columns
	# for i in cols_num:
	# percentile25 = df[i].quantile(0.25)
	# percentile75 = df[i].quantile(0.75)
	# iqr = percentile75 - percentile25
	# max_threshold = percentile75 + 3*iqr
	# min_threshold = percentile25 - 3*iqr
	# df[i] = np.where(
	# df[i]>max_threshold,
	# max_threshold,
	# np.where(
	# df[i]<min_threshold,
	# min_threshold,
	# df[i]
	# )
	# )
	# return df

	# # Function to find date column in dataframe and convert it to datetime format
	# def convert_date(df):
	# '''
	# function parameter : dataframe
	# parameter datatype : pandas.core.frame.DataFrame
	# function returns : dataframe
	# return datatype : pandas.core.frame.DataFrame
	# function definition : takes dataframe as input and finds the date columns in the dataframe.
	# if found, converts the column to datetime format.
	# '''
	# df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
	# return df

	# # Function to find price column in dataframe
	# def price_column(df):
	# '''
	# function parameter : dataframe
	# parameter datatype : pandas.core.frame.DataFrame
	# function returns : dataframe
	# return datatype : pandas.core.frame.DataFrame
	# function definition : takes dataframe as input and finds the price related columns in the dataframe.
	# if found, renames the column to price_1.
	# '''
	# numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
	# price_cols = [col for col in numeric_cols if col.lower().find('price') != -1 or col.lower().find('cost') != -1 or
	# col.lower().find('total') != -1 or col.lower().find('amount') != -1 or col.lower().find('revenue') != -1 or
	# col.lower().find('profit') != -1 or col.lower().find('margin') != -1 or col.lower().find('sales') != -1]
	# if len(price_cols) > 1:
	# for i in range(len(price_cols)):
	# df.rename(columns={price_cols[i]: 'price_'+str(i+1)}, inplace=True)
	# elif len(price_cols) == 1:
	# df.rename(columns={price_cols[0]: 'price'}, inplace=True)
	# return df


	# def data_cleaning(df):
	# import pandas as pd
	# import numpy as np
	# from sklearn.impute import KNNImputer
	# pd.set_option('display.max_rows', 100)
	# for i in df.columns:
	# if ((df[i].isna().sum())/df.shape[0]) > 0.95:
	# df = remove_col(df,i)
	# else:
	# df = df.copy()
	# df = impute_knn(df)
	# return df


	# class missing_df:
	# def __init__(self, df):
	# self.df = df
	# print(self.df)
	#functions for handling missing values

	class missing_df:
	def __init__ (self,dataset):
	self.dataset = dataset

	def handle_missing_value():
	df = pd.read_csv("temp_data/test.csv")
	missing_count = df.isnull().sum().sum()
	if missing_count != 0:
	print(f"Found total of {missing_count} missing values.")

	#remove column having name starts with Unnamed
	df =df.loc[:,~df.columns.str.startswith('Unnamed')]

	#drop columns having more than 90% missing values
	for i in df.columns.to_list():
	if df[f"{i}"].isna().mean().round(4) > 0.9:
	df = df.drop(i, axis=1)

	#converting object datatype to integer if present
	for j in df.columns.values.tolist(): # Iterate on columns of dataframe
	try:
	df[j] = df[j].astype('int') # Convert datatype from object to int, of columns having all integer values
	except:
	pass


	# find date column in dataframe and convert it to datetime format
	try:
	df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
	except:
	pass

	#impute missing values
	imputer = KNNImputer(n_neighbors=3)
	#finding numerical columns from dataset
	cols_num = df.select_dtypes(include=np.number).columns
	for feature in df.columns:
	#for numeric type
	if feature in cols_num:
	df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
	else:
	#for categorical type
	df[feature] = df[feature].fillna(df[feature].mode().iloc[0])

	# def add_binary_col(df):
	# """
	# Functions to add binary column which tells if the data was missing or not
	# """
	# for label, content in df.items():
	# if pd.isnull(content).sum():
	# df["ismissing_"+label] = pd.isnull(content)
	# return df
	st.write(df)
	return df