Spaces:
Runtime error
Runtime error
| # from cv2 import dft | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.impute import KNNImputer | |
| import streamlit as st | |
| # def remove_col(df ,i): | |
| # df.drop([i], axis = 1,inplace = True) | |
| # return df | |
| # def column_delete(df, column_name): | |
| # print("deleting the column: ", column_name) | |
| # # new_df = (df.drop['column_name'], axis=1) | |
| # del df[column_name] | |
| # df.head() | |
| # return df | |
| # def row_delete(df, row_number): | |
| # print("deleting the row number: ", row_number) | |
| # df.drop(df.index[row_number]) | |
| # df.head() | |
| # return df | |
| # def mean_fill(df,column_name): | |
| # mean_value=df[column_name].mean() | |
| # filled = df[column_name].fillna(value=mean_value, inplace=True) | |
| # return filled | |
| # def median_fill(df,column_name): | |
| # median_value=df[column_name].median() | |
| # filled = df[column_name].fillna(value=median_value, inplace=True) | |
| # return filled | |
| # def random_fill(df): | |
| # for i in df.columns: | |
| # df[i+"_imputed"] = df[i] | |
| # df[i+"_imputed"][df[i+"_imputed"].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values | |
| # def EndDistribution(df, column_name): | |
| # mean = df[column_name].mean() | |
| # std = df[column_name].std() | |
| # #calculating extreme standard deviation | |
| # extreme = (mean + (3*std)) | |
| # df[column_name+'_median'] = df[column_name].fillna(df[column_name].median()) | |
| # df[column_name+'_end_distribution'] = df[column_name].fillna(extreme) | |
| # return df | |
| # #knn imputer | |
| # def impute_knn(df): | |
| # ''' | |
| # function for knn imputation in missing values in the data | |
| # df - dataset provided by the users | |
| # ''' | |
| # from sklearn.impute import KNNImputer | |
| # imputer =KNNImputer(n_neighbors=5) | |
| # #finding only numeric columns | |
| # cols_num = df.select_dtypes(include=np.number).columns | |
| # for feature in df.columns: | |
| # #for numeric type | |
| # if feature in cols_num: | |
| # df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1))) | |
| # else: | |
| # #for categorical type | |
| # df[feature] = df[feature].fillna(df[feature].mode().iloc[0]) | |
| # return df | |
| # #Z score capping | |
| # def zScore(df): | |
| # cols_num = df.select_dtypes(include=np.number).columns | |
| # for i in cols_num: | |
| # max_threshold = df[i].mean() + 3*df[i].std() | |
| # min_threshold = df[i].mean() - 3*df[i].std() | |
| # # df = df[(df['cgpa'] > 8.80) | (df['cgpa'] < 5.11)] | |
| # df[i] = np.where( | |
| # df[i]>max_threshold, | |
| # max_threshold, | |
| # np.where( | |
| # df[i]<min_threshold, | |
| # min_threshold, | |
| # df[i] | |
| # ) | |
| # ) | |
| # return df | |
| # # zscore trimming | |
| # def zScore_trim(df): | |
| # cols_num = df.select_dtypes(include=np.number).columns | |
| # for i in cols_num: | |
| # max_threshold = df[i].mean() + 3*df[i].std() | |
| # min_threshold = df[i].mean() - 3*df[i].std() | |
| # df = df[(df[i] < max_threshold) | (df[i] > min_threshold)] | |
| # return df | |
| # # Ourlier using Percentile | |
| # # trimming | |
| # def percentile_trimming(df): | |
| # cols_num = df.select_dtypes(include=np.number).columns | |
| # for i in cols_num: | |
| # percentile25 = df[i].quantile(0.25) | |
| # percentile75 = df[i].quantile(0.75) | |
| # iqr = percentile75 - percentile25 | |
| # max_threshold = percentile75 + 3*iqr | |
| # min_threshold = percentile25 - 3*iqr | |
| # df = df[(df[i] < max_threshold) | (df[i] > min_threshold)] | |
| # return df | |
| # #capping | |
| # def percentile_capping(df): | |
| # cols_num = df.select_dtypes(include=np.number).columns | |
| # for i in cols_num: | |
| # percentile25 = df[i].quantile(0.25) | |
| # percentile75 = df[i].quantile(0.75) | |
| # iqr = percentile75 - percentile25 | |
| # max_threshold = percentile75 + 3*iqr | |
| # min_threshold = percentile25 - 3*iqr | |
| # df[i] = np.where( | |
| # df[i]>max_threshold, | |
| # max_threshold, | |
| # np.where( | |
| # df[i]<min_threshold, | |
| # min_threshold, | |
| # df[i] | |
| # ) | |
| # ) | |
| # return df | |
| # # Function to find date column in dataframe and convert it to datetime format | |
| # def convert_date(df): | |
| # ''' | |
| # function parameter : dataframe | |
| # parameter datatype : pandas.core.frame.DataFrame | |
| # function returns : dataframe | |
| # return datatype : pandas.core.frame.DataFrame | |
| # function definition : takes dataframe as input and finds the date columns in the dataframe. | |
| # if found, converts the column to datetime format. | |
| # ''' | |
| # df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) | |
| # return df | |
| # # Function to find price column in dataframe | |
| # def price_column(df): | |
| # ''' | |
| # function parameter : dataframe | |
| # parameter datatype : pandas.core.frame.DataFrame | |
| # function returns : dataframe | |
| # return datatype : pandas.core.frame.DataFrame | |
| # function definition : takes dataframe as input and finds the price related columns in the dataframe. | |
| # if found, renames the column to price_1. | |
| # ''' | |
| # numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']] | |
| # price_cols = [col for col in numeric_cols if col.lower().find('price') != -1 or col.lower().find('cost') != -1 or | |
| # col.lower().find('total') != -1 or col.lower().find('amount') != -1 or col.lower().find('revenue') != -1 or | |
| # col.lower().find('profit') != -1 or col.lower().find('margin') != -1 or col.lower().find('sales') != -1] | |
| # if len(price_cols) > 1: | |
| # for i in range(len(price_cols)): | |
| # df.rename(columns={price_cols[i]: 'price_'+str(i+1)}, inplace=True) | |
| # elif len(price_cols) == 1: | |
| # df.rename(columns={price_cols[0]: 'price'}, inplace=True) | |
| # return df | |
| # def data_cleaning(df): | |
| # import pandas as pd | |
| # import numpy as np | |
| # from sklearn.impute import KNNImputer | |
| # pd.set_option('display.max_rows', 100) | |
| # for i in df.columns: | |
| # if ((df[i].isna().sum())/df.shape[0]) > 0.95: | |
| # df = remove_col(df,i) | |
| # else: | |
| # df = df.copy() | |
| # df = impute_knn(df) | |
| # return df | |
| # class missing_df: | |
| # def __init__(self, df): | |
| # self.df = df | |
| # print(self.df) | |
| #functions for handling missing values | |
| class missing_df: | |
| def __init__ (self,dataset): | |
| self.dataset = dataset | |
| def handle_missing_value(): | |
| df = pd.read_csv("temp_data/test.csv") | |
| missing_count = df.isnull().sum().sum() | |
| if missing_count != 0: | |
| print(f"Found total of {missing_count} missing values.") | |
| #remove column having name starts with Unnamed | |
| df =df.loc[:,~df.columns.str.startswith('Unnamed')] | |
| #drop columns having more than 90% missing values | |
| for i in df.columns.to_list(): | |
| if df[f"{i}"].isna().mean().round(4) > 0.9: | |
| df = df.drop(i, axis=1) | |
| #converting object datatype to integer if present | |
| for j in df.columns.values.tolist(): # Iterate on columns of dataframe | |
| try: | |
| df[j] = df[j].astype('int') # Convert datatype from object to int, of columns having all integer values | |
| except: | |
| pass | |
| # find date column in dataframe and convert it to datetime format | |
| try: | |
| df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) | |
| except: | |
| pass | |
| #impute missing values | |
| imputer = KNNImputer(n_neighbors=3) | |
| #finding numerical columns from dataset | |
| cols_num = df.select_dtypes(include=np.number).columns | |
| for feature in df.columns: | |
| #for numeric type | |
| if feature in cols_num: | |
| df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1))) | |
| else: | |
| #for categorical type | |
| df[feature] = df[feature].fillna(df[feature].mode().iloc[0]) | |
| # def add_binary_col(df): | |
| # """ | |
| # Functions to add binary column which tells if the data was missing or not | |
| # """ | |
| # for label, content in df.items(): | |
| # if pd.isnull(content).sum(): | |
| # df["ismissing_"+label] = pd.isnull(content) | |
| # return df | |
| st.write(df) | |
| return df | |