Wilson-ZheLin
Initial commit
9183c57
raw
history blame
8.39 kB
import os
import io
import pandas as pd
def read_file(file_path):
"""
Read a file from a given path.
"""
# Check the size of the file
if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes
raise ValueError("Too large file")
# Extract the file extension
file_extension = file_path.split('.')[-1]
if file_extension == 'csv':
# Read CSV file
return pd.read_csv(file_path)
elif file_extension == 'json':
# Read JSON file
return pd.read_json(file_path)
elif file_extension in ['xls', 'xlsx']:
# Read Excel file
return pd.read_excel(file_path, engine='openpyxl')
else:
raise ValueError("Unsupported file format: " + file_extension)
def read_file_from_streamlit(uploaded_file):
"""
Read a file from a given streamlit file.
"""
# Check the size of the file
if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes
raise ValueError("Too large file")
# Extract the file extension
file_extension = uploaded_file.name.split('.')[-1]
if file_extension == 'csv':
# Read CSV file
return pd.read_csv(uploaded_file)
elif file_extension == 'json':
# Read JSON file
return pd.read_json(uploaded_file)
elif file_extension in ['xls', 'xlsx']:
# Read Excel file
# Use io.BytesIO to handle the binary stream
return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl')
else:
raise ValueError("Unsupported file format: " + file_extension)
def select_Y(df, Y_name):
"""
Select the target variable from the DataFrame.
"""
if Y_name in df.columns:
X = df.drop(Y_name, axis=1)
Y = df[Y_name]
return X, Y
else:
return -1
def check_all_columns_numeric(df):
"""
Check if all columns in a DataFrame are numeric. Return True if so, False otherwise.
"""
return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1]
def non_numeric_columns_and_head(df, num_rows=20):
"""
Identify non-numeric columns in a DataFrame and return their names and head.
:param df: Pandas DataFrame to be examined.
:param num_rows: Number of rows to include in the head (default is 20).
:return: A tuple with two elements:
1. List of column names that are not numeric (integer or float).
2. DataFrame containing the head of the non-numeric columns.
"""
# Identify columns that are not of numeric data type
non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
# Get the head of the non-numeric columns
non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv()
return non_numeric_cols, non_numeric_head
def contain_null_attributes_info(df):
"""
Identifies columns with missing values, summarizes their statistics, and reports their data types.
This function checks for attributes within a DataFrame that contain null values,
generates descriptive statistics for these attributes, and compiles information about their data types.
:param df: A pandas DataFrame to be analyzed.
:return: A tuple containing:
- A list of columns that contain null values.
- A string representation of data types for these columns.
- A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns.
Returns an empty list, -1, and -1 if no columns with null values are found.
"""
attributes = df.columns[df.isnull().any()].tolist()
if not attributes: return [], -1, -1
description_info = df[attributes].describe(percentiles=[.5])
description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv()
dtypes_df = df[attributes].dtypes
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
return attributes, types_info, description_info
def attribute_info(df):
"""
Obtain the attributes, types, and head information of the DataFrame.
"""
attributes = df.columns.tolist()
dtypes_df = df.dtypes
types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()])
head_info = df.head(10).to_csv()
return attributes, types_info, head_info
def get_data_overview(df):
"""
Obtain the shape, head, nunique, and description information of the DataFrame.
"""
shape_info = str(df.shape)
head_info = df.head().to_csv()
nunique_info = df.nunique().to_csv()
description_info = df.describe(include='all').to_csv()
return shape_info, head_info, nunique_info, description_info
def get_balance_info(df, Y_name):
"""
Obtain the shape, description, and balance information of the DataFrame.
"""
shape_info = df.shape
description_info = df.describe().to_csv()
balance_info = df[Y_name].value_counts().to_dict()
return shape_info, description_info, balance_info
def separate_decode_list(decided_dict, Y_name):
"""
Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop
"""
convert_int_cols = [key for key, value in decided_dict.items() if value == 1]
one_hot_cols = [key for key, value in decided_dict.items() if value == 2]
drop_cols = [key for key, value in decided_dict.items() if value == 3]
if Y_name and Y_name in one_hot_cols:
one_hot_cols.remove(Y_name)
convert_int_cols.append(Y_name)
if Y_name and Y_name in drop_cols:
drop_cols.remove(Y_name)
convert_int_cols.append(Y_name)
return convert_int_cols, one_hot_cols, drop_cols
def separate_fill_null_list(fill_null_dict):
"""
Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation
"""
mean_list = [key for key, value in fill_null_dict.items() if value == 1]
median_list = [key for key, value in fill_null_dict.items() if value == 2]
mode_list = [key for key, value in fill_null_dict.items() if value == 3]
new_category_list = [key for key, value in fill_null_dict.items() if value == 4]
interpolation_list = [key for key, value in fill_null_dict.items() if value == 5]
return mean_list, median_list, mode_list, new_category_list, interpolation_list
def get_selected_models(model_dict):
"""
Convert the dictionary of models to a list.
"""
return list(model_dict.values())
def get_model_name(model_no):
"""
Returns the name of the classification model based on the model number.
"""
if model_no == 1:
return "Logistic Regression"
elif model_no == 2:
return "SVM"
elif model_no == 3:
return "Naive Bayes"
elif model_no == 4:
return "Random Forest"
elif model_no == 5:
return "ADA Boost"
elif model_no == 6:
return "XGBoost"
elif model_no == 7:
return "Grandient Boost"
def get_cluster_method_name(method):
"""
Returns the name of the clustering method based on the method number.
"""
if method == 1:
return "K-Means"
elif method == 2:
return "DBSCAN"
elif method == 3:
return "Gaussian Mixture"
elif method == 4:
return "Agglomerative Clustering"
elif method == 5:
return "Spectral Clustering"
def get_balance_method_name(method):
"""
Returns the name of the balance method based on the method number.
"""
if method == 1:
return "ROS"
elif method == 2:
return "SMOTE"
elif method == 3:
return "ADASYN"
elif method == 4:
return "None"
def get_regression_method_name(method):
"""
Returns the name of the regression method based on the method number.
"""
if method == 1:
return "Linear Regression"
elif method == 2:
return "Ridge Regression"
elif method == 3:
return "Lasso Regression"
elif method == 4:
return "Random Forest"
elif method == 5:
return "Gradient Boosting"
elif method == 6:
return "Elastic Net"
def count_unique(df, Y):
"""
Counts the number of unique values in a specified column of a DataFrame.
"""
return df[Y].nunique()