Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
from sklearn.metrics import r2_score, accuracy_score | |
# Streamlit başlığı | |
st.title("Automatic Data Corrector / Quality Enhancer Tool") | |
# Train ve Test verilerini yükleme | |
train_file = st.file_uploader("Upload Your Train Data", type=["csv"]) | |
test_file = st.file_uploader("Upload Your Test Data", type=["csv"]) | |
if train_file and test_file: | |
train_df = pd.read_csv(train_file) | |
test_df = pd.read_csv(test_file) | |
# Segmentation sütununu kaldırma | |
if 'Segmentation' in train_df.columns: | |
train_df = train_df.drop(columns=['Segmentation']) | |
if 'Segmentation' in test_df.columns: | |
test_df = test_df.drop(columns=['Segmentation']) | |
st.write("Train Data", train_df.head()) | |
st.write("Test Data", test_df.head()) | |
# Kategorik ve sayısal sütunları belirleme | |
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'object'] | |
numeric_columns = [col for col in train_df.columns if train_df[col].dtype != 'object'] | |
# Eksik değerleri doldurma | |
def fill_missing_values(train_df, test_df, column, model_type): | |
# Eğitim verilerinde eksik olmayan ve eksik olan verileri ayırma | |
not_null_train = train_df[train_df[column].notnull()] | |
is_null_train = train_df[train_df[column].isnull()] | |
if is_null_train.empty: | |
return train_df, test_df, None | |
if model_type == "numeric": | |
model = RandomForestRegressor() | |
X_train = not_null_train.drop(columns=[column]) | |
y_train = not_null_train[column] | |
X_test = is_null_train.drop(columns=[column]) | |
else: | |
model = RandomForestClassifier() | |
le = LabelEncoder() | |
y_train = le.fit_transform(not_null_train[column].astype(str)) | |
X_train = not_null_train.drop(columns=[column]) | |
X_test = is_null_train.drop(columns=[column]) | |
# Kategorik veriler için One-Hot Encoding | |
X_train = pd.get_dummies(X_train, drop_first=True) | |
X_test = pd.get_dummies(X_test, drop_first=True) | |
# Train ve Test veri setlerinde aynı sütunların olmasını sağlama | |
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0) | |
# Eksik değerleri doldurma | |
imputer = SimpleImputer(strategy='mean' if model_type == "numeric" else 'most_frequent') | |
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) | |
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) | |
# Modeli eğitme ve tahmin yapma | |
model.fit(X_train, y_train) | |
predictions = model.predict(X_test) | |
if model_type == "categorical": | |
predictions = le.inverse_transform(predictions) | |
# Tahmin edilen değerlerle eksik değerleri doldurma | |
train_df.loc[train_df[column].isnull(), column] = predictions | |
# Model performansını değerlendirme | |
if model_type == "numeric": | |
performance = r2_score(y_train, model.predict(X_train)) | |
else: | |
performance = accuracy_score(y_train, model.predict(X_train)) | |
# Aynı işlemi test verileri için yapma | |
X_test = test_df[test_df[column].isnull()].drop(columns=[column]) | |
if X_test.empty: | |
return train_df, test_df, performance | |
X_test = pd.get_dummies(X_test, drop_first=True) | |
X_test = X_test.reindex(columns=X_train.columns, fill_value=0) | |
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) | |
predictions = model.predict(X_test) | |
if model_type == "categorical": | |
predictions = le.inverse_transform(predictions) | |
test_df.loc[test_df[column].isnull(), column] = predictions | |
return train_df, test_df, performance | |
performances = [] | |
for column in numeric_columns: | |
train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "numeric") | |
if performance is not None: | |
performances.append((column, "numeric", performance)) | |
for column in categorical_columns: | |
train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "categorical") | |
if performance is not None: | |
performances.append((column, "categorical", performance)) | |
st.write("Test Data with Predictions", test_df) | |
if performances: | |
st.write("MODEL PERFORMANCE:") | |
for column, model_type, performance in performances: | |
if model_type == "numeric": | |
st.write(f"Column: {column} , Type: {model_type} , R² Score: {performance:.2f}") | |
else: | |
st.write(f"Column: {column} , Type: {model_type} , Accuracy Score: {performance:.2f}") | |