import streamlit as st import pandas as pd from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.metrics import r2_score, accuracy_score # Streamlit başlığı st.title("Automatic Data Corrector / Quality Enhancer Tool") # Train ve Test verilerini yükleme train_file = st.file_uploader("Upload Your Train Data", type=["csv"]) test_file = st.file_uploader("Upload Your Test Data", type=["csv"]) if train_file and test_file: train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) # Segmentation sütununu kaldırma if 'Segmentation' in train_df.columns: train_df = train_df.drop(columns=['Segmentation']) if 'Segmentation' in test_df.columns: test_df = test_df.drop(columns=['Segmentation']) st.write("Train Data", train_df.head()) st.write("Test Data", test_df.head()) # Kategorik ve sayısal sütunları belirleme categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'object'] numeric_columns = [col for col in train_df.columns if train_df[col].dtype != 'object'] # Eksik değerleri doldurma def fill_missing_values(train_df, test_df, column, model_type): # Eğitim verilerinde eksik olmayan ve eksik olan verileri ayırma not_null_train = train_df[train_df[column].notnull()] is_null_train = train_df[train_df[column].isnull()] if is_null_train.empty: return train_df, test_df, None if model_type == "numeric": model = RandomForestRegressor() X_train = not_null_train.drop(columns=[column]) y_train = not_null_train[column] X_test = is_null_train.drop(columns=[column]) else: model = RandomForestClassifier() le = LabelEncoder() y_train = le.fit_transform(not_null_train[column].astype(str)) X_train = not_null_train.drop(columns=[column]) X_test = is_null_train.drop(columns=[column]) # Kategorik veriler için One-Hot Encoding X_train = pd.get_dummies(X_train, drop_first=True) X_test = pd.get_dummies(X_test, drop_first=True) # Train ve Test veri setlerinde aynı sütunların olmasını sağlama X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0) # Eksik değerleri doldurma imputer = SimpleImputer(strategy='mean' if model_type == "numeric" else 'most_frequent') X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) # Modeli eğitme ve tahmin yapma model.fit(X_train, y_train) predictions = model.predict(X_test) if model_type == "categorical": predictions = le.inverse_transform(predictions) # Tahmin edilen değerlerle eksik değerleri doldurma train_df.loc[train_df[column].isnull(), column] = predictions # Model performansını değerlendirme if model_type == "numeric": performance = r2_score(y_train, model.predict(X_train)) else: performance = accuracy_score(y_train, model.predict(X_train)) # Aynı işlemi test verileri için yapma X_test = test_df[test_df[column].isnull()].drop(columns=[column]) if X_test.empty: return train_df, test_df, performance X_test = pd.get_dummies(X_test, drop_first=True) X_test = X_test.reindex(columns=X_train.columns, fill_value=0) X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) predictions = model.predict(X_test) if model_type == "categorical": predictions = le.inverse_transform(predictions) test_df.loc[test_df[column].isnull(), column] = predictions return train_df, test_df, performance performances = [] for column in numeric_columns: train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "numeric") if performance is not None: performances.append((column, "numeric", performance)) for column in categorical_columns: train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "categorical") if performance is not None: performances.append((column, "categorical", performance)) st.write("Test Data with Predictions", test_df) if performances: st.write("MODEL PERFORMANCE:") for column, model_type, performance in performances: if model_type == "numeric": st.write(f"Column: {column} , Type: {model_type} , R² Score: {performance:.2f}") else: st.write(f"Column: {column} , Type: {model_type} , Accuracy Score: {performance:.2f}")