Spaces:

EmreEgilmez
/

data_corrector

Runtime error

File size: 4,968 Bytes

a9b9800
 
ceb81ef
a9b9800
ceb81ef
56ebeed
a9b9800
50658f6
4e29053
a9b9800
50658f6
4e29053
 
a9b9800
50658f6
 
 
a9b9800
fcd54cb
 
 
 
 
 
50658f6
 
a9b9800
fcd54cb
 
 
 
 
 
 
 
 
a9b9800
fcd54cb
56ebeed
fcd54cb
 
ceb81ef
fcd54cb
 
 
 
ceb81ef
fcd54cb
 
 
 
a9b9800
ceb81ef
fcd54cb
 
a9b9800
50658f6
 
a9b9800
50658f6
fcd54cb
0a68ad0
 
a9b9800
fcd54cb
50658f6
fcd54cb
 
 
 
a9b9800
fcd54cb
 
 
56ebeed
 
 
 
 
 
fcd54cb
 
 
56ebeed
fcd54cb
 
 
0a68ad0
50658f6
a9b9800
fcd54cb
 
 
 
 
56ebeed
 
 
fcd54cb
 
56ebeed
 
 
fcd54cb
 
56ebeed
 
 
a9b9800
fcd54cb
56ebeed
 
4e29053
56ebeed
 
4e29053
56ebeed
4e29053

import streamlit as st
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score, accuracy_score

# Streamlit başlığı
st.title("Automatic Data Corrector / Quality Enhancer Tool")

# Train ve Test verilerini yükleme
train_file = st.file_uploader("Upload Your Train Data", type=["csv"])
test_file = st.file_uploader("Upload Your Test Data", type=["csv"])

if train_file and test_file:
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    # Segmentation sütununu kaldırma
    if 'Segmentation' in train_df.columns:
        train_df = train_df.drop(columns=['Segmentation'])
    if 'Segmentation' in test_df.columns:
        test_df = test_df.drop(columns=['Segmentation'])

    st.write("Train Data", train_df.head())
    st.write("Test Data", test_df.head())

    # Kategorik ve sayısal sütunları belirleme
    categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'object']
    numeric_columns = [col for col in train_df.columns if train_df[col].dtype != 'object']

    # Eksik değerleri doldurma
    def fill_missing_values(train_df, test_df, column, model_type):
        # Eğitim verilerinde eksik olmayan ve eksik olan verileri ayırma
        not_null_train = train_df[train_df[column].notnull()]
        is_null_train = train_df[train_df[column].isnull()]

        if is_null_train.empty:
            return train_df, test_df, None

        if model_type == "numeric":
            model = RandomForestRegressor()
            X_train = not_null_train.drop(columns=[column])
            y_train = not_null_train[column]
            X_test = is_null_train.drop(columns=[column])
        else:
            model = RandomForestClassifier()
            le = LabelEncoder()
            y_train = le.fit_transform(not_null_train[column].astype(str))
            X_train = not_null_train.drop(columns=[column])
            X_test = is_null_train.drop(columns=[column])

        # Kategorik veriler için One-Hot Encoding
        X_train = pd.get_dummies(X_train, drop_first=True)
        X_test = pd.get_dummies(X_test, drop_first=True)

        # Train ve Test veri setlerinde aynı sütunların olmasını sağlama
        X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        # Eksik değerleri doldurma
        imputer = SimpleImputer(strategy='mean' if model_type == "numeric" else 'most_frequent')
        X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
        X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

        # Modeli eğitme ve tahmin yapma
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if model_type == "categorical":
            predictions = le.inverse_transform(predictions)

        # Tahmin edilen değerlerle eksik değerleri doldurma
        train_df.loc[train_df[column].isnull(), column] = predictions

        # Model performansını değerlendirme
        if model_type == "numeric":
            performance = r2_score(y_train, model.predict(X_train))
        else:
            performance = accuracy_score(y_train, model.predict(X_train))

        # Aynı işlemi test verileri için yapma
        X_test = test_df[test_df[column].isnull()].drop(columns=[column])
        if X_test.empty:
            return train_df, test_df, performance
        
        X_test = pd.get_dummies(X_test, drop_first=True)
        X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
        X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
        predictions = model.predict(X_test)

        if model_type == "categorical":
            predictions = le.inverse_transform(predictions)

        test_df.loc[test_df[column].isnull(), column] = predictions

        return train_df, test_df, performance

    performances = []

    for column in numeric_columns:
        train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "numeric")
        if performance is not None:
            performances.append((column, "numeric", performance))

    for column in categorical_columns:
        train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "categorical")
        if performance is not None:
            performances.append((column, "categorical", performance))

    st.write("Test Data with Predictions", test_df)
    
    if performances:
        st.write("MODEL PERFORMANCE:")
        for column, model_type, performance in performances:
            if model_type == "numeric":
                st.write(f"Column: {column}  , Type: {model_type}  , R² Score: {performance:.2f}")
            else:
                st.write(f"Column: {column}  , Type: {model_type}  , Accuracy Score: {performance:.2f}")