Spaces:

EmreEgilmez
/

data_corrector

Runtime error

App Files Files Community

data_corrector / app.py

EmreEgilmez

Update app.py

4e29053 verified 2 months ago

raw

history blame contribute delete

No virus

4.97 kB

	import streamlit as st
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import LabelEncoder, OneHotEncoder
	from sklearn.metrics import r2_score, accuracy_score

	# Streamlit başlığı
	st.title("Automatic Data Corrector / Quality Enhancer Tool")

	# Train ve Test verilerini yükleme
	train_file = st.file_uploader("Upload Your Train Data", type=["csv"])
	test_file = st.file_uploader("Upload Your Test Data", type=["csv"])

	if train_file and test_file:
	train_df = pd.read_csv(train_file)
	test_df = pd.read_csv(test_file)

	# Segmentation sütununu kaldırma
	if 'Segmentation' in train_df.columns:
	train_df = train_df.drop(columns=['Segmentation'])
	if 'Segmentation' in test_df.columns:
	test_df = test_df.drop(columns=['Segmentation'])

	st.write("Train Data", train_df.head())
	st.write("Test Data", test_df.head())

	# Kategorik ve sayısal sütunları belirleme
	categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'object']
	numeric_columns = [col for col in train_df.columns if train_df[col].dtype != 'object']

	# Eksik değerleri doldurma
	def fill_missing_values(train_df, test_df, column, model_type):
	# Eğitim verilerinde eksik olmayan ve eksik olan verileri ayırma
	not_null_train = train_df[train_df[column].notnull()]
	is_null_train = train_df[train_df[column].isnull()]

	if is_null_train.empty:
	return train_df, test_df, None

	if model_type == "numeric":
	model = RandomForestRegressor()
	X_train = not_null_train.drop(columns=[column])
	y_train = not_null_train[column]
	X_test = is_null_train.drop(columns=[column])
	else:
	model = RandomForestClassifier()
	le = LabelEncoder()
	y_train = le.fit_transform(not_null_train[column].astype(str))
	X_train = not_null_train.drop(columns=[column])
	X_test = is_null_train.drop(columns=[column])

	# Kategorik veriler için One-Hot Encoding
	X_train = pd.get_dummies(X_train, drop_first=True)
	X_test = pd.get_dummies(X_test, drop_first=True)

	# Train ve Test veri setlerinde aynı sütunların olmasını sağlama
	X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

	# Eksik değerleri doldurma
	imputer = SimpleImputer(strategy='mean' if model_type == "numeric" else 'most_frequent')
	X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
	X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

	# Modeli eğitme ve tahmin yapma
	model.fit(X_train, y_train)
	predictions = model.predict(X_test)

	if model_type == "categorical":
	predictions = le.inverse_transform(predictions)

	# Tahmin edilen değerlerle eksik değerleri doldurma
	train_df.loc[train_df[column].isnull(), column] = predictions

	# Model performansını değerlendirme
	if model_type == "numeric":
	performance = r2_score(y_train, model.predict(X_train))
	else:
	performance = accuracy_score(y_train, model.predict(X_train))

	# Aynı işlemi test verileri için yapma
	X_test = test_df[test_df[column].isnull()].drop(columns=[column])
	if X_test.empty:
	return train_df, test_df, performance

	X_test = pd.get_dummies(X_test, drop_first=True)
	X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
	X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
	predictions = model.predict(X_test)

	if model_type == "categorical":
	predictions = le.inverse_transform(predictions)

	test_df.loc[test_df[column].isnull(), column] = predictions

	return train_df, test_df, performance

	performances = []

	for column in numeric_columns:
	train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "numeric")
	if performance is not None:
	performances.append((column, "numeric", performance))

	for column in categorical_columns:
	train_df, test_df, performance = fill_missing_values(train_df, test_df, column, "categorical")
	if performance is not None:
	performances.append((column, "categorical", performance))

	st.write("Test Data with Predictions", test_df)

	if performances:
	st.write("MODEL PERFORMANCE:")
	for column, model_type, performance in performances:
	if model_type == "numeric":
	st.write(f"Column: {column} , Type: {model_type} , R² Score: {performance:.2f}")
	else:
	st.write(f"Column: {column} , Type: {model_type} , Accuracy Score: {performance:.2f}")