PPPDC_example / app.py
JUNGU's picture
Update app.py
38cbba4 verified
raw
history blame
10.2 kB
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats
# ν•œκΈ€ 폰트 μ„€μ •
def set_font():
font_path = "Pretendard-Bold.ttf" # μ‹€μ œ 폰트 파일 경둜둜 λ³€κ²½ν•΄μ£Όμ„Έμš”
fm.fontManager.addfont(font_path)
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
# 폰트 섀정을 κ°€μ Έμ˜΅λ‹ˆλ‹€
font_settings = set_font()
# μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™” 및 관리
def manage_session_state():
if 'data' not in st.session_state:
st.session_state.data = None
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
if 'numeric_columns' not in st.session_state:
st.session_state.numeric_columns = []
if 'categorical_columns' not in st.session_state:
st.session_state.categorical_columns = []
if 'x_var' not in st.session_state:
st.session_state.x_var = None
if 'y_var' not in st.session_state:
st.session_state.y_var = None
if 'slicers' not in st.session_state:
st.session_state.slicers = {}
if 'analysis_performed' not in st.session_state:
st.session_state.analysis_performed = False
# 데이터 λ‘œλ“œ
@st.cache_data
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
return None
return data
def manual_data_entry():
col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:", key="manual_col_names").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("초기 ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5, key="manual_num_rows")
data = pd.DataFrame(columns=col_names, index=range(num_rows))
edited_data = st.data_editor(data, num_rows="dynamic", key="manual_data_editor")
return edited_data
return None
def preprocess_data(data):
# 결츑치 처리
if data.isnull().sum().sum() > 0:
st.write("결츑치 처리:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:",
["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"],
key=f"missing_{column}")
if method == "제거":
data = data.dropna(subset=[column])
elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].mean(), inplace=True)
elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].median(), inplace=True)
elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].mode()[0], inplace=True)
# 데이터 νƒ€μž… λ³€ν™˜
for column in data.columns:
if data[column].dtype == 'object':
try:
data[column] = pd.to_numeric(data[column])
st.write(f"{column} 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
except ValueError:
st.write(f"{column} 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")
# μˆ«μžν˜• μ—΄κ³Ό λ²”μ£Όν˜• μ—΄ 뢄리
st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
st.session_state.categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
return data
def create_slicers(data):
for col in st.session_state.categorical_columns:
if data[col].nunique() <= 10: # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
st.session_state.slicers[col] = st.multiselect(
f"{col} 선택",
options=sorted(data[col].unique()),
default=sorted(data[col].unique()),
key=f"slicer_{col}"
)
def apply_slicers(data):
filtered_data = data.copy()
for col, selected_values in st.session_state.slicers.items():
if selected_values:
filtered_data = filtered_data[filtered_data[col].isin(selected_values)]
return filtered_data
def plot_correlation_heatmap(data):
corr = data[st.session_state.numeric_columns].corr()
fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='상관관계 히트맡')
st.plotly_chart(fig)
def plot_scatter_with_regression(data, x_var, y_var):
fig = px.scatter(data, x=x_var, y=y_var, color='반' if '반' in data.columns else None)
# νšŒκ·€μ„  μΆ”κ°€
x = data[x_var]
y = data[y_var]
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νšŒκ·€μ„ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'{x_var}와 {y_var}의 관계 (R-squared: {r_squared:.4f})',
xaxis_title=x_var,
yaxis_title=y_var,
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
# μΆ”κ°€ 톡계 정보
st.write(f"μƒκ΄€κ³„μˆ˜: {r_value:.4f}")
st.write(f"p-value: {p_value:.4f}")
st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")
def perform_analysis():
if st.session_state.processed_data is not None and not st.session_state.processed_data.empty:
st.header("탐색적 데이터 뢄석")
# μŠ¬λΌμ΄μ„œ 생성 및 적용
create_slicers(st.session_state.processed_data)
filtered_data = apply_slicers(st.session_state.processed_data)
# μš”μ•½ 톡계
st.write("μš”μ•½ 톡계:")
st.write(filtered_data.describe())
# 상관관계 히트맡
st.subheader("상관관계 히트맡")
plot_correlation_heatmap(filtered_data)
# μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
st.session_state.x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=st.session_state.numeric_columns, key='x_var')
st.session_state.y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=[col for col in st.session_state.numeric_columns if col != st.session_state.x_var], key='y_var')
if st.session_state.x_var and st.session_state.y_var:
plot_scatter_with_regression(filtered_data, st.session_state.x_var, st.session_state.y_var)
st.session_state.analysis_performed = True
else:
st.warning("뢄석할 데이터가 μ—†μŠ΅λ‹ˆλ‹€. 데이터λ₯Ό λ¨Όμ € λ‘œλ“œν•˜κ³  μ „μ²˜λ¦¬ν•΄μ£Όμ„Έμš”.")
# state μœ μ§€ν•˜λ„λ‘ μΆ”κ°€
def update_filtered_data():
st.session_state.filtered_data = apply_slicers(st.session_state.processed_data)
def create_slicers(data):
for col in st.session_state.categorical_columns:
if data[col].nunique() <= 10:
st.session_state.slicers[col] = st.multiselect(
f"{col} 선택",
options=sorted(data[col].unique()),
default=sorted(data[col].unique()),
key=f"slicer_{col}",
on_change=update_filtered_data
)
def apply_slicers(data):
filtered_data = data.copy()
for col, selected_values in st.session_state.slicers.items():
if selected_values:
filtered_data = filtered_data[filtered_data[col].isin(selected_values)]
return filtered_data
def perform_analysis():
if 'filtered_data' not in st.session_state or st.session_state.filtered_data is None:
st.session_state.filtered_data = st.session_state.processed_data.copy()
st.header("탐색적 데이터 뢄석")
# μŠ¬λΌμ΄μ„œ 생성
create_slicers(st.session_state.processed_data)
# μš”μ•½ 톡계
st.write("μš”μ•½ 톡계:")
st.write(st.session_state.filtered_data.describe())
# 상관관계 히트맡
st.subheader("상관관계 히트맡")
plot_correlation_heatmap(st.session_state.filtered_data)
# μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=st.session_state.numeric_columns, key='x_var')
y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')
if x_var and y_var:
plot_scatter_with_regression(st.session_state.filtered_data, x_var, y_var)
def main():
st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
manage_session_state()
if 'data' not in st.session_state or st.session_state.data is None:
# ... (데이터 λ‘œλ“œ λΆ€λΆ„)
if st.session_state.data is not None:
st.subheader("데이터 미리보기 및 μˆ˜μ •")
st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
edited_data = st.data_editor(st.session_state.data, num_rows="dynamic", key="data_editor")
if st.button("데이터 뢄석 μ‹œμž‘", key="start_analysis") or ('analysis_performed' in st.session_state and st.session_state.analysis_performed):
if 'analysis_performed' not in st.session_state or not st.session_state.analysis_performed:
st.session_state.processed_data = preprocess_data(edited_data)
st.session_state.analysis_performed = True
perform_analysis()
if __name__ == "__main__":
main()