Spaces:

TRusso
/

P1G5_Set_1_Titan_Russo

Sleeping

App Files Files Community

P1G5_Set_1_Titan_Russo / EDA.py

TRusso

Upload 7 files

5a7ae2a verified 6 months ago

raw

history blame contribute delete

3.13 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns


	# Load data
	data = pd.read_csv("P1G5_Set_1_Titan_Russo.csv")


	def eda():

	st.title("Eksploratory Data Analysis")
	st.write('Analyze the DataFrame for Better Understanding')
	st.markdown("<h2><b>Limit Balance vs. Bill Amount by Default Payment Next Month</b></h2>",
	unsafe_allow_html=True)

	palette = ["#FF0000", "#4129E1"] # custom colors
	for i in range(1, 7):
	plt.figure()
	sns.scatterplot(
	x="limit_balance", y=f"bill_amt_{i}", hue="default_payment_next_month", data=data, palette=palette)
	plt.title(
	f"Limit Balance vs. Pay Amount {i} by Default Payment Next Month")
	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.pyplot()
	st.write("Explanation:")
	markdown_text = """
	These plots indicate that a higher `limit_balance` means a higher likelihood of non defaulting on payments.
	"""
	st.markdown(markdown_text)

	st.markdown("<h2><b>Heatmap of Correlation Matrix</b></h2>",
	unsafe_allow_html=True)
	# Heatmap
	corr_matrix = data.corr()
	plt.figure(figsize=(15, 10))
	sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
	plt.title('Heatmap of Correlation Matrix')
	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.pyplot()
	st.write("") # Add a blank line

	st.write("Explanation:")
	markdown_text = """
	From the heatmap we can see the correlation between each columns. We can see the `pay_0`, `pay_2`, `pay_3`, `pay_4`, `pay_5`, `pay_6` have correlation each others from categorical columns meanwhile we can see `bill_amt_1`, `bill_amt_2`, `bill_amt_3`, `bill_amt_4`, `bill_amt_5`, `bill_amt_6` have correlation each others from numerical columns
	"""
	st.markdown(markdown_text)

	st.markdown("<h2><b>Checking Distribution Data</b></h2>",
	unsafe_allow_html=True)

	# Columns to plot
	cols_num = ['limit_balance', 'age', 'bill_amt_1',
	'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6',
	'pay_amt_1', 'pay_amt_2', 'pay_amt_3', 'pay_amt_4', 'pay_amt_5',
	'pay_amt_6']

	# creating subplots for histogram
	fig, axes = plt.subplots(5, 4, figsize=(18, 15))

	# Flatten axes array
	axes = axes.flatten()

	# p;ots for each column
	for i, col in enumerate(cols_num):
	# membuat histogram dengan kernel density estimate
	sns.histplot(data[col], ax=axes[i], kde=True)
	axes[i].set_title(f'Distribution {col}')
	axes[i].set_xlabel(col)
	axes[i].set_ylabel('Frequency')

	# hapus figure
	for j in range(len(cols_num), len(axes)):
	axes[j].remove()

	# display
	plt.tight_layout()
	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.pyplot()
	st.write("") # Add a blank line

	st.write("Explanation:")
	st.write('Checking the distribution data from non categorical columns, we can say the data is positive skewed')
	st.write()