Spaces:

sufianahmad513
/

assign_1

Sleeping

App Files Files Community

assign_1 / app.py

sufianahmad513

Update app.py

2773ed8 verified about 1 year ago

raw

history blame contribute delete

11.2 kB


	# Import required packages
	import pandas as pd
	import numpy as np
	import altair as alt
	import streamlit as st
	import matplotlib.pyplot as plt
	from scipy.stats import zscore
	import streamlit as st
	import altair as alt
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.metrics.pairwise import cosine_similarity

	df_new= pd.read_csv('Final_data.csv')
	# Page title
	st.title("Exploratory Data Analysis on Kiva Loans")
	st.sidebar.header("Filters")

	# Filter for Country
	country = df_new['country'].unique()
	selected_country = st.sidebar.selectbox("Select Country", country.tolist())
	if selected_country:
	filtered_df = df_new[df_new['country'] == selected_country]
	else:
	st.warning("Please select a country from the sidebar")
	st.stop()

	# Filter for Gender
	borrower_genders = df_new['borrower_genders'].unique()
	selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist())
	filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)]

	# Filter for Loan Amount
	min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max())
	selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan))
	filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])]

	# Filter for Years
	filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
	years = sorted(filtered_df['year'].unique())
	selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years)
	filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]

	# selected filters
	st.caption(f"Data for Country: {selected_country} \| Gender: {', '.join(selected_genders)} \| Loan Amount: {selected_loan_amount} \| Years: {', '.join(map(str, selected_years))}")


	# Distribution of Loan Sector
	st.subheader('Distribution of Loan Sector')
	sector_chart = alt.Chart(filtered_df).mark_bar().encode(
	x=alt.X('count(sector):Q', title='Count'),
	y=alt.Y('sector:N', sort='-x', title='Sector'),
	color=alt.Color('sector:N', legend=None)
	).properties(
	width=600,
	height=400
	)
	st.altair_chart(sector_chart)

	# Distribution of Loan Term
	st.subheader('Distribution of Loan Term (in Months)')
	term_hist = alt.Chart(filtered_df).mark_bar().encode(
	x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'),
	y=alt.Y('count():Q', title='Frequency'),
	color=alt.Color('term_in_months:Q', legend=None)
	).properties(
	width=600,
	height=400
	)
	st.altair_chart(term_hist)

	# Monthly Loan Amounts Over Time
	st.subheader('Monthly Loan Amounts Over Time')
	filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month
	filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b')
	filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year

	monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index()

	loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode(
	x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'),
	y=alt.Y('loan_amount:Q', title='Total Loan Amount'),
	color=alt.Color('year:N', title='Year'),
	tooltip=['year', 'month_name', 'loan_amount']
	).properties(
	width=700,
	height=400
	)

	st.altair_chart(loan_time_series)

	# Top 10 Countries with Highest Average Loan Amount
	st.subheader('Top 10 Countries with Highest Average Loan Amount')
	df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')]
	df_clean['country'] = df_clean['country'].str.strip()
	top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index()

	top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode(
	x=alt.X('loan_amount:Q', title='Average Loan Amount'),
	y=alt.Y('country:N', sort='-x', title='Country'),
	color=alt.Color('country:N', legend=None)
	).properties(
	width=600,
	height=400
	)
	st.altair_chart(top_10_chart)

	# Distribution of Genders
	st.subheader('Distribution of Borrower Genders')
	gender_counts = filtered_df['borrower_genders'].value_counts().reset_index()
	gender_counts.columns = ['borrower_genders', 'count']

	gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode(
	theta=alt.Theta(field="count", type="quantitative"),
	color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"),
	tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")]
	).properties(
	width=400,
	height=400
	)

	# text labels to the doughnut chart
	gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode(
	text=alt.Text('count:Q', format='.0f')
	)

	final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend(
	labelFontSize=12,
	titleFontSize=14
	)
	st.altair_chart(final_chart)


	# Dataset Summary
	st.header('Dataset Summary')
	st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2)))
	st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2)))
	st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0]))
	st.write(filtered_df.describe())

	# Filtered dataframe
	st.header("Filtered Data")
	st.dataframe(filtered_df)


	st.header('K-Means Clustering')

	#the columns we want to do kmean to
	filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']]

	#to determine scaler
	fig, ax = plt.subplots(figsize=(10, 5))
	filtered_df_reduced.hist(bins=100, ax=ax)
	st.pyplot(fig)

	#my chosen scaler
	scaler = MinMaxScaler()

	data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced)

	Sum_of_squared_distances = []

	K = range(1, 10)

	for k in K:
	km = KMeans(n_clusters=k, n_init = "auto")
	km.fit(data_to_cluster_scaled)
	Sum_of_squared_distances.append(km.inertia_)

	fig, ax = plt.subplots()
	ax.plot(K, Sum_of_squared_distances, 'bx-')
	ax.set_xlabel('Number of Clusters (k)')
	ax.set_ylabel('Sum of Squared Distances')
	ax.set_title('Elbow Method For Optimal k')
	ax.grid(True)

	st.pyplot(fig)



	def k_means_simple(data, k, max_iters=100):
	centroids = data[np.random.choice(data.shape[0], k, replace=False)]

	for _ in range(max_iters):
	distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
	labels = np.argmin(distances, axis=0)

	new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])

	if np.all(centroids == new_centroids):
	break

	centroids = new_centroids

	return labels, centroids

	labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5)


	distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2)
	nearest_centroid_indices = np.argmin(distances, axis=1)

	data_df = pd.DataFrame({
	'x': data_to_cluster_scaled[:, 0],
	'y': data_to_cluster_scaled[:, 1],
	'centroid': nearest_centroid_indices
	})


	centroids_df = pd.DataFrame({
	'x': final_centroids[:, 0],
	'y': final_centroids[:, 1],
	'centroid': range(final_centroids.shape[0])
	})


	data_df['type'] = 'data'
	centroids_df['type'] = 'centroid'

	data_df['loan_amount'] = filtered_df['loan_amount'].values
	data_df['term_in_months'] = filtered_df['term_in_months'].values
	data_df['activity'] = filtered_df['activity'].values
	data_df['sector'] = filtered_df['sector'].values
	data_df['region'] = filtered_df['region'].values

	combined_df = pd.concat([data_df, centroids_df])

	scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode(
	x='x',
	y='y',
	color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')),
	opacity=alt.condition(
	alt.datum.type == 'data',
	alt.value(0.6),
	alt.value(1)
	),
	tooltip=[
	alt.Tooltip('loan_amount:Q', title='Loan Amount'),
	alt.Tooltip('term_in_months:Q', title='Term (Months)'),
	alt.Tooltip('activity:N', title='Activity'),
	alt.Tooltip('sector:N', title='Sector'),
	alt.Tooltip('region:N', title='Region')
	]
	).properties(
	title='Reduced Data and Initial Centroids'
	)

	st.altair_chart(scatter_plot, use_container_width=True)

	# Fix session states
	if 'country_selected' not in st.session_state:
	st.session_state['country_selected'] = None
	if 'gender_selected' not in st.session_state:
	st.session_state['gender_selected'] = None
	if 'sector_selected' not in st.session_state:
	st.session_state['sector_selected'] = None

	# Recommendation Engine based on Country, Gender, and Sector
	st.subheader("Loan Recommendation")

	# Input for country
	country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique())))
	if country_input != "None":
	# Filter gender options based on selected country
	filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique()
	gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders))
	else:
	gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique()))

	# Input for sector based on the selected country and gender
	if country_input != "None" and gender_input != "None":
	# Filter sector options based on selected country and gender
	filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique()
	sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors))
	else:
	sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique()))

	# Generate recommendations based on country, gender, and sector
	if country_input != "None" and gender_input != "None" and sector_input != "None":
	# Filter the DataFrame based on selected country, gender, and sector
	user_filtered_df = df_new[
	(df_new['country'] == country_input) &
	(df_new['borrower_genders'] == gender_input) &
	(df_new['sector'] == sector_input)
	].reset_index(drop=True)

	if not user_filtered_df.empty:
	# Align the filtered DataFrame's indices with the scaled data by resetting both
	data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)]

	# Compute similarity matrix for filtered data
	similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered)

	# Get the most similar loans (top 3)
	similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4]

	# Display recommended loans
	recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']]
	st.write("Recommended Loans:")
	st.dataframe(recommendations)
	else:
	st.write("No matching loans found for the selected country, gender, and sector.")
	else:
	st.write("Please select a country, gender, and sector.")