Spaces:
Sleeping
Sleeping
| # Import required packages | |
| import pandas as pd | |
| import numpy as np | |
| import altair as alt | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| from scipy.stats import zscore | |
| import streamlit as st | |
| import altair as alt | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| df_new= pd.read_csv('Final_data.csv') | |
| # Page title | |
| st.title("Exploratory Data Analysis on Kiva Loans") | |
| st.sidebar.header("Filters") | |
| # Filter for Country | |
| country = df_new['country'].unique() | |
| selected_country = st.sidebar.selectbox("Select Country", country.tolist()) | |
| if selected_country: | |
| filtered_df = df_new[df_new['country'] == selected_country] | |
| else: | |
| st.warning("Please select a country from the sidebar") | |
| st.stop() | |
| # Filter for Gender | |
| borrower_genders = df_new['borrower_genders'].unique() | |
| selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist()) | |
| filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)] | |
| # Filter for Loan Amount | |
| min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max()) | |
| selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan)) | |
| filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])] | |
| # Filter for Years | |
| filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year | |
| years = sorted(filtered_df['year'].unique()) | |
| selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years) | |
| filtered_df = filtered_df[filtered_df['year'].isin(selected_years)] | |
| # selected filters | |
| st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}") | |
| # Distribution of Loan Sector | |
| st.subheader('Distribution of Loan Sector') | |
| sector_chart = alt.Chart(filtered_df).mark_bar().encode( | |
| x=alt.X('count(sector):Q', title='Count'), | |
| y=alt.Y('sector:N', sort='-x', title='Sector'), | |
| color=alt.Color('sector:N', legend=None) | |
| ).properties( | |
| width=600, | |
| height=400 | |
| ) | |
| st.altair_chart(sector_chart) | |
| # Distribution of Loan Term | |
| st.subheader('Distribution of Loan Term (in Months)') | |
| term_hist = alt.Chart(filtered_df).mark_bar().encode( | |
| x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'), | |
| y=alt.Y('count():Q', title='Frequency'), | |
| color=alt.Color('term_in_months:Q', legend=None) | |
| ).properties( | |
| width=600, | |
| height=400 | |
| ) | |
| st.altair_chart(term_hist) | |
| # Monthly Loan Amounts Over Time | |
| st.subheader('Monthly Loan Amounts Over Time') | |
| filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month | |
| filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b') | |
| filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year | |
| monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index() | |
| loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode( | |
| x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'), | |
| y=alt.Y('loan_amount:Q', title='Total Loan Amount'), | |
| color=alt.Color('year:N', title='Year'), | |
| tooltip=['year', 'month_name', 'loan_amount'] | |
| ).properties( | |
| width=700, | |
| height=400 | |
| ) | |
| st.altair_chart(loan_time_series) | |
| # Top 10 Countries with Highest Average Loan Amount | |
| st.subheader('Top 10 Countries with Highest Average Loan Amount') | |
| df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')] | |
| df_clean['country'] = df_clean['country'].str.strip() | |
| top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index() | |
| top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode( | |
| x=alt.X('loan_amount:Q', title='Average Loan Amount'), | |
| y=alt.Y('country:N', sort='-x', title='Country'), | |
| color=alt.Color('country:N', legend=None) | |
| ).properties( | |
| width=600, | |
| height=400 | |
| ) | |
| st.altair_chart(top_10_chart) | |
| # Distribution of Genders | |
| st.subheader('Distribution of Borrower Genders') | |
| gender_counts = filtered_df['borrower_genders'].value_counts().reset_index() | |
| gender_counts.columns = ['borrower_genders', 'count'] | |
| gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode( | |
| theta=alt.Theta(field="count", type="quantitative"), | |
| color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"), | |
| tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")] | |
| ).properties( | |
| width=400, | |
| height=400 | |
| ) | |
| # text labels to the doughnut chart | |
| gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode( | |
| text=alt.Text('count:Q', format='.0f') | |
| ) | |
| final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend( | |
| labelFontSize=12, | |
| titleFontSize=14 | |
| ) | |
| st.altair_chart(final_chart) | |
| # Dataset Summary | |
| st.header('Dataset Summary') | |
| st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2))) | |
| st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2))) | |
| st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0])) | |
| st.write(filtered_df.describe()) | |
| # Filtered dataframe | |
| st.header("Filtered Data") | |
| st.dataframe(filtered_df) | |
| st.header('K-Means Clustering') | |
| #the columns we want to do kmean to | |
| filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']] | |
| #to determine scaler | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| filtered_df_reduced.hist(bins=100, ax=ax) | |
| st.pyplot(fig) | |
| #my chosen scaler | |
| scaler = MinMaxScaler() | |
| data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced) | |
| Sum_of_squared_distances = [] | |
| K = range(1, 10) | |
| for k in K: | |
| km = KMeans(n_clusters=k, n_init = "auto") | |
| km.fit(data_to_cluster_scaled) | |
| Sum_of_squared_distances.append(km.inertia_) | |
| fig, ax = plt.subplots() | |
| ax.plot(K, Sum_of_squared_distances, 'bx-') | |
| ax.set_xlabel('Number of Clusters (k)') | |
| ax.set_ylabel('Sum of Squared Distances') | |
| ax.set_title('Elbow Method For Optimal k') | |
| ax.grid(True) | |
| st.pyplot(fig) | |
| def k_means_simple(data, k, max_iters=100): | |
| centroids = data[np.random.choice(data.shape[0], k, replace=False)] | |
| for _ in range(max_iters): | |
| distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2) | |
| labels = np.argmin(distances, axis=0) | |
| new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)]) | |
| if np.all(centroids == new_centroids): | |
| break | |
| centroids = new_centroids | |
| return labels, centroids | |
| labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5) | |
| distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2) | |
| nearest_centroid_indices = np.argmin(distances, axis=1) | |
| data_df = pd.DataFrame({ | |
| 'x': data_to_cluster_scaled[:, 0], | |
| 'y': data_to_cluster_scaled[:, 1], | |
| 'centroid': nearest_centroid_indices | |
| }) | |
| centroids_df = pd.DataFrame({ | |
| 'x': final_centroids[:, 0], | |
| 'y': final_centroids[:, 1], | |
| 'centroid': range(final_centroids.shape[0]) | |
| }) | |
| data_df['type'] = 'data' | |
| centroids_df['type'] = 'centroid' | |
| data_df['loan_amount'] = filtered_df['loan_amount'].values | |
| data_df['term_in_months'] = filtered_df['term_in_months'].values | |
| data_df['activity'] = filtered_df['activity'].values | |
| data_df['sector'] = filtered_df['sector'].values | |
| data_df['region'] = filtered_df['region'].values | |
| combined_df = pd.concat([data_df, centroids_df]) | |
| scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode( | |
| x='x', | |
| y='y', | |
| color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')), | |
| opacity=alt.condition( | |
| alt.datum.type == 'data', | |
| alt.value(0.6), | |
| alt.value(1) | |
| ), | |
| tooltip=[ | |
| alt.Tooltip('loan_amount:Q', title='Loan Amount'), | |
| alt.Tooltip('term_in_months:Q', title='Term (Months)'), | |
| alt.Tooltip('activity:N', title='Activity'), | |
| alt.Tooltip('sector:N', title='Sector'), | |
| alt.Tooltip('region:N', title='Region') | |
| ] | |
| ).properties( | |
| title='Reduced Data and Initial Centroids' | |
| ) | |
| st.altair_chart(scatter_plot, use_container_width=True) | |
| # Fix session states | |
| if 'country_selected' not in st.session_state: | |
| st.session_state['country_selected'] = None | |
| if 'gender_selected' not in st.session_state: | |
| st.session_state['gender_selected'] = None | |
| if 'sector_selected' not in st.session_state: | |
| st.session_state['sector_selected'] = None | |
| # Recommendation Engine based on Country, Gender, and Sector | |
| st.subheader("Loan Recommendation") | |
| # Input for country | |
| country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique()))) | |
| if country_input != "None": | |
| # Filter gender options based on selected country | |
| filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique() | |
| gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders)) | |
| else: | |
| gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique())) | |
| # Input for sector based on the selected country and gender | |
| if country_input != "None" and gender_input != "None": | |
| # Filter sector options based on selected country and gender | |
| filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique() | |
| sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors)) | |
| else: | |
| sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique())) | |
| # Generate recommendations based on country, gender, and sector | |
| if country_input != "None" and gender_input != "None" and sector_input != "None": | |
| # Filter the DataFrame based on selected country, gender, and sector | |
| user_filtered_df = df_new[ | |
| (df_new['country'] == country_input) & | |
| (df_new['borrower_genders'] == gender_input) & | |
| (df_new['sector'] == sector_input) | |
| ].reset_index(drop=True) | |
| if not user_filtered_df.empty: | |
| # Align the filtered DataFrame's indices with the scaled data by resetting both | |
| data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)] | |
| # Compute similarity matrix for filtered data | |
| similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered) | |
| # Get the most similar loans (top 3) | |
| similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4] | |
| # Display recommended loans | |
| recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']] | |
| st.write("Recommended Loans:") | |
| st.dataframe(recommendations) | |
| else: | |
| st.write("No matching loans found for the selected country, gender, and sector.") | |
| else: | |
| st.write("Please select a country, gender, and sector.") | |