Spaces:

Mahesh9
/

CFPB-Complaint-Classifier

Sleeping

File size: 9,993 Bytes

210b96e

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# State abbreviation to full name mapping
state_mapping = {
    'FL': 'Florida', 'CA': 'California', 'TX': 'Texas', 'GA': 'Georgia',
    'NY': 'New York', 'IL': 'Illinois', 'PA': 'Pennsylvania', 'NC': 'North Carolina',
    'NJ': 'New Jersey', 'MD': 'Maryland', 'VA': 'Virginia', 'OH': 'Ohio',
    'MI': 'Michigan', 'SC': 'South Carolina', 'AZ': 'Arizona', 'TN': 'Tennessee',
    'NV': 'Nevada', 'LA': 'Louisiana', 'AL': 'Alabama', 'MO': 'Missouri',
    'MA': 'Massachusetts', 'IN': 'Indiana', 'AR': 'Arkansas', 'WA': 'Washington',
    'CO': 'Colorado', 'MS': 'Mississippi', 'CT': 'Connecticut', 'MN': 'Minnesota',
    'WI': 'Wisconsin', 'KY': 'Kentucky', 'UT': 'Utah', 'DE': 'Delaware',
    'OR': 'Oregon', 'OK': 'Oklahoma', 'DC': 'District of Columbia', 'KS': 'Kansas',
    'IA': 'Iowa', 'NM': 'New Mexico', 'NE': 'Nebraska', 'HI': 'Hawaii',
    'RI': 'Rhode Island', 'ID': 'Idaho', 'WV': 'West Virginia', 'NH': 'New Hampshire',
    'ME': 'Maine', 'MT': 'Montana', 'ND': 'North Dakota', 'AK': 'Alaska',
    'SD': 'South Dakota', 'WY': 'Wyoming', 'VT': 'Vermont'
    # Removed territories and minor outlying islands not listed as states
}

# Function to plot top n most common categories
def plot_top_n(df, column, title, n=5, palette_name=None):
    # Generate a color sequence from the seaborn palette
    color_sequence = sns.color_palette(palette_name, n_colors=n).as_hex() if palette_name else None
    
    # Get top n most common values in the specified column
    counts = df[column].value_counts().reset_index()
    counts.columns = [column, 'Count']
    top_n = counts.head(n)
    
    # Create a horizontal bar plot with the seaborn color sequence and remove the legend
    fig = px.bar(top_n, y=column, x='Count', orientation='h', 
                 color=column, color_discrete_sequence=color_sequence)
    fig.update_layout(showlegend=False)
    return fig

# 1. Plotting top 5 most common products
def plot_top_5_products(df_new):
    # df_new = load_process_data(df)
    fig = plot_top_n(df_new, 'Product', 'Top 5 Most Common Products')
    return fig

# 2. Plotting Top 5 common issues
def plot_top_5_issues(df_new):
    # df_new = load_process_data(df)
    fig = plot_top_n(df_new, 'Issue', 'Top 5 Most Common Issues', palette_name='plasma')
    return fig

# 3. Plotting top 5 issues in each product category
def plot_top_5_issues_in_product(df_new):
    # Step 1: Group data by 'Product' and 'Issue', then count occurrences
    grouped_data = df_new.groupby(['Product', 'Issue']).size().reset_index(name='Count')

    # Calculate total issues per product for ordering
    total_issues_per_product = grouped_data.groupby('Product')['Count'].sum().reset_index(name='TotalIssues')

    # Sort products by total issues in descending order
    sorted_products = total_issues_per_product.sort_values('TotalIssues', ascending=False)

    # Step 2: Get top 5 issues for each product sorted by 'Count' in descending order
    top_issues_per_product = (grouped_data.groupby('Product', as_index=False)
                            .apply(lambda x: x.nlargest(5, 'Count'))
                            .reset_index(drop=True))

    # Merge to get the order column (TotalIssues) in top_issues_per_product for sorting
    top_issues_per_product = top_issues_per_product.merge(sorted_products, on='Product')

    # Sort top_issues_per_product DataFrame based on TotalIssues column to ensure the plot respects this order
    top_issues_per_product = top_issues_per_product.sort_values(by=['TotalIssues', 'Count'], ascending=[False, False])

    # Step 3: Create a vertical stacked bar chart
    fig = px.bar(top_issues_per_product, x='Product', y='Count', color='Issue',
                labels={'Count': 'Number of Complaints'}, 
                category_orders={'Product': sorted_products['Product'].tolist()}) # Explicitly set the order of products

    # Update layout to remove legend and adjust dimensions for clarity
    fig.update_layout(showlegend=False, width=900, height=600)
    return fig

# 4.Companies with the Most Complaints in 2023
def plot_top_10_companies_complaints(df_new):
    # Filter data for the year 2023
    df_2023 = df_new[df_new['Date received'].dt.year == 2023]

    # Group data by company name and count the number of complaints for each company
    company_complaint_counts = df_2023['Company'].value_counts()

    top_n = 10
    # Ensure the companies are sorted in ascending order for correct plotting
    top_companies = company_complaint_counts.head(top_n).sort_values(ascending=True)

    # Create a horizontal bar chart using Plotly Express with a nicer color scale
    fig = px.bar(
        x=top_companies.values,
        y=top_companies.index,
        orientation='h',
        color=top_companies.values, # This assigns a color based on the value
        color_continuous_scale=[(0.0, "green"),
                                (0.05, "yellow"),
                                (1.0, "red")], # This is an example of a nice color scale
        labels={'x': 'Number of Complaints', 'y': 'Company'}
    )

    fig.update_layout(
        xaxis=dict(
            title='Number of Complaints',
        ),
        yaxis=dict(
            tickfont=dict(size=10),
        ),
        height=500,
        width=800,
    )

    # To display a color bar, showing the mapping of colors to values
    fig.update_layout(coloraxis_showscale=False)
    return fig

# 5. Top 10 States with the Most Complaints
def plot_top_10_states_most_complaints(df_new):
    # Assuming df_new is your DataFrame and 'State' contains the abbreviations
    # Map state abbreviations to full names
    df_new['State Name'] = df_new['State'].map(state_mapping)

    # Calculate complaint counts by state
    state_complaint_counts = df_new['State Name'].value_counts()

    # Get top 10 states with the most complaint counts
    top_n = 10
    top_states = state_complaint_counts.head(top_n)

    # Create a horizontal bar chart using Plotly Express with a nice color scale
    fig = px.bar(
        x=top_states.values,
        y=top_states.index,
        orientation='h',
        color=top_states.values,  # Assign color based on values
        color_continuous_scale='Turbo',  # A nice color scale
        labels={'x': 'Number of Complaints', 'y': 'State'},
        category_orders={'y': top_states.index.tolist()}
    )

    fig.update_layout(
        yaxis=dict(
            tickfont=dict(size=10),
        ),
        xaxis=dict(
            tickangle=0,
        ),
        height=500,
        width=900,
    )

    # To display a color bar, showing the mapping of colors to values
    fig.update_layout(coloraxis_showscale=False)
    return fig

# 6. Top 10 States with the Least Complaints
def plot_top_10_states_least_complaints(df_new):
    # Map state abbreviations to full names
    df_new['State Name'] = df_new['State'].map(state_mapping)

    # Calculate complaint counts by state
    state_complaint_counts = df_new['State Name'].value_counts()

    # Get top 10 states with the most complaint counts
    top_n = 10
    top_states = state_complaint_counts.tail(top_n)

    # Create a horizontal bar chart using Plotly Express with a nice color scale
    fig = px.bar(
        x=top_states.values,
        y=top_states.index,
        orientation='h',
        color=top_states.values,  # Assign color based on values
        color_continuous_scale='Temps',  # A nice color scale
        labels={'x': 'Number of Complaints', 'y': 'State'},
        category_orders={'x': top_states.index.tolist()}
    )

    fig.update_layout(
        yaxis=dict(
            tickfont=dict(size=10),
        ),
        xaxis=dict(
            tickangle=0,
        ),
        height=500,
        width=900,
    )

    # To display a color bar, showing the mapping of colors to values
    fig.update_layout(coloraxis_showscale=False)

    return fig

# 7. Number of Complaints by Year
def complaints_by_year(df_new):
    monthly_complaints = df_new.copy()
    monthly_complaints = monthly_complaints[monthly_complaints['Date received'].dt.year != 2024]

    monthly_complaints['MonthYear'] = monthly_complaints['Date received'].dt.to_period('M').astype(str)
    monthly_complaints = monthly_complaints.groupby('MonthYear').size().reset_index(name = "NumComplaints")


    fig = px.line(monthly_complaints, x='MonthYear', y='NumComplaints',
                labels={'MonthYear': 'Year', 'NumComplaints': 'Number of Complaints'})

    fig.update_layout(
            width=900,
            height=400
        )
    return fig

# 8. Number of Complaints by State
def complaints_across_states(df_new):
    df_2023 = df_new[df_new['Date received'].dt.year == 2023]

    state_complaints = df_2023.groupby('State').size().reset_index(name='Num_complaints')
    state_complaints['Full_state_name'] = state_complaints['State'].apply(lambda x : state_mapping[x] if x in state_mapping else x)

    fig = px.choropleth(state_complaints,
                        locations='State',
                        locationmode='USA-states',
                        color='Num_complaints',
                        color_continuous_scale='Inferno',
                        scope="usa",
                        hover_name='Full_state_name')
    fig.add_scattergeo(
        locations=state_complaints['State'],    ###codes for states,
        locationmode='USA-states',
        text=state_complaints['State'],
        mode='text',
        hoverinfo='skip',
        textfont=dict(size = 8.5,color='white'))

    fig.update_layout(
        autosize = True,
        geo=dict(
            landcolor='rgb(217, 217, 217)',  
            lakecolor='rgb(255, 255, 255)',  
            bgcolor='rgb(255, 255, 255)' 
        ),
        paper_bgcolor='rgb(255, 255, 255)', 
        margin={"r":0,"t":50,"l":0,"b":0},
        width=1000,
        height=400
    )
    return fig