Spaces:

Niharmahesh
/

job_easz

Running

File size: 11,494 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta
import time
import pyarrow as pa
import pyarrow.parquet as pq
import math
import re
# Set page config for a wider layout and custom theme
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")

# Custom CSS for black background and styling
st.markdown("""
<style>
    .stApp {
        background-color: #000000;
        color: #FFFFFF;
    }
    .stButton>button {
        background-color: #4e79a7;
        color: white;
    }
    .stSelectbox, .stMultiSelect {
        color: #FFFFFF;
    }
    .stDataFrame {
        background-color: #1E1E1E;
    }
    .plotly-graph-div {
        background-color: #1E1E1E;
    }
    .big-font {
        font-size: 48px;
        font-weight: bold;
        text-align: center;
    }
</style>
""", unsafe_allow_html=True)

st.markdown("""
    <style>
    h1 {
        text-align: center;
    }
    </style>
    """, unsafe_allow_html=True)

# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"

@st.cache_data(ttl=3600)
def load_and_concat_data():
    api = HfApi()
    dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
    csv_files = [file for file in dataset_files if file.endswith('.csv')]

    all_data = []
    for file in csv_files:
        try:
            file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
            df = pd.read_csv(file_content, engine='pyarrow')
            all_data.append(df)
        except Exception:
            pass  # Silently skip files that can't be processed

    if not all_data:
        return pd.DataFrame()

    concatenated_df = pd.concat(all_data, ignore_index=True)
    
    columns_to_keep = [
        'site', 'job_url', 'title', 'company', 'location',
        'job_type', 'date_posted', 'is_remote', 'company_url'
    ]
    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
    filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
    
    # Drop duplicates and rows with NaT in date_posted
    filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
    #filtering based on data in 2024
    filtered_df = filtered_df[filtered_df['date_posted'].dt.year=='2024']
    # Convert titles to lowercase
    filtered_df['title'] = filtered_df['title'].str.lower()
    
    # Function to clean the location
    def clean_location(location):
        if pd.isna(location):
            return location  # Return NaN as is
        # Convert to lowercase
        location = location.lower()
        # Remove ', us' or ', usa' from the end using regex
        location = re.sub(r',\s*(us|usa)$', '', location)
        return location

    # Clean the location in place
    filtered_df['location'] = filtered_df['location'].apply(clean_location)
    
    return filtered_df
    
@st.cache_data()
def get_unique_values(df):
    return {
        'companies': df['company'].unique(),
        'locations': df['location'].unique(),
        'job_types': df['job_type'].unique(),
        'Role_Name': df['title'].unique(),
        'Date_posted': df['date_posted'].unique()
        
    }

def create_chart(data, _x, y, title, color_sequence):
    fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
    return fig

def create_time_series(df, time_unit='day'):
    if time_unit == 'month':
        # Group by month and year
        df_by_date = df.groupby(df['date_posted'].dt.to_period('M')).size().reset_index(name='count')
        df_by_date['date_posted'] = df_by_date['date_posted'].dt.to_timestamp()
    else:
        # Keep daily grouping as before
        df_by_date = df.groupby('date_posted').size().reset_index(name='count')
    
    fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
    fig.update_layout(
        plot_bgcolor='rgba(0,0,0,0)', 
        paper_bgcolor='rgba(0,0,0,0)', 
        font_color='#FFFFFF',
        xaxis_title="Date",
        yaxis_title="Number of Job Postings"
    )
    
    # Adjust x-axis ticks for monthly view
    if time_unit == 'month':
        fig.update_xaxes(
            dtick="M1",
            tickformat="%b %Y"
        )
    
    return fig

@st.cache_data
def prepare_dashboard_data(df):
    top_companies = df['company'].value_counts().head(10)
    top_locations = df['location'].value_counts().head(10)
    top_job_titles = df['title'].value_counts().head(20)
    df_by_date = df.groupby('date_posted').size().reset_index(name='count')
    return top_companies, top_locations, top_job_titles, df_by_date

def display_dashboard(df):
    top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)
    
    today = datetime.now().date()
    jobs_today = df[df['date_posted'].dt.date == today].shape[0]
    
    col1, col2 = st.columns(2)

    with col1:
        st.subheader("Job Postings Overview")
        st.metric("Total Job Postings", len(df))
        st.metric("Unique Companies", df['company'].nunique())
        st.metric("Job Postings Today", jobs_today)

        min_date = df['date_posted'].min().date()
        max_date = df['date_posted'].max().date()
        st.write(f"Job postings from {min_date} to {max_date}")

    with col2:
        fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
        st.plotly_chart(fig, use_container_width=True)

    # Job Postings Over Time Chart
    fig_time_series = create_time_series(df,time_unit='month')
    st.plotly_chart(fig_time_series, use_container_width=True)

    col3, col4 = st.columns(2)

    with col3:
        fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
        st.plotly_chart(fig, use_container_width=True)

    with col4:
        fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
        st.plotly_chart(fig, use_container_width=True)
@st.cache_data
def filter_dataframe(df, companies, locations, job_types,Role_Name,Date_posted):
    filtered_df = df
    if companies:
        filtered_df = filtered_df[filtered_df['company'].isin(companies)]
    if locations:
        filtered_df = filtered_df[filtered_df['location'].isin(locations)]
    if job_types:
        filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
    if Role_Name:
        filtered_df = filtered_df[filtered_df['title'].isin(Role_Name)]
    if Date_posted:
        filtered_df = filtered_df[filtered_df['date_posted'].isin(Date_posted)]
    return filtered_df

def display_data_explorer(df):
    st.subheader("Data Explorer")

    show_all = st.radio("Display", ("All Data", "Filtered Data"))

    if show_all == "Filtered Data":
        unique_values = get_unique_values(df)
        col1, col2, col3, col4,col5 = st.columns(5)
        with col1:
            companies = st.multiselect("Select Companies", options=unique_values['companies'])
        with col2:
            locations = st.multiselect("Select Locations", options=unique_values['locations'])
        with col3:
            job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
        with col4:
            Role_Name = st.multiselect("Select Role Types", options=unique_values['Role_Name'])
        with col5:
            Date_posted = st.multiselect("Select Date Posted", options=unique_values['Date_posted'])
        
        filtered_df = filter_dataframe(df, companies, locations, job_types, Role_Name,Date_posted)
    else:
        filtered_df = df

    st.write(f"Showing {len(filtered_df)} job listings")
    
    # Pagination
    items_per_page = 15
    num_pages = math.ceil(len(filtered_df) / items_per_page)
    
    col1, col2, col3 = st.columns([1, 3, 1])
    with col2:
        page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
    
    start_idx = (page - 1) * items_per_page
    end_idx = start_idx + items_per_page
    
    page_df = filtered_df.iloc[start_idx:end_idx]
    
    def make_clickable(url):
        return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'

    page_df['job_url'] = page_df['job_url'].apply(make_clickable)
    page_df['company_url'] = page_df['company_url'].apply(make_clickable)

    st.write(page_df.to_html(escape=False, index=False), unsafe_allow_html=True)
    
    col1, col2, col3 = st.columns([1, 3, 1])
    with col2:
        st.write(f"Page {page} of {num_pages}")
def display_about_page():
    st.markdown("""
    ## What is this application?
    The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles.
    ### Key Features:
    - **Interactive Dashboard**: Visualize job market trends with dynamic charts and graphs.
    - **Data Explorer**: Dive deep into individual job listings with advanced filtering options.
    - **Real-time Data**: Fetch the latest job data from our Hugging Face dataset.
    ## How to use this application
    ### Dashboard
    1. Navigate to the Dashboard using the sidebar.
    2. View overall statistics such as total job postings, unique companies, and today's postings.
    3. Explore interactive charts showing:
       - Top companies hiring
       - Job postings over time
       - Top locations for job opportunities
       - Most common job titles
    ### Data Explorer
    1. Switch to the Data Explorer using the sidebar.
    2. Choose between viewing all data or applying filters.
    3. Use the multi-select dropdowns to filter by:
       - Companies
       - Locations
       - Job Types
    4. Browse the filtered job listings table.
    5. Click on job or company links to view more details on the original posting site.
    ## Data Source
    This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily.
    ## Contact
    For questions, feedback, or collaboration opportunities, feel free to reach out:
    - LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/)
    """)

    # Add a clickable LinkedIn button
    linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/"
    st.markdown(f"""
    <a href="{linkedin_url}" target="_blank">
        <img src="https://content.linkedin.com/content/dam/me/business/en-us/amp/brand-site/v2/bg/LI-Logo.svg.original.svg" width="100">
    </a>
    """, unsafe_allow_html=True)
def main():
    st.title("Job Easz")

    df = load_and_concat_data()

    if df.empty:
        st.error("No data available. Please check your dataset.")
        return

    # Sidebar for navigation
    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"])

    if page == "Dashboard":
        display_dashboard(df)
    elif page == "Data Explorer":
        display_data_explorer(df)
    elif page == "About":
        display_about_page()

if __name__ == "__main__":
    main()