job_easz / app.py
Niharmahesh's picture
Update app.py
44680e5 verified
raw
history blame
8.4 kB
import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta
import time
# Set page config for a wider layout and custom theme
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
# Custom CSS for black background and styling
st.markdown("""
<style>
.stApp {
background-color: #000000;
color: #FFFFFF;
}
.stButton>button {
background-color: #4e79a7;
color: white;
}
.stSelectbox, .stMultiSelect {
color: #FFFFFF;
}
.stDataFrame {
background-color: #1E1E1E;
}
.plotly-graph-div {
background-color: #1E1E1E;
}
.big-font {
font-size: 48px;
font-weight: bold;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"
@st.cache_data(ttl=3600)
def load_and_concat_data():
api = HfApi()
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
csv_files = [file for file in dataset_files if file.endswith('.csv')]
all_data = []
for file in csv_files:
try:
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
df = pd.read_csv(file_content)
all_data.append(df)
except Exception:
pass # Silently skip files that can't be processed
if not all_data:
return pd.DataFrame()
concatenated_df = pd.concat(all_data, ignore_index=True)
columns_to_keep = [
'site', 'job_url', 'title', 'company', 'location',
'job_type', 'date_posted', 'is_remote', 'company_url'
]
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
# Drop duplicates and rows with NaT in date_posted
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
return filtered_df
@st.cache_data()
def get_unique_values(df):
return {
'companies': df['company'].unique(),
'locations': df['location'].unique(),
'job_types': df['job_type'].unique()
}
def create_chart(data, _x, y, title, color_sequence):
fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
return fig
def create_time_series(df):
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
return fig
def parse_locations(df):
valid_locations = [
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
"Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
"San Jose, CA", "Austin, TX", "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH",
"San Francisco Bay Area", "Washington, D.C.", "Boston, MA", "Seattle, WA", "Denver, CO",
"Nashville, TN", "Baltimore, MD", "Portland, OR", "Las Vegas, NV", "Milwaukee, WI",
"Albuquerque, NM", "Tucson, AZ", "Fresno, CA", "Sacramento, CA", "Long Beach, CA",
"Kansas City, MO", "Mesa, AZ", "Atlanta, GA", "Colorado Springs, CO", "Raleigh, NC",
"Omaha, NE", "Miami, FL", "Oakland, CA", "Minneapolis, MN", "Tulsa, OK",
"Cleveland, OH", "Wichita, KS", "Arlington, TX", "New Orleans, LA", "Bakersfield, CA",
"Tampa, FL", "Honolulu, HI", "Aurora, CO", "Anaheim, CA", "Santa Ana, CA",
"St. Louis, MO", "Riverside, CA", "Corpus Christi, TX", "Lexington, KY", "Pittsburgh, PA",
"Anchorage, AK", "Stockton, CA", "Cincinnati, OH", "St. Paul, MN", "Toledo, OH",
"Newark, NJ", "Greensboro, NC", "Plano, TX", "Henderson, NV", "Lincoln, NE",
"Buffalo, NY", "Fort Wayne, IN", "Jersey City, NJ", "Chula Vista, CA", "Orlando, FL",
"St. Petersburg, FL", "Norfolk, VA", "Chandler, AZ", "Laredo, TX", "Madison, WI",
"Durham, NC", "Lubbock, TX", "Winston-Salem, NC", "Garland, TX", "Glendale, AZ",
"Hialeah, FL", "Reno, NV", "Baton Rouge, LA", "Irvine, CA", "Chesapeake, VA",
"Irving, TX", "Scottsdale, AZ", "North Las Vegas, NV", "Fremont, CA", "Gilbert, AZ",
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
]
df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other'))
return df
def display_dashboard(df):
df = parse_locations(df)
col1, col2 = st.columns(2)
with col1:
st.subheader("Job Postings Overview")
st.metric("Total Job Postings", len(df))
st.metric("Unique Companies", df['company'].nunique())
st.metric("Unique Locations", df['parsed_location'].nunique())
min_date = df['date_posted'].min().date()
max_date = df['date_posted'].max().date()
st.write(f"Job postings from {min_date} to {max_date}")
with col2:
top_companies = df['company'].value_counts().head(10)
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
st.plotly_chart(fig, use_container_width=True)
# Job Postings Over Time Chart
fig_time_series = create_time_series(df)
st.plotly_chart(fig_time_series, use_container_width=True)
col3, col4 = st.columns(2)
with col3:
top_locations = df['parsed_location'].value_counts().head(10)
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
st.plotly_chart(fig, use_container_width=True)
with col4:
top_job_titles = df['title'].value_counts().head(20)
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
st.plotly_chart(fig, use_container_width=True)
@st.cache_data
def filter_dataframe(df, companies, locations, job_types):
filtered_df = df
if companies:
filtered_df = filtered_df[filtered_df['company'].isin(companies)]
if locations:
filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)]
if job_types:
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
return filtered_df
def display_data_explorer(df):
st.subheader("Data Explorer")
show_all = st.radio("Display", ("All Data", "Filtered Data"))
if show_all == "Filtered Data":
unique_values = get_unique_values(df)
col1, col2, col3 = st.columns(3)
with col1:
companies = st.multiselect("Select Companies", options=unique_values['companies'])
with col2:
locations = st.multiselect("Select Locations", options=df['parsed_location'].unique())
with col3:
job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
filtered_df = filter_dataframe(df, companies, locations, job_types)
else:
filtered_df = df
st.write(f"Showing {len(filtered_df)} job listings")
def make_clickable(url):
return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'
filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
def main():
st.title("Job Listings Dashboard")
df = load_and_concat_data()
if df.empty:
st.error("No data available. Please check your dataset.")
return
df = parse_locations(df)
# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
if page == "Dashboard":
display_dashboard(df)
elif page == "Data Explorer":
display_data_explorer(df)
if __name__ == "__main__":
main()