Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
from huggingface_hub import HfApi | |
import io | |
from datetime import datetime, timedelta | |
import time | |
# Set page config for a wider layout and custom theme | |
st.set_page_config(layout="wide", page_title="Job Listings Dashboard") | |
# Custom CSS for black background and styling | |
st.markdown(""" | |
<style> | |
.stApp { | |
background-color: #000000; | |
color: #FFFFFF; | |
} | |
.stButton>button { | |
background-color: #4e79a7; | |
color: white; | |
} | |
.stSelectbox, .stMultiSelect { | |
color: #FFFFFF; | |
} | |
.stDataFrame { | |
background-color: #1E1E1E; | |
} | |
.plotly-graph-div { | |
background-color: #1E1E1E; | |
} | |
.big-font { | |
font-size: 48px; | |
font-weight: bold; | |
text-align: center; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Hugging Face setup | |
HF_TOKEN = st.secrets["HF_TOKEN"] | |
HF_USERNAME = st.secrets["HF_USERNAME"] | |
DATASET_NAME = "jobeasz" | |
def load_and_concat_data(): | |
api = HfApi() | |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") | |
csv_files = [file for file in dataset_files if file.endswith('.csv')] | |
all_data = [] | |
for file in csv_files: | |
try: | |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) | |
df = pd.read_csv(file_content) | |
all_data.append(df) | |
except Exception: | |
pass # Silently skip files that can't be processed | |
if not all_data: | |
return pd.DataFrame() | |
concatenated_df = pd.concat(all_data, ignore_index=True) | |
columns_to_keep = [ | |
'site', 'job_url', 'title', 'company', 'location', | |
'job_type', 'date_posted', 'is_remote', 'company_url' | |
] | |
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) | |
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') | |
# Drop duplicates and rows with NaT in date_posted | |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted']) | |
return filtered_df | |
def get_unique_values(df): | |
return { | |
'companies': df['company'].unique(), | |
'locations': df['location'].unique(), | |
'job_types': df['job_type'].unique() | |
} | |
def create_chart(data, _x, y, title, color_sequence): | |
fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) | |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') | |
return fig | |
def create_time_series(df): | |
df_by_date = df.groupby('date_posted').size().reset_index(name='count') | |
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) | |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') | |
return fig | |
def parse_locations(df): | |
valid_locations = [ | |
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX", | |
"Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", | |
"San Jose, CA", "Austin, TX", "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", | |
"San Francisco Bay Area", "Washington, D.C.", "Boston, MA", "Seattle, WA", "Denver, CO", | |
"Nashville, TN", "Baltimore, MD", "Portland, OR", "Las Vegas, NV", "Milwaukee, WI", | |
"Albuquerque, NM", "Tucson, AZ", "Fresno, CA", "Sacramento, CA", "Long Beach, CA", | |
"Kansas City, MO", "Mesa, AZ", "Atlanta, GA", "Colorado Springs, CO", "Raleigh, NC", | |
"Omaha, NE", "Miami, FL", "Oakland, CA", "Minneapolis, MN", "Tulsa, OK", | |
"Cleveland, OH", "Wichita, KS", "Arlington, TX", "New Orleans, LA", "Bakersfield, CA", | |
"Tampa, FL", "Honolulu, HI", "Aurora, CO", "Anaheim, CA", "Santa Ana, CA", | |
"St. Louis, MO", "Riverside, CA", "Corpus Christi, TX", "Lexington, KY", "Pittsburgh, PA", | |
"Anchorage, AK", "Stockton, CA", "Cincinnati, OH", "St. Paul, MN", "Toledo, OH", | |
"Newark, NJ", "Greensboro, NC", "Plano, TX", "Henderson, NV", "Lincoln, NE", | |
"Buffalo, NY", "Fort Wayne, IN", "Jersey City, NJ", "Chula Vista, CA", "Orlando, FL", | |
"St. Petersburg, FL", "Norfolk, VA", "Chandler, AZ", "Laredo, TX", "Madison, WI", | |
"Durham, NC", "Lubbock, TX", "Winston-Salem, NC", "Garland, TX", "Glendale, AZ", | |
"Hialeah, FL", "Reno, NV", "Baton Rouge, LA", "Irvine, CA", "Chesapeake, VA", | |
"Irving, TX", "Scottsdale, AZ", "North Las Vegas, NV", "Fremont, CA", "Gilbert, AZ", | |
"San Bernardino, CA", "Boise, ID", "Birmingham, AL" | |
] | |
df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other')) | |
return df | |
def display_dashboard(df): | |
df = parse_locations(df) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Job Postings Overview") | |
st.metric("Total Job Postings", len(df)) | |
st.metric("Unique Companies", df['company'].nunique()) | |
st.metric("Unique Locations", df['parsed_location'].nunique()) | |
min_date = df['date_posted'].min().date() | |
max_date = df['date_posted'].max().date() | |
st.write(f"Job postings from {min_date} to {max_date}") | |
with col2: | |
top_companies = df['company'].value_counts().head(10) | |
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) | |
st.plotly_chart(fig, use_container_width=True) | |
# Job Postings Over Time Chart | |
fig_time_series = create_time_series(df) | |
st.plotly_chart(fig_time_series, use_container_width=True) | |
col3, col4 = st.columns(2) | |
with col3: | |
top_locations = df['parsed_location'].value_counts().head(10) | |
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) | |
st.plotly_chart(fig, use_container_width=True) | |
with col4: | |
top_job_titles = df['title'].value_counts().head(20) | |
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f']) | |
st.plotly_chart(fig, use_container_width=True) | |
def filter_dataframe(df, companies, locations, job_types): | |
filtered_df = df | |
if companies: | |
filtered_df = filtered_df[filtered_df['company'].isin(companies)] | |
if locations: | |
filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)] | |
if job_types: | |
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] | |
return filtered_df | |
def display_data_explorer(df): | |
st.subheader("Data Explorer") | |
show_all = st.radio("Display", ("All Data", "Filtered Data")) | |
if show_all == "Filtered Data": | |
unique_values = get_unique_values(df) | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
companies = st.multiselect("Select Companies", options=unique_values['companies']) | |
with col2: | |
locations = st.multiselect("Select Locations", options=df['parsed_location'].unique()) | |
with col3: | |
job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) | |
filtered_df = filter_dataframe(df, companies, locations, job_types) | |
else: | |
filtered_df = df | |
st.write(f"Showing {len(filtered_df)} job listings") | |
def make_clickable(url): | |
return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>' | |
filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable) | |
filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable) | |
st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True) | |
def main(): | |
st.title("Job Listings Dashboard") | |
df = load_and_concat_data() | |
if df.empty: | |
st.error("No data available. Please check your dataset.") | |
return | |
df = parse_locations(df) | |
# Sidebar for navigation | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"]) | |
if page == "Dashboard": | |
display_dashboard(df) | |
elif page == "Data Explorer": | |
display_data_explorer(df) | |
if __name__ == "__main__": | |
main() |