Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

job_easz / app.py

Niharmahesh

Update app.py

44680e5 verified 4 months ago

raw

history blame

8.4 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	from huggingface_hub import HfApi
	import io
	from datetime import datetime, timedelta
	import time

	# Set page config for a wider layout and custom theme
	st.set_page_config(layout="wide", page_title="Job Listings Dashboard")

	# Custom CSS for black background and styling
	st.markdown("""
	<style>
	.stApp {
	background-color: #000000;
	color: #FFFFFF;
	}
	.stButton>button {
	background-color: #4e79a7;
	color: white;
	}
	.stSelectbox, .stMultiSelect {
	color: #FFFFFF;
	}
	.stDataFrame {
	background-color: #1E1E1E;
	}
	.plotly-graph-div {
	background-color: #1E1E1E;
	}
	.big-font {
	font-size: 48px;
	font-weight: bold;
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	# Hugging Face setup
	HF_TOKEN = st.secrets["HF_TOKEN"]
	HF_USERNAME = st.secrets["HF_USERNAME"]
	DATASET_NAME = "jobeasz"

	@st.cache_data(ttl=3600)
	def load_and_concat_data():
	api = HfApi()
	dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
	csv_files = [file for file in dataset_files if file.endswith('.csv')]

	all_data = []
	for file in csv_files:
	try:
	file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
	df = pd.read_csv(file_content)
	all_data.append(df)
	except Exception:
	pass # Silently skip files that can't be processed

	if not all_data:
	return pd.DataFrame()

	concatenated_df = pd.concat(all_data, ignore_index=True)

	columns_to_keep = [
	'site', 'job_url', 'title', 'company', 'location',
	'job_type', 'date_posted', 'is_remote', 'company_url'
	]
	filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
	filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')

	# Drop duplicates and rows with NaT in date_posted
	filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])

	return filtered_df

	@st.cache_data()
	def get_unique_values(df):
	return {
	'companies': df['company'].unique(),
	'locations': df['location'].unique(),
	'job_types': df['job_type'].unique()
	}

	def create_chart(data, _x, y, title, color_sequence):
	fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
	fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
	return fig

	def create_time_series(df):
	df_by_date = df.groupby('date_posted').size().reset_index(name='count')
	fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
	fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
	return fig

	def parse_locations(df):
	valid_locations = [
	"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
	"Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
	"San Jose, CA", "Austin, TX", "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH",
	"San Francisco Bay Area", "Washington, D.C.", "Boston, MA", "Seattle, WA", "Denver, CO",
	"Nashville, TN", "Baltimore, MD", "Portland, OR", "Las Vegas, NV", "Milwaukee, WI",
	"Albuquerque, NM", "Tucson, AZ", "Fresno, CA", "Sacramento, CA", "Long Beach, CA",
	"Kansas City, MO", "Mesa, AZ", "Atlanta, GA", "Colorado Springs, CO", "Raleigh, NC",
	"Omaha, NE", "Miami, FL", "Oakland, CA", "Minneapolis, MN", "Tulsa, OK",
	"Cleveland, OH", "Wichita, KS", "Arlington, TX", "New Orleans, LA", "Bakersfield, CA",
	"Tampa, FL", "Honolulu, HI", "Aurora, CO", "Anaheim, CA", "Santa Ana, CA",
	"St. Louis, MO", "Riverside, CA", "Corpus Christi, TX", "Lexington, KY", "Pittsburgh, PA",
	"Anchorage, AK", "Stockton, CA", "Cincinnati, OH", "St. Paul, MN", "Toledo, OH",
	"Newark, NJ", "Greensboro, NC", "Plano, TX", "Henderson, NV", "Lincoln, NE",
	"Buffalo, NY", "Fort Wayne, IN", "Jersey City, NJ", "Chula Vista, CA", "Orlando, FL",
	"St. Petersburg, FL", "Norfolk, VA", "Chandler, AZ", "Laredo, TX", "Madison, WI",
	"Durham, NC", "Lubbock, TX", "Winston-Salem, NC", "Garland, TX", "Glendale, AZ",
	"Hialeah, FL", "Reno, NV", "Baton Rouge, LA", "Irvine, CA", "Chesapeake, VA",
	"Irving, TX", "Scottsdale, AZ", "North Las Vegas, NV", "Fremont, CA", "Gilbert, AZ",
	"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
	]

	df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other'))
	return df

	def display_dashboard(df):
	df = parse_locations(df)

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Job Postings Overview")
	st.metric("Total Job Postings", len(df))
	st.metric("Unique Companies", df['company'].nunique())
	st.metric("Unique Locations", df['parsed_location'].nunique())

	min_date = df['date_posted'].min().date()
	max_date = df['date_posted'].max().date()
	st.write(f"Job postings from {min_date} to {max_date}")

	with col2:
	top_companies = df['company'].value_counts().head(10)
	fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
	st.plotly_chart(fig, use_container_width=True)

	# Job Postings Over Time Chart
	fig_time_series = create_time_series(df)
	st.plotly_chart(fig_time_series, use_container_width=True)

	col3, col4 = st.columns(2)

	with col3:
	top_locations = df['parsed_location'].value_counts().head(10)
	fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
	st.plotly_chart(fig, use_container_width=True)

	with col4:
	top_job_titles = df['title'].value_counts().head(20)
	fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
	st.plotly_chart(fig, use_container_width=True)

	@st.cache_data
	def filter_dataframe(df, companies, locations, job_types):
	filtered_df = df
	if companies:
	filtered_df = filtered_df[filtered_df['company'].isin(companies)]
	if locations:
	filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)]
	if job_types:
	filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
	return filtered_df

	def display_data_explorer(df):
	st.subheader("Data Explorer")

	show_all = st.radio("Display", ("All Data", "Filtered Data"))

	if show_all == "Filtered Data":
	unique_values = get_unique_values(df)
	col1, col2, col3 = st.columns(3)
	with col1:
	companies = st.multiselect("Select Companies", options=unique_values['companies'])
	with col2:
	locations = st.multiselect("Select Locations", options=df['parsed_location'].unique())
	with col3:
	job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])

	filtered_df = filter_dataframe(df, companies, locations, job_types)
	else:
	filtered_df = df

	st.write(f"Showing {len(filtered_df)} job listings")

	def make_clickable(url):
	return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'

	filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
	filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)

	st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)

	def main():
	st.title("Job Listings Dashboard")

	df = load_and_concat_data()

	if df.empty:
	st.error("No data available. Please check your dataset.")
	return

	df = parse_locations(df)

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])

	if page == "Dashboard":
	display_dashboard(df)
	elif page == "Data Explorer":
	display_data_explorer(df)

	if __name__ == "__main__":
	main()