hones / app.py
LeonceNsh's picture
Upload folder using huggingface_hub
1885ec3 verified
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import duckdb
import numpy as np
from datetime import datetime
import os
# Database connection
DATABASE_PATH = "./data/h1bs_analytics.duckdb"
def get_db_connection():
"""Create a connection to the DuckDB database"""
if os.path.exists(DATABASE_PATH):
return duckdb.connect(DATABASE_PATH, read_only=True)
else:
# Create sample data if database doesn't exist
return create_sample_data()
def create_sample_data():
"""Create sample H1B facts data for demonstration"""
conn = duckdb.connect(":memory:")
# Sample fact table based on H1B schema
np.random.seed(42)
n_records = 5000
sample_facts = pd.DataFrame({
'record_id': range(1, n_records + 1),
'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
'country_of_birth': np.random.choice([
'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), # Log-normal for realistic wage distribution
'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
})
conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")
return conn
# Load data
conn = get_db_connection()
def load_facts_data():
"""Load H1B applications fact table"""
try:
query = """
SELECT * FROM fct_h1b_applications
WHERE wage_amt IS NOT NULL
LIMIT 10000
"""
return conn.execute(query).df()
except Exception as e:
print(f"Error loading facts data: {e}")
return pd.DataFrame()
# Load the facts data
facts_df = load_facts_data()
# ---- FACTS TABLE VISUALIZATIONS ----
def facts_overview():
"""Overview of the facts table with key metrics"""
if facts_df.empty:
return go.Figure().update_layout(title="No facts data available")
# Key metrics
total_records = len(facts_df)
avg_wage = facts_df['wage_amt'].mean()
median_wage = facts_df['wage_amt'].median()
multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100
# Create metrics dashboard
fig = make_subplots(
rows=2, cols=2,
specs=[[{"type": "indicator"}, {"type": "indicator"}],
[{"type": "indicator"}, {"type": "indicator"}]],
subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
)
fig.add_trace(
go.Indicator(
mode="number",
value=total_records,
number={"valueformat": ","},
title={"text": "Total Records"}
),
row=1, col=1
)
fig.add_trace(
go.Indicator(
mode="number",
value=avg_wage,
number={"prefix": "$", "valueformat": ",.0f"},
title={"text": "Average Wage"}
),
row=1, col=2
)
fig.add_trace(
go.Indicator(
mode="number",
value=median_wage,
number={"prefix": "$", "valueformat": ",.0f"},
title={"text": "Median Wage"}
),
row=2, col=1
)
fig.add_trace(
go.Indicator(
mode="number",
value=multiple_reg_pct,
number={"suffix": "%", "valueformat": ".1f"},
title={"text": "Multiple Registrations"}
),
row=2, col=2
)
fig.update_layout(
height=400,
title_text="H1B Facts Table - Key Metrics"
)
return fig
def wage_distribution():
"""Visualize wage distribution from facts table"""
if facts_df.empty:
return go.Figure().update_layout(title="No data available")
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "histogram"}, {"type": "box"}]],
subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
)
# Histogram
fig.add_trace(
go.Histogram(
x=facts_df['wage_amt'],
nbinsx=50,
marker_color='skyblue',
opacity=0.7,
name='Wage Distribution'
),
row=1, col=1
)
# Box plot
fig.add_trace(
go.Box(
y=facts_df['wage_amt'],
marker_color='lightcoral',
name='Wage Box Plot'
),
row=1, col=2
)
fig.update_layout(
height=500,
title_text="Wage Analysis from Facts Table",
showlegend=False
)
fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)
return fig
def country_analysis():
"""Analyze country distribution from facts table"""
if facts_df.empty:
return go.Figure().update_layout(title="No data available")
# Country counts
country_counts = facts_df['country_of_birth'].value_counts().head(10)
# Average wage by country
country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') # Min 50 applications
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "bar"}, {"type": "bar"}]],
subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
)
# Applications by country
fig.add_trace(
go.Bar(
x=country_counts.index,
y=country_counts.values,
marker_color='teal',
text=country_counts.values,
textposition='auto',
name='Application Count'
),
row=1, col=1
)
# Average wage by country
fig.add_trace(
go.Bar(
x=country_wages['country_of_birth'],
y=country_wages['mean'],
marker_color='orange',
text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
textposition='auto',
name='Average Wage'
),
row=1, col=2
)
fig.update_layout(
height=500,
title_text="Country Analysis from Facts Table",
showlegend=False
)
fig.update_xaxes(tickangle=45, row=1, col=1)
fig.update_xaxes(tickangle=45, row=1, col=2)
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)
return fig
def temporal_analysis():
"""Analyze temporal patterns from facts table"""
if facts_df.empty:
return go.Figure().update_layout(title="No data available")
# Yearly trends
yearly_stats = facts_df.groupby('fiscal_year').agg({
'record_id': 'count',
'wage_amt': 'mean',
'is_multiple_registration': 'mean'
}).reset_index()
yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100
fig = make_subplots(
rows=2, cols=1,
specs=[[{"secondary_y": True}], [{"type": "bar"}]],
subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
)
# Applications count
fig.add_trace(
go.Scatter(
x=yearly_stats['fiscal_year'],
y=yearly_stats['record_id'],
mode='lines+markers',
name='Applications',
line=dict(color='blue', width=3),
marker=dict(size=8)
),
row=1, col=1
)
# Average wage (secondary y-axis)
fig.add_trace(
go.Scatter(
x=yearly_stats['fiscal_year'],
y=yearly_stats['wage_amt'],
mode='lines+markers',
name='Average Wage',
line=dict(color='red', width=3),
marker=dict(size=8),
yaxis='y2'
),
row=1, col=1
)
# Multiple registration percentage
fig.add_trace(
go.Bar(
x=yearly_stats['fiscal_year'],
y=yearly_stats['multiple_reg_pct'],
marker_color='green',
text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
textposition='auto',
name='Multiple Registration %'
),
row=2, col=1
)
# Update layout
fig.update_layout(
height=600,
title_text="Temporal Analysis from Facts Table"
)
# Update y-axes
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)
return fig
def demographic_analysis():
"""Analyze demographic patterns from facts table"""
if facts_df.empty:
return go.Figure().update_layout(title="No data available")
# Age distribution
age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
age_counts = age_bins.value_counts().sort_index()
# Full-time vs Part-time
employment_type = facts_df['full_time_ind'].value_counts()
employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]
# Same state employment
same_state = facts_df['employer_worksite_same_state'].value_counts()
same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]
fig = make_subplots(
rows=2, cols=2,
specs=[[{"type": "bar"}, {"type": "pie"}],
[{"type": "pie"}, {"type": "histogram"}]],
subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
)
# Age distribution
fig.add_trace(
go.Bar(
x=[str(interval) for interval in age_counts.index],
y=age_counts.values,
marker_color='lightblue',
name='Age Distribution'
),
row=1, col=1
)
# Employment type pie chart
fig.add_trace(
go.Pie(
labels=employment_labels,
values=employment_type.values,
name="Employment Type"
),
row=1, col=2
)
# Same state pie chart
fig.add_trace(
go.Pie(
labels=same_state_labels,
values=same_state.values,
name="Location"
),
row=2, col=1
)
# Years since application
years_since = facts_df['years_since_application'].value_counts().sort_index()
fig.add_trace(
go.Histogram(
x=facts_df['years_since_application'],
nbinsx=10,
marker_color='lightgreen',
name='Years Since Application'
),
row=2, col=2
)
fig.update_layout(
height=600,
title_text="Demographic Analysis from Facts Table",
showlegend=False
)
return fig
def facts_data_table():
"""Display sample of facts table data"""
if facts_df.empty:
return pd.DataFrame()
# Return first 100 rows with key columns
display_cols = [
'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
'wage_amt', 'age_at_application', 'is_multiple_registration',
'full_time_ind', 'employer_worksite_same_state'
]
sample_data = facts_df[display_cols].head(100).copy()
# Format wage column
sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")
return sample_data
# ---- GRADIO INTERFACE ----
with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
gr.Markdown("# πŸ“Š H1B Facts Table Analytics Dashboard")
gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")
with gr.Tab("πŸ“ˆ Facts Overview"):
gr.Markdown("### Key Metrics from Facts Table")
facts_overview_plot = gr.Plot()
gr.Button("Load Facts Overview", variant="primary").click(
fn=facts_overview,
outputs=facts_overview_plot
)
with gr.Tab("πŸ’° Wage Analysis"):
gr.Markdown("### Wage Distribution from Facts Table")
wage_plot = gr.Plot()
gr.Button("Analyze Wages", variant="primary").click(
fn=wage_distribution,
outputs=wage_plot
)
with gr.Tab("🌍 Country Analysis"):
gr.Markdown("### Country-wise Analysis from Facts Table")
country_plot = gr.Plot()
gr.Button("Analyze Countries", variant="primary").click(
fn=country_analysis,
outputs=country_plot
)
with gr.Tab("πŸ“… Temporal Analysis"):
gr.Markdown("### Time-based Trends from Facts Table")
temporal_plot = gr.Plot()
gr.Button("Analyze Trends", variant="primary").click(
fn=temporal_analysis,
outputs=temporal_plot
)
with gr.Tab("πŸ‘₯ Demographics"):
gr.Markdown("### Demographic Patterns from Facts Table")
demo_plot = gr.Plot()
gr.Button("Analyze Demographics", variant="primary").click(
fn=demographic_analysis,
outputs=demo_plot
)
with gr.Tab("πŸ“‹ Raw Data"):
gr.Markdown("### Sample Facts Table Data (First 100 rows)")
data_table = gr.DataFrame()
gr.Button("Load Sample Data", variant="primary").click(
fn=facts_data_table,
outputs=data_table
)
# Footer
gr.Markdown("---")
gr.Markdown("### Facts Table Schema")
gr.Markdown("""
**Table**: `fct_h1b_applications`
**Key Columns**:
- `record_id`: Unique identifier for each application
- `lottery_year`, `fiscal_year`: Temporal dimensions
- `country_of_birth`: Beneficiary country
- `wage_amt`: Offered wage amount
- `age_at_application`: Beneficiary age
- `is_multiple_registration`: Multiple lottery entries flag
- `full_time_ind`: Full-time employment indicator
- `employer_worksite_same_state`: Location alignment flag
- Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_error=True
)