|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
import duckdb |
|
import numpy as np |
|
from datetime import datetime |
|
import os |
|
|
|
|
|
DATABASE_PATH = "./data/h1bs_analytics.duckdb" |
|
|
|
def get_db_connection(): |
|
"""Create a connection to the DuckDB database""" |
|
if os.path.exists(DATABASE_PATH): |
|
return duckdb.connect(DATABASE_PATH, read_only=True) |
|
else: |
|
|
|
return create_sample_data() |
|
|
|
def create_sample_data(): |
|
"""Create sample H1B facts data for demonstration""" |
|
conn = duckdb.connect(":memory:") |
|
|
|
|
|
np.random.seed(42) |
|
n_records = 5000 |
|
|
|
sample_facts = pd.DataFrame({ |
|
'record_id': range(1, n_records + 1), |
|
'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records), |
|
'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records), |
|
'country_of_birth': np.random.choice([ |
|
'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM', |
|
'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL' |
|
], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]), |
|
'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), |
|
'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]), |
|
'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45), |
|
'years_since_application': np.random.choice([0, 1, 2, 3], n_records), |
|
'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]), |
|
'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]), |
|
'employer_sk': [f'EMP_{i%500}' for i in range(n_records)], |
|
'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)], |
|
'job_sk': [f'JOB_{i%300}' for i in range(n_records)] |
|
}) |
|
|
|
conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts") |
|
|
|
return conn |
|
|
|
|
|
conn = get_db_connection() |
|
|
|
def load_facts_data(): |
|
"""Load H1B applications fact table""" |
|
try: |
|
query = """ |
|
SELECT * FROM fct_h1b_applications |
|
WHERE wage_amt IS NOT NULL |
|
LIMIT 10000 |
|
""" |
|
return conn.execute(query).df() |
|
except Exception as e: |
|
print(f"Error loading facts data: {e}") |
|
return pd.DataFrame() |
|
|
|
|
|
facts_df = load_facts_data() |
|
|
|
|
|
|
|
def facts_overview(): |
|
"""Overview of the facts table with key metrics""" |
|
if facts_df.empty: |
|
return go.Figure().update_layout(title="No facts data available") |
|
|
|
|
|
total_records = len(facts_df) |
|
avg_wage = facts_df['wage_amt'].mean() |
|
median_wage = facts_df['wage_amt'].median() |
|
multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100 |
|
|
|
|
|
fig = make_subplots( |
|
rows=2, cols=2, |
|
specs=[[{"type": "indicator"}, {"type": "indicator"}], |
|
[{"type": "indicator"}, {"type": "indicator"}]], |
|
subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %") |
|
) |
|
|
|
fig.add_trace( |
|
go.Indicator( |
|
mode="number", |
|
value=total_records, |
|
number={"valueformat": ","}, |
|
title={"text": "Total Records"} |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
fig.add_trace( |
|
go.Indicator( |
|
mode="number", |
|
value=avg_wage, |
|
number={"prefix": "$", "valueformat": ",.0f"}, |
|
title={"text": "Average Wage"} |
|
), |
|
row=1, col=2 |
|
) |
|
|
|
fig.add_trace( |
|
go.Indicator( |
|
mode="number", |
|
value=median_wage, |
|
number={"prefix": "$", "valueformat": ",.0f"}, |
|
title={"text": "Median Wage"} |
|
), |
|
row=2, col=1 |
|
) |
|
|
|
fig.add_trace( |
|
go.Indicator( |
|
mode="number", |
|
value=multiple_reg_pct, |
|
number={"suffix": "%", "valueformat": ".1f"}, |
|
title={"text": "Multiple Registrations"} |
|
), |
|
row=2, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
height=400, |
|
title_text="H1B Facts Table - Key Metrics" |
|
) |
|
|
|
return fig |
|
|
|
def wage_distribution(): |
|
"""Visualize wage distribution from facts table""" |
|
if facts_df.empty: |
|
return go.Figure().update_layout(title="No data available") |
|
|
|
fig = make_subplots( |
|
rows=1, cols=2, |
|
specs=[[{"type": "histogram"}, {"type": "box"}]], |
|
subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)") |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Histogram( |
|
x=facts_df['wage_amt'], |
|
nbinsx=50, |
|
marker_color='skyblue', |
|
opacity=0.7, |
|
name='Wage Distribution' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Box( |
|
y=facts_df['wage_amt'], |
|
marker_color='lightcoral', |
|
name='Wage Box Plot' |
|
), |
|
row=1, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
height=500, |
|
title_text="Wage Analysis from Facts Table", |
|
showlegend=False |
|
) |
|
|
|
fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1) |
|
fig.update_yaxes(title_text="Frequency", row=1, col=1) |
|
fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2) |
|
|
|
return fig |
|
|
|
def country_analysis(): |
|
"""Analyze country distribution from facts table""" |
|
if facts_df.empty: |
|
return go.Figure().update_layout(title="No data available") |
|
|
|
|
|
country_counts = facts_df['country_of_birth'].value_counts().head(10) |
|
|
|
|
|
country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index() |
|
country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') |
|
|
|
fig = make_subplots( |
|
rows=1, cols=2, |
|
specs=[[{"type": "bar"}, {"type": "bar"}]], |
|
subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)") |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=country_counts.index, |
|
y=country_counts.values, |
|
marker_color='teal', |
|
text=country_counts.values, |
|
textposition='auto', |
|
name='Application Count' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=country_wages['country_of_birth'], |
|
y=country_wages['mean'], |
|
marker_color='orange', |
|
text=['$' + f"{x:,.0f}" for x in country_wages['mean']], |
|
textposition='auto', |
|
name='Average Wage' |
|
), |
|
row=1, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
height=500, |
|
title_text="Country Analysis from Facts Table", |
|
showlegend=False |
|
) |
|
|
|
fig.update_xaxes(tickangle=45, row=1, col=1) |
|
fig.update_xaxes(tickangle=45, row=1, col=2) |
|
fig.update_yaxes(title_text="Number of Applications", row=1, col=1) |
|
fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2) |
|
|
|
return fig |
|
|
|
def temporal_analysis(): |
|
"""Analyze temporal patterns from facts table""" |
|
if facts_df.empty: |
|
return go.Figure().update_layout(title="No data available") |
|
|
|
|
|
yearly_stats = facts_df.groupby('fiscal_year').agg({ |
|
'record_id': 'count', |
|
'wage_amt': 'mean', |
|
'is_multiple_registration': 'mean' |
|
}).reset_index() |
|
|
|
yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100 |
|
|
|
fig = make_subplots( |
|
rows=2, cols=1, |
|
specs=[[{"secondary_y": True}], [{"type": "bar"}]], |
|
subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year") |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=yearly_stats['fiscal_year'], |
|
y=yearly_stats['record_id'], |
|
mode='lines+markers', |
|
name='Applications', |
|
line=dict(color='blue', width=3), |
|
marker=dict(size=8) |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=yearly_stats['fiscal_year'], |
|
y=yearly_stats['wage_amt'], |
|
mode='lines+markers', |
|
name='Average Wage', |
|
line=dict(color='red', width=3), |
|
marker=dict(size=8), |
|
yaxis='y2' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=yearly_stats['fiscal_year'], |
|
y=yearly_stats['multiple_reg_pct'], |
|
marker_color='green', |
|
text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']], |
|
textposition='auto', |
|
name='Multiple Registration %' |
|
), |
|
row=2, col=1 |
|
) |
|
|
|
|
|
fig.update_layout( |
|
height=600, |
|
title_text="Temporal Analysis from Facts Table" |
|
) |
|
|
|
|
|
fig.update_yaxes(title_text="Number of Applications", row=1, col=1) |
|
fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1) |
|
fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1) |
|
fig.update_xaxes(title_text="Fiscal Year", row=2, col=1) |
|
|
|
return fig |
|
|
|
def demographic_analysis(): |
|
"""Analyze demographic patterns from facts table""" |
|
if facts_df.empty: |
|
return go.Figure().update_layout(title="No data available") |
|
|
|
|
|
age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False) |
|
age_counts = age_bins.value_counts().sort_index() |
|
|
|
|
|
employment_type = facts_df['full_time_ind'].value_counts() |
|
employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index] |
|
|
|
|
|
same_state = facts_df['employer_worksite_same_state'].value_counts() |
|
same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index] |
|
|
|
fig = make_subplots( |
|
rows=2, cols=2, |
|
specs=[[{"type": "bar"}, {"type": "pie"}], |
|
[{"type": "pie"}, {"type": "histogram"}]], |
|
subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application") |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=[str(interval) for interval in age_counts.index], |
|
y=age_counts.values, |
|
marker_color='lightblue', |
|
name='Age Distribution' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Pie( |
|
labels=employment_labels, |
|
values=employment_type.values, |
|
name="Employment Type" |
|
), |
|
row=1, col=2 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Pie( |
|
labels=same_state_labels, |
|
values=same_state.values, |
|
name="Location" |
|
), |
|
row=2, col=1 |
|
) |
|
|
|
|
|
years_since = facts_df['years_since_application'].value_counts().sort_index() |
|
fig.add_trace( |
|
go.Histogram( |
|
x=facts_df['years_since_application'], |
|
nbinsx=10, |
|
marker_color='lightgreen', |
|
name='Years Since Application' |
|
), |
|
row=2, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
height=600, |
|
title_text="Demographic Analysis from Facts Table", |
|
showlegend=False |
|
) |
|
|
|
return fig |
|
|
|
def facts_data_table(): |
|
"""Display sample of facts table data""" |
|
if facts_df.empty: |
|
return pd.DataFrame() |
|
|
|
|
|
display_cols = [ |
|
'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth', |
|
'wage_amt', 'age_at_application', 'is_multiple_registration', |
|
'full_time_ind', 'employer_worksite_same_state' |
|
] |
|
|
|
sample_data = facts_df[display_cols].head(100).copy() |
|
|
|
|
|
sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}") |
|
|
|
return sample_data |
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo: |
|
gr.Markdown("# π H1B Facts Table Analytics Dashboard") |
|
gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data") |
|
|
|
with gr.Tab("π Facts Overview"): |
|
gr.Markdown("### Key Metrics from Facts Table") |
|
facts_overview_plot = gr.Plot() |
|
gr.Button("Load Facts Overview", variant="primary").click( |
|
fn=facts_overview, |
|
outputs=facts_overview_plot |
|
) |
|
|
|
with gr.Tab("π° Wage Analysis"): |
|
gr.Markdown("### Wage Distribution from Facts Table") |
|
wage_plot = gr.Plot() |
|
gr.Button("Analyze Wages", variant="primary").click( |
|
fn=wage_distribution, |
|
outputs=wage_plot |
|
) |
|
|
|
with gr.Tab("π Country Analysis"): |
|
gr.Markdown("### Country-wise Analysis from Facts Table") |
|
country_plot = gr.Plot() |
|
gr.Button("Analyze Countries", variant="primary").click( |
|
fn=country_analysis, |
|
outputs=country_plot |
|
) |
|
|
|
with gr.Tab("π
Temporal Analysis"): |
|
gr.Markdown("### Time-based Trends from Facts Table") |
|
temporal_plot = gr.Plot() |
|
gr.Button("Analyze Trends", variant="primary").click( |
|
fn=temporal_analysis, |
|
outputs=temporal_plot |
|
) |
|
|
|
with gr.Tab("π₯ Demographics"): |
|
gr.Markdown("### Demographic Patterns from Facts Table") |
|
demo_plot = gr.Plot() |
|
gr.Button("Analyze Demographics", variant="primary").click( |
|
fn=demographic_analysis, |
|
outputs=demo_plot |
|
) |
|
|
|
with gr.Tab("π Raw Data"): |
|
gr.Markdown("### Sample Facts Table Data (First 100 rows)") |
|
data_table = gr.DataFrame() |
|
gr.Button("Load Sample Data", variant="primary").click( |
|
fn=facts_data_table, |
|
outputs=data_table |
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
gr.Markdown("### Facts Table Schema") |
|
gr.Markdown(""" |
|
**Table**: `fct_h1b_applications` |
|
|
|
**Key Columns**: |
|
- `record_id`: Unique identifier for each application |
|
- `lottery_year`, `fiscal_year`: Temporal dimensions |
|
- `country_of_birth`: Beneficiary country |
|
- `wage_amt`: Offered wage amount |
|
- `age_at_application`: Beneficiary age |
|
- `is_multiple_registration`: Multiple lottery entries flag |
|
- `full_time_ind`: Full-time employment indicator |
|
- `employer_worksite_same_state`: Location alignment flag |
|
- Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk` |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=True, |
|
show_error=True |
|
) |