Spaces:
Sleeping
Sleeping
| """ | |
| NYC Taxi Small Dataset Loader | |
| Creates a DuckDB database with sample taxi trip data for testing. | |
| """ | |
| import duckdb | |
| import os | |
| from datetime import datetime, timedelta | |
| def create_database(db_path: str = "nyc_taxi_small.duckdb"): | |
| """Create a DuckDB database with sample taxi data.""" | |
| # Remove existing database if it exists | |
| if os.path.exists(db_path): | |
| os.remove(db_path) | |
| # Connect to DuckDB | |
| conn = duckdb.connect(db_path) | |
| # Read and execute schema | |
| schema_path = os.path.join(os.path.dirname(__file__), "schema.sql") | |
| with open(schema_path, 'r') as f: | |
| schema_sql = f.read() | |
| conn.execute(schema_sql) | |
| # Insert sample data | |
| base_time = datetime(2023, 1, 1, 8, 0, 0) | |
| # Sample trips data | |
| trips_data = [ | |
| (1, base_time, base_time + timedelta(minutes=15), 1, 2.5, -73.9857, 40.7484, -73.9881, 40.7614, 12.50, 2.50, 15.00, "Credit", "CMT"), | |
| (2, base_time + timedelta(minutes=30), base_time + timedelta(minutes=45), 2, 1.8, -73.9857, 40.7484, -73.9881, 40.7614, 8.50, 1.70, 10.20, "Cash", "VTS"), | |
| (3, base_time + timedelta(hours=1), base_time + timedelta(hours=1, minutes=20), 1, 4.2, -73.9857, 40.7484, -73.9881, 40.7614, 18.00, 3.60, 21.60, "Credit", "CMT"), | |
| (4, base_time + timedelta(hours=2), base_time + timedelta(hours=2, minutes=10), 3, 0.9, -73.9857, 40.7484, -73.9881, 40.7614, 6.00, 1.20, 7.20, "Credit", "VTS"), | |
| (5, base_time + timedelta(hours=3), base_time + timedelta(hours=3, minutes=25), 1, 3.1, -73.9857, 40.7484, -73.9881, 40.7614, 14.50, 2.90, 17.40, "Cash", "CMT"), | |
| (6, base_time + timedelta(hours=4), base_time + timedelta(hours=4, minutes=12), 2, 2.3, -73.9857, 40.7484, -73.9881, 40.7614, 11.00, 2.20, 13.20, "Credit", "VTS"), | |
| (7, base_time + timedelta(hours=5), base_time + timedelta(hours=5, minutes=18), 1, 1.5, -73.9857, 40.7484, -73.9881, 40.7614, 7.50, 1.50, 9.00, "Credit", "CMT"), | |
| (8, base_time + timedelta(hours=6), base_time + timedelta(hours=6, minutes=22), 4, 5.8, -73.9857, 40.7484, -73.9881, 40.7614, 25.00, 5.00, 30.00, "Credit", "VTS"), | |
| (9, base_time + timedelta(hours=7), base_time + timedelta(hours=7, minutes=8), 1, 0.7, -73.9857, 40.7484, -73.9881, 40.7614, 5.50, 1.10, 6.60, "Cash", "CMT"), | |
| (10, base_time + timedelta(hours=8), base_time + timedelta(hours=8, minutes=35), 2, 6.2, -73.9857, 40.7484, -73.9881, 40.7614, 28.00, 5.60, 33.60, "Credit", "VTS"), | |
| ] | |
| # Sample zones data | |
| zones_data = [ | |
| (1, "Manhattan", "Central Park", "Yellow Zone"), | |
| (2, "Manhattan", "Times Square", "Yellow Zone"), | |
| (3, "Brooklyn", "Williamsburg", "Boro Zone"), | |
| (4, "Queens", "Astoria", "Boro Zone"), | |
| (5, "Bronx", "Yankee Stadium", "Boro Zone"), | |
| (6, "Staten Island", "St. George", "Boro Zone"), | |
| ] | |
| # Insert trips data | |
| conn.executemany( | |
| "INSERT INTO trips VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
| trips_data | |
| ) | |
| # Insert zones data | |
| conn.executemany( | |
| "INSERT INTO zones VALUES (?, ?, ?, ?)", | |
| zones_data | |
| ) | |
| conn.close() | |
| print(f"Created database: {db_path}") | |
| return db_path | |
| def load_data(db_path: str = "nyc_taxi_small.duckdb"): | |
| """Load data into the database - wrapper for create_database.""" | |
| return create_database(db_path) | |
| if __name__ == "__main__": | |
| create_database() | |