| import os | |
| import sys | |
| import glob | |
| import pandas as pd | |
| from process import prepare_data_df, build_sensor_index, map_pms_to_sensors | |
| index_map = { | |
| '405': 'I', | |
| '101': 'US', | |
| '101': 'US', | |
| '110': 'I', | |
| '170': 'CA', | |
| '118': 'CA', | |
| '134': 'CA', | |
| '605': 'I', | |
| '210': 'I', | |
| '5': 'I' | |
| } | |
| direction_map = { | |
| 'E': 'East', | |
| 'W': 'West', | |
| 'N': 'North', | |
| 'S': 'South' | |
| } | |
| PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
| sys.path.append(PROJECT_ROOT) | |
| DATA_DIR = os.path.join(PROJECT_ROOT, "data_collection", "data") | |
| COORDINATE_DIR = os.path.join(PROJECT_ROOT,"data_collection", "coordinates") | |
| full_df = pd.DataFrame() | |
| for entry in os.scandir(DATA_DIR): | |
| if entry.is_dir(): | |
| road_number = entry.name | |
| road_name = index_map[road_number] +" "+ road_number | |
| with os.scandir(entry.path) as it: | |
| for sub in it: | |
| if sub.is_dir() and sub.name in {'E', 'W', 'N', 'S'}: | |
| direction = direction_map[sub.name] | |
| print(f"Processing {road_name} {direction}") | |
| data_dir = os.path.join(entry.path, sub.name) | |
| coordinate_dir = os.path.join(COORDINATE_DIR, f"{road_name} {direction}.xlsx") | |
| for i in range(1,32): | |
| if i <=9: | |
| raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*0{i}*2025.xlsx") | |
| date = f"2025-03-0{i}" | |
| else: | |
| raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*{i}*2025.xlsx") | |
| date = f"2025-03-{i}" | |
| matching_files = glob.glob(raw_data_pattern) | |
| if not matching_files: | |
| print(f"No data file found for {road_name} {direction} on {date}, skipping...") | |
| continue | |
| raw_data = matching_files[0] | |
| df_coord = pd.read_excel(coordinate_dir) | |
| df_data = pd.read_excel(raw_data) | |
| clean_data_df = prepare_data_df(df_data, df_coord,date) | |
| sensors = build_sensor_index(clean_data_df) | |
| enriched = map_pms_to_sensors(clean_data_df, sensors) | |
| enriched["road_name"] = road_name | |
| enriched["direction"] = direction | |
| full_df = pd.concat([full_df, enriched], ignore_index=True) | |
| print(f"finished {date}") | |
| full_df.drop(columns=["Postmile (Abs)", "Postmile (CA)", "VDS", "Time_hour", "matched_sensor_lat", "matched_sensor_lon", "distance_m"], inplace=True) | |
| desired_order = [ | |
| "Time","sensor_id", "Latitude", "Longitude", | |
| "road_name", "direction", "# Lane Points", | |
| "% Observed", "weather", "Day", "AggSpeed" | |
| ] | |
| full_df = full_df[desired_order] | |
| full_df.rename(columns={ | |
| "AggSpeed": "speed_mph", | |
| "# Lane Points": "lanes" | |
| }, inplace=True) | |
| full_df.to_csv('full_df_weather.csv',index=False) | |