TrafCast / data_process /unified.py
amitom's picture
Minimal app for HF Space
73e9c25
import os
import sys
import glob
import pandas as pd
from process import prepare_data_df, build_sensor_index, map_pms_to_sensors
index_map = {
'405': 'I',
'101': 'US',
'101': 'US',
'110': 'I',
'170': 'CA',
'118': 'CA',
'134': 'CA',
'605': 'I',
'210': 'I',
'5': 'I'
}
direction_map = {
'E': 'East',
'W': 'West',
'N': 'North',
'S': 'South'
}
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(PROJECT_ROOT)
DATA_DIR = os.path.join(PROJECT_ROOT, "data_collection", "data")
COORDINATE_DIR = os.path.join(PROJECT_ROOT,"data_collection", "coordinates")
full_df = pd.DataFrame()
for entry in os.scandir(DATA_DIR):
if entry.is_dir():
road_number = entry.name
road_name = index_map[road_number] +" "+ road_number
with os.scandir(entry.path) as it:
for sub in it:
if sub.is_dir() and sub.name in {'E', 'W', 'N', 'S'}:
direction = direction_map[sub.name]
print(f"Processing {road_name} {direction}")
data_dir = os.path.join(entry.path, sub.name)
coordinate_dir = os.path.join(COORDINATE_DIR, f"{road_name} {direction}.xlsx")
for i in range(1,32):
if i <=9:
raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*0{i}*2025.xlsx")
date = f"2025-03-0{i}"
else:
raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*{i}*2025.xlsx")
date = f"2025-03-{i}"
matching_files = glob.glob(raw_data_pattern)
if not matching_files:
print(f"No data file found for {road_name} {direction} on {date}, skipping...")
continue
raw_data = matching_files[0]
df_coord = pd.read_excel(coordinate_dir)
df_data = pd.read_excel(raw_data)
clean_data_df = prepare_data_df(df_data, df_coord,date)
sensors = build_sensor_index(clean_data_df)
enriched = map_pms_to_sensors(clean_data_df, sensors)
enriched["road_name"] = road_name
enriched["direction"] = direction
full_df = pd.concat([full_df, enriched], ignore_index=True)
print(f"finished {date}")
full_df.drop(columns=["Postmile (Abs)", "Postmile (CA)", "VDS", "Time_hour", "matched_sensor_lat", "matched_sensor_lon", "distance_m"], inplace=True)
desired_order = [
"Time","sensor_id", "Latitude", "Longitude",
"road_name", "direction", "# Lane Points",
"% Observed", "weather", "Day", "AggSpeed"
]
full_df = full_df[desired_order]
full_df.rename(columns={
"AggSpeed": "speed_mph",
"# Lane Points": "lanes"
}, inplace=True)
full_df.to_csv('full_df_weather.csv',index=False)