import os import pickle import pandas as pd import datetime import sys from multiprocessing import Pool def get_labeled_data_as_df(path): trajectory_frames = [] labelfile = os.path.join(path, "labels.txt") _label_df = pd.read_csv(labelfile,sep="\t",header=0,names=["starttime", "endtime", "mode"],parse_dates=[0,1]) _label_df["startdate"] = _label_df["starttime"].dt.date _label_startdate_set = set(_label_df["startdate"]) datapath = os.path.join(path, "Trajectory") for file in os.listdir(datapath): df = pd.read_csv(os.path.join(datapath,file), sep=",", header=None, skiprows=6, usecols=[0, 1, 3, 5, 6], names=["lat", "lon", "altitude", "date", "time"]) df["datetime"] = pd.to_datetime(df['date'] + ' ' + df['time']) date_of_traj = datetime.datetime.strptime(file[:8],"%Y%m%d").date() if date_of_traj in _label_startdate_set: labels_for_date = _label_df[_label_df["startdate"] == date_of_traj] def is_in(trajrow): for i, row in labels_for_date.iterrows(): if row["starttime"] <= trajrow["datetime"] <= row["endtime"]: return row["mode"] df["label"] = df.apply(is_in, axis=1) trajectory_frames.append(df) print("added", datapath, file) return trajectory_frames if __name__ == '__main__': '''if len(sys.argv) < 2: print("Usage: raw_data_loader.py /path/to/geolife/Data/") exit(-1)''' path = 'D:\Geolife Trajectories 1.3\Geolife Trajectories 1.3\Data' traj_with_labels_paths = [] for file in os.listdir(path): currfile = os.path.join(path, file) if os.path.isdir(currfile): if "labels.txt" not in os.listdir(currfile): continue traj_with_labels_paths.append(currfile) with Pool(3) as p: traj_frames = p.map(get_labeled_data_as_df, traj_with_labels_paths) pickle.dump(traj_frames, open( "data/raw_labeled.pkl", "wb"))