{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "041c9721", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/flights/Combined_Flights_2022.csv')\n", "# df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/flights/clean_Flights_2022.csv')" ] }, { "cell_type": "code", "execution_count": 2, "id": "03d0f39e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FlightDate 2022-03-15\n", "Airline Delta Air Lines Inc.\n", "Origin LAS\n", "Dest SLC\n", "Cancelled False\n", " ... \n", "ArrDel15 0.0\n", "ArrivalDelayGroups -2.0\n", "ArrTimeBlk 1600-1659\n", "DistanceGroup 2\n", "DivAirportLandings 0\n", "Name: 3504987, Length: 61, dtype: object" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[3504987]" ] }, { "cell_type": "code", "execution_count": 5, "id": "036418f5", "metadata": {}, "outputs": [], "source": [ "data_dict = data.to_dict(orient = 'split')" ] }, { "cell_type": "code", "execution_count": 13, "id": "ef12c4b3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FlightDate 2022-01-29\n", "Airline Frontier Airlines Inc.\n", "Origin COS\n", "Dest LAS\n", "Cancelled False\n", "Diverted False\n", "CRSDepTime 1558\n", "DepTime 1553.0\n", "DepDelayMinutes 0.0\n", "DepDelay -5.0\n", "ArrTime 1646.0\n", "ArrDelayMinutes 0.0\n", "AirTime 91.0\n", "CRSElapsedTime 124.0\n", "ActualElapsedTime 113.0\n", "Distance 604.0\n", "Year 2022\n", "Quarter 1\n", "Month 1\n", "DayofMonth 29\n", "DayOfWeek 6\n", "Marketing_Airline_Network F9\n", "Operated_or_Branded_Code_Share_Partners F9\n", "DOT_ID_Marketing_Airline 20436\n", "IATA_Code_Marketing_Airline F9\n", "Flight_Number_Marketing_Airline 2019\n", "Operating_Airline F9\n", "DOT_ID_Operating_Airline 20436\n", "IATA_Code_Operating_Airline F9\n", "Tail_Number N235FR\n", "Flight_Number_Operating_Airline 2019\n", "OriginAirportID 11109\n", "OriginAirportSeqID 1110902\n", "OriginCityMarketID 30189\n", "OriginCityName Colorado Springs, CO\n", "OriginState CO\n", "OriginStateFips 8\n", "OriginStateName Colorado\n", "OriginWac 82\n", "DestAirportID 12889\n", "DestAirportSeqID 1288903\n", "DestCityMarketID 32211\n", "DestCityName Las Vegas, NV\n", "DestState NV\n", "DestStateFips 32\n", "DestStateName Nevada\n", "DestWac 85\n", "DepDel15 0.0\n", "DepartureDelayGroups -1.0\n", "DepTimeBlk 1500-1559\n", "TaxiOut 13.0\n", "WheelsOff 1606.0\n", "WheelsOn 1637.0\n", "TaxiIn 9.0\n", "CRSArrTime 1702\n", "ArrDelay -16.0\n", "ArrDel15 0.0\n", "ArrivalDelayGroups -2.0\n", "ArrTimeBlk 1700-1759\n", "DistanceGroup 3\n", "DivAirportLandings 0\n" ] } ], "source": [ "for idx,unit in enumerate(data_dict['columns']):\n", " print(unit, data_dict['data'][3000020][idx])" ] }, { "cell_type": "code", "execution_count": 12, "id": "372b3fd9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['2022-01-29',\n", " 'Frontier Airlines Inc.',\n", " 'COS',\n", " 'LAS',\n", " False,\n", " False,\n", " 1558,\n", " 1553.0,\n", " 0.0,\n", " -5.0,\n", " 1646.0,\n", " 0.0,\n", " 91.0,\n", " 124.0,\n", " 113.0,\n", " 604.0,\n", " 2022,\n", " 1,\n", " 1,\n", " 29,\n", " 6,\n", " 'F9',\n", " 'F9',\n", " 20436,\n", " 'F9',\n", " 2019,\n", " 'F9',\n", " 20436,\n", " 'F9',\n", " 'N235FR',\n", " 2019,\n", " 11109,\n", " 1110902,\n", " 30189,\n", " 'Colorado Springs, CO',\n", " 'CO',\n", " 8,\n", " 'Colorado',\n", " 82,\n", " 12889,\n", " 1288903,\n", " 32211,\n", " 'Las Vegas, NV',\n", " 'NV',\n", " 32,\n", " 'Nevada',\n", " 85,\n", " 0.0,\n", " -1.0,\n", " '1500-1559',\n", " 13.0,\n", " 1606.0,\n", " 1637.0,\n", " 9.0,\n", " 1702,\n", " -16.0,\n", " 0.0,\n", " -2.0,\n", " '1700-1759',\n", " 3,\n", " 0]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dict['data'][3000020]" ] }, { "cell_type": "code", "execution_count": 11, "id": "371a85fd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4078318\n" ] } ], "source": [ "print(len(data_dict['data']))" ] }, { "cell_type": "code", "execution_count": 12, "id": "64d46483", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FlightDate 0\n", "DepTime 7\n", "ArrTime 10\n", "ActualElapsedTime 14\n", "Distance 15\n", "OriginCityName 34\n", "DestCityName 42\n" ] } ], "source": [ "for idx,unit in enumerate(data_dict['columns']):\n", " if unit in ['FlightDate','DepTime','ArrTime','ActualElapsedTime','Distance','OriginCityName','DestCityName']:\n", " print(unit, str(idx))" ] }, { "cell_type": "code", "execution_count": 6, "id": "81047adf", "metadata": {}, "outputs": [], "source": [ "import math\n", "def convert_to_hhmm(time_float):\n", " \"\"\"\n", " Convert a float time to hh:mm format\n", " :param time_float: Time as a float. Example: 757.0\n", " :return: Time in hh:mm format. Example: \"07:57\"\n", " \"\"\"\n", " try:\n", " hours = int(time_float // 100)\n", " minutes = int(time_float % 100)\n", " return \"{:02d}:{:02d}\".format(hours, minutes)\n", " except:\n", " return time_float\n", "\n", "def minutes_to_hours_minutes(minutes):\n", " # Check for NaN and handle it\n", " if math.isnan(minutes):\n", " return \"NaN\"\n", " \n", " # Ensure minutes is an integer or rounded to the nearest integer\n", " minutes = round(minutes)\n", " \n", " hours = minutes // 60\n", " remaining_minutes = minutes % 60\n", " return f\"{hours} hours {remaining_minutes} minutes\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "ee34cbde", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b60c3d13fb6d44258103c6251365272b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from tqdm.autonotebook import tqdm\n", "import random\n", "new_data = []\n", "for idx, unit in tqdm(enumerate(data_dict['data'])):\n", " tmp_dict = {k:\"\" for k in ['FlightDate','DepTime','ArrTime','ActualElapsedTime','Distance','OriginCityName','DestCityName','Price']}\n", " tmp_dict['FlightDate'] = unit[0]\n", " tmp_dict['DepTime'] = convert_to_hhmm(unit[7])\n", " tmp_dict['ArrTime'] = convert_to_hhmm(unit[10])\n", " tmp_dict['ActualElapsedTime'] = minutes_to_hours_minutes(unit[14])\n", " tmp_dict['Distance'] = unit[15]\n", " tmp_dict['OriginCityName'] = unit[34].split(',')[0].split('/')[0]\n", " tmp_dict['DestCityName'] = unit[42].split(',')[0].split('/')[0]\n", " tmp_dict['Price'] = int((unit[15]) * random.uniform(0.2,0.5))\n", " new_data.append(tmp_dict)" ] }, { "cell_type": "code", "execution_count": 11, "id": "aee3f422", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'FlightDate': '2022-01-29',\n", " 'DepTime': '15:53',\n", " 'ArrTime': '16:46',\n", " 'ActualElapsedTime': '1 hours 53 minutes',\n", " 'Distance': 604.0,\n", " 'OriginCityName': 'Colorado Springs',\n", " 'DestCityName': 'Las Vegas',\n", " 'Price': 205}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_data[3000020]" ] }, { "cell_type": "code", "execution_count": 90, "id": "bfb243c0", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(new_data)" ] }, { "cell_type": "code", "execution_count": 62, "id": "f152a150", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | FlightDate | \n", "DepTime | \n", "ArrTime | \n", "ActualElapsedTime | \n", "Distance | \n", "OriginCityName | \n", "DestCityName | \n", "Price | \n", "
---|---|---|---|---|---|---|---|---|
3488394 | \n", "2022-03-01 | \n", "07:24 | \n", "15:15 | \n", "4 hours 51 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "720 | \n", "
3509382 | \n", "2022-03-01 | \n", "22:29 | \n", "06:07 | \n", "4 hours 38 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "484 | \n", "
3736056 | \n", "2022-03-01 | \n", "23:33 | \n", "07:16 | \n", "4 hours 43 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "1199 | \n", "
3736260 | \n", "2022-03-01 | \n", "14:37 | \n", "22:05 | \n", "4 hours 28 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "950 | \n", "
3736313 | \n", "2022-03-01 | \n", "09:11 | \n", "17:17 | \n", "5 hours 6 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "1050 | \n", "
3776858 | \n", "2022-03-01 | \n", "21:01 | \n", "04:32 | \n", "4 hours 31 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "1146 | \n", "
3778565 | \n", "2022-03-01 | \n", "13:18 | \n", "21:08 | \n", "4 hours 50 minutes | \n", "2422.0 | \n", "Seattle | \n", "New York | \n", "578 | \n", "
\n", " | FlightDate | \n", "DepTime | \n", "ArrTime | \n", "ActualElapsedTime | \n", "Distance | \n", "OriginCityName | \n", "DestCityName | \n", "Price | \n", "Flight Number | \n", "
---|---|---|---|---|---|---|---|---|---|
4155 | \n", "2022-04-01 | \n", "10:42 | \n", "11:34 | \n", "0 hours 52 minutes | \n", "196.0 | \n", "Montrose | \n", "Denver | \n", "42 | \n", "F0004155 | \n", "
4156 | \n", "2022-04-01 | \n", "14:43 | \n", "15:32 | \n", "0 hours 49 minutes | \n", "196.0 | \n", "Montrose | \n", "Denver | \n", "64 | \n", "F0004156 | \n", "
4157 | \n", "2022-04-01 | \n", "17:38 | \n", "18:58 | \n", "1 hours 20 minutes | \n", "196.0 | \n", "Montrose | \n", "Denver | \n", "97 | \n", "F0004157 | \n", "
7439 | \n", "2022-04-02 | \n", "13:38 | \n", "16:32 | \n", "1 hours 54 minutes | \n", "733.0 | \n", "Montrose | \n", "Dallas | \n", "151 | \n", "F0007439 | \n", "
7440 | \n", "2022-04-02 | \n", "12:37 | \n", "13:29 | \n", "0 hours 52 minutes | \n", "196.0 | \n", "Montrose | \n", "Denver | \n", "54 | \n", "F0007440 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4045139 | \n", "2022-03-27 | \n", "11:50 | \n", "12:17 | \n", "1 hours 27 minutes | \n", "419.0 | \n", "Montrose | \n", "Phoenix | \n", "133 | \n", "F4045139 | \n", "
4045140 | \n", "2022-03-28 | \n", "11:45 | \n", "12:22 | \n", "1 hours 37 minutes | \n", "419.0 | \n", "Montrose | \n", "Phoenix | \n", "188 | \n", "F4045140 | \n", "
4045141 | \n", "2022-03-29 | \n", "11:35 | \n", "12:17 | \n", "1 hours 42 minutes | \n", "419.0 | \n", "Montrose | \n", "Phoenix | \n", "144 | \n", "F4045141 | \n", "
4045142 | \n", "2022-03-30 | \n", "11:38 | \n", "12:13 | \n", "1 hours 35 minutes | \n", "419.0 | \n", "Montrose | \n", "Phoenix | \n", "125 | \n", "F4045142 | \n", "
4045143 | \n", "2022-03-31 | \n", "11:40 | \n", "12:19 | \n", "1 hours 39 minutes | \n", "419.0 | \n", "Montrose | \n", "Phoenix | \n", "129 | \n", "F4045143 | \n", "
2035 rows × 9 columns
\n", "\n", " | FlightDate | \n", "DepTime | \n", "ArrTime | \n", "ActualElapsedTime | \n", "Distance | \n", "OriginCityName | \n", "DestCityName | \n", "Price | \n", "
---|---|---|---|---|---|---|---|---|
1369 | \n", "2022-04-01 | \n", "09:04 | \n", "10:23 | \n", "1 hours 19 minutes | \n", "229.0 | \n", "Washington | \n", "New York | \n", "105 | \n", "
1370 | \n", "2022-04-01 | \n", "11:07 | \n", "12:29 | \n", "1 hours 22 minutes | \n", "229.0 | \n", "Washington | \n", "New York | \n", "56 | \n", "
1380 | \n", "2022-04-01 | \n", "13:20 | \n", "14:52 | \n", "1 hours 32 minutes | \n", "229.0 | \n", "Washington | \n", "New York | \n", "95 | \n", "
1409 | \n", "2022-04-01 | \n", "19:03 | \n", "20:23 | \n", "1 hours 20 minutes | \n", "229.0 | \n", "Washington | \n", "New York | \n", "71 | \n", "
1436 | \n", "2022-04-01 | \n", "15:32 | \n", "17:03 | \n", "1 hours 31 minutes | \n", "229.0 | \n", "Washington | \n", "New York | \n", "64 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
565444 | \n", "2022-04-01 | \n", "22:55 | \n", "07:09 | \n", "5 hours 14 minutes | \n", "2475.0 | \n", "Los Angeles | \n", "New York | \n", "621 | \n", "
565446 | \n", "2022-04-01 | \n", "11:39 | \n", "19:49 | \n", "5 hours 10 minutes | \n", "2475.0 | \n", "Los Angeles | \n", "New York | \n", "1100 | \n", "
565511 | \n", "2022-04-01 | \n", "15:48 | \n", "22:16 | \n", "4 hours 28 minutes | \n", "1620.0 | \n", "Denver | \n", "New York | \n", "575 | \n", "
565541 | \n", "2022-04-01 | \n", "17:36 | \n", "23:05 | \n", "3 hours 29 minutes | \n", "1620.0 | \n", "Denver | \n", "New York | \n", "669 | \n", "
565581 | \n", "2022-04-01 | \n", "20:52 | \n", "23:48 | \n", "1 hours 56 minutes | \n", "733.0 | \n", "Chicago | \n", "New York | \n", "338 | \n", "
889 rows × 8 columns
\n", "