{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os\n", "from os.path import join, dirname, exists\n", "import concurrent\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "import pdfplumber\n", "from datetime import datetime\n", "import requests\n", "import shutil\n", "import sys" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def check_exists(date):\n", " str_date = date.strftime('%Y%m%d')\n", " file_name = f\"AQI_Bulletin_{str_date}.pdf\"\n", " file_path = f\"AQI_data/{file_name}\"\n", " return exists(file_path), file_path, file_name\n", "\n", "def download(date):\n", " file_exists, file_path, file_name = check_exists(date)\n", " if file_exists:\n", " return file_path\n", " \n", " url = f\"https://cpcb.nic.in//upload/Downloads/{file_name}\"\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", " with open(file_path, 'wb') as f:\n", " f.write(response.content)\n", " return file_path\n", " else:\n", " print(f\"Failed to download {url} with status code {response.status_code}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',\n", " '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',\n", " '2016-01-09', '2016-01-10',\n", " ...\n", " '2024-11-02', '2024-11-03', '2024-11-04', '2024-11-05',\n", " '2024-11-06', '2024-11-07', '2024-11-08', '2024-11-09',\n", " '2024-11-10', '2024-11-11'],\n", " dtype='datetime64[ns]', length=3238, freq='D')\n" ] }, { "data": { "text/plain": [ "(None, 3238)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('2016-01-01', datetime.today() - pd.Timedelta(days=1), freq='D')\n", "# dates = pd.date_range('2024-01-01', '2024-02-01', freq='D')\n", "print(dates), len(dates)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20160606.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20171014.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20170618.pdf with status code 404\n" ] } ], "source": [ "with concurrent.futures.ThreadPoolExecutor(48) as executor:\n", " files = list(executor.map(download, dates))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3238\n" ] }, { "data": { "text/plain": [ "3235" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(files))\n", "files = list(filter(None, files))\n", "len(files)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6b1ea587cbcb48f6838b51ebc5b2dfc3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3238 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "File AQI_Bulletin_20160606.pdf does not exist\n", "No tables found in AQI_data/AQI_Bulletin_20160704.pdf\n", "No tables found in AQI_data/AQI_Bulletin_20160721.pdf\n", "No tables found in AQI_data/AQI_Bulletin_20160723.pdf\n", "No tables found in AQI_data/AQI_Bulletin_20160722.pdf\n", "File AQI_Bulletin_20170618.pdf does not exist\n", "File AQI_Bulletin_20171014.pdf does not exist\n" ] } ], "source": [ "def check_valid(value):\n", " if value is None:\n", " return False\n", " if value == \"\":\n", " return False\n", " return True\n", "\n", "def process_pattern_1(table, i, key):\n", " # second line\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][0]\n", " aqi = int(table[i][4])\n", " pollutant = table[i][5]\n", " air_quality = process_pattern_1(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": None}}\n", " \n", "def process_pattern_2(table, i, key):\n", " # second line\n", " try:\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " except Exception as e:\n", " print(table[i-1])\n", " raise e\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " try:\n", " key = table[i][0]\n", " if check_valid(table[i][1]):\n", " air_quality = table[i][1]\n", " need_for_second_line = False\n", " else:\n", " need_for_second_line = True\n", " aqi = int(table[i][4])\n", " pollutant = table[i][5] # p2\n", " n_stations = table[i][6]\n", " except Exception as e:\n", " print(table[i])\n", " print(table)\n", " raise e\n", " if need_for_second_line:\n", " air_quality = process_pattern_2(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", " \n", "def process_pattern_3(table, i, key):\n", " # second line\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][0]\n", " if check_valid(table[i][1]):\n", " air_quality = table[i][1]\n", " second_line_needed = False\n", " else:\n", " second_line_needed = True\n", " aqi = int(table[i][2])\n", " pollutant = table[i][3]\n", " n_stations = table[i][4]\n", " if second_line_needed:\n", " air_quality = process_pattern_3(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", "\n", "def process_pattern_4(table, i, key):\n", " # ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations'], ['1', 'Agra', 'Moderate', '138', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['2', 'Ahmedabad', 'Satisfactory', '77', 'PM\\n10', '1'], [None, None, '', None, None, None], ['3', 'Aizawl', 'Satisfactory', '53', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['4', 'Ajmer', 'Satisfa\n", " # # invalid line\n", " # if (key is not None) and (not check_valid(table[i][0])):\n", " # air_quality = table[i][2]\n", " # return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][1]\n", " air_quality = table[i][2]\n", " aqi = int(table[i][3].split(\"\\n\")[0])\n", " pollutant = table[i][4]\n", " n_stations = table[i][5]\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", " \n", "\n", "def process_table(table, start):\n", " data_dict = {}\n", " if table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant']:\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_1(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_1(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on number of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index\\nValue', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']):\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_2(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_2(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif table[0] == ['City', 'Air Quality', 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']:\n", " # print(\"Pattern 3\")\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_3(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_3(table, i, None))\n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Polluta\\nnt', 'Based on number\\nof monitoring\\nstations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index\\nValue', 'Prominent Pollutant', 'No. of Stations\\nParticipated/\\nTotal Stations']):\n", " # print(\"Pattern 4\")\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_4(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_4(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['Good', 'Minimal impact']) or (table[0] == ['Good', 'Minimal Impact']) or (table[0] == ['AQI', 'Category', 'Color Code', 'Possible Health Impacts']):\n", " # print(\"Not a data table\")\n", " return None, None\n", " else:\n", " print(table)\n", " raise ValueError(\"Table pattern not recognized\")\n", "\n", "def process_file(date):\n", " file_exists, file_path, file_name = check_exists(date)\n", " if not file_exists:\n", " print(f\"File {file_name} does not exist\")\n", " return None\n", " save_path = file_name.replace(\".pdf\",\".csv\")\n", " if exists(f\"AQI_data_csv/{save_path}\"):\n", " try:\n", " pd.read_csv(f\"AQI_data_csv/{save_path}\")\n", " # print(f\"File {save_path} already exists\")\n", " return None\n", " except Exception as e:\n", " print(f\"File {save_path} is corrupted and will be overwritten\")\n", " \n", " tables = []\n", " with pdfplumber.open(file_path) as pdf:\n", " for page in pdf.pages:\n", " table = page.extract_table()\n", " if table:\n", " tables.append(table)\n", " try:\n", " assert len(tables) > 0, f\"No tables found in {file_path}\"\n", " except AssertionError:\n", " print(f\"No tables found in {file_path}\")\n", " return None\n", "\n", " df_list = []\n", " for table in tables:\n", " try:\n", " df, _ = process_table(table, 0)\n", " if df is not None:\n", " df_list.append(df)\n", " except Exception as e:\n", " print(f\"Ignoring a table for {file_name}\")\n", " # print(table)\n", " print(\"Error message:\", e)\n", " \n", " if len(df_list) == 0:\n", " print(f\"No valid tables found in {file_name}\")\n", " return None\n", " \n", " df = pd.concat(df_list, ignore_index=True)\n", " df['Date'] = date\n", " df.to_csv(f\"AQI_data_csv/{save_path}\", index=False)\n", "\n", "from joblib import Parallel, delayed\n", "\n", "# dfs = {}\n", "# for file_path in tqdm(files[1000:]):\n", " # print(file_path)\n", " # df = process_file(file_path)\n", " # dfs[file_path] s= df\n", "# print(dates[15:16])\n", "_ = Parallel(48)(delayed(process_file)(file_path) for file_path in tqdm(dates))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Postprocessing" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "372342" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df = pd.read_csv(\"AQI_data_csv/merged.csv\")\n", "len(merged_df)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Agartala', 'Agra', 'Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alwar', 'Ambala', 'Amravati', 'Amritsar', 'Anantapur', 'Angul', 'Ankleshwar', 'Araria', 'Ariyalur', 'Arrah', 'Asansol', 'Aurangabad (Bihar)', 'Aurangabad(Maharashtra)', 'Baddi', 'Badlapur', 'Bagalkot', 'Baghpat', 'Bahadurgarh', 'Balasore', 'Ballabgarh', 'Banswara', 'Baran', 'Barbil', 'Bareilly', 'Baripada', 'Barmer', 'Barrackpore', 'Bathinda', 'Begusarai', 'Belapur', 'Belgaum', 'Bengaluru', 'Bettiah', 'Bhagalpur', 'Bharatpur', 'Bhilai', 'Bhilwara', 'Bhiwadi', 'Bhiwandi', 'Bhiwani', 'Bhopal', 'Bhubaneswar', 'Bidar', 'Bihar Sharif', 'Bikaner', 'Bilaspur', 'Bileipada', 'Boisar', 'Brajrajnagar', 'Bulandshahr', 'Bundi', 'Buxar', 'Byasanagar', 'Byrnihat', 'Chamarajanagar', 'Chandigarh', 'Chandrapur', 'Charkhi Dadri', 'Chengalpattu', 'Chennai', 'Chhal', 'Chhapra', 'Chikkaballapur', 'Chikkamagaluru', 'Chittoor', 'Chittorgarh', 'Churu', 'Coimbtore', 'Cuddalore', 'Cuttack', 'Damoh', 'Darbhanga', 'Dausa', 'Davanagere', 'Dehradun', 'Delhi', 'Dewas', 'Dhanbad', 'Dharuhera', 'Dharwad', 'Dholpur', 'Dhule', 'Dindigul', 'Dungarpur', 'Durgapur', 'Eloor', 'Ernakulam', 'Faridabad', 'Fatehabad', 'Firozabad', 'Gadag', 'Gandhinagar', 'Gangtok', 'Gaya', 'Ghaziabad', 'Gorakhpur', 'Greater_Noida', 'Gummidipoondi', 'Gurugram', 'Guwahati', 'Gwalior', 'Hajipur', 'Haldia', 'Hanumangarh', 'Hapur', 'Hassan', 'Haveri', 'Hisar', 'Hosur', 'Howrah', 'Hubballi', 'Hyderabad', 'Imphal', 'Indore', 'Jabalpur', 'Jaipur', 'Jaisalmer', 'Jalandhar', 'Jalgaon', 'Jalna', 'Jalore', 'Jhalawar', 'Jhansi', 'Jharsuguda', 'Jhunjhunu', 'Jind', 'Jodhpur', 'Jorapokhar', 'Kadapa', 'Kaithal', 'Kalaburgi', 'Kalyan', 'Kanchipuram', 'Kannur', 'Kanpur', 'Karauli', 'Karnal', 'Karur', 'Karwar', 'Kashipur', 'Katihar', 'Katni', 'Keonjhar', 'Khanna', 'Khurja', 'Kishanganj', 'Kochi', 'Kohima', 'Kolar', 'Kolhapur', 'Kolkata', 'Kollam', 'Koppal', 'Korba', 'Kota', 'Kozhikode', 'Kunjemura', 'Kurushketra', 'Latur', 'Loni_Ghaziabad', 'Lucknow', 'Ludhiana', 'Madurai', 'Mahad', 'Maihar', 'Malegaon', 'Mandi Gobindgarh', 'Mandideep', 'Mandikhera', 'Manesar', 'Mangalore', 'Manguraha', 'Medikeri', 'Meerut', 'Milupara', 'Mira-Bhayandar', 'Moradabad', 'Motihari', 'Mumbai', 'Munger', 'Muzaffarnagar', 'Muzaffarpur', 'Mysuru', 'NOIDA', 'Nagaon', 'Nagapattinam', 'Nagaur', 'Nagpur', 'Naharlagun', 'Nalbari', 'Nanded', 'Nandesari', 'Narnaul', 'Nashik', 'Navi Mumbai', 'Nayagarh', 'Noida', 'Ooty', 'Pali', 'Palkalaiperur', 'Palwal', 'Panchkula', 'Panipat', 'Parbhani', 'Pathardih', 'Patiala', 'Patna', 'Pimpri-Chinchwad', 'Pithampur', 'Pratapgarh', 'Prayagraj', 'Puducherry', 'Pudukottai', 'Pune', 'Purnia', 'Raichur', 'Raipur', 'Rairangpur', 'Rajamahendravaram', 'Rajgir', 'Rajsamand', 'Ramanagara', 'Ramanathapuram', 'Ranipet', 'Ratlam', 'Rishikesh', 'Rohtak', 'Rourkela', 'Rupnagar', 'Sagar', 'Saharsa', 'Salem', 'Samastipur', 'Sangli', 'Sasaram', 'Satna', 'Sawai Madhopur', 'Shillong', 'Shivamogga', 'Sikar', 'Silchar', 'Siliguri', 'Singrauli', 'Sirohi', 'Sirsa', 'Sivasagar', 'Siwan', 'Solapur', 'Sonipat', 'Sri Ganganagar', 'Srinagar', 'Suakati', 'Surat', 'Talcher', 'Tensa', 'Thane', 'Thanjavur', 'Thiruvananthapuram', 'Thoothukudi', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 'Tirupati', 'Tirupur', 'Tonk', 'Tumidih', 'Udaipur', 'Udupi', 'Ujjain', 'Ulhasnagar', 'Vapi', 'Varanasi', 'Vatva', 'Vellore', 'Vijayapura', 'Vijayawada', 'Virar', 'Virudhunagar', 'Visakhapatnam', 'Vrindavan', 'Yadgir', 'Yamunanagar']\n" ] } ], "source": [ "city_mapping = {\n", " \"Amaravati\": \"Amravati\",\n", " \"Asanol\": \"Asansol\",\n", " \"Greater Noida\": \"Greater_Noida\",\n", " \"GandhiNagar\": \"Gandhinagar\",\n", " \"Gurgaon\": \"Gurugram\",\n", " \"Coimbatore\": \"Coimbtore\",\n", " \"Kalaburagi\": \"Kalaburgi\",\n", " \"Kurukshetra\": \"Kurushketra\",\n", " \"Loni_Dehat\": \"Loni_Ghaziabad\",\n", " \"Madikeri\": \"Medikeri\",\n", " \"Manglore\": \"Mangalore\",\n", " \"Pimpri Chinchwad\": \"Pimpri-Chinchwad\",\n", " \"Tumakuru\": \"Tumidih\",\n", " \"Tiruppur\": \"Tirupur\",\n", " \"Yamuna Nagar\": \"Yamunanagar\",\n", " \"vellore\": \"Vellore\" # duplicate, can map to itself or be handled separately\n", "}\n", "def replace_it(x):\n", " x = x.strip().replace(\"\\n\",\"\")\n", " if x in city_mapping:\n", " return city_mapping[x]\n", " else:\n", " return x\n", "\n", "merged_df['City'] = merged_df['City'].apply(lambda x: replace_it(x))\n", "merged_df = merged_df[merged_df.City != \"Aurangabad\"]\n", "print(merged_df['City'].value_counts().sort_index().index.tolist())" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "State\n", "Andhra Pradesh 10881\n", "Arunachal Pradesh 557\n", "Assam 4582\n", "Bihar 26391\n", "Chandigarh 1874\n", "Chhattisgarh 4674\n", "Delhi 3224\n", "Gujarat 11558\n", "Haryana 49090\n", "Himachal Pradesh 916\n", "Jammu and Kashmir 822\n", "Jharkhand 1872\n", "Karnataka 33054\n", "Kerala 11121\n", "Madhya Pradesh 29800\n", "Maharashtra 35954\n", "Manipur 724\n", "Meghalaya 1839\n", "Mizoram 1433\n", "Nagaland 1305\n", "Odisha 10816\n", "Puducherry 1330\n", "Punjab 18876\n", "Rajasthan 34231\n", "Sikkim 772\n", "Tamil Nadu 12170\n", "Telangana 3216\n", "Tripura 1349\n", "Uttar Pradesh 39710\n", "Uttarakhand 1866\n", "West Bengal 14674\n", "Name: count, dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_to_state = {\n", " 'Agartala': 'Tripura', 'Agra': 'Uttar Pradesh', 'Ahmedabad': 'Gujarat', 'Ahmednagar': 'Maharashtra',\n", " 'Aizawl': 'Mizoram', 'Ajmer': 'Rajasthan', 'Akola': 'Maharashtra', 'Alwar': 'Rajasthan', \n", " 'Ambala': 'Haryana', 'Amravati': 'Maharashtra', 'Amritsar': 'Punjab', 'Anantapur': 'Andhra Pradesh', \n", " 'Angul': 'Odisha', 'Ankleshwar': 'Gujarat', 'Araria': 'Bihar', 'Ariyalur': 'Tamil Nadu', \n", " 'Arrah': 'Bihar', 'Asansol': 'West Bengal', 'Aurangabad (Bihar)': 'Bihar', \n", " 'Aurangabad(Maharashtra)': 'Maharashtra', 'Baddi': 'Himachal Pradesh', 'Badlapur': 'Maharashtra', \n", " 'Bagalkot': 'Karnataka', 'Baghpat': 'Uttar Pradesh', 'Bahadurgarh': 'Haryana', 'Balasore': 'Odisha', \n", " 'Ballabgarh': 'Haryana', 'Banswara': 'Rajasthan', 'Baran': 'Rajasthan', 'Barbil': 'Odisha', \n", " 'Bareilly': 'Uttar Pradesh', 'Baripada': 'Odisha', 'Barmer': 'Rajasthan', 'Barrackpore': 'West Bengal', \n", " 'Bathinda': 'Punjab', 'Begusarai': 'Bihar', 'Belapur': 'Maharashtra', 'Belgaum': 'Karnataka', \n", " 'Bengaluru': 'Karnataka', 'Bettiah': 'Bihar', 'Bhagalpur': 'Bihar', 'Bharatpur': 'Rajasthan', \n", " 'Bhilai': 'Chhattisgarh', 'Bhilwara': 'Rajasthan', 'Bhiwadi': 'Rajasthan', 'Bhiwandi': 'Maharashtra', \n", " 'Bhiwani': 'Haryana', 'Bhopal': 'Madhya Pradesh', 'Bhubaneswar': 'Odisha', 'Bidar': 'Karnataka', \n", " 'Bihar Sharif': 'Bihar', 'Bikaner': 'Rajasthan', 'Bilaspur': 'Chhattisgarh', 'Bileipada': 'Odisha', \n", " 'Boisar': 'Maharashtra', 'Brajrajnagar': 'Odisha', 'Bulandshahr': 'Uttar Pradesh', 'Bundi': 'Rajasthan', \n", " 'Buxar': 'Bihar', 'Byasanagar': 'Odisha', 'Byrnihat': 'Meghalaya', 'Chamarajanagar': 'Karnataka', \n", " 'Chandigarh': 'Chandigarh', 'Chandrapur': 'Maharashtra', 'Charkhi Dadri': 'Haryana', \n", " 'Chengalpattu': 'Tamil Nadu', 'Chennai': 'Tamil Nadu', 'Chhal': 'Chhattisgarh', 'Chhapra': 'Bihar', \n", " 'Chikkaballapur': 'Karnataka', 'Chikkamagaluru': 'Karnataka', 'Chittoor': 'Andhra Pradesh', \n", " 'Chittorgarh': 'Rajasthan', 'Churu': 'Rajasthan', 'Coimbtore': 'Tamil Nadu', 'Cuddalore': 'Tamil Nadu', \n", " 'Cuttack': 'Odisha', 'Damoh': 'Madhya Pradesh', 'Darbhanga': 'Bihar', 'Dausa': 'Rajasthan', \n", " 'Davanagere': 'Karnataka', 'Dehradun': 'Uttarakhand', 'Delhi': 'Delhi', 'Dewas': 'Madhya Pradesh', \n", " 'Dhanbad': 'Jharkhand', 'Dharuhera': 'Haryana', 'Dharwad': 'Karnataka', 'Dholpur': 'Rajasthan', \n", " 'Dhule': 'Maharashtra', 'Dindigul': 'Tamil Nadu', 'Dungarpur': 'Rajasthan', 'Durgapur': 'West Bengal', \n", " 'Eloor': 'Kerala', 'Ernakulam': 'Kerala', 'Faridabad': 'Haryana', 'Fatehabad': 'Haryana', \n", " 'Firozabad': 'Uttar Pradesh', 'Gadag': 'Karnataka', 'Gandhinagar': 'Gujarat', 'Gangtok': 'Sikkim', \n", " 'Gaya': 'Bihar', 'Ghaziabad': 'Uttar Pradesh', 'Gorakhpur': 'Uttar Pradesh', 'Greater_Noida': 'Uttar Pradesh', \n", " 'Gummidipoondi': 'Tamil Nadu', 'Gurugram': 'Haryana', 'Guwahati': 'Assam', 'Gwalior': 'Madhya Pradesh', \n", " 'Hajipur': 'Bihar', 'Haldia': 'West Bengal', 'Hanumangarh': 'Rajasthan', 'Hapur': 'Uttar Pradesh', \n", " 'Hassan': 'Karnataka', 'Haveri': 'Karnataka', 'Hisar': 'Haryana', 'Hosur': 'Tamil Nadu', 'Howrah': 'West Bengal', \n", " 'Hubballi': 'Karnataka', 'Hyderabad': 'Telangana', 'Imphal': 'Manipur', 'Indore': 'Madhya Pradesh', \n", " 'Jabalpur': 'Madhya Pradesh', 'Jaipur': 'Rajasthan', 'Jaisalmer': 'Rajasthan', 'Jalandhar': 'Punjab', \n", " 'Jalgaon': 'Maharashtra', 'Jalna': 'Maharashtra', 'Jalore': 'Rajasthan', 'Jhalawar': 'Rajasthan', \n", " 'Jhansi': 'Uttar Pradesh', 'Jharsuguda': 'Odisha', 'Jhunjhunu': 'Rajasthan', 'Jind': 'Haryana', \n", " 'Jodhpur': 'Rajasthan', 'Jorapokhar': 'Jharkhand', 'Kadapa': 'Andhra Pradesh', 'Kaithal': 'Haryana', \n", " 'Kalaburgi': 'Karnataka', 'Kalyan': 'Maharashtra', 'Kanchipuram': 'Tamil Nadu', 'Kannur': 'Kerala', \n", " 'Kanpur': 'Uttar Pradesh', 'Karauli': 'Rajasthan', 'Karnal': 'Haryana', 'Karur': 'Tamil Nadu', \n", " 'Karwar': 'Karnataka', 'Kashipur': 'Uttarakhand', 'Katihar': 'Bihar', 'Katni': 'Madhya Pradesh', \n", " 'Keonjhar': 'Odisha', 'Khanna': 'Punjab', 'Khurja': 'Uttar Pradesh', 'Kishanganj': 'Bihar', \n", " 'Kochi': 'Kerala', 'Kohima': 'Nagaland', 'Kolar': 'Karnataka', 'Kolhapur': 'Maharashtra', \n", " 'Kolkata': 'West Bengal', 'Kollam': 'Kerala', 'Koppal': 'Karnataka', 'Korba': 'Chhattisgarh', \n", " 'Kota': 'Rajasthan', 'Kozhikode': 'Kerala', 'Kunjemura': 'Jharkhand', 'Kurushketra': 'Haryana', \n", " 'Latur': 'Maharashtra', 'Loni_Ghaziabad': 'Uttar Pradesh', 'Lucknow': 'Uttar Pradesh', 'Ludhiana': 'Punjab', \n", " 'Madurai': 'Tamil Nadu', 'Mahad': 'Maharashtra', 'Maihar': 'Madhya Pradesh', 'Malegaon': 'Maharashtra', \n", " 'Mandi Gobindgarh': 'Punjab', 'Mandideep': 'Madhya Pradesh', 'Mandikhera': 'Haryana', 'Manesar': 'Haryana', \n", " 'Mangalore': 'Karnataka', 'Manguraha': 'Bihar', 'Medikeri': 'Karnataka', 'Meerut': 'Uttar Pradesh', \n", " 'Milupara': 'Chhattisgarh', 'Mira-Bhayandar': 'Maharashtra', 'Moradabad': 'Uttar Pradesh', \n", " 'Motihari': 'Bihar', 'Mumbai': 'Maharashtra', 'Munger': 'Bihar', 'Muzaffarnagar': 'Uttar Pradesh', \n", " 'Muzaffarpur': 'Bihar', 'Mysuru': 'Karnataka', 'NOIDA': 'Uttar Pradesh', 'Nagaon': 'Assam', \n", " 'Nagapattinam': 'Tamil Nadu', 'Nagaur': 'Rajasthan', 'Nagpur': 'Maharashtra', 'Naharlagun': 'Arunachal Pradesh', \n", " 'Nalbari': 'Assam', 'Nanded': 'Maharashtra', 'Nandesari': 'Gujarat', 'Narnaul': 'Haryana', 'Nashik': 'Maharashtra',\n", " 'Navi Mumbai': 'Maharashtra',\n", " 'Nayagarh': 'Odisha',\n", " 'Noida': 'Uttar Pradesh',\n", " 'Ooty': 'Tamil Nadu',\n", " 'Pali': 'Rajasthan',\n", " 'Palkalaiperur': 'Tamil Nadu',\n", " 'Palwal': 'Haryana',\n", " 'Panchkula': 'Haryana',\n", " 'Panipat': 'Haryana',\n", " 'Parbhani': 'Maharashtra',\n", " 'Pathardih': 'Jharkhand',\n", " 'Patiala': 'Punjab',\n", " 'Patna': 'Bihar',\n", " 'Pimpri-Chinchwad': 'Maharashtra',\n", " 'Pithampur': 'Madhya Pradesh',\n", " 'Pratapgarh': 'Rajasthan',\n", " 'Prayagraj': 'Uttar Pradesh',\n", " 'Puducherry': 'Puducherry',\n", " 'Pudukottai': 'Tamil Nadu',\n", " 'Pune': 'Maharashtra',\n", " 'Purnia': 'Bihar',\n", " 'Raichur': 'Karnataka',\n", " 'Raipur': 'Chhattisgarh',\n", " 'Rairangpur': 'Odisha',\n", " 'Rajamahendravaram': 'Andhra Pradesh',\n", " 'Rajgir': 'Bihar',\n", " 'Rajsamand': 'Rajasthan',\n", " 'Ramanagara': 'Karnataka',\n", " 'Ramanathapuram': 'Tamil Nadu',\n", " 'Ranipet': 'Tamil Nadu',\n", " 'Ratlam': 'Madhya Pradesh',\n", " 'Rishikesh': 'Uttarakhand',\n", " 'Rohtak': 'Haryana',\n", " 'Rourkela': 'Odisha',\n", " 'Rupnagar': 'Punjab',\n", " 'Sagar': 'Madhya Pradesh',\n", " 'Saharsa': 'Bihar',\n", " 'Salem': 'Tamil Nadu',\n", " 'Samastipur': 'Bihar',\n", " 'Sangli': 'Maharashtra',\n", " 'Sasaram': 'Bihar',\n", " 'Satna': 'Madhya Pradesh',\n", " 'Sawai Madhopur': 'Rajasthan',\n", " 'Shillong': 'Meghalaya',\n", " 'Shivamogga': 'Karnataka',\n", " 'Sikar': 'Rajasthan',\n", " 'Silchar': 'Assam',\n", " 'Siliguri': 'West Bengal',\n", " 'Singrauli': 'Madhya Pradesh',\n", " 'Sirohi': 'Rajasthan',\n", " 'Sirsa': 'Haryana',\n", " 'Sivasagar': 'Assam',\n", " 'Siwan': 'Bihar',\n", " 'Solapur': 'Maharashtra',\n", " 'Sonipat': 'Haryana',\n", " 'Sri Ganganagar': 'Rajasthan',\n", " 'Srinagar': 'Jammu and Kashmir',\n", " 'Suakati': 'Odisha',\n", " 'Surat': 'Gujarat',\n", " 'Talcher': 'Odisha',\n", " 'Tensa': 'Odisha',\n", " 'Thane': 'Maharashtra',\n", " 'Thanjavur': 'Tamil Nadu',\n", " 'Thiruvananthapuram': 'Kerala',\n", " 'Thoothukudi': 'Tamil Nadu',\n", " 'Thrissur': 'Kerala',\n", " 'Tiruchirappalli': 'Tamil Nadu',\n", " 'Tirunelveli': 'Tamil Nadu',\n", " 'Tirupati': 'Andhra Pradesh',\n", " 'Tirupur': 'Tamil Nadu',\n", " 'Tonk': 'Rajasthan',\n", " 'Tumidih': 'Chhattisgarh',\n", " 'Udaipur': 'Rajasthan',\n", " 'Udupi': 'Karnataka',\n", " 'Ujjain': 'Madhya Pradesh',\n", " 'Ulhasnagar': 'Maharashtra',\n", " 'Vapi': 'Gujarat',\n", " 'Varanasi': 'Uttar Pradesh', \n", " 'Vatva': 'Gujarat', 'Vellore': 'Tamil Nadu',\n", " 'Vijayapura': 'Karnataka',\n", " 'Vijayawada': 'Andhra Pradesh',\n", " 'Virar': 'Maharashtra',\n", " 'Virudhunagar': 'Tamil Nadu',\n", " 'Visakhapatnam': 'Andhra Pradesh',\n", " 'Vrindavan': 'Uttar Pradesh',\n", " 'Yadgir': 'Karnataka',\n", " 'Yamunanagar': 'Haryana'\n", "}\n", "merged_df['State'] = merged_df['City'].apply(lambda x: city_to_state[x])\n", "merged_df['State'].value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | City | \n", "AQI | \n", "Pollutant | \n", "Air Quality | \n", "Based on number of monitoring stations | \n", "Date | \n", "State | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Agra | \n", "417 | \n", "PM\\n2.5 | \n", "Severe | \n", "1 | \n", "2016-01-01 | \n", "Uttar Pradesh | \n", "
1 | \n", "Bengaluru | \n", "95 | \n", "PM , PM\\n2.5 10 | \n", "Satisfactory | \n", "5 | \n", "2016-01-01 | \n", "Karnataka | \n", "