{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os\n", "from os.path import join, dirname, exists\n", "import concurrent\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "import pdfplumber\n", "from datetime import datetime\n", "import requests\n", "import shutil\n", "import sys" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def check_exists(date):\n", " str_date = date.strftime('%Y%m%d')\n", " file_name = f\"AQI_Bulletin_{str_date}.pdf\"\n", " file_path = f\"AQI_data/{file_name}\"\n", " return exists(file_path), file_path, file_name\n", "\n", "def download(date):\n", " file_exists, file_path, file_name = check_exists(date)\n", " if file_exists:\n", " return file_path\n", " \n", " url = f\"https://cpcb.nic.in//upload/Downloads/{file_name}\"\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", " with open(file_path, 'wb') as f:\n", " f.write(response.content)\n", " return file_path\n", " else:\n", " print(f\"Failed to download {url} with status code {response.status_code}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',\n", " '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',\n", " '2016-01-09', '2016-01-10',\n", " ...\n", " '2024-11-02', '2024-11-03', '2024-11-04', '2024-11-05',\n", " '2024-11-06', '2024-11-07', '2024-11-08', '2024-11-09',\n", " '2024-11-10', '2024-11-11'],\n", " dtype='datetime64[ns]', length=3238, freq='D')\n" ] }, { "data": { "text/plain": [ "(None, 3238)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('2016-01-01', datetime.today() - pd.Timedelta(days=1), freq='D')\n", "# dates = pd.date_range('2024-01-01', '2024-02-01', freq='D')\n", "print(dates), len(dates)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20160606.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20171014.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20170618.pdf with status code 404\n" ] } ], "source": [ "with concurrent.futures.ThreadPoolExecutor(48) as executor:\n", " files = list(executor.map(download, dates))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3238\n" ] }, { "data": { "text/plain": [ "3235" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(files))\n", "files = list(filter(None, files))\n", "len(files)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6b1ea587cbcb48f6838b51ebc5b2dfc3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3238 [00:00 0, f\"No tables found in {file_path}\"\n", " except AssertionError:\n", " print(f\"No tables found in {file_path}\")\n", " return None\n", "\n", " df_list = []\n", " for table in tables:\n", " try:\n", " df, _ = process_table(table, 0)\n", " if df is not None:\n", " df_list.append(df)\n", " except Exception as e:\n", " print(f\"Ignoring a table for {file_name}\")\n", " # print(table)\n", " print(\"Error message:\", e)\n", " \n", " if len(df_list) == 0:\n", " print(f\"No valid tables found in {file_name}\")\n", " return None\n", " \n", " df = pd.concat(df_list, ignore_index=True)\n", " df['Date'] = date\n", " df.to_csv(f\"AQI_data_csv/{save_path}\", index=False)\n", "\n", "from joblib import Parallel, delayed\n", "\n", "# dfs = {}\n", "# for file_path in tqdm(files[1000:]):\n", " # print(file_path)\n", " # df = process_file(file_path)\n", " # dfs[file_path] s= df\n", "# print(dates[15:16])\n", "_ = Parallel(48)(delayed(process_file)(file_path) for file_path in tqdm(dates))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Postprocessing" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "372342" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df = pd.read_csv(\"AQI_data_csv/merged.csv\")\n", "len(merged_df)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Agartala', 'Agra', 'Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alwar', 'Ambala', 'Amravati', 'Amritsar', 'Anantapur', 'Angul', 'Ankleshwar', 'Araria', 'Ariyalur', 'Arrah', 'Asansol', 'Aurangabad (Bihar)', 'Aurangabad(Maharashtra)', 'Baddi', 'Badlapur', 'Bagalkot', 'Baghpat', 'Bahadurgarh', 'Balasore', 'Ballabgarh', 'Banswara', 'Baran', 'Barbil', 'Bareilly', 'Baripada', 'Barmer', 'Barrackpore', 'Bathinda', 'Begusarai', 'Belapur', 'Belgaum', 'Bengaluru', 'Bettiah', 'Bhagalpur', 'Bharatpur', 'Bhilai', 'Bhilwara', 'Bhiwadi', 'Bhiwandi', 'Bhiwani', 'Bhopal', 'Bhubaneswar', 'Bidar', 'Bihar Sharif', 'Bikaner', 'Bilaspur', 'Bileipada', 'Boisar', 'Brajrajnagar', 'Bulandshahr', 'Bundi', 'Buxar', 'Byasanagar', 'Byrnihat', 'Chamarajanagar', 'Chandigarh', 'Chandrapur', 'Charkhi Dadri', 'Chengalpattu', 'Chennai', 'Chhal', 'Chhapra', 'Chikkaballapur', 'Chikkamagaluru', 'Chittoor', 'Chittorgarh', 'Churu', 'Coimbtore', 'Cuddalore', 'Cuttack', 'Damoh', 'Darbhanga', 'Dausa', 'Davanagere', 'Dehradun', 'Delhi', 'Dewas', 'Dhanbad', 'Dharuhera', 'Dharwad', 'Dholpur', 'Dhule', 'Dindigul', 'Dungarpur', 'Durgapur', 'Eloor', 'Ernakulam', 'Faridabad', 'Fatehabad', 'Firozabad', 'Gadag', 'Gandhinagar', 'Gangtok', 'Gaya', 'Ghaziabad', 'Gorakhpur', 'Greater_Noida', 'Gummidipoondi', 'Gurugram', 'Guwahati', 'Gwalior', 'Hajipur', 'Haldia', 'Hanumangarh', 'Hapur', 'Hassan', 'Haveri', 'Hisar', 'Hosur', 'Howrah', 'Hubballi', 'Hyderabad', 'Imphal', 'Indore', 'Jabalpur', 'Jaipur', 'Jaisalmer', 'Jalandhar', 'Jalgaon', 'Jalna', 'Jalore', 'Jhalawar', 'Jhansi', 'Jharsuguda', 'Jhunjhunu', 'Jind', 'Jodhpur', 'Jorapokhar', 'Kadapa', 'Kaithal', 'Kalaburgi', 'Kalyan', 'Kanchipuram', 'Kannur', 'Kanpur', 'Karauli', 'Karnal', 'Karur', 'Karwar', 'Kashipur', 'Katihar', 'Katni', 'Keonjhar', 'Khanna', 'Khurja', 'Kishanganj', 'Kochi', 'Kohima', 'Kolar', 'Kolhapur', 'Kolkata', 'Kollam', 'Koppal', 'Korba', 'Kota', 'Kozhikode', 'Kunjemura', 'Kurushketra', 'Latur', 'Loni_Ghaziabad', 'Lucknow', 'Ludhiana', 'Madurai', 'Mahad', 'Maihar', 'Malegaon', 'Mandi Gobindgarh', 'Mandideep', 'Mandikhera', 'Manesar', 'Mangalore', 'Manguraha', 'Medikeri', 'Meerut', 'Milupara', 'Mira-Bhayandar', 'Moradabad', 'Motihari', 'Mumbai', 'Munger', 'Muzaffarnagar', 'Muzaffarpur', 'Mysuru', 'NOIDA', 'Nagaon', 'Nagapattinam', 'Nagaur', 'Nagpur', 'Naharlagun', 'Nalbari', 'Nanded', 'Nandesari', 'Narnaul', 'Nashik', 'Navi Mumbai', 'Nayagarh', 'Noida', 'Ooty', 'Pali', 'Palkalaiperur', 'Palwal', 'Panchkula', 'Panipat', 'Parbhani', 'Pathardih', 'Patiala', 'Patna', 'Pimpri-Chinchwad', 'Pithampur', 'Pratapgarh', 'Prayagraj', 'Puducherry', 'Pudukottai', 'Pune', 'Purnia', 'Raichur', 'Raipur', 'Rairangpur', 'Rajamahendravaram', 'Rajgir', 'Rajsamand', 'Ramanagara', 'Ramanathapuram', 'Ranipet', 'Ratlam', 'Rishikesh', 'Rohtak', 'Rourkela', 'Rupnagar', 'Sagar', 'Saharsa', 'Salem', 'Samastipur', 'Sangli', 'Sasaram', 'Satna', 'Sawai Madhopur', 'Shillong', 'Shivamogga', 'Sikar', 'Silchar', 'Siliguri', 'Singrauli', 'Sirohi', 'Sirsa', 'Sivasagar', 'Siwan', 'Solapur', 'Sonipat', 'Sri Ganganagar', 'Srinagar', 'Suakati', 'Surat', 'Talcher', 'Tensa', 'Thane', 'Thanjavur', 'Thiruvananthapuram', 'Thoothukudi', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 'Tirupati', 'Tirupur', 'Tonk', 'Tumidih', 'Udaipur', 'Udupi', 'Ujjain', 'Ulhasnagar', 'Vapi', 'Varanasi', 'Vatva', 'Vellore', 'Vijayapura', 'Vijayawada', 'Virar', 'Virudhunagar', 'Visakhapatnam', 'Vrindavan', 'Yadgir', 'Yamunanagar']\n" ] } ], "source": [ "city_mapping = {\n", " \"Amaravati\": \"Amravati\",\n", " \"Asanol\": \"Asansol\",\n", " \"Greater Noida\": \"Greater_Noida\",\n", " \"GandhiNagar\": \"Gandhinagar\",\n", " \"Gurgaon\": \"Gurugram\",\n", " \"Coimbatore\": \"Coimbtore\",\n", " \"Kalaburagi\": \"Kalaburgi\",\n", " \"Kurukshetra\": \"Kurushketra\",\n", " \"Loni_Dehat\": \"Loni_Ghaziabad\",\n", " \"Madikeri\": \"Medikeri\",\n", " \"Manglore\": \"Mangalore\",\n", " \"Pimpri Chinchwad\": \"Pimpri-Chinchwad\",\n", " \"Tumakuru\": \"Tumidih\",\n", " \"Tiruppur\": \"Tirupur\",\n", " \"Yamuna Nagar\": \"Yamunanagar\",\n", " \"vellore\": \"Vellore\" # duplicate, can map to itself or be handled separately\n", "}\n", "def replace_it(x):\n", " x = x.strip().replace(\"\\n\",\"\")\n", " if x in city_mapping:\n", " return city_mapping[x]\n", " else:\n", " return x\n", "\n", "merged_df['City'] = merged_df['City'].apply(lambda x: replace_it(x))\n", "merged_df = merged_df[merged_df.City != \"Aurangabad\"]\n", "print(merged_df['City'].value_counts().sort_index().index.tolist())" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "State\n", "Andhra Pradesh 10881\n", "Arunachal Pradesh 557\n", "Assam 4582\n", "Bihar 26391\n", "Chandigarh 1874\n", "Chhattisgarh 4674\n", "Delhi 3224\n", "Gujarat 11558\n", "Haryana 49090\n", "Himachal Pradesh 916\n", "Jammu and Kashmir 822\n", "Jharkhand 1872\n", "Karnataka 33054\n", "Kerala 11121\n", "Madhya Pradesh 29800\n", "Maharashtra 35954\n", "Manipur 724\n", "Meghalaya 1839\n", "Mizoram 1433\n", "Nagaland 1305\n", "Odisha 10816\n", "Puducherry 1330\n", "Punjab 18876\n", "Rajasthan 34231\n", "Sikkim 772\n", "Tamil Nadu 12170\n", "Telangana 3216\n", "Tripura 1349\n", "Uttar Pradesh 39710\n", "Uttarakhand 1866\n", "West Bengal 14674\n", "Name: count, dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_to_state = {\n", " 'Agartala': 'Tripura', 'Agra': 'Uttar Pradesh', 'Ahmedabad': 'Gujarat', 'Ahmednagar': 'Maharashtra',\n", " 'Aizawl': 'Mizoram', 'Ajmer': 'Rajasthan', 'Akola': 'Maharashtra', 'Alwar': 'Rajasthan', \n", " 'Ambala': 'Haryana', 'Amravati': 'Maharashtra', 'Amritsar': 'Punjab', 'Anantapur': 'Andhra Pradesh', \n", " 'Angul': 'Odisha', 'Ankleshwar': 'Gujarat', 'Araria': 'Bihar', 'Ariyalur': 'Tamil Nadu', \n", " 'Arrah': 'Bihar', 'Asansol': 'West Bengal', 'Aurangabad (Bihar)': 'Bihar', \n", " 'Aurangabad(Maharashtra)': 'Maharashtra', 'Baddi': 'Himachal Pradesh', 'Badlapur': 'Maharashtra', \n", " 'Bagalkot': 'Karnataka', 'Baghpat': 'Uttar Pradesh', 'Bahadurgarh': 'Haryana', 'Balasore': 'Odisha', \n", " 'Ballabgarh': 'Haryana', 'Banswara': 'Rajasthan', 'Baran': 'Rajasthan', 'Barbil': 'Odisha', \n", " 'Bareilly': 'Uttar Pradesh', 'Baripada': 'Odisha', 'Barmer': 'Rajasthan', 'Barrackpore': 'West Bengal', \n", " 'Bathinda': 'Punjab', 'Begusarai': 'Bihar', 'Belapur': 'Maharashtra', 'Belgaum': 'Karnataka', \n", " 'Bengaluru': 'Karnataka', 'Bettiah': 'Bihar', 'Bhagalpur': 'Bihar', 'Bharatpur': 'Rajasthan', \n", " 'Bhilai': 'Chhattisgarh', 'Bhilwara': 'Rajasthan', 'Bhiwadi': 'Rajasthan', 'Bhiwandi': 'Maharashtra', \n", " 'Bhiwani': 'Haryana', 'Bhopal': 'Madhya Pradesh', 'Bhubaneswar': 'Odisha', 'Bidar': 'Karnataka', \n", " 'Bihar Sharif': 'Bihar', 'Bikaner': 'Rajasthan', 'Bilaspur': 'Chhattisgarh', 'Bileipada': 'Odisha', \n", " 'Boisar': 'Maharashtra', 'Brajrajnagar': 'Odisha', 'Bulandshahr': 'Uttar Pradesh', 'Bundi': 'Rajasthan', \n", " 'Buxar': 'Bihar', 'Byasanagar': 'Odisha', 'Byrnihat': 'Meghalaya', 'Chamarajanagar': 'Karnataka', \n", " 'Chandigarh': 'Chandigarh', 'Chandrapur': 'Maharashtra', 'Charkhi Dadri': 'Haryana', \n", " 'Chengalpattu': 'Tamil Nadu', 'Chennai': 'Tamil Nadu', 'Chhal': 'Chhattisgarh', 'Chhapra': 'Bihar', \n", " 'Chikkaballapur': 'Karnataka', 'Chikkamagaluru': 'Karnataka', 'Chittoor': 'Andhra Pradesh', \n", " 'Chittorgarh': 'Rajasthan', 'Churu': 'Rajasthan', 'Coimbtore': 'Tamil Nadu', 'Cuddalore': 'Tamil Nadu', \n", " 'Cuttack': 'Odisha', 'Damoh': 'Madhya Pradesh', 'Darbhanga': 'Bihar', 'Dausa': 'Rajasthan', \n", " 'Davanagere': 'Karnataka', 'Dehradun': 'Uttarakhand', 'Delhi': 'Delhi', 'Dewas': 'Madhya Pradesh', \n", " 'Dhanbad': 'Jharkhand', 'Dharuhera': 'Haryana', 'Dharwad': 'Karnataka', 'Dholpur': 'Rajasthan', \n", " 'Dhule': 'Maharashtra', 'Dindigul': 'Tamil Nadu', 'Dungarpur': 'Rajasthan', 'Durgapur': 'West Bengal', \n", " 'Eloor': 'Kerala', 'Ernakulam': 'Kerala', 'Faridabad': 'Haryana', 'Fatehabad': 'Haryana', \n", " 'Firozabad': 'Uttar Pradesh', 'Gadag': 'Karnataka', 'Gandhinagar': 'Gujarat', 'Gangtok': 'Sikkim', \n", " 'Gaya': 'Bihar', 'Ghaziabad': 'Uttar Pradesh', 'Gorakhpur': 'Uttar Pradesh', 'Greater_Noida': 'Uttar Pradesh', \n", " 'Gummidipoondi': 'Tamil Nadu', 'Gurugram': 'Haryana', 'Guwahati': 'Assam', 'Gwalior': 'Madhya Pradesh', \n", " 'Hajipur': 'Bihar', 'Haldia': 'West Bengal', 'Hanumangarh': 'Rajasthan', 'Hapur': 'Uttar Pradesh', \n", " 'Hassan': 'Karnataka', 'Haveri': 'Karnataka', 'Hisar': 'Haryana', 'Hosur': 'Tamil Nadu', 'Howrah': 'West Bengal', \n", " 'Hubballi': 'Karnataka', 'Hyderabad': 'Telangana', 'Imphal': 'Manipur', 'Indore': 'Madhya Pradesh', \n", " 'Jabalpur': 'Madhya Pradesh', 'Jaipur': 'Rajasthan', 'Jaisalmer': 'Rajasthan', 'Jalandhar': 'Punjab', \n", " 'Jalgaon': 'Maharashtra', 'Jalna': 'Maharashtra', 'Jalore': 'Rajasthan', 'Jhalawar': 'Rajasthan', \n", " 'Jhansi': 'Uttar Pradesh', 'Jharsuguda': 'Odisha', 'Jhunjhunu': 'Rajasthan', 'Jind': 'Haryana', \n", " 'Jodhpur': 'Rajasthan', 'Jorapokhar': 'Jharkhand', 'Kadapa': 'Andhra Pradesh', 'Kaithal': 'Haryana', \n", " 'Kalaburgi': 'Karnataka', 'Kalyan': 'Maharashtra', 'Kanchipuram': 'Tamil Nadu', 'Kannur': 'Kerala', \n", " 'Kanpur': 'Uttar Pradesh', 'Karauli': 'Rajasthan', 'Karnal': 'Haryana', 'Karur': 'Tamil Nadu', \n", " 'Karwar': 'Karnataka', 'Kashipur': 'Uttarakhand', 'Katihar': 'Bihar', 'Katni': 'Madhya Pradesh', \n", " 'Keonjhar': 'Odisha', 'Khanna': 'Punjab', 'Khurja': 'Uttar Pradesh', 'Kishanganj': 'Bihar', \n", " 'Kochi': 'Kerala', 'Kohima': 'Nagaland', 'Kolar': 'Karnataka', 'Kolhapur': 'Maharashtra', \n", " 'Kolkata': 'West Bengal', 'Kollam': 'Kerala', 'Koppal': 'Karnataka', 'Korba': 'Chhattisgarh', \n", " 'Kota': 'Rajasthan', 'Kozhikode': 'Kerala', 'Kunjemura': 'Jharkhand', 'Kurushketra': 'Haryana', \n", " 'Latur': 'Maharashtra', 'Loni_Ghaziabad': 'Uttar Pradesh', 'Lucknow': 'Uttar Pradesh', 'Ludhiana': 'Punjab', \n", " 'Madurai': 'Tamil Nadu', 'Mahad': 'Maharashtra', 'Maihar': 'Madhya Pradesh', 'Malegaon': 'Maharashtra', \n", " 'Mandi Gobindgarh': 'Punjab', 'Mandideep': 'Madhya Pradesh', 'Mandikhera': 'Haryana', 'Manesar': 'Haryana', \n", " 'Mangalore': 'Karnataka', 'Manguraha': 'Bihar', 'Medikeri': 'Karnataka', 'Meerut': 'Uttar Pradesh', \n", " 'Milupara': 'Chhattisgarh', 'Mira-Bhayandar': 'Maharashtra', 'Moradabad': 'Uttar Pradesh', \n", " 'Motihari': 'Bihar', 'Mumbai': 'Maharashtra', 'Munger': 'Bihar', 'Muzaffarnagar': 'Uttar Pradesh', \n", " 'Muzaffarpur': 'Bihar', 'Mysuru': 'Karnataka', 'NOIDA': 'Uttar Pradesh', 'Nagaon': 'Assam', \n", " 'Nagapattinam': 'Tamil Nadu', 'Nagaur': 'Rajasthan', 'Nagpur': 'Maharashtra', 'Naharlagun': 'Arunachal Pradesh', \n", " 'Nalbari': 'Assam', 'Nanded': 'Maharashtra', 'Nandesari': 'Gujarat', 'Narnaul': 'Haryana', 'Nashik': 'Maharashtra',\n", " 'Navi Mumbai': 'Maharashtra',\n", " 'Nayagarh': 'Odisha',\n", " 'Noida': 'Uttar Pradesh',\n", " 'Ooty': 'Tamil Nadu',\n", " 'Pali': 'Rajasthan',\n", " 'Palkalaiperur': 'Tamil Nadu',\n", " 'Palwal': 'Haryana',\n", " 'Panchkula': 'Haryana',\n", " 'Panipat': 'Haryana',\n", " 'Parbhani': 'Maharashtra',\n", " 'Pathardih': 'Jharkhand',\n", " 'Patiala': 'Punjab',\n", " 'Patna': 'Bihar',\n", " 'Pimpri-Chinchwad': 'Maharashtra',\n", " 'Pithampur': 'Madhya Pradesh',\n", " 'Pratapgarh': 'Rajasthan',\n", " 'Prayagraj': 'Uttar Pradesh',\n", " 'Puducherry': 'Puducherry',\n", " 'Pudukottai': 'Tamil Nadu',\n", " 'Pune': 'Maharashtra',\n", " 'Purnia': 'Bihar',\n", " 'Raichur': 'Karnataka',\n", " 'Raipur': 'Chhattisgarh',\n", " 'Rairangpur': 'Odisha',\n", " 'Rajamahendravaram': 'Andhra Pradesh',\n", " 'Rajgir': 'Bihar',\n", " 'Rajsamand': 'Rajasthan',\n", " 'Ramanagara': 'Karnataka',\n", " 'Ramanathapuram': 'Tamil Nadu',\n", " 'Ranipet': 'Tamil Nadu',\n", " 'Ratlam': 'Madhya Pradesh',\n", " 'Rishikesh': 'Uttarakhand',\n", " 'Rohtak': 'Haryana',\n", " 'Rourkela': 'Odisha',\n", " 'Rupnagar': 'Punjab',\n", " 'Sagar': 'Madhya Pradesh',\n", " 'Saharsa': 'Bihar',\n", " 'Salem': 'Tamil Nadu',\n", " 'Samastipur': 'Bihar',\n", " 'Sangli': 'Maharashtra',\n", " 'Sasaram': 'Bihar',\n", " 'Satna': 'Madhya Pradesh',\n", " 'Sawai Madhopur': 'Rajasthan',\n", " 'Shillong': 'Meghalaya',\n", " 'Shivamogga': 'Karnataka',\n", " 'Sikar': 'Rajasthan',\n", " 'Silchar': 'Assam',\n", " 'Siliguri': 'West Bengal',\n", " 'Singrauli': 'Madhya Pradesh',\n", " 'Sirohi': 'Rajasthan',\n", " 'Sirsa': 'Haryana',\n", " 'Sivasagar': 'Assam',\n", " 'Siwan': 'Bihar',\n", " 'Solapur': 'Maharashtra',\n", " 'Sonipat': 'Haryana',\n", " 'Sri Ganganagar': 'Rajasthan',\n", " 'Srinagar': 'Jammu and Kashmir',\n", " 'Suakati': 'Odisha',\n", " 'Surat': 'Gujarat',\n", " 'Talcher': 'Odisha',\n", " 'Tensa': 'Odisha',\n", " 'Thane': 'Maharashtra',\n", " 'Thanjavur': 'Tamil Nadu',\n", " 'Thiruvananthapuram': 'Kerala',\n", " 'Thoothukudi': 'Tamil Nadu',\n", " 'Thrissur': 'Kerala',\n", " 'Tiruchirappalli': 'Tamil Nadu',\n", " 'Tirunelveli': 'Tamil Nadu',\n", " 'Tirupati': 'Andhra Pradesh',\n", " 'Tirupur': 'Tamil Nadu',\n", " 'Tonk': 'Rajasthan',\n", " 'Tumidih': 'Chhattisgarh',\n", " 'Udaipur': 'Rajasthan',\n", " 'Udupi': 'Karnataka',\n", " 'Ujjain': 'Madhya Pradesh',\n", " 'Ulhasnagar': 'Maharashtra',\n", " 'Vapi': 'Gujarat',\n", " 'Varanasi': 'Uttar Pradesh', \n", " 'Vatva': 'Gujarat', 'Vellore': 'Tamil Nadu',\n", " 'Vijayapura': 'Karnataka',\n", " 'Vijayawada': 'Andhra Pradesh',\n", " 'Virar': 'Maharashtra',\n", " 'Virudhunagar': 'Tamil Nadu',\n", " 'Visakhapatnam': 'Andhra Pradesh',\n", " 'Vrindavan': 'Uttar Pradesh',\n", " 'Yadgir': 'Karnataka',\n", " 'Yamunanagar': 'Haryana'\n", "}\n", "merged_df['State'] = merged_df['City'].apply(lambda x: city_to_state[x])\n", "merged_df['State'].value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityAQIPollutantAir QualityBased on number of monitoring stationsDateState
0Agra417PM\\n2.5Severe12016-01-01Uttar Pradesh
1Bengaluru95PM , PM\\n2.5 10Satisfactory52016-01-01Karnataka
\n", "
" ], "text/plain": [ " City AQI Pollutant Air Quality \\\n", "0 Agra 417 PM\\n2.5 Severe \n", "1 Bengaluru 95 PM , PM\\n2.5 10 Satisfactory \n", "\n", " Based on number of monitoring stations Date State \n", "0 1 2016-01-01 Uttar Pradesh \n", "1 5 2016-01-01 Karnataka " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merged_df['Based on number of monitoring stations'] = merged_df['Based on number of monitoring stations'].apply(lambda x: int(x.split(\"/\")[0].split(\"#\")[0].split(\"\\n\")[0]))\n", "merged_df['Date'] = pd.to_datetime(merged_df['Date'])\n", "merged_df.head(2)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "370681" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(merged_df)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "370681\n", "369967\n" ] } ], "source": [ "merged_df['Date_City'] = merged_df['Date'].astype(str) + \"_\" + merged_df['City']\n", "print(len(merged_df))\n", "merged_df = merged_df.drop_duplicates(subset=['Date_City'], keep='first')\n", "print(len(merged_df))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "merged_df.set_index([\"Date\", \"City\"]).to_xarray().to_dataframe().to_parquet(\"AQI_data.parquet\")" ] } ], "metadata": { "kernelspec": { "display_name": "zeel_py310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }