## __Data Pipelines__ 
Loading data from OpenStreetMap using overpass API

In [60]:
import requests
import pandas as pd
import re
import math
from typing import Tuple, List, Dict

In [97]:
def fetch_osm_data(lat: float, lon: float, radius: int) -> List[Dict]:
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    (
      node["name"](around:{radius},{lat},{lon});
      way["name"](around:{radius},{lat},{lon});
      relation["name"](around:{radius},{lat},{lon});
    );
    out center;
    """
    
    response = requests.get(overpass_url, params={'data': overpass_query})
    data = response.json()
    return data['elements']

def determine_location_type(tags: Dict[str, str]) -> str:
    # Residential
    if 'building' in tags and tags['building'] in ['residential', 'house', 'apartments', 'detached', 'terrace', 'dormitory', 'bungalow']:
        return 'Residential'
    
    # Commercial
    if any(key in tags for key in ['shop', 'office', 'craft']):
        return 'Commercial'
    if 'building' in tags and tags['building'] in ['commercial', 'office', 'retail', 'supermarket', 'kiosk']:
        return 'Commercial'
    
    # Industrial
    if 'building' in tags and tags['building'] in ['industrial', 'warehouse', 'factory', 'manufacture']:
        return 'Industrial'
    if 'industrial' in tags or 'industry' in tags:
        return 'Industrial'
    
    # Educational
    if 'amenity' in tags and tags['amenity'] in ['school', 'university', 'college', 'library', 'kindergarten', 'language_school']:
        return 'Educational'
    
    # Healthcare
    if 'amenity' in tags and tags['amenity'] in ['hospital', 'clinic', 'doctors', 'dentist', 'pharmacy', 'veterinary']:
        return 'Healthcare'
    
    # Food & Drink
    if 'amenity' in tags and tags['amenity'] in ['restaurant', 'cafe', 'bar', 'fast_food', 'pub', 'food_court']:
        return 'Food & Drink'
    
    # Leisure & Entertainment
    if 'leisure' in tags or 'tourism' in tags:
        return 'Leisure & Entertainment'
    if 'amenity' in tags and tags['amenity'] in ['theatre', 'cinema', 'nightclub', 'arts_centre', 'community_centre']:
        return 'Leisure & Entertainment'
    
    # Transportation
    if 'amenity' in tags and tags['amenity'] in ['parking', 'bicycle_parking', 'bus_station', 'ferry_terminal']:
        return 'Transportation'
    if 'highway' in tags or 'railway' in tags or 'aeroway' in tags:
        return 'Transportation'
    
    # Religious
    if 'amenity' in tags and tags['amenity'] in ['place_of_worship', 'monastery']:
        return 'Religious'
    
    # Government & Public Services
    if 'amenity' in tags and tags['amenity'] in ['townhall', 'courthouse', 'police', 'fire_station', 'post_office']:
        return 'Government & Public Services'
    
    # Parks & Recreation
    if 'leisure' in tags and tags['leisure'] in ['park', 'playground', 'sports_centre', 'stadium', 'garden']:
        return 'Parks & Recreation'
    
    # Natural
    if 'natural' in tags:
        return 'Natural'
    
    # Landuse
    if 'landuse' in tags:
        landuse = tags['landuse'].capitalize()
        if landuse in ['Residential', 'Commercial', 'Industrial', 'Retail']:
            return landuse
        else:
            return f'Landuse: {landuse}'
    
    # If no specific category is found, return 'Other'
    return 'Other'

def parse_osm_data(elements: List[Dict]) -> pd.DataFrame:
    parsed_data = []
    for element in elements:
        tags = element.get('tags', {})
        parsed_element = {
            'ID': f"{element['type']}_{element['id']}",
            'Location Name': tags.get('name', ''),
            'Location Type': determine_location_type(tags)
        }
        parsed_data.append(parsed_element)
    if len(parsed_data) == 0:
        return pd.DataFrame(columns=['ID', 'Location Name', 'Location Type'])
    return pd.DataFrame(parsed_data)

def get_osm_data(lat: float, lon: float, radius: int) -> pd.DataFrame:
    raw_data = fetch_osm_data(lat, lon, radius)
    return parse_osm_data(raw_data)

def dms_to_decimal(coord_str):
    # Regular expression to match the coordinate format
    pattern = r'(\d+)°(\d+)\'([\d.]+)"([NS])\s*(\d+)°(\d+)\'([\d.]+)"([EW])'
    
    match = re.match(pattern, coord_str)
    if not match:
        raise ValueError("Invalid coordinate format. Expected format: 19°03'08.6\"N 72°54'06.0\"E")

    lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()

    # Convert to decimal degrees
    lat = float(lat_deg) + float(lat_min)/60 + float(lat_sec)/3600
    lon = float(lon_deg) + float(lon_min)/60 + float(lon_sec)/3600

    # Adjust sign based on direction
    if lat_dir == 'S':
        lat = -lat
    if lon_dir == 'W':
        lon = -lon

    return lat, lon

In [91]:
coord_str = '19°00\'56.9"N 72°53\'58.0"E'
radius_meters = 1000
try:
    latitude, longitude = dms_to_decimal(coord_str)
    print(f"Latitude: {latitude}")
    print(f"Longitude: {longitude}")
except ValueError as e:
    print(f"Error: {e}")

Latitude: 19.015805555555556
Longitude: 72.89944444444446


In [92]:
result_df = get_osm_data(latitude, longitude, radius_meters)

In [93]:
result_df.head(10)

Unnamed: 0,ID,Location Name,Location Type
0,node_622002639,Mahul,Other
1,node_622005407,Gowanpada,Other
2,node_1646222635,gadakary bus stop,Transportation
3,node_1646222681,vishnu nagar bus stop,Other
4,node_2932495033,Sree Dutta mandir,Religious
5,node_11954176622,Gavhanpada,Other
6,way_25587616,Bhikaji Damaji Patil Marg,Transportation
7,way_122289587,Mulund - Trombay 220 KV line,Other
8,way_151783563,Laxman Umaji Gadkari Marg,Transportation
9,way_151783570,Vishnu Nagar Road,Transportation


In [94]:
labelled_df = result_df[result_df['Location Type'] != 'Other']
labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']
labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']
labelled_df.head(10)

Unnamed: 0,ID,Location Name,Location Type
11,way_430012316,track,Residential
12,way_430012318,Mumbai Refinery Mahul,Industrial
13,way_430012320,Mumbai Refinery,Industrial


In [95]:
## removing duplicates

loc_types = []
for row in labelled_df.iterrows():
    loc_type = (row[1]['Location Name'], row[1]['Location Type'])
    if loc_type not in loc_types:
        loc_types.append(loc_type)

labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])
labelled_df.head(20)

Unnamed: 0,Location Name,Location Type
0,track,Residential
1,Mumbai Refinery Mahul,Industrial
2,Mumbai Refinery,Industrial


In [58]:
row_of_dataset = ''

for row in labelled_df.iterrows():
    row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']
    row_of_dataset += row_text + ', '

In [59]:
row_of_dataset

'Oswal Company Trees is a Natural, Newspaper stall is a Commercial, Shiv Polyclinic and Nursing Home is a Healthcare, राजपूत मेडिकल is a Healthcare, Bhabha Atomic Research Centre - BARC is a Industrial, BPCL Sports Club is a Leisure & Entertainment, New Bharat Nagar, Banjara tanda, Hasina Nagar is a Residential, Old Bharat Nagar is a Residential, Rashtriya Chemicals & Fertilizers is a Industrial, Koyna Colony is a Residential, D is a Residential, A-2 is a Residential, flip card is a Commercial, track is a Residential, Mumbai Refinery Mahul is a Industrial, Mumbai Refinery is a Industrial, Trombay Thermal Power Station is a Industrial, Vitta Sanchay Society is a Residential, E is a Residential, Acharya Sharad Narayan Udyan is a Leisure & Entertainment, bmc park is a Leisure & Entertainment, Mysore Colony Central Garden is a Leisure & Entertainment, BMC owned trees is a Natural, BMC PARK is a Leisure & Entertainment, Mysore colony eastern park is a Leisure & Entertainment, Trees owned by

This is one row of the  dataset, now writing a function to extract all these rows from a given large map area

In [61]:
## input point is at the bottom left of the map

def calculate_distant_points(lat: float, lon: float, distance: float) -> tuple:
    # Earth's radius in meters
    R = 6371000

    # Convert latitude and longitude to radians
    lat_rad = math.radians(lat)
    lon_rad = math.radians(lon)

    # Calculate the point with the same latitude (moving east-west)
    delta_lon = distance / (R * math.cos(lat_rad))
    lon1 = lon + math.degrees(delta_lon)
    
    # Calculate the point with the same longitude (moving north-south)
    delta_lat = distance / R
    lat2 = lat + math.degrees(delta_lat)

    return ((lat, lon1), (lat2, lon))

In [66]:
if __name__ == "__main__":
    latitude = 40.7128  # New York City latitude
    longitude = -74.0060  # New York City longitude
    distance = 1000*25  # 1000 meters

    result = calculate_distant_points(latitude, longitude, distance)
    print(f"Original point: ({latitude}, {longitude})")
    print(f"Point 1000m east: ({result[0][0]:.6f}, {result[0][1]:.6f})")
    print(f"Point 1000m north: ({result[1][0]:.6f}, {result[1][1]:.6f})")

Original point: (40.7128, -74.006)
Point 1000m east: (40.712800, -73.709386)
Point 1000m north: (40.937630, -74.006000)


In [69]:
bottom_left_latitude = 40.7128
bottom_left_longitude = -74.0060

result = calculate_distant_points(bottom_left_latitude, bottom_left_longitude, 1000*25)

top_left_latitude = result[1][0]
top_left_longitude = result[1][1]

bottom_right_latitude = result[0][0]
bottom_right_longitude = result[0][1]

top_right_latitude = top_left_latitude
top_right_longitude = bottom_right_longitude

print(f"Bottom Left:    ({bottom_left_latitude}, {bottom_left_longitude})")
print(f"Top Left:       ({top_left_latitude}, {top_left_longitude})")
print(f"Bottom Right:   ({bottom_right_latitude}, {bottom_right_longitude})")
print(f"Top Right:      ({top_right_latitude}, {top_right_longitude})")

Bottom Left:    (40.7128, -74.006)
Top Left:       (40.93763040147969, -74.006)
Bottom Right:   (40.7128, -73.7093855252233)
Top Right:      (40.93763040147969, -73.7093855252233)


In [71]:
latitude_shift = top_left_latitude - bottom_left_latitude
longitude_shift = bottom_right_longitude - bottom_left_longitude

latitude_unit = latitude_shift / 25
longitude_unit = longitude_shift / 25

latitude_unit, longitude_unit

(0.008993216059187433, 0.01186457899106813)

In [73]:
## 2d map grid (0,0) --> bottom left

def create_map_grid(bottom_left: Tuple[float, float], top_right: Tuple[float, float], rows: int, cols: int) -> List[List[Tuple[float, float]]]:
    grid = []
    lat_unit = (top_right[0] - bottom_left[0]) / rows
    lon_unit = (top_right[1] - bottom_left[1]) / cols
    
    for i in range(rows):
        row = []
        for j in range(cols):
            lat = bottom_left[0] + i * lat_unit
            lon = bottom_left[1] + j * lon_unit
            lat = lat + lat_unit / 2
            lon = lon + lon_unit / 2
            row.append((lat, lon))
        grid.append(row)
    
    return grid

In [79]:
grid = create_map_grid((bottom_left_latitude, bottom_left_longitude), (top_right_latitude, top_right_longitude), 25, 25)

In [108]:
grid_dataset = []
for i, row in enumerate(grid):
    for j, point in enumerate(row):
        
        grid_row = {"row": i, "col": j, "latitude": point[0], "longitude": point[1]}
        grid_dataset.append(grid_row)

grid_df = pd.DataFrame(grid_dataset)

In [83]:
left_lat = 18.889833
left_lon = 72.779844

In [84]:
res1 =  calculate_distant_points(left_lat, left_lon, 1000*35)

right_lat = res1[1][0]
right_lon = res1[0][1]

In [85]:
grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), 35, 35)

In [None]:
grid_dataset = []
for i, row in enumerate(grid):
    for j, point in enumerate(row):
        grid_row = {"row": i, "col": j, "latitude": point[0], "longitude": point[1]}
        grid_dataset.append(grid_row)

grid_df = pd.DataFrame(grid_dataset)
grid_df.head(25)

In [106]:
## entire pipeline

left_lat = 18.889833
left_lon = 72.779844
dist = 35

res1 =  calculate_distant_points(left_lat, left_lon, 1000*dist)

right_lat = res1[1][0]
right_lon = res1[0][1]
grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), dist, dist)

grid_dataset = []
for i, row in enumerate(grid):
    for j, point in enumerate(row):
        result_df = get_osm_data(point[0], point[1], 710)
        # print(result_df.head(3))
        labelled_df = result_df[result_df['Location Type'] != 'Other']
        labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']
        labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']
        loc_types = []
        for row in labelled_df.iterrows():
            loc_type = (row[1]['Location Name'], row[1]['Location Type'])
            if loc_type not in loc_types:
                loc_types.append(loc_type)

        labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])

        row_of_dataset = ''

        for row in labelled_df.iterrows():
            row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']
            row_of_dataset += row_text + '; '
        ## replacing any coma in the text with a blank space

        row_of_dataset = row_of_dataset.replace(',', ' ')
        
        grid_row = {"row": i, "col": j, "latitude": point[0], "longitude": point[1], "Map Data": row_of_dataset}
        grid_dataset.append(grid_row)

grid_df = pd.DataFrame(grid_dataset)
grid_df.to_csv('MMR_DATASET.csv', index=False)

In [107]:
grid_df.head(20)

Unnamed: 0,row,col,latitude,longitude,Map Data
0,0,0,18.89433,72.784597,
1,0,1,18.89433,72.794102,"Prongs Reef is a Natural,"
2,0,2,18.89433,72.803607,United Services Club Golf Course is a Leisure ...
3,0,3,18.89433,72.813112,Indian Meterological Department is a Commercia...
4,1,0,18.903323,72.784597,
5,1,1,18.903323,72.794102,
6,1,2,18.903323,72.803607,"Jagadish Canteen is a Food & Drink, Maratha St..."
7,1,3,18.903323,72.813112,Indian Meterological Department is a Commercia...
8,2,0,18.912316,72.784597,
9,2,1,18.912316,72.794102,
