| | import pandas as pd |
| | import geopandas as gpd |
| | import matplotlib.pyplot as plt |
| | import os |
| | import glob |
| | from os.path import join as pjoin |
| | import math |
| | import numpy as np |
| | import sys |
| | import time |
| |
|
| | from config import get_dataset_path, get_shapefile_from_s3, DATASET_S3_KEYS |
| |
|
| |
|
| | def add_data_source(run_context, layer_list): |
| | for layer_name in layer_list: |
| | if layer_name == "seismic": |
| | run_context.deps.add_source("UK BGS earthquake data (https://www.earthquakes.bgs.ac.uk)") |
| | elif layer_name == "drilling": |
| | run_context.deps.add_source("UKCS daily production data") |
| | elif layer_name == "licences": |
| | run_context.deps.add_source("UKCS licensed blocks data (https://www.arcgis.com/home/item.html?id=92b08a672721407ca90ed26e67514af8)") |
| | elif layer_name == "wells": |
| | run_context.deps.add_source("UKCS wells data (https://www.arcgis.com/home/item.html?id=92b08a672721407ca90ed26e67514af8)") |
| | elif layer_name == "pipelines": |
| | run_context.deps.add_source("UKCS pipeline data (https://www.arcgis.com/home/item.html?id=92b08a672721407ca90ed26e67514af8)") |
| | elif layer_name == "offshore_fields": |
| | run_context.deps.add_source("UKCS offshore fields data (https://www.arcgis.com/home/item.html?id=92b08a672721407ca90ed26e67514af8)") |
| | elif layer_name == "windfarms": |
| | run_context.deps.add_source("EMODNet active offshore wind farms data (https://emodnet.ec.europa.eu/)") |
| | elif layer_name == "copernicus_wind": |
| | run_context.deps.add_source("Copernicus Marine Data Service's Global Ocean Hourly Reprocessed Sea Surface Wind and Stress from Scatterometer and Model API, https://data.marine.copernicus.eu/product/WIND_GLO_PHY_L4_MY_012_006/description.") |
| | elif layer_name == "copernicus_wave": |
| | run_context.deps.add_source("Copernicus Marine Data Service's Global Ocean L 4 Significant Wave Height From Reprocessed Satellite Measurements API, https://data.marine.copernicus.eu/product/WAVE_GLO_PHY_SWH_L4_MY_014_007/description.") |
| |
|
| | |
| | def load_data_and_process(layer_name: str): |
| | """ |
| | Load and process datasets from S3. |
| | Handles both CSV and shapefile formats. |
| | """ |
| |
|
| | print(f" β load_data_and_process({layer_name})", flush=True) |
| | sys.stdout.flush() |
| | |
| | start = time.time() |
| |
|
| | |
| | s3_key = DATASET_S3_KEYS[layer_name] |
| |
|
| | print(f" β S3 key: {s3_key}", flush=True) |
| | sys.stdout.flush() |
| | |
| | |
| | if s3_key.endswith('.csv'): |
| | print(f" β Loading CSV...", flush=True) |
| | sys.stdout.flush() |
| |
|
| | local_path = get_dataset_path(layer_name) |
| |
|
| | print(f" β Local path: {local_path}", flush=True) |
| |
|
| | df = pd.read_csv(local_path) |
| |
|
| | print(f" β CSV loaded: {len(df)} rows", flush=True) |
| | sys.stdout.flush() |
| | |
| | |
| | if 'Lat' in df.columns and 'Lon' in df.columns: |
| | print(f" β Converting to GeoDataFrame...", flush=True) |
| | sys.stdout.flush() |
| |
|
| | df = gpd.GeoDataFrame( |
| | df, |
| | geometry=gpd.points_from_xy(df['Lon'], df['Lat']), |
| | crs='EPSG:4326' |
| | ) |
| | |
| | elif s3_key.endswith('.shp'): |
| | print(f" β Loading shapefile...", flush=True) |
| | sys.stdout.flush() |
| |
|
| | local_path = get_shapefile_from_s3(layer_name) |
| |
|
| | print(f" β Local path: {local_path}", flush=True) |
| | sys.stdout.flush() |
| |
|
| | df = gpd.read_file(local_path) |
| |
|
| | print(f" β Shapefile loaded: {len(df)} rows", flush=True) |
| | sys.stdout.flush() |
| | |
| | else: |
| | raise ValueError(f"Unsupported file type for {layer_name}") |
| | |
| | print(f" β Processing {layer_name}...", flush=True) |
| | sys.stdout.flush() |
| |
|
| | |
| | if layer_name == "seismic": |
| | df["Name"] = df["Region"] + ", " + df["Comment"] |
| | if 'geometry' not in df.columns: |
| | df["geometry"] = gpd.points_from_xy(df['Lon'], df['Lat']) |
| | df = df[["geometry", "Name"]] |
| | |
| | elif layer_name == "drilling": |
| | if 'geometry' not in df.columns: |
| | df["geometry"] = gpd.points_from_xy(df['Lon'], df['Lat']) |
| | df.rename(columns={'Field': 'Name'}, inplace=True) |
| | df = df[["geometry", "Name", "GasAvg"]] |
| | |
| | elif layer_name == "licences": |
| | df['Name'] = df['LICTYPE'] + df['LICNO'].astype(str) + '_' + df['BLOCKREF'] + '_' + df['BLOCKSUFFI'] |
| | df = df[["geometry", "Name"]] |
| | df['Name'] = df['Name'].fillna("") |
| | |
| | elif layer_name == "pipelines": |
| | df.rename(columns={'PIPE_NAME': 'Name'}, inplace=True) |
| | df = df[["geometry", "Name"]] |
| | |
| | elif layer_name == "offshore_fields": |
| | df.rename(columns={'FIELDNAME': 'Name'}, inplace=True) |
| | df = df[["geometry", "Name"]] |
| | |
| | elif layer_name == "wells": |
| | df.rename(columns={'WELLREGNO': 'Name'}, inplace=True) |
| | df = df[["geometry", "Name", "ORIGINSTAT"]] |
| |
|
| | elif layer_name == "windfarms": |
| | |
| | df = df[df["STATUS"].isin(["Construction", "Production"])] |
| | df['Name'] = df['NAME'].fillna('Unnamed') + ' (' + df['POWER_MW'].astype(str) + 'MW)' |
| | df = df[["geometry", "Name"]] |
| | |
| | |
| | if not isinstance(df, gpd.GeoDataFrame): |
| | df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326') |
| | |
| | elapsed = time.time() - start |
| | print(f" β load_data_and_process({layer_name}) complete in {elapsed:.2f}s", flush=True) |
| | sys.stdout.flush() |
| |
|
| | return df |
| |
|
| |
|
| | def calculate_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float: |
| | """ |
| | Calculate distance between two points using Haversine formula. |
| | |
| | Returns: |
| | Distance in kilometers |
| | """ |
| | R = 6371 |
| | |
| | |
| | lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2]) |
| | |
| | dlat = lat2 - lat1 |
| | dlon = lon2 - lon1 |
| | |
| | a = (math.sin(dlat/2)**2 + |
| | math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2) |
| | c = 2 * math.asin(math.sqrt(a)) |
| | |
| | return R * c |
| |
|