Spaces:
Running
Running
File size: 16,831 Bytes
a49479d 7929990 bb2a10f 7929990 bb2a10f d086442 a49479d d086442 7929990 d086442 bb2a10f a49479d bb2a10f a49479d d086442 a49479d d086442 a49479d d086442 a49479d d086442 a49479d bb2a10f a49479d d086442 a49479d 7929990 a49479d 7929990 a49479d 7929990 a49479d 7929990 a49479d d086442 a49479d 7929990 a49479d d086442 a49479d d086442 a49479d 7929990 a49479d 7929990 a49479d 7929990 a49479d d086442 a49479d d086442 a49479d 7929990 a49479d 7929990 a49479d d086442 a49479d bb2a10f a49479d bb2a10f a49479d 7929990 a49479d bb2a10f 7929990 a49479d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 |
"""
Feature engineering module for biomass prediction.
This module extracts the 99 features needed by the StableResNet model.
Author: najahpokkiri
Date: 2025-05-19
"""
import numpy as np
import logging
from datetime import datetime
# Configure logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Try to import optional dependencies but don't fail if not available
try:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
SKLEARN_AVAILABLE = True
except ImportError:
SKLEARN_AVAILABLE = False
logger.warning("scikit-learn not available. PCA features will be approximated.")
try:
from skimage.filters import sobel
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
SKIMAGE_AVAILABLE = True
except ImportError:
SKIMAGE_AVAILABLE = False
logger.warning("scikit-image not available. Texture features will be approximated.")
def safe_divide(a, b, fill_value=0.0):
"""Safe division that handles zeros in the denominator"""
a = np.asarray(a, dtype=np.float32)
b = np.asarray(b, dtype=np.float32)
# Handle NaN/Inf in inputs
a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
b = np.nan_to_num(b, nan=1e-10, posinf=1e10, neginf=-1e10)
mask = np.abs(b) < 1e-10
result = np.full_like(a, fill_value, dtype=np.float32)
if np.any(~mask):
result[~mask] = a[~mask] / b[~mask]
return np.nan_to_num(result, nan=fill_value, posinf=fill_value, neginf=fill_value)
def calculate_spectral_indices(satellite_data):
"""Calculate spectral indices from satellite bands"""
indices = {}
n_bands = satellite_data.shape[0]
# Enhanced band mapping with error checking
def safe_get_band(idx):
return satellite_data[idx] if idx < n_bands else None
# Sentinel-2 bands (assuming standard band order)
# B2(blue), B3(green), B4(red), B8(nir), B11(swir1), B12(swir2)
try:
blue = safe_get_band(1) # Adjust indices based on your data
green = safe_get_band(2)
red = safe_get_band(3)
nir = safe_get_band(7)
swir1 = safe_get_band(9)
swir2 = safe_get_band(10)
if all(b is not None for b in [red, nir]):
# NDVI (Normalized Difference Vegetation Index)
indices['NDVI'] = safe_divide(nir - red, nir + red)
if blue is not None and green is not None:
# EVI (Enhanced Vegetation Index)
indices['EVI'] = 2.5 * safe_divide(nir - red, nir + 6*red - 7.5*blue + 1)
# SAVI (Soil Adjusted Vegetation Index)
indices['SAVI'] = 1.5 * safe_divide(nir - red, nir + red + 0.5)
# MSAVI2 (Modified Soil Adjusted Vegetation Index)
indices['MSAVI2'] = 0.5 * (2 * nir + 1 - np.sqrt((2 * nir + 1)**2 - 8 * (nir - red)))
# NDWI (Normalized Difference Water Index)
indices['NDWI'] = safe_divide(green - nir, green + nir)
if swir1 is not None and nir is not None:
# NDMI (Normalized Difference Moisture Index)
indices['NDMI'] = safe_divide(nir - swir1, nir + swir1)
if swir2 is not None and nir is not None:
# NBR (Normalized Burn Ratio)
indices['NBR'] = safe_divide(nir - swir2, nir + swir2)
except Exception as e:
logger.warning(f"Error calculating spectral indices: {e}")
# Clean up None values and NaNs
indices = {k: np.nan_to_num(v, nan=0.0) for k, v in indices.items() if v is not None}
# Ensure we have all required indices by providing defaults
required_indices = ['NDVI', 'EVI', 'SAVI', 'MSAVI2', 'NDWI', 'NDMI', 'NBR']
for idx in required_indices:
if idx not in indices:
if satellite_data.shape[1] > 0 and satellite_data.shape[2] > 0:
indices[idx] = np.zeros((satellite_data.shape[1], satellite_data.shape[2]), dtype=np.float32)
return indices
def extract_texture_features(satellite_data):
"""Extract texture features from satellite data"""
texture_features = {}
height, width = satellite_data.shape[1], satellite_data.shape[2]
# If scikit-image is not available, return placeholders
if not SKIMAGE_AVAILABLE:
texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
'GLCM_homogeneity_B7', 'GLCM_energy_B7']
for name in texture_names:
texture_features[name] = np.zeros((height, width), dtype=np.float32)
return texture_features
try:
# Use NIR band (band 7) for texture features
b7_idx = min(7, satellite_data.shape[0] - 1)
band = satellite_data[b7_idx].copy()
band = np.nan_to_num(band, nan=0.0)
# 1. Sobel filter for edge detection
sobel_filtered = sobel(band)
texture_features['Sobel_B7'] = sobel_filtered
# 2. Local Binary Pattern
# Normalize band to 0-255 range for LBP
band_norm = band.copy()
if np.any(~np.isnan(band)):
band_min, band_max = np.nanpercentile(band, [1, 99])
if band_max > band_min:
band_norm = np.clip((band - band_min) / (band_max - band_min + 1e-8) * 255, 0, 255).astype(np.uint8)
else:
band_norm = np.zeros_like(band, dtype=np.uint8)
# Calculate LBP
lbp = local_binary_pattern(band_norm, 8, 1, method='uniform')
texture_features['LBP_B7'] = lbp
# 3. GLCM properties
# Create sample patch for GLCM calculation
sample_size = min(128, height, width)
center_y, center_x = height // 2, width // 2
offset = sample_size // 2
y_start = max(0, center_y - offset)
y_end = min(height, center_y + offset)
x_start = max(0, center_x - offset)
x_end = min(width, center_x + offset)
patch = band_norm[y_start:y_end, x_start:x_end]
# Calculate GLCM properties if patch is valid
if patch.size > 0:
glcm = graycomatrix(patch, [1], [0], levels=256, symmetric=True, normed=True)
for prop in ['contrast', 'dissimilarity', 'homogeneity', 'energy']:
try:
value = float(graycoprops(glcm, prop)[0, 0])
texture_features[f'GLCM_{prop}_B7'] = np.full((height, width), value)
except:
texture_features[f'GLCM_{prop}_B7'] = np.zeros((height, width), dtype=np.float32)
else:
# Create placeholder GLCM features if patch is invalid
for prop in ['contrast', 'dissimilarity', 'homogeneity', 'energy']:
texture_features[f'GLCM_{prop}_B7'] = np.zeros((height, width), dtype=np.float32)
except Exception as e:
logger.error(f"Error in texture feature extraction: {e}")
# Provide placeholder features in case of error
texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
'GLCM_homogeneity_B7', 'GLCM_energy_B7']
for name in texture_names:
texture_features[name] = np.zeros((height, width), dtype=np.float32)
return texture_features
def calculate_spatial_features(satellite_data, indices):
"""Calculate spatial context features like gradients"""
spatial_features = {}
height, width = satellite_data.shape[1], satellite_data.shape[2]
# 1. Gradient of Band 7 (NIR)
b7_idx = min(7, satellite_data.shape[0] - 1)
band = satellite_data[b7_idx].copy()
band = np.nan_to_num(band, nan=0.0)
try:
# Calculate the gradient magnitude
grad_y, grad_x = np.gradient(band)
grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)
spatial_features['Gradient_B7'] = grad_magnitude
except Exception as e:
logger.warning(f"Error calculating band gradient: {e}")
spatial_features['Gradient_B7'] = np.zeros((height, width), dtype=np.float32)
# 2. NDVI gradient
try:
ndvi = indices.get('NDVI', np.zeros((height, width), dtype=np.float32))
ndvi = np.nan_to_num(ndvi, nan=0.0)
# Calculate the gradient magnitude for NDVI
grad_y, grad_x = np.gradient(ndvi)
grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)
spatial_features['NDVI_gradient'] = grad_magnitude
except Exception as e:
logger.warning(f"Error calculating NDVI gradient: {e}")
spatial_features['NDVI_gradient'] = np.zeros((height, width), dtype=np.float32)
return spatial_features
def calculate_pca_features(satellite_data, n_components=25):
"""Calculate PCA features from satellite bands"""
pca_features = {}
height, width = satellite_data.shape[1], satellite_data.shape[2]
n_bands = satellite_data.shape[0]
# If scikit-learn is not available, return placeholders
if not SKLEARN_AVAILABLE:
for i in range(1, n_components + 1):
# Create some basic derived features as placeholders
if i <= n_bands:
# Use band values directly for first components
pca_features[f'PCA_{i:02d}'] = satellite_data[i-1]
else:
# Create synthetic features for remaining components
pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
return pca_features
try:
# Reshape for PCA (pixels x bands)
bands_reshaped = satellite_data.reshape(n_bands, -1).T
# Handle NaN values
valid_mask = ~np.any(np.isnan(bands_reshaped), axis=1)
bands_clean = bands_reshaped[valid_mask]
if len(bands_clean) == 0:
logger.warning("No valid data for PCA calculation")
# Create placeholder PCA features
for i in range(1, n_components + 1):
pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
return pca_features
# Standardize valid data
scaler = StandardScaler()
bands_scaled = scaler.fit_transform(bands_clean)
# Calculate PCA
pca = PCA(n_components=min(n_components, bands_scaled.shape[1], bands_scaled.shape[0]))
pca_result = pca.fit_transform(bands_scaled)
# Extend to full 25 components if needed
actual_components = pca_result.shape[1]
if actual_components < n_components:
logger.warning(f"Only {actual_components} PCA components calculated, padding to {n_components}")
padding = np.zeros((pca_result.shape[0], n_components - actual_components))
pca_result = np.hstack([pca_result, padding])
# Map back to original pixels
pca_all = np.zeros((bands_reshaped.shape[0], n_components))
pca_all[valid_mask] = pca_result
# Reshape to spatial dimensions
pca_spatial = pca_all.reshape(height, width, n_components)
# Store each component with the correct naming
for i in range(1, n_components + 1):
pca_features[f'PCA_{i:02d}'] = pca_spatial[:, :, i-1]
# Log PCA explained variance
if hasattr(pca, 'explained_variance_ratio_'):
logger.info(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.3f}")
except Exception as e:
logger.error(f"Error calculating PCA features: {e}")
# Create placeholder PCA features
for i in range(1, n_components + 1):
pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
return pca_features
def extract_all_features(satellite_data):
"""
Extract exactly 99 features needed by the model:
- 59 original bands
- 7 spectral indices
- 6 texture features
- 2 spatial features
- 25 PCA components
Parameters:
satellite_data (ndarray): Array of shape (bands, height, width)
Returns:
features_array (ndarray): Array of shape (valid_pixels, 99)
valid_mask (ndarray): Boolean mask of valid pixels
feature_names (list): List of 99 feature names
"""
start_time = datetime.now()
logger.info("Extracting features for biomass prediction...")
height, width = satellite_data.shape[1], satellite_data.shape[2]
# Create valid pixel mask (no NaN or Inf values)
valid_mask = np.all(np.isfinite(satellite_data), axis=0)
valid_y, valid_x = np.where(valid_mask)
n_valid = len(valid_y)
logger.info(f"Found {n_valid} valid pixels out of {height*width}")
# Generate all feature categories
logger.info("Calculating spectral indices...")
indices = calculate_spectral_indices(satellite_data)
logger.info("Extracting texture features...")
texture_features = extract_texture_features(satellite_data)
logger.info("Calculating spatial features...")
spatial_features = calculate_spatial_features(satellite_data, indices)
logger.info("Computing PCA components...")
pca_features = calculate_pca_features(satellite_data)
# Define the ordered list of feature names
feature_names = []
# 1. Add original band names (Band_01 through Band_59)
for i in range(1, 60):
feature_names.append(f'Band_{i:02d}')
# 2. Add spectral indices
spectral_indices = ['NDVI', 'EVI', 'SAVI', 'MSAVI2', 'NDWI', 'NDMI', 'NBR']
feature_names.extend(spectral_indices)
# 3. Add texture features
texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
'GLCM_homogeneity_B7', 'GLCM_energy_B7']
feature_names.extend(texture_names)
# 4. Add spatial features
spatial_names = ['Gradient_B7', 'NDVI_gradient']
feature_names.extend(spatial_names)
# 5. Add PCA components
for i in range(1, 26):
feature_names.append(f'PCA_{i:02d}')
# Create feature dictionary with all features
all_features = {}
# 1. Original bands
for i in range(min(satellite_data.shape[0], 59)):
all_features[f'Band_{i+1:02d}'] = satellite_data[i]
# Pad with zeros if we have fewer than 59 bands
for i in range(satellite_data.shape[0], 59):
all_features[f'Band_{i+1:02d}'] = np.zeros((height, width), dtype=np.float32)
# 2. Add other feature categories
all_features.update(indices)
all_features.update(texture_features)
all_features.update(spatial_features)
all_features.update(pca_features)
# Verify we have exactly 99 features
assert len(feature_names) == 99, f"Expected 99 features, but got {len(feature_names)}"
# Extract feature values for valid pixels
feature_matrix = np.zeros((n_valid, len(feature_names)), dtype=np.float32)
for i, name in enumerate(feature_names):
if name in all_features:
feature_data = all_features[name]
if feature_data.ndim == 2:
feature_values = feature_data[valid_y, valid_x]
else:
feature_values = np.full(n_valid, feature_data)
feature_matrix[:, i] = np.nan_to_num(feature_values, nan=0.0)
else:
logger.warning(f"Feature '{name}' not found, using zeros")
feature_matrix[:, i] = 0.0
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
logger.info(f"Successfully extracted {len(feature_names)} features for {n_valid} pixels in {processing_time:.2f} seconds")
return feature_matrix, valid_mask, feature_names
# Simple test function
def test_feature_extraction():
"""Test the feature extraction pipeline with sample data"""
try:
# Create sample data (5 bands, 100x100 pixels)
satellite_data = np.random.random((5, 100, 100)).astype(np.float32)
# Extract features
feature_matrix, valid_mask, feature_names = extract_all_features(satellite_data)
# Print summary
print(f"Sample data shape: {satellite_data.shape}")
print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Number of feature names: {len(feature_names)}")
print(f"Valid pixels: {np.sum(valid_mask)}")
return True
except Exception as e:
print(f"Feature extraction test failed: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
# Run a simple test if this script is executed directly
test_feature_extraction() |