Spaces:

crop-classification
/

messis-demo

Sleeping

App Files Files Community

yvokeller commited on Aug 14, 2024

Commit

5b24075

1 Parent(s): 1eea5f1

first messis demo app version

Browse files

Files changed (11) hide show

.gitignore +2 -0
inference.py +235 -0
main.py +15 -0
messis/README.md +7 -0
messis/__init__.py +0 -0
messis/dataloader.py +287 -0
messis/messis.py +919 -0
messis/prithvi.py +555 -0
pages/1_Select_Location.py +78 -0
pages/2_Perform_Crop_Classification.py +99 -0
requirements.txt +23 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ hf_cache
2	+ __pycache__

inference.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import os
+import torch
+import yaml
+import json
+import rasterio
+from rasterio.windows import Window
+from rasterio.transform import rowcol
+from pyproj import Transformer
+from torchvision import transforms
+import numpy as np
+from rasterio.features import shapes
+from shapely.geometry import shape
+import geopandas as gpd
+from messis.messis import LogConfusionMatrix
+class InferenceDataLoader:
+    def __init__(self, features_path, labels_path, field_ids_path, stats_path, window_size=224, n_timesteps=3, fold_indices=None, debug=False):
+        self.features_path = features_path
+        self.labels_path = labels_path
+        self.field_ids_path = field_ids_path
+        self.stats_path = stats_path
+        self.window_size = window_size
+        self.n_timesteps = n_timesteps
+        self.fold_indices = fold_indices if fold_indices is not None else []
+        self.debug = debug
+        # Load normalization stats
+        self.means, self.stds = self.load_stats()
+        # Set up the transformer for coordinate conversion
+        self.transformer = Transformer.from_crs("EPSG:4326", "EPSG:32632", always_xy=True)
+    def load_stats(self):
+        """Load normalization statistics for dataset from YAML file."""
+        if self.debug:
+            print(f"Loading mean/std stats from {self.stats_path}")
+        assert os.path.exists(self.stats_path), f"Mean/std stats file not found at {self.stats_path}"
+        with open(self.stats_path, 'r') as file:
+            stats = yaml.safe_load(file)
+        mean_list, std_list, n_list = [], [], []
+        for fold in self.fold_indices:
+            key = f'fold_{fold}'
+            if key not in stats:
+                raise ValueError(f"Mean/std stats for fold {fold} not found in {self.stats_path}")
+            if self.debug:
+                print(f"Stats with selected test fold {fold}: {stats[key]} over {self.n_timesteps} timesteps.")
+            mean_list.append(torch.tensor(stats[key]['mean'])) # list of 6 means
+            std_list.append(torch.tensor(stats[key]['std'])) # list of 6 stds
+            n_list.append(stats[key]['n_chips']) # list of 6 ns
+        means, stds = [], []
+        for channel in range(mean_list[0].shape[0]):
+            means.append(torch.stack([mean_list[i][channel] for i in range(len(mean_list))]).mean())
+            variances = torch.stack([std_list[i][channel] ** 2 for i in range(len(std_list))])
+            n = torch.tensor([n_list[i] for i in range(len(n_list))], dtype=torch.float32)
+            combined_variance = torch.sum(variances * (n - 1)) / (torch.sum(n) - len(n_list))
+            stds.append(torch.sqrt(combined_variance))
+        return means * self.n_timesteps, stds * self.n_timesteps
+    def identify_window(self, path, lon, lat):
+        """Identify the 224x224 window centered on the clicked coordinates (lon, lat) from the specified GeoTIFF."""
+        with rasterio.open(path) as src:
+            # Transform the coordinates from WGS84 to UTM (EPSG:32632)
+            utm_x, utm_y = self.transformer.transform(lon, lat)
+            try:
+                px, py = rowcol(src.transform, utm_x, utm_y)
+            except ValueError:
+                raise ValueError("Coordinates out of bounds for this raster.")
+            if self.debug:
+                print(f"Row: {py}, Column: {px}")
+            half_window_size = self.window_size // 2
+            col_off = px - half_window_size
+            row_off = py - half_window_size
+            if col_off < 0:
+                col_off = 0
+            if row_off < 0:
+                row_off = 0
+            if col_off + self.window_size > src.width:
+                col_off = src.width - self.window_size
+            if row_off + self.window_size > src.height:
+                row_off = src.height - self.window_size
+            window = Window(col_off, row_off, self.window_size, self.window_size)
+            window_transform = src.window_transform(window)
+            crs = src.crs
+            return window, window_transform, crs
+    def extract_window(self, path, window):
+        """Extract data from the specified window from the GeoTIFF."""
+        with rasterio.open(path) as src:
+            window_data = src.read(window=window)
+        if self.debug:
+            print(f"Extracted window data from {path}")
+            print(f"Min: {window_data.min()}, Max: {window_data.max()}")
+        return window_data
+    def prepare_data_for_model(self, features_data):
+        """Prepare the window data for model inference."""
+        # Convert to tensor
+        features_data = torch.tensor(features_data, dtype=torch.float32)
+        # Normalize
+        normalize = transforms.Normalize(mean=self.means, std=self.stds)
+        features_data = normalize(features_data)
+        # Permute the dimensions if needed
+        height, width = features_data.shape[-2:]
+        features_data = features_data.view(self.n_timesteps, 6, height, width).permute(1, 0, 2, 3)
+        # Add batch dimension
+        features_data = features_data.unsqueeze(0)
+        return features_data
+    def get_data(self, lon, lat):
+        """Extract, normalize, and prepare data for inference, including labels and field IDs."""
+        # Identify the window and get the georeferencing information
+        window, features_transform, features_crs = self.identify_window(self.features_path, lon, lat)
+        # Extract data from the GeoTIFF, labels, and field IDs
+        features_data = self.extract_window(self.features_path, window)
+        label_data = self.extract_window(self.labels_path, window)
+        field_ids_data = self.extract_window(self.field_ids_path, window)
+        # Prepare the window data for the model
+        prepared_features_data = self.prepare_data_for_model(features_data)
+        # Convert labels and field_ids to tensors (without normalization)
+        label_data = torch.tensor(label_data, dtype=torch.long)
+        field_ids_data = torch.tensor(field_ids_data, dtype=torch.long)
+        # Return the prepared data along with transform and CRS
+        return prepared_features_data, label_data, field_ids_data, features_transform, features_crs
+def crop_predictions_to_gdf(field_ids, targets, predictions, transform, crs, class_names):
+    """
+    Convert field_ids, targets, and predictions tensors to field polygons with corresponding class reference.
+    :param field_ids: PyTorch tensor of shape (1, 224, 224) representing individual fields
+    :param targets: PyTorch tensor of shape (1, 224, 224) for targets
+    :param predictions: PyTorch tensor of shape (1, 224, 224) for predictions
+    :param transform: Affine transform for georeferencing
+    :param crs: Coordinate reference system (CRS) of the data
+    :param class_names: Dictionary mapping class indices to class names
+    :return: GeoPandas DataFrame with polygons, prediction class labels, and target class labels
+    """
+    field_array = field_ids.squeeze().cpu().numpy().astype(np.int32)
+    target_array = targets.squeeze().cpu().numpy().astype(np.int8)
+    pred_array = predictions.squeeze().cpu().numpy().astype(np.int8)
+    polygons = []
+    field_values = []
+    target_values = []
+    pred_values = []
+    for geom, field_value in shapes(field_array, transform=transform):
+        polygons.append(shape(geom))
+        field_values.append(field_value)
+        # Get a single value from the field area for targets and predictions
+        target_value = target_array[field_array == field_value][0]
+        pred_value = pred_array[field_array == field_value][0]
+        target_values.append(target_value)
+        pred_values.append(pred_value)
+    gdf = gpd.GeoDataFrame({
+        'geometry': polygons,
+        'field_id': field_values,
+        'target': target_values,
+        'prediction': pred_values
+    }, crs=crs)
+    gdf['prediction_class'] = gdf['prediction'].apply(lambda x: class_names[x])
+    gdf['target_class'] = gdf['target'].apply(lambda x: class_names[x])
+    gdf['correct'] = gdf['target'] == gdf['prediction']
+    gdf = gdf[gdf.geometry.area > 250] # Threshold for small polygons
+    return gdf
+def perform_inference(lon, lat, model, config, debug=False):
+    features_path = "./data/stacked_features.tif"
+    labels_path = "./data/labels.tif"
+    field_ids_path = "./data/field_ids.tif"
+    stats_path = "./data/chips_stats.yaml"
+    loader = InferenceDataLoader(features_path, labels_path, field_ids_path, stats_path, n_timesteps=9, fold_indices=[0], debug=True)
+    # Coordinates must be in EPSG:4326 and lon lat order - are converted to the CRS of the raster
+    satellite_data, label_data, field_ids_data, features_transform, features_crs = loader.get_data(lon, lat)
+    if debug:
+        # Print the shape of the extracted data
+        print(satellite_data.shape)
+        print(label_data.shape)
+        print(field_ids_data.shape)
+    with open('./data/dataset_info.json', 'r') as file:
+        dataset_info = json.load(file)
+    class_names = dataset_info['tier3']
+    tiers_dict = {k: v for k, v in config.hparams.get('heads_spec').items() if v.get('is_metrics_tier', False)}
+    tiers = list(tiers_dict.keys())
+    # Perform inference
+    model.eval()
+    with torch.no_grad():
+        output = model(satellite_data)['tier3_refinement_head']
+    pixelwise_outputs_stacked, majority_outputs_stacked = LogConfusionMatrix.get_pixelwise_and_majority_outputs(output, tiers, field_ids=field_ids_data, dataset_info=dataset_info)
+    majority_tier3_predictions = majority_outputs_stacked[2] # Tier 3 predictions
+    # Convert the predictions to a GeoDataFrame
+    gdf = crop_predictions_to_gdf(field_ids_data, label_data, majority_tier3_predictions, features_transform, features_crs, class_names)
+    # Simple GeoDataFrame with only the necessary columns
+    gdf = gdf[['prediction_class', 'target_class', 'correct', 'geometry']]
+    gdf.columns = ['Prediction', 'Target', 'Correct', 'geometry']
+    # gdf = gdf[gdf['Target'] != 'Background']
+    return gdf

main.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+def main():
+    st.set_page_config(layout="wide", page_title="Messis 🌾 - Crop Classification 🌎")
+    st.title("Messis 🌾 - Crop Classification 🌎")
+    st.write("Welcome to the Messis Crop Classification app. Use the sidebar to navigate between selecting coordinates and performing inference.")
+    st.page_link("main.py", label="Home", icon="🏠")
+    st.page_link("pages/1_Select_Location.py", label="Select Location", icon="📍")
+    st.page_link("pages/2_Perform_Crop_Classification.py", label="Perform Crop Classification", icon="🔍")
+if __name__ == "__main__":
+    main()

messis/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# About
+This package contains the code for the crop classification model Messis.
+It can be found on Hugging Face at [this link](https://huggingface.co/crop-classification/messis).
+TODO: Add more information about the model.

messis/__init__.py ADDED Viewed

File without changes

messis/dataloader.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import random
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from pytorch_lightning import LightningDataModule
+import os
+import re
+import yaml
+import rasterio
+import dvc.api
+params = dvc.api.params_show()
+N_TIMESTEPS = params['number_of_timesteps']
+class ToTensorTransform(object):
+    def __init__(self, dtype):
+        self.dtype = dtype
+    def __call__(self, data):
+        return torch.tensor(data, dtype=self.dtype)
+class NormalizeTransform(object):
+    def __init__(self, means, stds):
+        self.mean = means
+        self.std = stds
+    def __call__(self, data):
+        return transforms.Normalize(self.mean, self.std)(data)
+class PermuteTransform:
+    def __call__(self, data):
+        height, width = data.shape[-2:]
+        # Ensure the channel dimension is as expected
+        if data.shape[0] != N_TIMESTEPS * 6:
+            raise ValueError(f"Expected {N_TIMESTEPS*6} channels, got {data.shape[1]}")
+        # Step 1: Reshape the data to group the N_TIMESTEPS*6 bands into N_TIMESTEPS groups of 6 bands
+        data = data.view(N_TIMESTEPS, 6, height, width)
+        # Step 2: Permute to bring the bands to the front
+        data = data.permute(1, 0, 2, 3)  # NOTE: Prithvi wants it bands first # after this, shape is (6, N_TIMESTEPS, height, width)
+        return data
+class RandomFlipAndJitterTransform:
+    """
+    Apply random horizontal and vertical flips, and channel jitter to the input image and corresponding mask.
+    Parameters:
+    -----------
+    flip_prob : float, optional (default=0.5)
+        Probability of applying horizontal and vertical flips to the image and mask.
+        Each flip (horizontal and vertical) is applied independently based on this probability.
+    jitter_std : float, optional (default=0.02)
+        Standard deviation of the Gaussian noise added to the image channels for jitter.
+        This value controls the intensity of the random noise applied to the image channels.
+    Effects of Parameters:
+    ----------------------
+    flip_prob:
+        - Higher flip_prob increases the likelihood of the image and mask being flipped.
+        - A value of 0 means no flipping, while a value of 1 means always flip.
+    jitter_std:
+        - Higher jitter_std increases the intensity of the noise added to the image channels.
+        - A value of 0 means no noise, while larger values add more significant noise.
+    """
+    def __init__(self, flip_prob=0.5, jitter_std=0.02):
+        self.flip_prob = flip_prob
+        self.jitter_std = jitter_std
+    def __call__(self, img, mask, field_ids):
+        # Shapes (..., H, W)| img: torch.Size([6, N_TIMESTEPS, 224, 224]), mask: torch.Size([N_TIMESTEPS, 224, 224]), field_ids: torch.Size([1, 224, 224])
+        # Temporarily convert field_ids to int32 for flipping (flip not implemented for uint16)
+        field_ids = field_ids.to(torch.int32)
+        # Random horizontal flip
+        if random.random() < self.flip_prob:
+            img = torch.flip(img, [2])
+            mask = torch.flip(mask, [1])
+            field_ids = torch.flip(field_ids, [1])
+        # Random vertical flip
+        if random.random() < self.flip_prob:
+            img = torch.flip(img, [3])
+            mask = torch.flip(mask, [2])
+            field_ids = torch.flip(field_ids, [2])
+        # Convert field_ids back to uint16
+        field_ids = field_ids.to(torch.uint16)
+        # Channel jitter
+        noise = torch.randn(img.size()) * self.jitter_std
+        img += noise
+        return img, mask, field_ids
+def get_img_transforms():
+    return transforms.Compose([])
+def get_mask_transforms():
+    return transforms.Compose([])
+class GeospatialDataset(Dataset):
+    def __init__(self, data_dir, fold_indicies, transform_img=None, transform_mask=None, transform_field_ids=None, debug=False, subset_size=None, data_augmentation=None):
+        self.data_dir = data_dir
+        self.chips_dir = os.path.join(data_dir, 'chips')
+        self.transform_img = transform_img
+        self.transform_mask = transform_mask
+        self.transform_field_ids = transform_field_ids
+        self.debug = debug
+        self.images = []
+        self.masks = []
+        self.field_ids = []
+        self.data_augmentation = data_augmentation
+        self.means, self.stds = self.load_stats(fold_indicies, N_TIMESTEPS)
+        self.transform_img_load = self.get_img_load_transforms(self.means, self.stds)
+        self.transform_mask_load = self.get_mask_load_transforms()
+        self.transform_field_ids_load = self.get_field_ids_load_transforms()
+        # Adjust file selection based on fold
+        for file in os.listdir(self.chips_dir):
+            if re.match(f".*_fold_[{''.join([str(f) for f in fold_indicies])}]_merged.tif", file):
+                self.images.append(file)
+                mask_file = file.replace("_merged.tif", "_mask.tif")
+                self.masks.append(mask_file)
+                field_ids_file = file.replace("_merged.tif", "_field_ids.tif")
+                self.field_ids.append(field_ids_file)
+        assert len(self.images) == len(self.masks), "Number of images and masks do not match"
+        # If subset_size is specified, randomly select a subset of the data
+        if subset_size is not None and len(self.images) > subset_size:
+            print(f"Randomly selecting {subset_size} samples from {len(self.images)} samples.")
+            selected_indices = random.sample(range(len(self.images)), subset_size)
+            self.images = [self.images[i] for i in selected_indices]
+            self.masks = [self.masks[i] for i in selected_indices]
+            self.field_ids = [self.field_ids[i] for i in selected_indices]
+    def load_stats(self, fold_indicies, n_timesteps=3):
+        """Load normalization statistics for dataset from YAML file."""
+        stats_path = os.path.join(self.data_dir, 'chips_stats.yaml')
+        if self.debug:
+            print(f"Loading mean/std stats from {stats_path}")
+        assert os.path.exists(stats_path), f"mean/std stats file for dataset not found at {stats_path}"
+        with open(stats_path, 'r') as file:
+            stats = yaml.safe_load(file)
+        mean_list, std_list, n_list = [], [], []
+        for fold in fold_indicies:
+            key = f'fold_{fold}'
+            if key not in stats:
+                raise ValueError(f"mean/std stats for fold {fold} not found in {stats_path}")
+            if self.debug:
+                print(f"Stats with selected test fold {fold}: {stats[key]} over {n_timesteps} timesteps.")
+            mean_list.append(torch.Tensor(stats[key]['mean'])) # list of 6 means
+            std_list.append(torch.Tensor(stats[key]['std'])) # list of 6 stds
+            n_list.append(stats[key]['n_chips']) # list of 6 ns
+        # aggregate means and stds over all folds
+        means, stds = [], []
+        for channel in range(mean_list[0].shape[0]):
+            means.append(torch.stack([mean_list[i][channel] for i in range(len(mean_list))]).mean())
+            # stds are waaaay more complex to aggregate
+            # \sqrt{\frac{\sum_{i=1}^{n} (\sigma_i * (n_i - 1))}{\sum_{i=1}^{n} (n_i) - n}}
+            variances = torch.stack([std_list[i][channel] ** 2 for i in range(len(std_list))])
+            n = torch.tensor([n_list[i] for i in range(len(n_list))], dtype=torch.float32)
+            combined_variance = torch.sum(variances * (n - 1)) / (torch.sum(n) - len(n_list))
+            stds.append(torch.sqrt(combined_variance))
+        # make means and stds into 2d arrays, as torchvision would otherwise convert it into a 3d tensor which is incompatible with our 4d temporal images
+        # https://github.com/pytorch/vision/blob/6e18cea3485066b7277785415bf2e0422dbdb9da/torchvision/transforms/_functional_tensor.py#L923
+        return means * n_timesteps, stds * n_timesteps
+    def get_img_load_transforms(self, means, stds):
+        return transforms.Compose([
+            ToTensorTransform(torch.float32),
+            NormalizeTransform(means, stds),
+            PermuteTransform()
+        ])
+    def get_mask_load_transforms(self):
+        return transforms.Compose([
+            ToTensorTransform(torch.uint8)
+        ])
+    def get_field_ids_load_transforms(self):
+        return transforms.Compose([
+            ToTensorTransform(torch.uint16)
+        ])
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.chips_dir, self.images[idx])
+        mask_path = os.path.join(self.chips_dir, self.masks[idx])
+        field_ids_path = os.path.join(self.chips_dir, self.field_ids[idx])
+        img = rasterio.open(img_path).read().astype('uint16')
+        mask = rasterio.open(mask_path).read().astype('uint8')
+        field_ids = rasterio.open(field_ids_path).read().astype('uint16')
+        # Apply our base transforms
+        img = self.transform_img_load(img)
+        mask = self.transform_mask_load(mask)
+        field_ids = self.transform_field_ids_load(field_ids)
+        # Apply additional transforms passed from GeospatialDataModule if applicable
+        if self.transform_img is not None:
+            img = self.transform_img(img)
+        if self.transform_mask is not None:
+            mask = self.transform_mask(mask)
+        if self.transform_field_ids is not None:
+            field_ids = self.transform_field_ids(field_ids)
+        # Apply data augmentation if enabled
+        if self.data_augmentation is not None and self.data_augmentation.get('enabled', True):
+            img, mask, field_ids = RandomFlipAndJitterTransform(
+                flip_prob=self.data_augmentation.get('flip_prob', 0.5),
+                jitter_std=self.data_augmentation.get('jitter_std', 0.02)
+            )(img, mask, field_ids)
+        # Load targets for given tiers
+        num_tiers = mask.shape[0]
+        targets = ()
+        for i in range(num_tiers):
+            targets += (mask[i, :, :].type(torch.long),)
+        return img, (targets, field_ids)
+class GeospatialDataModule(LightningDataModule):
+    def __init__(self, data_dir, train_folds, val_folds, test_folds, batch_size=8, num_workers=4, debug=False, subsets=None, data_augmentation=None):
+        super().__init__()
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.debug = debug
+        self.subsets = subsets if subsets is not None else {}
+        self.data_augmentation = data_augmentation if data_augmentation is not None else {}
+        GeospatialDataModule.validate_folds(train_folds, val_folds, test_folds)
+        self.train_folds = train_folds
+        self.val_folds = val_folds
+        self.test_folds = test_folds
+        # NOTE: Transforms on this level not used for now
+        self.transform_img = get_img_transforms()
+        self.transform_mask = get_mask_transforms()
+    @staticmethod
+    def validate_folds(train, val, test):
+        if train is None or val is None or test is None:
+            raise ValueError("All fold sets must be specified")
+        if len(set(train) & set(val)) > 0 or len(set(train) & set(test)) > 0 or len(set(val) & set(test)) > 0:
+            raise ValueError("Folds must be mutually exclusive")
+    def setup(self, stage=None):
+        print(f"Setting up GeospatialDataModule for stage: {stage}. Data augmentation config: {self.data_augmentation}")
+        common_params = {
+            'data_dir': self.data_dir,
+            'debug': self.debug,
+            'data_augmentation': self.data_augmentation
+        }
+        common_params_val_test = { # Never augment validation or test data
+            **common_params,
+             'data_augmentation': {
+                'enabled': False
+            }
+        }
+        if stage in ('fit', None):
+            self.train_dataset = GeospatialDataset(fold_indicies=self.train_folds, subset_size=self.subsets.get('train', None), **common_params)
+            self.val_dataset   = GeospatialDataset(fold_indicies=self.val_folds,   subset_size=self.subsets.get('val',   None), **common_params_val_test)
+        if stage in ('test', None):
+            self.test_dataset  = GeospatialDataset(fold_indicies=self.test_folds,  subset_size=self.subsets.get('test',  None), **common_params_val_test)
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=True, shuffle=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=True)

messis/messis.py ADDED Viewed

	@@ -0,0 +1,919 @@

+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from torchmetrics import classification
+import wandb
+from matplotlib import pyplot as plt
+import numpy as np
+import matplotlib.ticker as ticker
+from matplotlib.colors import ListedColormap
+from huggingface_hub import PyTorchModelHubMixin
+from lion_pytorch import Lion
+import json
+from messis.prithvi import TemporalViTEncoder, ConvTransformerTokensToEmbeddingNeck, ConvTransformerTokensToEmbeddingBottleneckNeck
+def safe_shape(x):
+    if isinstance(x, tuple):
+        # loop through tuple
+        shape_info = '(tuple) : '
+        for i in x:
+            shape_info += str(i.shape) + ', '
+        return shape_info
+    if isinstance(x, list):
+        # loop through list
+        shape_info = '(list) : '
+        for i in x:
+            shape_info += str(i.shape) + ', '
+        return shape_info
+    return x.shape
+class ConvModule(nn.Module):
+    """
+    A simple convolutional module including Conv, BatchNorm, and ReLU layers.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, padding, dilation):
+        super(ConvModule, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return self.relu(x)
+class HierarchicalFCNHead(nn.Module):
+    """
+    Hierarchical FCN Head for semantic segmentation.
+    """
+    def __init__(self, in_channels, out_channels, num_classes, num_convs=2, kernel_size=3, dilation=1, dropout_p=0.1, debug=False):
+        super(HierarchicalFCNHead, self).__init__()
+        self.debug = debug
+        self.convs = nn.Sequential(*[
+            ConvModule(
+                in_channels if i == 0 else out_channels,
+                out_channels,
+                kernel_size,
+                padding=dilation * (kernel_size // 2),
+                dilation=dilation
+            ) for i in range(num_convs)
+        ])
+        self.conv_seg = nn.Conv2d(out_channels, num_classes, kernel_size=1)
+        self.dropout = nn.Dropout2d(p=dropout_p)
+    def forward(self, x):
+        if self.debug:
+            print('HierarchicalFCNHead forward INP: ', safe_shape(x))
+        x = self.convs(x)
+        features = self.dropout(x)
+        output = self.conv_seg(features)
+        if self.debug:
+            print('HierarchicalFCNHead forward features OUT: ', safe_shape(features))
+            print('HierarchicalFCNHead forward output OUT: ', safe_shape(output))
+        return output, features
+class LabelRefinementHead(nn.Module):
+    """
+    Similar to the label refinement module introduced in the ZueriCrop paper, this module refines the predictions for tier 3.
+    It takes the raw predictions from head 1, head 2 and head 3 and refines them to produce the final prediction for tier 3.
+    According to ZueriCrop, this helps with making the predictions more consistent across the different tiers.
+    """
+    def __init__(self, input_channels, num_classes):
+        super(LabelRefinementHead, self).__init__()
+        self.cnn_layers = nn.Sequential(
+            # 1x1 Convolutional layer
+            nn.Conv2d(in_channels=input_channels, out_channels=128, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            # 3x3 Convolutional layer
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=0.5),
+            # Skip connection (implemented in forward method)
+            # Another 3x3 Convolutional layer
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            # 1x1 Convolutional layer to adjust the number of output channels to num_classes
+            nn.Conv2d(in_channels=128, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
+            nn.Dropout(p=0.5)
+        )
+    def forward(self, x):
+        # Apply initial conv layer
+        y = self.cnn_layers[0:3](x)
+        # Save for skip connection
+        y_skip = y
+        # Apply the next two conv layers
+        y = self.cnn_layers[3:9](y)
+        # Skip connection (element-wise addition)
+        y = y + y_skip
+        # Apply the last conv layer
+        y = self.cnn_layers[9:](y)
+        return y
+class HierarchicalClassifier(nn.Module):
+    def __init__(
+            self,
+            heads_spec,
+            dropout_p=0.1,
+            img_size=256,
+            patch_size=16,
+            num_frames=3,
+            bands=[0, 1, 2, 3, 4, 5],
+            backbone_weights_path=None,
+            freeze_backbone=True,
+            use_bottleneck_neck=False,
+            bottleneck_reduction_factor=4,
+            loss_ignore_background=False,
+            debug=False
+        ):
+        super(HierarchicalClassifier, self).__init__()
+        self.embed_dim = 768
+        if num_frames % 3 != 0:
+            raise ValueError("The number of frames must be a multiple of 3, it is currently: ", num_frames)
+        self.num_frames = num_frames
+        self.hp, self.wp = img_size // patch_size, img_size // patch_size
+        self.heads_spec = heads_spec
+        self.dropout_p = dropout_p
+        self.loss_ignore_background = loss_ignore_background
+        self.debug = debug
+        if self.debug:
+            print('hp and wp: ', self.hp, self.wp)
+        self.prithvi = TemporalViTEncoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            num_frames=3,
+            tubelet_size=1,
+            in_chans=len(bands),
+            embed_dim=self.embed_dim,
+            depth=12,
+            num_heads=8,
+            mlp_ratio=4.0,
+            norm_pix_loss=False,
+            pretrained=backbone_weights_path,
+            debug=self.debug
+        )
+        # (Un)freeze the backbone
+        for param in self.prithvi.parameters():
+            param.requires_grad = not freeze_backbone
+        # Neck to transform the token-based output of the transformer into a spatial feature map
+        number_of_necks = self.num_frames // 3
+        if use_bottleneck_neck:
+            self.necks = nn.ModuleList([ConvTransformerTokensToEmbeddingBottleneckNeck(
+                embed_dim=self.embed_dim * 3,
+                output_embed_dim=self.embed_dim * 3,
+                drop_cls_token=True,
+                Hp=self.hp,
+                Wp=self.wp,
+                bottleneck_reduction_factor=bottleneck_reduction_factor
+            ) for _ in range(number_of_necks)])
+        else:
+            self.necks = nn.ModuleList([ConvTransformerTokensToEmbeddingNeck(
+                embed_dim=self.embed_dim * 3,
+                output_embed_dim=self.embed_dim * 3,
+                drop_cls_token=True,
+                Hp=self.hp,
+                Wp=self.wp,
+            ) for _ in range(number_of_necks)])
+        # Initialize heads and loss weights based on tiers
+        self.heads = nn.ModuleDict()
+        self.loss_weights = {}
+        self.total_classes = 0
+        # Build HierarchicalFCNHeads
+        head_count = 0
+        for head_name, head_info in self.heads_spec.items():
+            head_type = head_info['type']
+            num_classes = head_info['num_classes_to_predict']
+            loss_weight = head_info['loss_weight']
+            if head_type == 'HierarchicalFCNHead':
+                num_classes = head_info['num_classes_to_predict']
+                loss_weight = head_info['loss_weight']
+                kernel_size = head_info.get('kernel_size', 3)
+                num_convs = head_info.get('num_convs', 1)
+                num_channels = head_info.get('num_channels', 256)
+                self.total_classes += num_classes
+                self.heads[head_name] = HierarchicalFCNHead(
+                    in_channels=(self.embed_dim * self.num_frames) if head_count == 0 else num_channels,
+                    out_channels=num_channels,
+                    num_classes=num_classes,
+                    num_convs=num_convs,
+                    kernel_size=kernel_size,
+                    dropout_p=self.dropout_p,
+                    debug=self.debug
+                )
+                self.loss_weights[head_name] = loss_weight
+            # NOTE: LabelRefinementHead must be the last in the dict, otherwise the total_classes will be incorrect
+            if head_type == 'LabelRefinementHead':
+                self.refinement_head = LabelRefinementHead(input_channels=self.total_classes, num_classes=num_classes)
+                self.refinement_head_name = head_name
+                self.loss_weights[head_name] = loss_weight
+            head_count += 1
+        self.loss_func = nn.CrossEntropyLoss(ignore_index=-1)
+    def forward(self, x):
+        if self.debug:
+            print(f"Input shape: {safe_shape(x)}") # torch.Size([4, 6, 9, 224, 224])
+        # Extract features from the base model
+        if len(self.necks) == 1:
+            features = [x]
+        else:
+            features = torch.chunk(x, len(self.necks), dim=2)
+        features = [self.prithvi(x) for x in features]
+        if self.debug:
+            print(f"Features shape after base model: {', '.join([safe_shape(f) for f in features])}") # (tuple) : torch.Size([4, 589, 768]), , (tuple) : torch.Size
+        # Process through the neck
+        features = [neck(feat_) for feat_, neck in zip(features, self.necks)]
+        if self.debug:
+            print(f"Features shape after neck: {', '.join([safe_shape(f) for f in features])}") # (tuple) : torch.Size([4, 2304, 224, 224]), , (tuple) : torch.Size
+        # Remove from tuple
+        features = [feat[0] for feat in features]
+        # stack the features to create a tensor of torch.Size([4, 6912, 224, 224])
+        features = torch.concatenate(features, dim=1)
+        if self.debug:
+            print(f"Features shape after removing tuple: {safe_shape(features)}") # torch.Size([4, 6912, 224, 224])
+        # Process through the heads
+        outputs = {}
+        for tier_name, head in self.heads.items():
+            output, features = head(features)
+            outputs[tier_name] = output
+            if self.debug:
+                print(f"Features shape after {tier_name} head: {safe_shape(features)}")
+                print(f"Output shape after {tier_name} head: {safe_shape(output)}")
+        # Process through the classification refinement head
+        output_concatenated = torch.cat(list(outputs.values()), dim=1)
+        output_refinement_head = self.refinement_head(output_concatenated)
+        outputs[self.refinement_head_name] = output_refinement_head
+        return outputs
+    def calculate_loss(self, outputs, targets):
+        total_loss = 0
+        loss_per_head = {}
+        for head_name, output in outputs.items():
+            if self.debug:
+                print(f"Target index for {head_name}: {self.heads_spec[head_name]['target_idx']}")
+            target = targets[self.heads_spec[head_name]['target_idx']]
+            loss_target = target
+            if self.loss_ignore_background:
+                loss_target = target.clone()  # Clone as original target needed in backward pass
+                loss_target[loss_target == 0] = -1  # Set background class to ignore_index -1 for loss calculation
+            loss = self.loss_func(output, loss_target)
+            loss_per_head[f'{head_name}'] = loss
+            total_loss += loss * self.loss_weights[head_name]
+        return total_loss, loss_per_head
+class Messis(pl.LightningModule, PyTorchModelHubMixin):
+    def __init__(self, hparams):
+        super().__init__()
+        self.save_hyperparameters(hparams)
+        self.model = HierarchicalClassifier(
+            heads_spec=hparams['heads_spec'],
+            dropout_p=hparams.get('dropout_p'),
+            img_size=hparams.get('img_size'),
+            patch_size=hparams.get('patch_size'),
+            num_frames=hparams.get('num_frames'),
+            bands=hparams.get('bands'),
+            backbone_weights_path=hparams.get('backbone_weights_path'),
+            freeze_backbone=hparams['freeze_backbone'],
+            use_bottleneck_neck=hparams.get('use_bottleneck_neck'),
+            bottleneck_reduction_factor=hparams.get('bottleneck_reduction_factor'),
+            loss_ignore_background=hparams.get('loss_ignore_background'),
+            debug=hparams.get('debug')
+        )
+    def forward(self, x):
+        return self.model(x)
+    def training_step(self, batch, batch_idx):
+        return self.__step(batch, batch_idx, "train")
+    def validation_step(self, batch, batch_idx):
+        return self.__step(batch, batch_idx, "val")
+    def test_step(self, batch, batch_idx):
+        return self.__step(batch, batch_idx, "test")
+    def configure_optimizers(self):
+        # select case on optimizer
+        match self.hparams.get('optimizer', 'Adam'):
+            case 'Adam':
+                optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.get('lr', 1e-3))
+            case 'AdamW':
+                optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.get('lr', 1e-3), weight_decay=self.hparams.get('optimizer_weight_decay', 0.01))
+            case 'SGD':
+                optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.get('lr', 1e-3), momentum=self.hparams.get('optimizer_momentum', 0.9))
+            case 'Lion':
+                # https://github.com/lucidrains/lion-pytorch | Typically lr 3-10 times lower than Adam and weight_decay 3-10 times higher
+                optimizer = Lion(self.parameters(), lr=self.hparams.get('lr', 1e-4), weight_decay=self.hparams.get('optimizer_weight_decay', 0.1))
+            case _:
+                raise ValueError(f"Optimizer {self.hparams.get('optimizer')} not supported")
+        return optimizer
+    def __step(self, batch, batch_idx, stage):
+        inputs, targets = batch
+        targets = torch.stack(targets[0])
+        outputs = self(inputs)
+        loss, loss_per_head = self.model.calculate_loss(outputs, targets)
+        loss_per_head_named = {f'{stage}_loss_{head}': loss_per_head[head] for head in loss_per_head}
+        loss_proportions = { f'{stage}_loss_{head}_proportion': round(loss_per_head[head].item() / loss.item(), 2) for head in loss_per_head}
+        loss_detail_dict = {**loss_per_head_named, **loss_proportions}
+        if self.hparams.get('debug'):
+            print(f"Step Inputs shape: {safe_shape(inputs)}")
+            print(f"Step Targets shape: {safe_shape(targets)}")
+            print(f"Step Outputs dict keys: {outputs.keys()}")
+        # NOTE: All metrics other than loss are tracked by callbacks (LogMessisMetrics)
+        self.log_dict({f'{stage}_loss': loss, **loss_detail_dict}, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return {'loss': loss, 'outputs': outputs}
+class LogConfusionMatrix(pl.Callback):
+    def __init__(self, hparams, dataset_info_file, debug=False):
+        super().__init__()
+        assert hparams.get('heads_spec') is not None, "heads_spec must be defined in the hparams"
+        self.tiers_dict = {k: v for k, v in hparams.get('heads_spec').items() if v.get('is_metrics_tier', False)}
+        self.last_tier_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_last_tier', False)), None)
+        self.final_head_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_final_head', False)), None)
+        assert self.last_tier_name is not None, "No tier found with 'is_last_tier' set to True"
+        assert self.final_head_name is not None, "No head found with 'is_final_head' set to True"
+        self.tiers = list(self.tiers_dict.keys())
+        self.phases = ['train', 'val', 'test']
+        self.modes = ['pixelwise', 'majority']
+        self.debug = debug
+        if debug:
+            print(f"Final head identified as: {self.final_head_name}")
+            print(f"LogConfusionMatrix Metrics over | Phases: {self.phases}, Tiers: {self.tiers}, Modes: {self.modes}")
+        with open(dataset_info_file, 'r') as f:
+            self.dataset_info = json.load(f)
+        # Initialize confusion matrices
+        self.metrics_to_compute = ['confusion_matrix']
+        self.metrics = {phase: {tier: {mode: self.__init_metrics(tier, phase) for mode in self.modes} for tier in self.tiers} for phase in self.phases}
+    def __init_metrics(self, tier, phase):
+        num_classes = self.tiers_dict[tier]['num_classes_to_predict']
+        confusion_matrix = classification.MulticlassConfusionMatrix(num_classes=num_classes)
+        return {
+            'confusion_matrix': confusion_matrix
+        }
+    def setup(self, trainer, pl_module, stage=None):
+        # Move all metrics to the correct device at the start of the training/validation
+        device = pl_module.device
+        for phase_metrics in self.metrics.values():
+            for tier_metrics in phase_metrics.values():
+                for mode_metrics in tier_metrics.values():
+                    for metric in self.metrics_to_compute:
+                        mode_metrics[metric].to(device)
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'train')
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'val')
+    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'test')
+    def __update_confusion_matrices(self, trainer, pl_module, outputs, batch, batch_idx, phase):
+        if trainer.sanity_checking:
+            return
+        targets = torch.stack(batch[1][0]) # (tiers, batch, H, W)
+        outputs = outputs['outputs'][self.final_head_name] # (batch, C, H, W)
+        field_ids = batch[1][1].permute(1, 0, 2, 3)[0]
+        pixelwise_outputs, majority_outputs = LogConfusionMatrix.get_pixelwise_and_majority_outputs(outputs, self.tiers, field_ids, self.dataset_info)
+        for preds, mode in zip([pixelwise_outputs, majority_outputs], self.modes):
+            # Update all metrics
+            assert len(preds) == len(targets), f"Number of predictions and targets do not match: {len(preds)} vs {len(targets)}"
+            assert len(preds) == len(self.tiers), f"Number of predictions and tiers do not match: {len(preds)} vs {len(self.tiers)}"
+            for pred, target, tier in zip(preds, targets, self.tiers):
+                if self.debug:
+                    print(f"Updating confusion matrix for {phase} {tier} {mode}")
+                metrics = self.metrics[phase][tier][mode]
+                # flatten and remove background class if the mode is majority (such that the background class is not included in the confusion matrix)
+                if mode == 'majority':
+                    pred = pred[target != 0]
+                    target = target[target != 0]
+                metrics['confusion_matrix'].update(pred, target)
+    @staticmethod
+    def get_pixelwise_and_majority_outputs(refinement_head_outputs, tiers, field_ids, dataset_info):
+        """
+        Get the pixelwise and majority predictions from the model outputs.
+        The pixelwise tier predictions are derived from the refinement_head_outputs predictions.
+        The majority last tier predictions are derived from the refinement_head_outputs. And then the majority lower-tier predictions are derived from the majority highest-tier predictions.
+        Also sets the background to 0 for all field majority predictions (regardless of what the model predicts for the background class).
+        As this is a classification task and not a segmentation task and the field boundaries are known beforehand and not of any interest.
+        Args:
+            refinement_head_outputs (torch.Tensor(batch, C, H, W)): The probability outputs from the model for the refined tier.
+            tiers (list of str): List of tiers e.g. ['tier1', 'tier2', 'tier3'].
+            field_ids (torch.Tensor(batch, H, W)): The field IDs for each prediction.
+            dataset_info (dict): The dataset information.
+        Returns:
+            torch.Tensor(tiers, batch, H, W): The pixelwise predictions.
+            torch.Tensor(tiers, batch, H, W): The majority predictions.
+        """
+        # Assuming the highest tier is the last one in the list
+        highest_tier = tiers[-1]
+        pixelwise_highest_tier = torch.softmax(refinement_head_outputs, dim=1).argmax(dim=1)  # (batch, H, W)
+        majority_highest_tier = LogConfusionMatrix.get_field_majority_preds(refinement_head_outputs, field_ids)
+        tier_mapping = {tier: dataset_info[f'{highest_tier}_to_{tier}'] for tier in tiers if tier != highest_tier}
+        pixelwise_outputs = {highest_tier: pixelwise_highest_tier}
+        majority_outputs = {highest_tier: majority_highest_tier}
+        # Initialize pixelwise and majority outputs for each tier
+        for tier in tiers:
+            if tier != highest_tier:
+                pixelwise_outputs[tier] = torch.zeros_like(pixelwise_highest_tier)
+                majority_outputs[tier] = torch.zeros_like(majority_highest_tier)
+        # Map the highest tier to lower tiers
+        for i, mappings in enumerate(zip(*tier_mapping.values())):
+            for j, tier in enumerate(tier_mapping.keys()):
+                pixelwise_outputs[tier][pixelwise_highest_tier == i] = mappings[j]
+                majority_outputs[tier][majority_highest_tier == i] = mappings[j]
+        pixelwise_outputs_stacked = torch.stack([pixelwise_outputs[tier] for tier in tiers])
+        majority_outputs_stacked = torch.stack([majority_outputs[tier] for tier in tiers])
+        # Ensure these are tensors
+        assert isinstance(pixelwise_outputs_stacked, torch.Tensor), "pixelwise_outputs_stacked is not a tensor"
+        assert isinstance(majority_outputs_stacked, torch.Tensor), "majority_outputs_stacked is not a tensor"
+        return pixelwise_outputs_stacked, majority_outputs_stacked
+    @staticmethod
+    def get_field_majority_preds(output, field_ids):
+        """
+        Get the majority prediction for each field in the batch. The majority excludes the background class.
+        Args:
+            output (torch.Tensor(batch, C, H, W)): The probability outputs from the model (tier3_refined)
+            field_ids (torch.Tensor(batch, H, W)): The field IDs for each prediction.
+        Returns:
+            torch.Tensor(batch, H, W): The majority predictions.
+        """
+        # remove the background class
+        pixelwise = torch.softmax(output[:, 1:, :, :], dim=1).argmax(dim=1) + 1  # (batch, H, W)
+        majority_preds = torch.zeros_like(pixelwise)
+        for batch in range(len(pixelwise)):
+            field_ids_batch = field_ids[batch]
+            for field_id in np.unique(field_ids_batch.cpu().numpy()):
+                if field_id == 0:
+                    continue
+                field_mask = field_ids_batch == field_id
+                flattened_pred = pixelwise[batch][field_mask].view(-1)  # Flatten the prediction
+                flattened_pred = flattened_pred[flattened_pred != 0]  # Exclude background class
+                if len(flattened_pred) == 0:
+                    continue
+                mode_pred, _ = torch.mode(flattened_pred) # Compute mode prediction
+                majority_preds[batch][field_mask] = mode_pred.item()
+        return majority_preds
+    def on_train_epoch_end(self, trainer, pl_module):
+        # Log and then reset the confusion matrices after training epoch
+        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'train')
+    def on_validation_epoch_end(self, trainer, pl_module):
+        # Log and then reset the confusion matrices after validation epoch
+        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'val')
+    def on_test_epoch_end(self, trainer, pl_module):
+        # Log and then reset the confusion matrices after test epoch
+        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'test')
+    def __log_and_reset_confusion_matrices(self, trainer, pl_module, phase):
+        if trainer.sanity_checking:
+            return
+        for tier in self.tiers:
+            for mode in self.modes:
+                metrics = self.metrics[phase][tier][mode]
+                confusion_matrix = metrics['confusion_matrix']
+                if self.debug:
+                    print(f"Logging and resetting confusion matrix for {phase} {tier} Update count: {confusion_matrix._update_count}")
+                matrix = confusion_matrix.compute()  # columns are predictions and rows are targets
+                # Calculate percentages
+                matrix = matrix.float()
+                row_sums = matrix.sum(dim=1, keepdim=True)
+                matrix_percent = matrix / row_sums
+                # Ensure percentages sum to 1 for each row or handle NaNs
+                row_sum_check = matrix_percent.sum(dim=1)
+                valid_rows = ~torch.isnan(row_sum_check)
+                if valid_rows.any():
+                    assert torch.allclose(row_sum_check[valid_rows], torch.ones_like(row_sum_check[valid_rows]), atol=1e-2), "Percentages do not sum to 1 for some valid rows"
+                # Sort the matrix and labels by the total number of instances
+                sorted_indices = row_sums.squeeze().argsort(descending=True)
+                matrix_percent = matrix_percent[sorted_indices, :] # sort rows
+                matrix_percent = matrix_percent[:, sorted_indices] # sort columns
+                class_labels = [self.dataset_info[tier][i] for i in sorted_indices]
+                row_sums_sorted = row_sums[sorted_indices]
+                # Check for zero rows after sorting
+                zero_rows = (row_sums_sorted == 0).squeeze()
+                fig, ax = plt.subplots(figsize=(matrix.size(0), matrix.size(0)), dpi=140)
+                ax.matshow(matrix_percent.cpu().numpy(), cmap='viridis')
+                ax.xaxis.set_major_locator(ticker.FixedLocator(range(matrix.size(1) + 1)))
+                ax.yaxis.set_major_locator(ticker.FixedLocator(range(matrix.size(0) + 1)))
+                ax.set_xticklabels(class_labels + [''], rotation=45)
+                ax.set_yticklabels(class_labels + [''])
+                # Add total number of instances to the y-axis labels
+                y_labels = [f'{class_labels[i]} [n={int(row_sums_sorted[i].item()):,.0f}]'.replace(',', "'") for i in range(matrix.size(0))]
+                ax.set_yticklabels(y_labels + [''])
+                ax.set_xlabel('Predictions')
+                ax.set_ylabel('Targets')
+                # Move x-axis label and ticks to the top
+                ax.xaxis.set_label_position('top')
+                ax.xaxis.set_ticks_position('top')
+                fig.tight_layout()
+                for i in range(matrix.size(0)):
+                    for j in range(matrix.size(1)):
+                        if zero_rows[i]:
+                            ax.text(j, i, 'N/A', ha='center', va='center', color='black')
+                        else:
+                            ax.text(j, i, f'{matrix_percent[i, j]:.2f}', ha='center', va='center', color='#F88379', weight='bold') # coral red
+                trainer.logger.experiment.log({f"{phase}_{tier}_confusion_matrix_{mode}": wandb.Image(fig)})
+                plt.close()
+                confusion_matrix.reset()
+class LogMessisMetrics(pl.Callback):
+    def __init__(self, hparams, dataset_info_file, debug=False):
+        super().__init__()
+        assert hparams.get('heads_spec') is not None, "heads_spec must be defined in the hparams"
+        self.tiers_dict = {k: v for k, v in hparams.get('heads_spec').items() if v.get('is_metrics_tier', False)}
+        self.last_tier_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_last_tier', False)), None)
+        self.final_head_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_final_head', False)), None)
+        assert self.last_tier_name is not None, "No tier found with 'is_last_tier' set to True"
+        assert self.final_head_name is not None, "No head found with 'is_final_head' set to True"
+        self.tiers = list(self.tiers_dict.keys())
+        self.phases = ['train', 'val', 'test']
+        self.modes = ['pixelwise', 'majority']
+        self.debug = debug
+        if debug:
+            print(f"Last tier identified as: {self.last_tier_name}")
+            print(f"Final head identified as: {self.final_head_name}")
+            print(f"LogMessisMetrics Metrics over | Phases: {self.phases}, Tiers: {self.tiers}, Modes: {self.modes}")
+        with open(dataset_info_file, 'r') as f:
+            self.dataset_info = json.load(f)
+        # Initialize metrics
+        self.metrics_to_compute = ['accuracy', 'weighted_accuracy', 'precision', 'weighted_precision', 'recall', 'weighted_recall' ,'f1', 'weighted_f1', 'cohen_kappa']
+        self.metrics = {phase: {tier: {mode: self.__init_metrics(tier, phase) for mode in self.modes} for tier in self.tiers} for phase in self.phases}
+        self.images_to_log = {phase: {mode: None for mode in self.modes} for phase in self.phases}
+        self.images_to_log_targets = {phase: None for phase in self.phases}
+        self.field_ids_to_log_targets = {phase: None for phase in self.phases}
+        self.inputs_to_log = {phase: None for phase in self.phases}
+    def __init_metrics(self, tier, phase):
+        num_classes = self.tiers_dict[tier]['num_classes_to_predict']
+        accuracy = classification.MulticlassAccuracy(num_classes=num_classes, average='macro')
+        weighted_accuracy = classification.MulticlassAccuracy(num_classes=num_classes, average='weighted')
+        per_class_accuracies = {
+            class_index: classification.BinaryAccuracy() for class_index in range(num_classes)
+        }
+        precision = classification.MulticlassPrecision(num_classes=num_classes, average='macro')
+        weighted_precision = classification.MulticlassPrecision(num_classes=num_classes, average='weighted')
+        recall = classification.MulticlassRecall(num_classes=num_classes, average='macro')
+        weighted_recall = classification.MulticlassRecall(num_classes=num_classes, average='weighted')
+        f1 = classification.MulticlassF1Score(num_classes=num_classes, average='macro')
+        weighted_f1 = classification.MulticlassF1Score(num_classes=num_classes, average='weighted')
+        cohen_kappa = classification.MulticlassCohenKappa(num_classes=num_classes)
+        return {
+            'accuracy': accuracy,
+            'weighted_accuracy': weighted_accuracy,
+            'per_class_accuracies': per_class_accuracies,
+            'precision': precision,
+            'weighted_precision': weighted_precision,
+            'recall': recall,
+            'weighted_recall': weighted_recall,
+            'f1': f1,
+            'weighted_f1': weighted_f1,
+            'cohen_kappa': cohen_kappa
+        }
+    def setup(self, trainer, pl_module, stage=None):
+        # Move all metrics to the correct device at the start of the training/validation
+        device = pl_module.device
+        for phase_metrics in self.metrics.values():
+            for tier_metrics in phase_metrics.values():
+                for mode_metrics in tier_metrics.values():
+                    for metric in self.metrics_to_compute:
+                        mode_metrics[metric].to(device)
+                    for class_accuracy in mode_metrics['per_class_accuracies'].values():
+                        class_accuracy.to(device)
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'train')
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'val')
+    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'test')
+    def __on_batch_end(self, trainer: pl.Trainer, pl_module, outputs, batch, batch_idx, phase):
+        if trainer.sanity_checking:
+            return
+        if self.debug:
+            print(f"{phase} batch ended. Updating metrics...")
+        targets = torch.stack(batch[1][0]) # (tiers, batch, H, W)
+        outputs = outputs['outputs'][self.final_head_name] # (batch, C, H, W)
+        field_ids = batch[1][1].permute(1, 0, 2, 3)[0]
+        pixelwise_outputs, majority_outputs = LogConfusionMatrix.get_pixelwise_and_majority_outputs(outputs, self.tiers, field_ids, self.dataset_info)
+        for preds, mode in zip([pixelwise_outputs, majority_outputs], self.modes):
+            # Update all metrics
+            assert preds.shape == targets.shape, f"Shapes of predictions and targets do not match: {preds.shape} vs {targets.shape}"
+            assert preds.shape[0] == len(self.tiers), f"Number of tiers in predictions and tiers do not match: {preds.shape[0]} vs {len(self.tiers)}"
+            self.images_to_log[phase][mode] = preds[-1]
+            for pred, target, tier in zip(preds, targets, self.tiers):
+                # flatten and remove background class if the mode is majority (such that the background class is not considered in the metrics)
+                if mode == 'majority':
+                    pred = pred[target != 0]
+                    target = target[target != 0]
+                metrics = self.metrics[phase][tier][mode]
+                for metric in self.metrics_to_compute:
+                    metrics[metric].update(pred, target)
+                    if self.debug:
+                        print(f"{phase} {tier} {mode} {metric} updated. Update count: {metrics[metric]._update_count}")
+                self.__update_per_class_metrics(pred, target, metrics['per_class_accuracies'])
+        self.images_to_log_targets[phase] = targets[-1]
+        self.field_ids_to_log_targets[phase] = field_ids
+        self.inputs_to_log[phase] = batch[0]
+    def __update_per_class_metrics(self, preds, targets, per_class_accuracies):
+        for class_index, class_accuracy in per_class_accuracies.items():
+            if not (targets == class_index).any():
+                continue
+            if class_index == 0:
+                # Mask out non-background elements for background class (0)
+                class_mask = targets != 0
+            else:
+                # Mask out background elements for other classes
+                class_mask = targets == 0
+            preds_fields = preds[~class_mask]
+            targets_fields = targets[~class_mask]
+            # Prepare for binary classification (needs to be float)
+            preds_class = (preds_fields == class_index).float()
+            targets_class = (targets_fields == class_index).float()
+            class_accuracy.update(preds_class, targets_class)
+            if self.debug:
+                print(f"Shape of preds_fields: {preds_fields.shape}")
+                print(f"Shape of targets_fields: {targets_fields.shape}")
+                print(f"Unique values in preds_fields: {torch.unique(preds_fields)}")
+                print(f"Unique values in targets_fields: {torch.unique(targets_fields)}")
+                print(f"Per-class metrics for class {class_index} updated. Update count: {per_class_accuracies[class_index]._update_count}")
+    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        self.__on_epoch_end(trainer, pl_module, 'train')
+    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        self.__on_epoch_end(trainer, pl_module, 'val')
+    def on_test_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        self.__on_epoch_end(trainer, pl_module, 'test')
+    def __on_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule, phase):
+        if trainer.sanity_checking:
+            return # Skip during sanity check (avoid warning about metric compute being called before update)
+        for tier in self.tiers:
+            for mode in self.modes:
+                metrics = self.metrics[phase][tier][mode]
+                # Calculate and reset in tier: Accuracy, WeightedAccuracy, Precision, Recall, F1, Cohen's Kappa
+                metrics_dict = {metric: metrics[metric].compute() for metric in self.metrics_to_compute}
+                pl_module.log_dict({f"{phase}_{metric}_{tier}_{mode}": v for metric, v in metrics_dict.items()}, on_step=False, on_epoch=True)
+                for metric in self.metrics_to_compute:
+                    metrics[metric].reset()
+                # Per-class metrics
+                # NOTE: Some literature reports "per class accuracy" but what they actually mean is "per class recall".
+                # Using the accuracy formula per class has no value in our imbalanced multi-class setting (TN's inflate scores!)
+                # We calculate all 4 metrics. This allows us to calculate any macro/micro score later if needed.
+                class_metrics = []
+                class_names_mapping = self.dataset_info[tier.split('_')[0] if '_refined' in tier else tier]
+                for class_index, class_accuracy in metrics['per_class_accuracies'].items():
+                    if class_accuracy._update_count == 0:
+                        continue  # Skip if no updates have been made
+                    tp, tn, fp, fn = class_accuracy.tp, class_accuracy.tn, class_accuracy.fp, class_accuracy.fn
+                    recall = (tp / (tp + fn)).item() if tp + fn > 0 else 0
+                    precision = (tp / (tp + fp)).item() if tp + fp > 0 else 0
+                    f1 = (2 * (precision * recall) / (precision + recall)) if precision + recall > 0 else 0
+                    n_of_class = (tp + fn).item()
+                    class_metrics.append([class_index, class_names_mapping[class_index], precision, recall, f1, class_accuracy.compute().item(), n_of_class])
+                    class_accuracy.reset()
+                wandb_table = wandb.Table(data=class_metrics, columns=["Class Index", "Class Name", "Precision", "Recall", "F1", "Accuracy", "N"])
+                trainer.logger.experiment.log({f"{phase}_per_class_metrics_{tier}_{mode}": wandb_table})
+        # use the same n_classes for all images, such that they are comparable
+        n_classes = max([
+            torch.max(self.images_to_log_targets[phase]),
+            torch.max(self.images_to_log[phase]["majority"]),
+            torch.max(self.images_to_log[phase]["pixelwise"])
+        ])
+        images     = [LogMessisMetrics.process_images(self.images_to_log[phase][mode], n_classes) for mode in self.modes]
+        images.append(LogMessisMetrics.create_positive_negative_image(self.images_to_log[phase]["majority"], self.images_to_log_targets[phase]))
+        images.append(LogMessisMetrics.process_images(self.images_to_log_targets[phase], n_classes))
+        images.append(LogMessisMetrics.process_images(self.field_ids_to_log_targets[phase].cpu()))
+        examples = []
+        for i in range(len(images[0])):
+            example = np.concatenate([img[i] for img in images], axis=0)
+            examples.append(wandb.Image(example, caption=f"From Top to Bottom: {self.modes[0]}, {self.modes[1]}, right/wrong classifications, target, fields"))
+        trainer.logger.experiment.log({f"{phase}_examples": examples})
+        # Log segmentation masks
+        batch_input_data = self.inputs_to_log[phase].cpu() # shape [BS, 6, N_TIMESTEPS, 224, 224]
+        ground_truth_masks = self.images_to_log_targets[phase].cpu().numpy()
+        pixel_wise_masks = self.images_to_log[phase]["pixelwise"].cpu().numpy()
+        field_majority_masks = self.images_to_log[phase]["majority"].cpu().numpy()
+        correctness_masks = self.create_positive_negative_segmentation_mask(field_majority_masks, ground_truth_masks)
+        class_labels = {idx: name for idx, name in enumerate(self.dataset_info[self.last_tier_name])}
+        segmentation_masks = []
+        for input_data, ground_truth_mask, pixel_wise_mask, field_majority_mask, correctness_mask in zip(batch_input_data, ground_truth_masks, pixel_wise_masks, field_majority_masks, correctness_masks):
+            middle_timestep_index = input_data.shape[1] // 2  # Get the middle timestamp index
+            gamma = 2.5  # Gamma for brightness adjustment
+            rgb_image = input_data[:3, middle_timestep_index, :, :].permute(1, 2, 0).numpy()  # Shape [224, 224, 3]
+            rgb_image = (rgb_image - rgb_image.min()) / (rgb_image.max() - rgb_image.min())
+            rgb_image = np.power(rgb_image, 1.0 / gamma)
+            rgb_image = (rgb_image * 255).astype(np.uint8)
+            mask_img = wandb.Image(
+                rgb_image,
+                masks={
+                    "predictions_pixel_wise": {"mask_data": pixel_wise_mask, "class_labels": class_labels},
+                    "predictions_field_majority": {"mask_data": field_majority_mask, "class_labels": class_labels},
+                    "ground_truth": {"mask_data": ground_truth_mask, "class_labels": class_labels},
+                    "correctness": {"mask_data": correctness_mask, "class_labels": { 0: "Background", 1: "Wrong", 2: "Right" }},
+                },
+            )
+            segmentation_masks.append(mask_img)
+        trainer.logger.experiment.log({f"{phase}_segmentation_mask": segmentation_masks})
+        if self.debug:
+            print(f"{phase} epoch ended. Logging & resetting metrics...", trainer.sanity_checking)
+    @staticmethod
+    def create_positive_negative_segmentation_mask(field_majority_masks, ground_truth_masks):
+        """
+        Create a tensor that shows the positive and negative classifications of the model.
+        Args:
+            field_majority_masks (np.ndarray): The field majority masks generated by the model.
+            ground_truth_masks (np.ndarray): The ground truth masks.
+        Returns:
+            np.ndarray: An array with values:
+                - 0 where the target is 0,
+                - 2 where the prediction matches the target,
+                - 1 where the prediction does not match the target.
+        """
+        correctness_mask = np.zeros_like(ground_truth_masks, dtype=int)
+        matches = (field_majority_masks == ground_truth_masks) & (ground_truth_masks != 0)
+        correctness_mask[matches] = 2
+        mismatches = (field_majority_masks != ground_truth_masks) & (ground_truth_masks != 0)
+        correctness_mask[mismatches] = 1
+        return correctness_mask
+    @staticmethod
+    def create_positive_negative_image(generated_images, target_images):
+        """
+        Create an image that shows the positive and negative classifications of the model.
+        Args:
+            generated_images (torch.Tensor): The images generated by the model.
+            target_images (torch.Tensor): The target images.
+        Returns:
+            list: A list of processed images.
+        """
+        classification_masks = generated_images == target_images
+        processed_imgs = []
+        for mask, target in zip(classification_masks, target_images):
+            # color the background white, right classifications green, wrong classifications red
+            colored_img = torch.zeros((mask.shape[0], mask.shape[1], 3), dtype=torch.uint8)
+            mask = mask.bool()  # Convert to boolean tensor
+            colored_img[mask] = torch.tensor([0, 255, 0], dtype=torch.uint8)
+            colored_img[~mask] = torch.tensor([255, 0, 0], dtype=torch.uint8)
+            colored_img[target == 0] = torch.tensor([0, 0, 0], dtype=torch.uint8)
+            processed_imgs.append(colored_img.cpu())
+        return processed_imgs
+    @staticmethod
+    def process_images(imgs, max=None):
+        """
+        Process a batch of images to be logged on wandb.
+        Args:
+            imgs (torch.Tensor): A batch of images with shape (B, H, W) to be processed.
+            max (float, optional): The maximum value to normalize the images. Defaults to None. If None, the maximum value in the batch is used.
+        """
+        if max is None:
+            max = np.max(imgs.cpu().numpy())
+        normalized_img = imgs / max
+        processed_imgs = []
+        for img in normalized_img.cpu().numpy():
+            if max < 60:
+                cmap = ListedColormap(plt.get_cmap('tab20').colors + plt.get_cmap('tab20b').colors + plt.get_cmap('tab20c').colors)
+            else:
+                cmap = plt.get_cmap('viridis')
+            colored_img = cmap(img)
+            colored_img[img == 0] = [0, 0, 0, 1]
+            colored_img_uint8 = (colored_img[:, :, :3] * 255).astype(np.uint8)
+            processed_imgs.append(colored_img_uint8)
+        return processed_imgs

messis/prithvi.py ADDED Viewed

	@@ -0,0 +1,555 @@

+from safetensors import safe_open
+import torch
+import torch.nn as nn
+import numpy as np
+from timm.models.layers import to_2tuple
+from timm.models.vision_transformer import Block
+# Taken and adapted from Pritvhi `geospatial_fm.py`, for the purpose of avoiding MMCV/MMSegmentation dependencies
+def _convTranspose2dOutput(
+    input_size: int,
+    stride: int,
+    padding: int,
+    dilation: int,
+    kernel_size: int,
+    output_padding: int,
+):
+    """
+    Calculate the output size of a ConvTranspose2d.
+    Taken from: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+    """
+    return (
+        (input_size - 1) * stride
+        - 2 * padding
+        + dilation * (kernel_size - 1)
+        + output_padding
+        + 1
+    )
+def get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos: torch.Tensor):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_3d_sincos_pos_embed(embed_dim: int, grid_size: tuple, cls_token: bool = False):
+    # Copyright (c) Meta Platforms, Inc. and affiliates.
+    # All rights reserved.
+    # This source code is licensed under the license found in the
+    # LICENSE file in the root directory of this source tree.
+    # --------------------------------------------------------
+    # Position embedding utils
+    # --------------------------------------------------------
+    """
+    grid_size: 3d tuple of grid size: t, h, w
+    return:
+    pos_embed: L, D
+    """
+    assert embed_dim % 16 == 0
+    t_size, h_size, w_size = grid_size
+    w_embed_dim = embed_dim // 16 * 6
+    h_embed_dim = embed_dim // 16 * 6
+    t_embed_dim = embed_dim // 16 * 4
+    w_pos_embed = get_1d_sincos_pos_embed_from_grid(w_embed_dim, np.arange(w_size))
+    h_pos_embed = get_1d_sincos_pos_embed_from_grid(h_embed_dim, np.arange(h_size))
+    t_pos_embed = get_1d_sincos_pos_embed_from_grid(t_embed_dim, np.arange(t_size))
+    w_pos_embed = np.tile(w_pos_embed, (t_size * h_size, 1))
+    h_pos_embed = np.tile(np.repeat(h_pos_embed, w_size, axis=0), (t_size, 1))
+    t_pos_embed = np.repeat(t_pos_embed, h_size * w_size, axis=0)
+    pos_embed = np.concatenate((w_pos_embed, h_pos_embed, t_pos_embed), axis=1)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+class Norm2d(nn.Module):
+    def __init__(self, embed_dim: int):
+        super().__init__()
+        self.ln = nn.LayerNorm(embed_dim, eps=1e-6)
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = self.ln(x)
+        x = x.permute(0, 3, 1, 2).contiguous()
+        return x
+class PatchEmbed(nn.Module):
+    """Frames of 2D Images to Patch Embedding
+    The 3D version of timm.models.vision_transformer.PatchEmbed
+    """
+    def __init__(
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        num_frames: int = 3,
+        tubelet_size: int = 1,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: nn.Module = None,
+        flatten: bool = True,
+        bias: bool = True,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.grid_size = (
+            num_frames // tubelet_size,
+            img_size[0] // patch_size[0],
+            img_size[1] // patch_size[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        self.flatten = flatten
+        self.proj = nn.Conv3d(
+            in_chans,
+            embed_dim,
+            kernel_size=(tubelet_size, patch_size[0], patch_size[1]),
+            stride=(tubelet_size, patch_size[0], patch_size[1]),
+            bias=bias,
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        assert (
+            H == self.img_size[0]
+        ), f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
+        assert (
+            W == self.img_size[1]
+        ), f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[3], x.shape[4]
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # B,C,T,H,W -> B,C,L -> B,L,C
+        x = self.norm(x)
+        return x, Hp, Wp
+class ConvTransformerTokensToEmbeddingNeck(nn.Module):
+    """
+    Neck that transforms the token-based output of transformer into a single embedding suitable for processing with standard layers.
+    Performs 4 ConvTranspose2d operations on the rearranged input with kernel_size=2 and stride=2
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        output_embed_dim: int,
+        # num_frames: int = 1,
+        Hp: int = 14,
+        Wp: int = 14,
+        drop_cls_token: bool = True,
+    ):
+        """
+        Args:
+            embed_dim (int): Input embedding dimension
+            output_embed_dim (int): Output embedding dimension
+            Hp (int, optional): Height (in patches) of embedding to be upscaled. Defaults to 14.
+            Wp (int, optional): Width (in patches) of embedding to be upscaled. Defaults to 14.
+            drop_cls_token (bool, optional): Whether there is a cls_token, which should be dropped. This assumes the cls token is the first token. Defaults to True.
+        """
+        super().__init__()
+        self.drop_cls_token = drop_cls_token
+        self.Hp = Hp
+        self.Wp = Wp
+        self.H_out = Hp
+        self.W_out = Wp
+        # self.num_frames = num_frames
+        kernel_size = 2
+        stride = 2
+        dilation = 1
+        padding = 0
+        output_padding = 0
+        for _ in range(4):
+            self.H_out = _convTranspose2dOutput(
+                self.H_out, stride, padding, dilation, kernel_size, output_padding
+            )
+            self.W_out = _convTranspose2dOutput(
+                self.W_out, stride, padding, dilation, kernel_size, output_padding
+            )
+        self.embed_dim = embed_dim
+        self.output_embed_dim = output_embed_dim
+        self.fpn1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                self.embed_dim,
+                self.output_embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                output_padding=output_padding,
+            ),
+            Norm2d(self.output_embed_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                self.output_embed_dim,
+                self.output_embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                output_padding=output_padding,
+            ),
+        )
+        self.fpn2 = nn.Sequential(
+            nn.ConvTranspose2d(
+                self.output_embed_dim,
+                self.output_embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                output_padding=output_padding,
+            ),
+            Norm2d(self.output_embed_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                self.output_embed_dim,
+                self.output_embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                output_padding=output_padding,
+            ),
+        )
+    def forward(self, x):
+        x = x[0]
+        if self.drop_cls_token:
+            x = x[:, 1:, :]
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, self.Hp, self.Wp)
+        x = self.fpn1(x)
+        x = self.fpn2(x)
+        x = x.reshape((-1, self.output_embed_dim, self.H_out, self.W_out))
+        out = tuple([x])
+        return out
+class ConvTransformerTokensToEmbeddingBottleneckNeck(nn.Module):
+    """
+    Neck that transforms the token-based output of transformer into a single embedding suitable for processing with standard layers.
+    Performs ConvTranspose2d operations with bottleneck layers to reduce channels.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        output_embed_dim: int,
+        Hp: int = 14,
+        Wp: int = 14,
+        drop_cls_token: bool = True,
+        bottleneck_reduction_factor: int = 4,
+    ):
+        """
+        Args:
+            embed_dim (int): Input embedding dimension
+            output_embed_dim (int): Output embedding dimension
+            Hp (int, optional): Height (in patches) of embedding to be upscaled. Defaults to 14.
+            Wp (int, optional): Width (in patches) of embedding to be upscaled. Defaults to 14.
+            drop_cls_token (bool, optional): Whether there is a cls_token, which should be dropped. Defaults to True.
+            bottleneck_ratio (int, optional): Ratio to reduce channels in bottleneck layers. Defaults to 4.
+        """
+        super().__init__()
+        self.drop_cls_token = drop_cls_token
+        self.Hp = Hp
+        self.Wp = Wp
+        self.H_out = Hp
+        self.W_out = Wp
+        kernel_size = 2
+        stride = 2
+        dilation = 1
+        padding = 0
+        output_padding = 0
+        for _ in range(4):
+            self.H_out = _convTranspose2dOutput(
+                self.H_out, stride, padding, dilation, kernel_size, output_padding
+            )
+            self.W_out = _convTranspose2dOutput(
+                self.W_out, stride, padding, dilation, kernel_size, output_padding
+            )
+        self.embed_dim = embed_dim
+        self.output_embed_dim = output_embed_dim
+        bottleneck_dim = self.embed_dim // bottleneck_reduction_factor
+        self.fpn1 = nn.Sequential(
+            nn.Conv2d(
+                self.embed_dim,
+                bottleneck_dim,
+                kernel_size=1
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                bottleneck_dim,
+                bottleneck_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                output_padding=output_padding
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                bottleneck_dim,
+                bottleneck_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                output_padding=output_padding
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.Conv2d(
+                bottleneck_dim,
+                self.output_embed_dim,
+                kernel_size=1
+            ),
+            Norm2d(self.output_embed_dim),
+            nn.GELU(),
+        )
+        self.fpn2 = nn.Sequential(
+            nn.Conv2d(
+                self.output_embed_dim,
+                bottleneck_dim,
+                kernel_size=1
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                bottleneck_dim,
+                bottleneck_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                output_padding=output_padding
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.ConvTranspose2d(
+                bottleneck_dim,
+                bottleneck_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                output_padding=output_padding
+            ),
+            Norm2d(bottleneck_dim),
+            nn.GELU(),
+            nn.Conv2d(
+                bottleneck_dim,
+                self.output_embed_dim,
+                kernel_size=1
+            ),
+            Norm2d(self.output_embed_dim),
+            nn.GELU(),
+        )
+    def forward(self, x):
+        x = x[0]
+        if self.drop_cls_token:
+            x = x[:, 1:, :]
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, self.Hp, self.Wp)
+        x = self.fpn1(x)
+        x = self.fpn2(x)
+        x = x.reshape((-1, self.output_embed_dim, self.H_out, self.W_out))
+        out = tuple([x])
+        return out
+class TemporalViTEncoder(nn.Module):
+    """Encoder from an ViT with capability to take in temporal input.
+    This class defines an encoder taken from a ViT architecture.
+    """
+    def __init__(
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        num_frames: int = 1,
+        tubelet_size: int = 1,
+        in_chans: int = 3,
+        embed_dim: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        norm_pix_loss: bool = False,
+        pretrained: str = None,
+        debug=False
+    ):
+        """
+        Args:
+            img_size (int, optional): Input image size. Defaults to 224.
+            patch_size (int, optional): Patch size to be used by the transformer. Defaults to 16.
+            num_frames (int, optional): Number of frames (temporal dimension) to be input to the encoder. Defaults to 1.
+            tubelet_size (int, optional): Tubelet size used in patch embedding. Defaults to 1.
+            in_chans (int, optional): Number of input channels. Defaults to 3.
+            embed_dim (int, optional): Embedding dimension. Defaults to 1024.
+            depth (int, optional): Encoder depth. Defaults to 24.
+            num_heads (int, optional): Number of heads used in the encoder blocks. Defaults to 16.
+            mlp_ratio (float, optional): Ratio to be used for the size of the MLP in encoder blocks. Defaults to 4.0.
+            norm_layer (nn.Module, optional): Norm layer to be used. Defaults to nn.LayerNorm.
+            norm_pix_loss (bool, optional): Whether to use Norm Pix Loss. Defaults to False.
+            pretrained (str, optional): Path to pretrained encoder weights. Defaults to None.
+        """
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.embed_dim = embed_dim
+        self.patch_embed = PatchEmbed(
+            img_size, patch_size, num_frames, tubelet_size, in_chans, embed_dim
+        )
+        num_patches = self.patch_embed.num_patches
+        self.num_frames = num_frames
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dim), requires_grad=False
+        )  # fixed sin-cos embedding
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+        self.norm_pix_loss = norm_pix_loss
+        self.pretrained = pretrained
+        self.debug = debug
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        pos_embed = get_3d_sincos_pos_embed(
+            self.pos_embed.shape[-1], self.patch_embed.grid_size, cls_token=True
+        )
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # TODO: FIX huggingface config
+        # load pretrained weights
+        # if self.pretrained:
+        #     if self.pretrained.endswith('.safetensors'):
+        #         self._load_safetensors_weights()
+        #     elif self.pretrained == 'huggingface':
+        #         print("TemporalViTEncoder | Using HuggingFace pretrained weights.")
+        #     else:
+        #         self._load_pt_weights()
+        # else:
+        #     self.apply(self._init_weights)
+    def _load_safetensors_weights(self):
+        with safe_open(self.pretrained, framework='pt', device='cpu') as f:
+            checkpoint_state_dict = {k: torch.tensor(v) for k, v in f.items()}
+        missing_keys, unexpected_keys = self.load_state_dict(checkpoint_state_dict, strict=False)
+        if missing_keys:
+            print("TemporalViTEncoder | Warning: Missing keys in the state dict:", missing_keys)
+        if unexpected_keys:
+            print("TemporalViTEncoder | Warning: Unexpected keys in the state dict:", unexpected_keys)
+        print(f"TemporalViTEncoder | Loaded pretrained weights from '{self.pretrained}' (safetensors).")
+    def _load_pt_weights(self):
+        checkpoint = torch.load(self.pretrained, map_location='cpu')
+        checkpoint_state_dict = checkpoint.get('state_dict', checkpoint)
+        missing_keys, unexpected_keys = self.load_state_dict(checkpoint_state_dict, strict=False)
+        if missing_keys:
+            print("TemporalViTEncoder | Warning: Missing keys in the state dict:", missing_keys)
+        if unexpected_keys:
+            print("TemporalViTEncoder | Warning: Unexpected keys in the state dict:", unexpected_keys)
+        print(f"TemporalViTEncoder | Loaded pretrained weights from '{self.pretrained}' (pt file).")
+    def _init_weights(self, m):
+        print("TemporalViTEncoder | Newly Initializing weights...")
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        if self.debug:
+            print('TemporalViTEncoder IN:', x.shape)
+        # embed patches
+        x, _, _ = self.patch_embed(x)
+        if self.debug:
+            print('TemporalViTEncoder EMBED:', x.shape)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        if self.debug:
+            print('TemporalViTEncoder OUT:', x.shape)
+        return tuple([x])

pages/1_Select_Location.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import streamlit as st
+from streamlit_folium import st_folium
+import folium
+from geopy.geocoders import Nominatim
+# Define the bounding box
+ZUERICH_BBOX = [8.364, 47.240, 9.0405, 47.69894]
+def within_bbox(lat, lon, bbox):
+    """Check if a point is within the given bounding box."""
+    return bbox[1] <= lat <= bbox[3] and bbox[0] <= lon <= bbox[2]
+def select_coordinates():
+    st.title("Step 1: Select Location")
+    instructions = """
+    1. Choose a crop classification location. Search for a location or click on the map.
+    2. Proceed to the "Perform Crop Classification" step.
+    _Note:_ The location must be within the green ZüriCrop area.
+    """
+    st.sidebar.header("Instructions")
+    st.sidebar.markdown(instructions)
+    # Initialize a map centered around the midpoint of the bounding box
+    midpoint_lat = (ZUERICH_BBOX[1] + ZUERICH_BBOX[3]) / 2
+    midpoint_lon = (ZUERICH_BBOX[0] + ZUERICH_BBOX[2]) / 2
+    m = folium.Map(location=[midpoint_lat, midpoint_lon], zoom_start=9)
+    # Add the bounding box to the map as a rectangle
+    folium.Rectangle(
+        bounds=[[ZUERICH_BBOX[1], ZUERICH_BBOX[0]], [ZUERICH_BBOX[3], ZUERICH_BBOX[2]]],
+        color="green",
+        fill=True,
+        fill_opacity=0.1
+    ).add_to(m)
+    # Search for a location
+    geolocator = Nominatim(user_agent="streamlit-app")
+    location_query = st.text_input("Search for a location:")
+    if location_query:
+        location = geolocator.geocode(location_query)
+        if location:
+            lat, lon = location.latitude, location.longitude
+            folium.Marker([lat, lon], tooltip=location.address).add_to(m)
+            m.location = [lat, lon]
+            m.zoom_start = 12
+            if within_bbox(lat, lon, ZUERICH_BBOX):
+                st.success(f"Location found: {location.address}. It is within the bounding box.")
+                st.session_state["selected_location"] = (lat, lon)
+            else:
+                st.error(f"Location found: {location.address}. It is outside the bounding box.")
+        else:
+            st.error("Location not found. Please try again.")
+    # Add a click event listener to capture coordinates
+    m.add_child(folium.LatLngPopup())
+    # Display the map using streamlit-folium
+    st_data = st_folium(m, height=500, width=800)
+    # Check if the user clicked within the bounding box
+    if st_data["last_clicked"]:
+        lat, lon = st_data["last_clicked"]["lat"], st_data["last_clicked"]["lng"]
+        if within_bbox(lat, lon, ZUERICH_BBOX):
+            st.success(f"Selected Location: Latitude {lat}, Longitude {lon}")
+            st.session_state["selected_location"] = (lat, lon)
+        else:
+            st.error(f"Selected Location is outside the allowed area. Please select a location within the bounding box.")
+    # Proceed to the next step
+    link_disabled = "selected_location" not in st.session_state
+    st.sidebar.page_link("pages/2_Perform_Crop_Classification.py", label="Proceed to Crop Classification", icon="🌾", disabled=link_disabled)
+if __name__ == "__main__":
+    select_coordinates()

pages/2_Perform_Crop_Classification.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+import leafmap.foliumap as leafmap
+from transformers import PretrainedConfig
+from folium import Icon
+from messis.messis import Messis
+from inference import perform_inference
+st.set_page_config(layout="wide")
+GEOTIFF_PATH = "./data/stacked_features.tif"
+# Load the model
+@st.cache_resource
+def load_model():
+    config = PretrainedConfig.from_pretrained('crop-classification/messis', revision='47d9ca4')
+    model = Messis.from_pretrained('crop-classification/messis', cache_dir='./hf_cache/', revision='47d9ca4')
+    return model, config
+model, config = load_model()
+def perform_inference_step():
+    st.title("Step 2: Perform Crop Classification")
+    if "selected_location" not in st.session_state:
+        st.error("No location selected. Please select a location first.")
+        st.page_link("pages/1_Select_Location.py", label="Select Location", icon="📍")
+        return
+    lat, lon = st.session_state["selected_location"]
+    # Sidebar
+    st.sidebar.header("Settings")
+    # Timestep Slider
+    timestep = st.sidebar.slider("Select Timestep", 1, 9, 5)
+    # Band Dropdown
+    band_options = {
+        "RGB": [1, 2, 3],  # Adjust indices based on the actual bands in your GeoTIFF
+        "NIR": [4],
+        "SWIR1": [5],
+        "SWIR2": [6]
+    }
+    vmin_vmax = {
+        "RGB": (89, 1878),
+        "NIR": (165, 5468),
+        "SWIR1": (120, 3361),
+        "SWIR2": (94, 2700)
+    }
+    selected_band = st.sidebar.selectbox("Select Satellite Band to Display", options=list(band_options.keys()), index=0)
+    # Calculate the band indices based on the selected timestep
+    selected_bands = [band + (timestep - 1) * 6 for band in band_options[selected_band]]
+    instructions = """
+    Click the button "Perform Crop Classification".
+    _Note:_
+    - Messis will classify the crop types for the fields in your selected location.
+    - Hover over the fields to see the predicted and true crop type.
+    - The satellite images might take a few seconds to load.
+    """
+    st.sidebar.header("Instructions")
+    st.sidebar.markdown(instructions)
+    # Initialize the map
+    m = leafmap.Map(center=(lat, lon), zoom=10, draw_control=False)
+    # Perform inference
+    if st.sidebar.button("Perform Crop Classification", type="primary"):
+        predictions = perform_inference(lon, lat, model, config, debug=True)
+        m.add_data(predictions,
+            layer_name = "Predictions",
+            column="Correct",
+            add_legend=False,
+            style_function=lambda x: {"fillColor": "green" if x["properties"]["Correct"] else "red", "color": "black", "weight": 0, "fillOpacity": 0.25},
+        )
+        st.success("Inference completed!")
+    # GeoTIFF Satellite Imagery with selected timestep and band
+    m.add_raster(
+        GEOTIFF_PATH,
+        layer_name="Satellite Image",
+        bands=selected_bands,
+        fit_bounds=True,
+        vmin=vmin_vmax[selected_band][0],
+        vmax=vmin_vmax[selected_band][1],
+    )
+    # Show the POI on the map
+    poi_icon = Icon(color="green", prefix="fa", icon="crosshairs")
+    m.add_marker(location=[lat, lon], popup="Selected Location", layer_name="POI", icon=poi_icon)
+    # Display the map in the Streamlit app
+    m.to_streamlit()
+if __name__ == "__main__":
+    perform_inference_step()

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+torch==2.3.0
+PyYAML==6.0.1
+rasterio==1.3.10
+torchvision==0.18.0
+shapely==2.0.4
+geopandas==0.14.4
+pytorch-lightning==2.2.3
+dvc==3.50.1
+streamlit==1.37.0
+leafmap==0.36.6
+transformers==4.41.2
+folium==0.17.0
+streamlit-folium==0.22.0
+geopy==2.4.1
+localtileserver==0.10.3
+xarray==2024.7.0
+scipy==1.14.0
+mapclassify==2.8.0
+wandb==0.16.6
+numpy==1.26.4
+lion-pytorch==0.2.2
+timm==0.9.16
+pyproj