|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from batchgenerators.utilities.file_and_folder_operations import * |
|
from multiprocessing import Pool |
|
|
|
from nnunet.configuration import default_num_threads |
|
from nnunet.paths import nnUNet_raw_data, nnUNet_cropped_data |
|
import numpy as np |
|
import pickle |
|
from nnunet.preprocessing.cropping import get_patient_identifiers_from_cropped_files |
|
from skimage.morphology import label |
|
from collections import OrderedDict |
|
|
|
|
|
class DatasetAnalyzer(object): |
|
def __init__(self, folder_with_cropped_data, overwrite=True, num_processes=default_num_threads): |
|
""" |
|
:param folder_with_cropped_data: |
|
:param overwrite: If True then precomputed values will not be used and instead recomputed from the data. |
|
False will allow loading of precomputed values. This may be dangerous though if some of the code of this class |
|
was changed, therefore the default is True. |
|
""" |
|
self.num_processes = num_processes |
|
self.overwrite = overwrite |
|
self.folder_with_cropped_data = folder_with_cropped_data |
|
self.sizes = self.spacings = None |
|
self.patient_identifiers = get_patient_identifiers_from_cropped_files(self.folder_with_cropped_data) |
|
assert isfile(join(self.folder_with_cropped_data, "dataset.json")), \ |
|
"dataset.json needs to be in folder_with_cropped_data" |
|
self.props_per_case_file = join(self.folder_with_cropped_data, "props_per_case.pkl") |
|
self.intensityproperties_file = join(self.folder_with_cropped_data, "intensityproperties.pkl") |
|
|
|
def load_properties_of_cropped(self, case_identifier): |
|
with open(join(self.folder_with_cropped_data, "%s.pkl" % case_identifier), 'rb') as f: |
|
properties = pickle.load(f) |
|
return properties |
|
|
|
@staticmethod |
|
def _check_if_all_in_one_region(seg, regions): |
|
res = OrderedDict() |
|
for r in regions: |
|
new_seg = np.zeros(seg.shape) |
|
for c in r: |
|
new_seg[seg == c] = 1 |
|
labelmap, numlabels = label(new_seg, return_num=True) |
|
if numlabels != 1: |
|
res[tuple(r)] = False |
|
else: |
|
res[tuple(r)] = True |
|
return res |
|
|
|
@staticmethod |
|
def _collect_class_and_region_sizes(seg, all_classes, vol_per_voxel): |
|
volume_per_class = OrderedDict() |
|
region_volume_per_class = OrderedDict() |
|
for c in all_classes: |
|
region_volume_per_class[c] = [] |
|
volume_per_class[c] = np.sum(seg == c) * vol_per_voxel |
|
labelmap, numregions = label(seg == c, return_num=True) |
|
for l in range(1, numregions + 1): |
|
region_volume_per_class[c].append(np.sum(labelmap == l) * vol_per_voxel) |
|
return volume_per_class, region_volume_per_class |
|
|
|
def _get_unique_labels(self, patient_identifier): |
|
seg = np.load(join(self.folder_with_cropped_data, patient_identifier) + ".npz")['data'][-1] |
|
unique_classes = np.unique(seg) |
|
return unique_classes |
|
|
|
def _load_seg_analyze_classes(self, patient_identifier, all_classes): |
|
""" |
|
1) what class is in this training case? |
|
2) what is the size distribution for each class? |
|
3) what is the region size of each class? |
|
4) check if all in one region |
|
:return: |
|
""" |
|
seg = np.load(join(self.folder_with_cropped_data, patient_identifier) + ".npz")['data'][-1] |
|
pkl = load_pickle(join(self.folder_with_cropped_data, patient_identifier) + ".pkl") |
|
vol_per_voxel = np.prod(pkl['itk_spacing']) |
|
|
|
|
|
unique_classes = np.unique(seg) |
|
|
|
|
|
regions = list() |
|
regions.append(list(all_classes)) |
|
for c in all_classes: |
|
regions.append((c, )) |
|
|
|
all_in_one_region = self._check_if_all_in_one_region(seg, regions) |
|
|
|
|
|
volume_per_class, region_sizes = self._collect_class_and_region_sizes(seg, all_classes, vol_per_voxel) |
|
|
|
return unique_classes, all_in_one_region, volume_per_class, region_sizes |
|
|
|
def get_classes(self): |
|
datasetjson = load_json(join(self.folder_with_cropped_data, "dataset.json")) |
|
return datasetjson['labels'] |
|
|
|
def analyse_segmentations(self): |
|
class_dct = self.get_classes() |
|
|
|
if self.overwrite or not isfile(self.props_per_case_file): |
|
p = Pool(self.num_processes) |
|
res = p.map(self._get_unique_labels, self.patient_identifiers) |
|
p.close() |
|
p.join() |
|
|
|
props_per_patient = OrderedDict() |
|
for p, unique_classes in \ |
|
zip(self.patient_identifiers, res): |
|
props = dict() |
|
props['has_classes'] = unique_classes |
|
props_per_patient[p] = props |
|
|
|
save_pickle(props_per_patient, self.props_per_case_file) |
|
else: |
|
props_per_patient = load_pickle(self.props_per_case_file) |
|
return class_dct, props_per_patient |
|
|
|
def get_sizes_and_spacings_after_cropping(self): |
|
sizes = [] |
|
spacings = [] |
|
|
|
for c in self.patient_identifiers: |
|
properties = self.load_properties_of_cropped(c) |
|
sizes.append(properties["size_after_cropping"]) |
|
spacings.append(properties["original_spacing"]) |
|
|
|
return sizes, spacings |
|
|
|
def get_modalities(self): |
|
datasetjson = load_json(join(self.folder_with_cropped_data, "dataset.json")) |
|
modalities = datasetjson["modality"] |
|
modalities = {int(k): modalities[k] for k in modalities.keys()} |
|
return modalities |
|
|
|
def get_size_reduction_by_cropping(self): |
|
size_reduction = OrderedDict() |
|
for p in self.patient_identifiers: |
|
props = self.load_properties_of_cropped(p) |
|
shape_before_crop = props["original_size_of_raw_data"] |
|
shape_after_crop = props['size_after_cropping'] |
|
size_red = np.prod(shape_after_crop) / np.prod(shape_before_crop) |
|
size_reduction[p] = size_red |
|
return size_reduction |
|
|
|
def _get_voxels_in_foreground(self, patient_identifier, modality_id): |
|
all_data = np.load(join(self.folder_with_cropped_data, patient_identifier) + ".npz")['data'] |
|
modality = all_data[modality_id] |
|
mask = all_data[-1] > 0 |
|
voxels = list(modality[mask][::10]) |
|
return voxels |
|
|
|
@staticmethod |
|
def _compute_stats(voxels): |
|
if len(voxels) == 0: |
|
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan |
|
median = np.median(voxels) |
|
mean = np.mean(voxels) |
|
sd = np.std(voxels) |
|
mn = np.min(voxels) |
|
mx = np.max(voxels) |
|
percentile_99_5 = np.percentile(voxels, 99.5) |
|
percentile_00_5 = np.percentile(voxels, 00.5) |
|
return median, mean, sd, mn, mx, percentile_99_5, percentile_00_5 |
|
|
|
def collect_intensity_properties(self, num_modalities): |
|
if self.overwrite or not isfile(self.intensityproperties_file): |
|
p = Pool(self.num_processes) |
|
|
|
results = OrderedDict() |
|
for mod_id in range(num_modalities): |
|
results[mod_id] = OrderedDict() |
|
v = p.starmap(self._get_voxels_in_foreground, zip(self.patient_identifiers, |
|
[mod_id] * len(self.patient_identifiers))) |
|
|
|
w = [] |
|
for iv in v: |
|
w += iv |
|
|
|
median, mean, sd, mn, mx, percentile_99_5, percentile_00_5 = self._compute_stats(w) |
|
|
|
local_props = p.map(self._compute_stats, v) |
|
props_per_case = OrderedDict() |
|
for i, pat in enumerate(self.patient_identifiers): |
|
props_per_case[pat] = OrderedDict() |
|
props_per_case[pat]['median'] = local_props[i][0] |
|
props_per_case[pat]['mean'] = local_props[i][1] |
|
props_per_case[pat]['sd'] = local_props[i][2] |
|
props_per_case[pat]['mn'] = local_props[i][3] |
|
props_per_case[pat]['mx'] = local_props[i][4] |
|
props_per_case[pat]['percentile_99_5'] = local_props[i][5] |
|
props_per_case[pat]['percentile_00_5'] = local_props[i][6] |
|
|
|
results[mod_id]['local_props'] = props_per_case |
|
results[mod_id]['median'] = median |
|
results[mod_id]['mean'] = mean |
|
results[mod_id]['sd'] = sd |
|
results[mod_id]['mn'] = mn |
|
results[mod_id]['mx'] = mx |
|
results[mod_id]['percentile_99_5'] = percentile_99_5 |
|
results[mod_id]['percentile_00_5'] = percentile_00_5 |
|
|
|
p.close() |
|
p.join() |
|
save_pickle(results, self.intensityproperties_file) |
|
else: |
|
results = load_pickle(self.intensityproperties_file) |
|
return results |
|
|
|
def analyze_dataset(self, collect_intensityproperties=True): |
|
|
|
sizes, spacings = self.get_sizes_and_spacings_after_cropping() |
|
|
|
|
|
|
|
|
|
classes = self.get_classes() |
|
all_classes = [] |
|
for j in classes.keys(): |
|
all_classes.append([int(i) for i in classes[j].keys() if int(i) > 0]) |
|
|
|
|
|
modalities = self.get_modalities() |
|
|
|
|
|
if collect_intensityproperties: |
|
intensityproperties = self.collect_intensity_properties(len(modalities)) |
|
else: |
|
intensityproperties = None |
|
|
|
|
|
size_reductions = self.get_size_reduction_by_cropping() |
|
|
|
dataset_properties = dict() |
|
dataset_properties['all_sizes'] = sizes |
|
dataset_properties['all_spacings'] = spacings |
|
dataset_properties['all_classes'] = all_classes |
|
dataset_properties['modalities'] = modalities |
|
dataset_properties['intensityproperties'] = intensityproperties |
|
dataset_properties['size_reductions'] = size_reductions |
|
|
|
save_pickle(dataset_properties, join(self.folder_with_cropped_data, "dataset_properties.pkl")) |
|
return dataset_properties |
|
|