maxmo2009 commited on
Commit
da9fb1e
·
verified ·
1 Parent(s): 48c2317

Initial upload: data cleanup pipeline for 12 medical imaging datasets

Browse files

Data engineering scripts that standardize CT/MRI/PET datasets into unified NIfTI format. Excludes raw datasets, run logs, and intermediate test outputs. Includes per-dataset dataclean_*.py, shared util.py, config_format.json schemas, and small demo/sample NIfTI files.

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. AbdomenAtlas/config_format.json +125 -0
  2. AbdomenAtlas/dataclean_abdomen_atlas.py +415 -0
  3. AbdomenAtlas/dataclean_abdomen_atlas_update_json.py +501 -0
  4. AbdomenAtlas/dataclean_abdomen_atlas_v2.py +477 -0
  5. AbdomenAtlas/util.py +410 -0
  6. AbdomenAtlas/xx_update.py +518 -0
  7. AbdomenCT1k/config_format.json +125 -0
  8. AbdomenCT1k/dataclean_abdomen_ct_1k.py +365 -0
  9. AbdomenCT1k/util.py +410 -0
  10. CLAUDE.md +71 -0
  11. MnM2_clean/config_format.json +124 -0
  12. MnM2_clean/dataclean_MnM2.py +427 -0
  13. MnM2_clean/dataclean_MnM2_v2.py +432 -0
  14. MnM2_clean/dataclean_MnM2_v3.py +451 -0
  15. MnM2_clean/util.py +406 -0
  16. MnMs_clean/config_format.json +124 -0
  17. MnMs_clean/dataclean_MnMs.py +484 -0
  18. MnMs_clean/util.py +406 -0
  19. OAISIS_clean/config_format.json +125 -0
  20. OAISIS_clean/dataclean_OASIS_1_CS_Sectional.py +358 -0
  21. OAISIS_clean/dataclean_OASIS_1_CS_Sectional_Unmask.py +359 -0
  22. OAISIS_clean/dataclean_OASIS_1_CS_Sectional_raw.py +280 -0
  23. OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw.py +283 -0
  24. OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw_v2.py +345 -0
  25. OAISIS_clean/oasis2_longitudinal_demographics.csv +374 -0
  26. OAISIS_clean/oasis_cross-sectional-5708aa0a98d82080.csv +437 -0
  27. OAISIS_clean/util.py +410 -0
  28. OAI_ZIB_clean/config_format.json +125 -0
  29. OAI_ZIB_clean/dataclean_OAI_ZIB.py +309 -0
  30. OAI_ZIB_clean/util.py +413 -0
  31. PSMA_clean/config_format.json +125 -0
  32. PSMA_clean/dataclean_PSMA_Longitudinal.py +380 -0
  33. PSMA_clean/dataclean_PSMA_Longitudinal_v2.py +450 -0
  34. PSMA_clean/dataclean_PSMA_petct.py +525 -0
  35. PSMA_clean/dataclean_PSMA_petct_v2.py +423 -0
  36. PSMA_clean/dataclean_PSMA_petct_v2_json.py +425 -0
  37. PSMA_clean/demo.py +451 -0
  38. PSMA_clean/demo/inputsTr/9c838d2e45.csv +11 -0
  39. PSMA_clean/demo/inputsTr/9c838d2e45_BL_00.json +90 -0
  40. PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz +3 -0
  41. PSMA_clean/demo/inputsTr/9c838d2e45_BL_mask_BL_img_00.nii.gz +3 -0
  42. PSMA_clean/demo/inputsTr/9c838d2e45_FU_00.json +74 -0
  43. PSMA_clean/demo/inputsTr/9c838d2e45_FU_01.json +26 -0
  44. PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_00.nii.gz +3 -0
  45. PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_01.nii.gz +3 -0
  46. PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_00.nii.gz +3 -0
  47. PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_01.nii.gz +3 -0
  48. PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz +3 -0
  49. PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz +3 -0
  50. PSMA_clean/sample/failed_files.json +1 -0
AbdomenAtlas/config_format.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+
117
+ "Sub_modality": {
118
+ "type": "dict",
119
+ "required": false
120
+ },
121
+ "Label_Dict": {
122
+ "type": "dict",
123
+ "required": false
124
+ }
125
+ }
AbdomenAtlas/dataclean_abdomen_atlas.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-8-18
5
+ update AbdomenAtlas3.0 data clean
6
+
7
+ https://arxiv.org/pdf/2407.16697
8
+ https://zhuanlan.zhihu.com/p/19339643417
9
+
10
+ AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。
11
+ 该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。
12
+ 这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。
13
+ 通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。
14
+
15
+ 数据集统计信息
16
+ 总数据量:
17
+ 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。
18
+ 包含 8,562 个肿瘤实例:
19
+ 肝脏肿瘤:3,036 个实例(929 份报告)
20
+ 胰腺肿瘤:354 个实例(344 份报告)
21
+ 肾脏肿瘤:4,239 个实例(1,674 份报告)
22
+ 6,061 份无肿瘤报告(作为对照组)
23
+ 小肿瘤(≤2 cm):
24
+ 943 份小肿瘤相关报告:
25
+ 肝脏:347 个实例(占肝脏肿瘤的 37.4%)
26
+ 胰腺:83 个实例(占胰腺肿瘤的 24.1%)
27
+ 肾脏:466 个实例(占肾脏肿瘤的 27.8%)
28
+ 肿瘤分期与解剖结构:
29
+ 260 份胰腺肿瘤分期报告(T1–T4)
30
+ 提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割
31
+ 标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度
32
+ 图像与文本配对:
33
+ 1.8M 文本 Token,包含三类报告:
34
+ 结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等)
35
+ 叙述性报告:通过 LLM 转换,模仿目标医院的报告风格
36
+ 人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容
37
+
38
+ AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值
39
+ 1 aorta
40
+ 2 gall_bladder
41
+ 3 kidney_left
42
+ 4 kidney_right
43
+ 5 liver
44
+ 6 pancreas
45
+ 7 postcava
46
+ 8 spleen
47
+ 9 stomach
48
+ 10 adrenal_gland_left
49
+ 11 adrenal_gland_right
50
+ 12 bladder
51
+ 13 celiac_trunk
52
+ 14 colon
53
+ 15 duodenum
54
+ 16 esophagus
55
+ 17 femur_left
56
+ 18 femur_right
57
+ 19 hepatic_vessel
58
+ 20 intestine
59
+ 21 lung_left
60
+ 22 lung_right
61
+ 23 portal_vein_and_splenic_vein
62
+ 24 prostate
63
+ 25 rectum
64
+
65
+
66
+ 参考TotalSegment分别存储25个器官的label处理后的数据文件
67
+ '''
68
+ import os
69
+ import glob
70
+ import pandas as pd
71
+ import SimpleITK as sitk
72
+ import argparse
73
+ import json
74
+ from tqdm import tqdm
75
+ from util import meta_data
76
+ import util
77
+ import numpy as np
78
+ # from bert_helper import *
79
+
80
+ # model_name = "bert-large-uncased"
81
+ # reduce_method = 'mean'
82
+ # max_words_num = 32 # max number of words in the caption > 2
83
+
84
+ # embeder, tokenizer = get_frozen_embeder(model_name)
85
+
86
+ # string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
87
+ # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
88
+
89
+ # string2 = "modality: ct, gender: female, age: 50, roi: head"
90
+
91
+ # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
92
+
93
+ # input_size = embeder.config.vocab_size
94
+ # in_size = embeder.config.hidden_size
95
+
96
+ # print(embeder, input_size, in_size)
97
+ # print(tokenizer)
98
+
99
+
100
+ # print(embeder_output1)
101
+ # print(embeder_output1.shape) # torch.Size([1, 8, 768])
102
+
103
+
104
+ # print(embeder_output2)
105
+ # print(embeder_output2.shape) # torch.Size([1, 8, 768])
106
+
107
+
108
+ # error = torch.abs(embeder_output1 - embeder_output2)
109
+ # print(error)
110
+ # print("Embedding distance between the two sentences: ")
111
+ # print(f"String1: {string1}")
112
+ # print(f"String2: {string2}")
113
+ # print(torch.mean(error))
114
+
115
+
116
+ # exit()
117
+
118
+
119
+ # meta_id_name='Patient'
120
+ # meta_weeks_name='Weeks'
121
+ # meta_fvc_name='FVC'
122
+ # meta_percent_name='Percent'
123
+ # meta_age_name='Age'
124
+ # meta_sex_name='Sex'
125
+ # meta_status_name='SmokingStatus'
126
+
127
+ TASK_VALUE="segmentation"
128
+ CLAMP_RANGE_CT = [-300,300]
129
+ CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...
130
+
131
+
132
+ LABEL_DICT={
133
+ "0":"backgroud",
134
+ "1":"aorta",
135
+ "2":"gall_bladder",
136
+ "3":"kidney_left",
137
+ "4":"kidney_right",
138
+ "5":"liver",
139
+ "6":"pancreas",
140
+ "7":"postcava",
141
+ "8":"spleen",
142
+ "9":"stomach",
143
+ "10":"adrenal_gland_left",
144
+ "11":"adrenal_gland_right",
145
+ "12":"bladder",
146
+ "13":"celiac_trunk",
147
+ "14":"colon",
148
+ "15":"duodenum",
149
+ "16":"esophagus",
150
+ "17":"femur_left",
151
+ "18":"femur_right",
152
+ "19":"hepatic_vessel",
153
+ "20":"intestine",
154
+ "21":"lung_left",
155
+ "22":"lung_right",
156
+ "23":"portal_vein_and_splenic_vein",
157
+ "24":"prostate",
158
+ "25":"rectum"
159
+ }
160
+ # def find_metadata_files(path):
161
+ # # for Cancer Image Archive (TCIA) dataset
162
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
163
+ # return glob.glob(search_pattern, recursive=True)
164
+
165
+ def find_metadata_files(path):
166
+ # for Cancer Image Archive (TCIA) dataset
167
+ search_pattern = os.path.join(path, '*.csv')
168
+ return glob.glob(search_pattern, recursive=True)
169
+ ##added by yanguoqing on 20250527
170
+ def find_image_dirs(path):
171
+ return os.listdir(path)
172
+
173
+ ##modify by yanguoqing on 20250527
174
+ def load_dicom_images(folder_path):
175
+ reader = sitk.ImageSeriesReader()
176
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
177
+ reader.SetFileNames(dicom_names)
178
+ image = reader.Execute()
179
+ return dicom_names,image
180
+
181
+ ##added by yanguoqing on 20250527
182
+ def load_dicom_tag(imgs):
183
+ reader = sitk.ImageFileReader()
184
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
185
+ reader.SetFileName(imgs)
186
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
187
+ # metadata_keys = reader.GetMetaDataKeys()
188
+ tag=reader.Execute()
189
+ return tag
190
+
191
+ def load_nrrd(fp):
192
+ return sitk.ReadImage(fp)
193
+
194
+ def save_nifti(image, output_path, folder_path):
195
+ # Set metadata in the NIfTI file's header
196
+ output_dirpath = os.path.dirname(output_path)
197
+ if not os.path.exists(output_dirpath):
198
+ print(f"Creating directory {output_dirpath}")
199
+ os.makedirs(output_dirpath)
200
+ # Set metadata in the NIfTI file's header
201
+ image.SetMetaData("FolderPath", folder_path)
202
+ sitk.WriteImage(image, output_path)
203
+
204
+ ##modify by yanguoqing on 20250527
205
+ def convert_windows_to_linux_path(windows_path):
206
+ # Replace backslashes with forward slashes and remove the drive letter
207
+ # Some meta files have windows paths, but the data is stored on a linux server
208
+ linux_path = windows_path.replace('\\', '/')
209
+ if ':' in linux_path:
210
+ linux_path = linux_path.split(':', 1)[1]
211
+ return linux_path
212
+
213
+ def main(target_path, output_dir):
214
+ metadata_files = find_metadata_files(target_path)
215
+ pid_dirs=find_image_dirs(target_path)
216
+ failed_files = []
217
+ if not os.path.isdir(output_dir):
218
+ os.makedirs(output_dir)
219
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
220
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
221
+ meta = meta_data()
222
+
223
+ # Initialize the JSON file
224
+ if not os.path.exists(json_output_path):
225
+ with open(json_output_path, 'w') as json_file:
226
+ json.dump({}, json_file)
227
+
228
+ if pid_dirs:
229
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
230
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
231
+ continue
232
+ if not pid_dir.startswith("BDMAP_"):
233
+ continue
234
+
235
+ meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
236
+ if os.path.isfile(meta_file):
237
+ mf_flag=True
238
+ df_meta=pd.read_csv(meta_file,sep=',')
239
+ else:
240
+ mf_flag=False
241
+
242
+ full_path=os.path.join(target_path,pid_dir,"ct.nii.gz")
243
+
244
+
245
+ if not os.path.isfile(full_path):
246
+ continue
247
+ try:
248
+ print(full_path)
249
+
250
+ dicom_image=util.load_nifti(full_path)
251
+ spacing_info = dicom_image.GetSpacing()
252
+ print('SPACING INFO:', spacing_info)
253
+
254
+ # metadata_keys = dicom_image.GetMetaDataKeys()
255
+
256
+ # dtag=load_dicom_tag(dicom_fp[0])
257
+ # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
258
+ # modality=dtag.GetMetaData('0008|0060')##Modality
259
+ uid=pid_dir
260
+ modality="CT"
261
+ study='AbdomenAtlas'##Dataset_name
262
+ CIA_other_info = {
263
+ 'Study_UID':uid,
264
+ 'metadata_file':''
265
+ # 'Series_Description':serise_desc
266
+ }
267
+ CIA_other_info['split'] = "train"
268
+ if mf_flag:
269
+ CIA_other_info['metadata_file']=meta_file
270
+
271
+ size = list(dicom_image.GetSize())
272
+ resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
273
+
274
+ # resize the image
275
+ if resampler is not None:
276
+ proces_image = resampler.Execute(dicom_image)
277
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
278
+ CIA_other_info['Resample'] = True
279
+ else:
280
+ proces_image = dicom_image
281
+ CIA_other_info['Resample'] = False
282
+
283
+ ##
284
+ # CIA_other_info['Image_id']=meta_image_id
285
+ # CIA_other_info['Weeks']=str(meta_weeks)
286
+ # CIA_other_info['FVC']=str(meta_fvc)
287
+ # CIA_other_info['Percent']=str(meta_percent)
288
+ # CIA_other_info['Age']=str(meta_age)
289
+ # CIA_other_info['Sex']=meta_sex
290
+ # CIA_other_info['Smoke_Status']=meta_status
291
+ # threshold the image
292
+ if 'CT' in modality:
293
+ proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
294
+ else:
295
+ pass
296
+
297
+ output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
298
+ # output_path=convert_windows_to_linux_path(output_path)
299
+ save_nifti(proces_image, output_path, full_path)
300
+ print(f"Saved NIfTI file to {output_path}")
301
+
302
+ ##segment
303
+ label_path_dict = {}
304
+ label_flag=True
305
+
306
+ label_paths = os.path.join(target_path,pid_dir, 'segmentations')
307
+ label_files=glob.glob("%s/*.nii.gz"%(label_paths))
308
+ #print(label_paths,label_files)
309
+ if len(label_files)>0:
310
+ for lf in label_files:
311
+ lf_name=os.path.basename(lf)
312
+
313
+ lf_tissue=lf_name.replace(".nii.gz","")
314
+ label_image=load_nrrd(lf)
315
+ resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
316
+ if resampler is not None:
317
+ proces_label = resampler.Execute(label_image)
318
+ else:
319
+ proces_label = label_image
320
+
321
+
322
+ # print(proces_image.GetSize(),proces_label.GetSize())
323
+ try:
324
+ assert proces_image.GetSize() == proces_label.GetSize()
325
+ except Exception as e:
326
+ failed_files.append(lf)
327
+ continue
328
+
329
+ label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz")
330
+
331
+ label_path_dict[lf_tissue] = label_output_path
332
+ util.save_nifti(proces_label, label_output_path, lf)
333
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
334
+
335
+ else:
336
+ label_flag=False
337
+ except RuntimeError:
338
+ failed_files.append(full_path)
339
+ print(f"Failed to load DICOM images from {full_path}")
340
+ continue
341
+
342
+ '''
343
+ meta.add_keyvalue('Image_id',meta_image_id)
344
+ meta.add_keyvalue('Weeks',meta_weeks)
345
+ meta.add_keyvalue('FVC',meta_fvc)
346
+ meta.add_keyvalue('Percent',meta_percent)
347
+ meta.add_keyvalue('Age',meta_age)
348
+ meta.add_keyvalue('Sex',meta_sex)
349
+ meta.add_keyvalue('Smoke_Status',meta_status)
350
+ '''
351
+
352
+ size_processed = list(proces_image.GetSize())
353
+
354
+ meta_image_id=uid
355
+ # meta.add_keyvalue('Image_id',meta_image_id)
356
+ meta.add_keyvalue('Spacing_mm',min(spacing_info))
357
+ meta.add_keyvalue('OriImg_path',full_path)
358
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
359
+ meta.add_keyvalue('Modality',modality)
360
+ meta.add_keyvalue('Dataset_name',study)
361
+ meta.add_keyvalue('ROI','abdomen')
362
+
363
+
364
+
365
+ if label_flag:
366
+ # print(label_path_dict.keys())
367
+ meta.add_keyvalue('Task',TASK_VALUE)
368
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
369
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
370
+
371
+ # meta.add_keyvalue('Label_Dict',LABEL_DICT)
372
+
373
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
374
+
375
+
376
+
377
+
378
+ # Write the mapping to the JSON file on the fly
379
+ with open(json_output_path, 'r+') as json_file:
380
+ existing_mappings = json.load(json_file)
381
+ existing_mappings[output_path] = meta.get_meta_data()
382
+ json_file.seek(0)
383
+ json.dump(existing_mappings, json_file, indent=4)
384
+ json_file.truncate()
385
+ else:
386
+ print("No metadata.csv files found.")
387
+
388
+ with open(failed_files_path, "w") as json_file:
389
+ json.dump(failed_files, json_file)
390
+
391
+ print(f"The list has been written to {failed_files_path}")
392
+ print(f"Saved NIfTI mappings to {json_output_path}")
393
+
394
+ if __name__ == "__main__":
395
+ parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
396
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2")
397
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas/")
398
+ args = parser.parse_args()
399
+ print(args.target_path, args.output_dir)
400
+ main(args.target_path, args.output_dir)
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
AbdomenAtlas/dataclean_abdomen_atlas_update_json.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-8-18
5
+ update AbdomenAtlas3.0 data clean
6
+
7
+ https://arxiv.org/pdf/2407.16697
8
+ https://zhuanlan.zhihu.com/p/19339643417
9
+
10
+ AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。
11
+ 该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。
12
+ 这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。
13
+ 通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。
14
+
15
+ 数据集统计信息
16
+ 总数据量:
17
+ 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。
18
+ 包含 8,562 个肿瘤实例:
19
+ 肝脏肿瘤:3,036 个实例(929 份报告)
20
+ 胰腺肿瘤:354 个实例(344 份报告)
21
+ 肾脏肿瘤:4,239 个实例(1,674 份报告)
22
+ 6,061 份无肿瘤报告(作为对照组)
23
+ 小肿瘤(≤2 cm):
24
+ 943 份小肿瘤相关报告:
25
+ 肝脏:347 个实例(占肝脏肿瘤的 37.4%)
26
+ 胰腺:83 个实例(占胰腺肿瘤的 24.1%)
27
+ 肾脏:466 个实例(占肾脏肿瘤的 27.8%)
28
+ 肿瘤分期与解剖结构:
29
+ 260 份胰腺肿瘤分期报告(T1–T4)
30
+ 提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割
31
+ 标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度
32
+ 图像与文本配对:
33
+ 1.8M 文本 Token,包含三类报告:
34
+ 结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等)
35
+ 叙述性报告:通过 LLM 转换,模仿目标医院的报告风格
36
+ 人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容
37
+
38
+ AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值
39
+ 1 aorta
40
+ 2 gall_bladder
41
+ 3 kidney_left
42
+ 4 kidney_right
43
+ 5 liver
44
+ 6 pancreas
45
+ 7 postcava
46
+ 8 spleen
47
+ 9 stomach
48
+ 10 adrenal_gland_left
49
+ 11 adrenal_gland_right
50
+ 12 bladder
51
+ 13 celiac_trunk
52
+ 14 colon
53
+ 15 duodenum
54
+ 16 esophagus
55
+ 17 femur_left
56
+ 18 femur_right
57
+ 19 hepatic_vessel
58
+ 20 intestine
59
+ 21 lung_left
60
+ 22 lung_right
61
+ 23 portal_vein_and_splenic_vein
62
+ 24 prostate
63
+ 25 rectum
64
+
65
+
66
+ 参考TotalSegment分别存储25个器官的label处理后的数据文件
67
+ '''
68
+ import os
69
+ import glob
70
+ import pandas as pd
71
+ import SimpleITK as sitk
72
+ import argparse
73
+ import json
74
+ from tqdm import tqdm
75
+ from util import meta_data
76
+ import util
77
+ import numpy as np
78
+ # from bert_helper import *
79
+
80
+ # model_name = "bert-large-uncased"
81
+ # reduce_method = 'mean'
82
+ # max_words_num = 32 # max number of words in the caption > 2
83
+
84
+ # embeder, tokenizer = get_frozen_embeder(model_name)
85
+
86
+ # string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
87
+ # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
88
+
89
+ # string2 = "modality: ct, gender: female, age: 50, roi: head"
90
+
91
+ # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
92
+
93
+ # input_size = embeder.config.vocab_size
94
+ # in_size = embeder.config.hidden_size
95
+
96
+ # print(embeder, input_size, in_size)
97
+ # print(tokenizer)
98
+
99
+
100
+ # print(embeder_output1)
101
+ # print(embeder_output1.shape) # torch.Size([1, 8, 768])
102
+
103
+
104
+ # print(embeder_output2)
105
+ # print(embeder_output2.shape) # torch.Size([1, 8, 768])
106
+
107
+
108
+ # error = torch.abs(embeder_output1 - embeder_output2)
109
+ # print(error)
110
+ # print("Embedding distance between the two sentences: ")
111
+ # print(f"String1: {string1}")
112
+ # print(f"String2: {string2}")
113
+ # print(torch.mean(error))
114
+
115
+
116
+ # exit()
117
+
118
+
119
+ # meta_id_name='Patient'
120
+ # meta_weeks_name='Weeks'
121
+ # meta_fvc_name='FVC'
122
+ # meta_percent_name='Percent'
123
+ # meta_age_name='Age'
124
+ # meta_sex_name='Sex'
125
+ # meta_status_name='SmokingStatus'
126
+
127
+ TASK_VALUE="segmentation"
128
+ CLAMP_RANGE_CT = [-300,300]
129
+ CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...
130
+
131
+ ##判定是否有效胸部的肺部体积阈值ml
132
+ LUNG_VOL_THRESH=1000
133
+ FEMUR_VOL_THRESH=80
134
+ KIDNEY_VOL_THRESH=100
135
+ ROI="abdomen"
136
+
137
+ PROCESS_FLAG=True
138
+
139
+ LABEL_DICT={
140
+ "0":"backgroud",
141
+ "1":"aorta",
142
+ "2":"gall_bladder",
143
+ "3":"kidney_left",
144
+ "4":"kidney_right",
145
+ "5":"liver",
146
+ "6":"pancreas",
147
+ "7":"postcava",
148
+ "8":"spleen",
149
+ "9":"stomach",
150
+ "10":"adrenal_gland_left",
151
+ "11":"adrenal_gland_right",
152
+ "12":"bladder",
153
+ "13":"celiac_trunk",
154
+ "14":"colon",
155
+ "15":"duodenum",
156
+ "16":"esophagus",
157
+ "17":"femur_left",
158
+ "18":"femur_right",
159
+ "19":"hepatic_vessel",
160
+ "20":"intestine",
161
+ "21":"lung_left",
162
+ "22":"lung_right",
163
+ "23":"portal_vein_and_splenic_vein",
164
+ "24":"prostate",
165
+ "25":"rectum"
166
+ }
167
+ # def find_metadata_files(path):
168
+ # # for Cancer Image Archive (TCIA) dataset
169
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
170
+ # return glob.glob(search_pattern, recursive=True)
171
+
172
+ def find_metadata_files(path):
173
+ # for Cancer Image Archive (TCIA) dataset
174
+ search_pattern = os.path.join(path, '*.csv')
175
+ return glob.glob(search_pattern, recursive=True)
176
+ ##added by yanguoqing on 20250527
177
+ def find_image_dirs(path):
178
+ return os.listdir(path)
179
+
180
+ ##modify by yanguoqing on 20250527
181
+ def load_dicom_images(folder_path):
182
+ reader = sitk.ImageSeriesReader()
183
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
184
+ reader.SetFileNames(dicom_names)
185
+ image = reader.Execute()
186
+ return dicom_names,image
187
+
188
+ ##added by yanguoqing on 20250527
189
+ def load_dicom_tag(imgs):
190
+ reader = sitk.ImageFileReader()
191
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
192
+ reader.SetFileName(imgs)
193
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
194
+ # metadata_keys = reader.GetMetaDataKeys()
195
+ tag=reader.Execute()
196
+ return tag
197
+
198
+ def load_nrrd(fp):
199
+ return sitk.ReadImage(fp)
200
+
201
+ def save_nifti(image, output_path, folder_path):
202
+ # Set metadata in the NIfTI file's header
203
+ output_dirpath = os.path.dirname(output_path)
204
+ if not os.path.exists(output_dirpath):
205
+ print(f"Creating directory {output_dirpath}")
206
+ os.makedirs(output_dirpath)
207
+ # Set metadata in the NIfTI file's header
208
+ image.SetMetaData("FolderPath", folder_path)
209
+ sitk.WriteImage(image, output_path)
210
+
211
+ ##modify by yanguoqing on 20250527
212
+ def convert_windows_to_linux_path(windows_path):
213
+ # Replace backslashes with forward slashes and remove the drive letter
214
+ # Some meta files have windows paths, but the data is stored on a linux server
215
+ linux_path = windows_path.replace('\\', '/')
216
+ if ':' in linux_path:
217
+ linux_path = linux_path.split(':', 1)[1]
218
+ return linux_path
219
+
220
+ def simpleitk_volume_calculation(image_path):
221
+ """
222
+ 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax
223
+ """
224
+
225
+ image=util.load_nifti(image_path)
226
+ # 获取体素尺寸
227
+ spacing = image.GetSpacing()
228
+ voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³
229
+
230
+ # print(f"图像尺寸: {image.GetSize()}")
231
+ # print(f"体素间距: {spacing}")
232
+ # print(f"单个体素体积: {voxel_volume:.6f} mm³")
233
+ ##计算有效像元数量
234
+ image_array2 = sitk.GetArrayFromImage(image)
235
+ valid_pxiels=image_array2[image_array2==1].sum()
236
+ if valid_pxiels<10:
237
+ return 0
238
+ # 简单的阈值分割(需要根据实际情况调整阈值)
239
+ segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1)
240
+
241
+ # 统计体素数量
242
+ statistics = sitk.LabelShapeStatisticsImageFilter()
243
+ statistics.Execute(segmented)
244
+
245
+ voxel_count = statistics.GetNumberOfPixels(1)
246
+ volume_mm3 = voxel_count * voxel_volume
247
+ volume_ml = volume_mm3 / 1000.0
248
+
249
+ # print(f"体素数量: {voxel_count}")
250
+ # print(f"器官体积: {volume_ml:.2f} mL")
251
+
252
+ return volume_ml
253
+
254
+ def main(target_path, output_dir):
255
+ metadata_files = find_metadata_files(target_path)
256
+ pid_dirs=find_image_dirs(target_path)
257
+ failed_files = []
258
+ if not os.path.isdir(output_dir):
259
+ os.makedirs(output_dir)
260
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
261
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
262
+ #meta = meta_data()
263
+ with open(json_output_path,'r') as fi:
264
+ fj=json.load(fi)
265
+ '''
266
+ # Initialize the JSON file
267
+ if not os.path.exists(json_output_path):
268
+ with open(json_output_path, 'w') as json_file:
269
+ json.dump({}, json_file)
270
+ '''
271
+ if pid_dirs:
272
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
273
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
274
+ continue
275
+ if not pid_dir.startswith("BDMAP_"):
276
+ continue
277
+
278
+ meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
279
+ if os.path.isfile(meta_file):
280
+ mf_flag=True
281
+ # df_meta=pd.read_csv(meta_file,sep=',')
282
+ else:
283
+ mf_flag=False
284
+
285
+ full_path=os.path.join(target_path,pid_dir,"ct.nii.gz")
286
+
287
+
288
+ try:
289
+ '''
290
+ dicom_image=util.load_nifti(full_path)
291
+ spacing_info = dicom_image.GetSpacing()
292
+ print('SPACING INFO:', spacing_info)
293
+
294
+ # metadata_keys = dicom_image.GetMetaDataKeys()
295
+
296
+ # dtag=load_dicom_tag(dicom_fp[0])
297
+ # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
298
+ # modality=dtag.GetMetaData('0008|0060')##Modality
299
+ uid=pid_dir
300
+ modality="CT"
301
+ study='AbdomenAtlas'##Dataset_name
302
+ CIA_other_info = {
303
+ 'Study_UID':uid,
304
+ 'metadata_file':''
305
+ # 'Series_Description':serise_desc
306
+ }
307
+ CIA_other_info['split'] = "train"
308
+ if mf_flag:
309
+ CIA_other_info['metadata_file']=meta_file
310
+
311
+ size = list(dicom_image.GetSize())
312
+ resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
313
+
314
+ # resize the image
315
+ if resampler is not None:
316
+ proces_image = resampler.Execute(dicom_image)
317
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
318
+ CIA_other_info['Resample'] = True
319
+ else:
320
+ proces_image = dicom_image
321
+ CIA_other_info['Resample'] = False
322
+
323
+ ##
324
+ # CIA_other_info['Image_id']=meta_image_id
325
+ # CIA_other_info['Weeks']=str(meta_weeks)
326
+ # CIA_other_info['FVC']=str(meta_fvc)
327
+ # CIA_other_info['Percent']=str(meta_percent)
328
+ # CIA_other_info['Age']=str(meta_age)
329
+ # CIA_other_info['Sex']=meta_sex
330
+ # CIA_other_info['Smoke_Status']=meta_status
331
+ # threshold the image
332
+ if 'CT' in modality:
333
+ proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
334
+ else:
335
+ pass
336
+
337
+ output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
338
+ # output_path=convert_windows_to_linux_path(output_path)
339
+ save_nifti(proces_image, output_path, full_path)
340
+ print(f"Saved NIfTI file to {output_path}")
341
+ '''
342
+ ##segment
343
+ label_path_dict = {}
344
+ label_flag=True
345
+
346
+ label_paths = os.path.join(target_path,pid_dir, 'segmentations')
347
+ label_files=glob.glob("%s/*.nii.gz"%(label_paths))
348
+ #print(label_paths,label_files)
349
+ pelvis_flag=False
350
+ thorax_flag=False
351
+ kidney_flag=False
352
+ if len(label_files)>0:
353
+ for lf in label_files:
354
+ lf_name=os.path.basename(lf)
355
+
356
+ lf_tissue=lf_name.replace(".nii.gz","")
357
+
358
+ if 'femur' in lf_tissue:
359
+ vol_femur=simpleitk_volume_calculation(lf)
360
+ print(lf_tissue,vol_femur)
361
+ if vol_femur>=FEMUR_VOL_THRESH:
362
+ pelvis_flag=True
363
+ if 'lung' in lf_tissue:
364
+ vol_lung=simpleitk_volume_calculation(lf)
365
+ print(lf_tissue,vol_lung)
366
+ if vol_lung>=LUNG_VOL_THRESH:
367
+ thorax_flag=True
368
+ if 'kidney_right' in lf_tissue:
369
+ vol_kidney=simpleitk_volume_calculation(lf)
370
+ print(lf_tissue,vol_kidney)
371
+ if vol_kidney>=KIDNEY_VOL_THRESH:
372
+ kidney_flag=True
373
+
374
+ '''
375
+ label_image=load_nrrd(lf)
376
+ resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
377
+ if resampler is not None:
378
+ proces_label = resampler.Execute(label_image)
379
+ else:
380
+ proces_label = label_image
381
+
382
+
383
+ # print(proces_image.GetSize(),proces_label.GetSize())
384
+ try:
385
+ assert proces_image.GetSize() == proces_label.GetSize()
386
+ except Exception as e:
387
+ failed_files.append(lf)
388
+ continue
389
+
390
+ label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz")
391
+
392
+ label_path_dict[lf_tissue] = label_output_path
393
+ util.save_nifti(proces_label, label_output_path, lf)
394
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
395
+ '''
396
+ else:
397
+ label_flag=False
398
+ except RuntimeError:
399
+ failed_files.append(full_path)
400
+ print(f"Failed to load DICOM images from {full_path}")
401
+ continue
402
+
403
+ '''
404
+ meta.add_keyvalue('Image_id',meta_image_id)
405
+ meta.add_keyvalue('Weeks',meta_weeks)
406
+ meta.add_keyvalue('FVC',meta_fvc)
407
+ meta.add_keyvalue('Percent',meta_percent)
408
+ meta.add_keyvalue('Age',meta_age)
409
+ meta.add_keyvalue('Sex',meta_sex)
410
+ meta.add_keyvalue('Smoke_Status',meta_status)
411
+
412
+
413
+ size_processed = list(proces_image.GetSize())
414
+
415
+ meta_image_id=uid
416
+ # meta.add_keyvalue('Image_id',meta_image_id)
417
+ meta.add_keyvalue('Spacing_mm',min(spacing_info))
418
+ meta.add_keyvalue('OriImg_path',full_path)
419
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
420
+ meta.add_keyvalue('Modality',modality)
421
+ meta.add_keyvalue('Dataset_name',study)
422
+ '''
423
+ roi='abdomen'
424
+
425
+ if thorax_flag and kidney_flag:
426
+ roi='thorax-'+roi
427
+ if thorax_flag and not kidney_flag:
428
+ roi='thorax'
429
+ if pelvis_flag and kidney_flag:
430
+ roi=roi+"-pelvis"
431
+ if pelvis_flag and not kidney_flag:
432
+ roi='pelvis'
433
+ print(pid_dir,roi)
434
+ #meta.add_keyvalue('ROI',roi)
435
+ for ik in fj.keys():
436
+ fi=fj[ik]
437
+ jid=fi['Metadata']['Study_UID']
438
+ max_length=fi['Spacing_mm']*max(fi['Size'])*0.001
439
+ if roi=='thorax-abdomen-pelvis' and max_length>1.2:
440
+ roi='whole-body'
441
+ #print(jid,max_length,roi)
442
+ if jid==pid_dir:
443
+ fj[ik]['ROI']=roi
444
+ print(jid,max_length,roi)
445
+ break
446
+ else:
447
+ continue
448
+
449
+ '''
450
+ if label_flag:
451
+ # print(label_path_dict.keys())
452
+ meta.add_keyvalue('Task',TASK_VALUE)
453
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
454
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
455
+
456
+ # meta.add_keyvalue('Label_Dict',LABEL_DICT)
457
+
458
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
459
+
460
+
461
+
462
+
463
+ # Write the mapping to the JSON file on the fly
464
+ with open(json_output_path, 'r+') as json_file:
465
+ existing_mappings = json.load(json_file)
466
+ existing_mappings[output_path] = meta.get_meta_data()
467
+ json_file.seek(0)
468
+ json.dump(existing_mappings, json_file, indent=4)
469
+ json_file.truncate()
470
+ '''
471
+ else:
472
+ print("No metadata.csv files found.")
473
+
474
+
475
+ with open(json_output_path,'w') as fi:
476
+ json.dump(fj,fi)
477
+ print(f"The list has been written to {failed_files_path}")
478
+ print(f"Saved NIfTI mappings to {json_output_path}")
479
+
480
+ if __name__ == "__main__":
481
+ parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
482
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2")
483
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/")
484
+ args = parser.parse_args()
485
+ print(args.target_path, args.output_dir)
486
+ main(args.target_path, args.output_dir)
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
AbdomenAtlas/dataclean_abdomen_atlas_v2.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-8-18
5
+ update AbdomenAtlas3.0 data clean
6
+
7
+ https://arxiv.org/pdf/2407.16697
8
+ https://zhuanlan.zhihu.com/p/19339643417
9
+
10
+ AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。
11
+ 该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。
12
+ 这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。
13
+ 通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。
14
+
15
+ 数据集统计信息
16
+ 总数据量:
17
+ 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。
18
+ 包含 8,562 个肿瘤实例:
19
+ 肝脏肿瘤:3,036 个实例(929 份报告)
20
+ 胰腺肿瘤:354 个实例(344 份报告)
21
+ 肾脏肿瘤:4,239 个实例(1,674 份报告)
22
+ 6,061 份无肿瘤报告(作为对照组)
23
+ 小肿瘤(≤2 cm):
24
+ 943 份小肿瘤相关报告:
25
+ 肝脏:347 个实例(占肝脏肿瘤的 37.4%)
26
+ 胰腺:83 个实例(占胰腺肿瘤的 24.1%)
27
+ 肾脏:466 个实例(占肾脏肿瘤的 27.8%)
28
+ 肿瘤分期与解剖结构:
29
+ 260 份胰腺肿瘤分期报告(T1–T4)
30
+ 提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割
31
+ 标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度
32
+ 图像与文本配对:
33
+ 1.8M 文本 Token,包含三类报告:
34
+ 结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等)
35
+ 叙述性报告:通过 LLM 转换,模仿目标医院的报告风格
36
+ 人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容
37
+
38
+ AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值
39
+ 1 aorta
40
+ 2 gall_bladder
41
+ 3 kidney_left
42
+ 4 kidney_right
43
+ 5 liver
44
+ 6 pancreas
45
+ 7 postcava
46
+ 8 spleen
47
+ 9 stomach
48
+ 10 adrenal_gland_left
49
+ 11 adrenal_gland_right
50
+ 12 bladder
51
+ 13 celiac_trunk
52
+ 14 colon
53
+ 15 duodenum
54
+ 16 esophagus
55
+ 17 femur_left
56
+ 18 femur_right
57
+ 19 hepatic_vessel
58
+ 20 intestine
59
+ 21 lung_left
60
+ 22 lung_right
61
+ 23 portal_vein_and_splenic_vein
62
+ 24 prostate
63
+ 25 rectum
64
+
65
+
66
+ 参考TotalSegment分别存储25个器官的label处理后的数据文件
67
+ '''
68
+ import os
69
+ import glob
70
+ import pandas as pd
71
+ import SimpleITK as sitk
72
+ import argparse
73
+ import json
74
+ from tqdm import tqdm
75
+ from util import meta_data
76
+ import util
77
+ import numpy as np
78
+ # from bert_helper import *
79
+
80
+ # model_name = "bert-large-uncased"
81
+ # reduce_method = 'mean'
82
+ # max_words_num = 32 # max number of words in the caption > 2
83
+
84
+ # embeder, tokenizer = get_frozen_embeder(model_name)
85
+
86
+ # string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
87
+ # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
88
+
89
+ # string2 = "modality: ct, gender: female, age: 50, roi: head"
90
+
91
+ # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
92
+
93
+ # input_size = embeder.config.vocab_size
94
+ # in_size = embeder.config.hidden_size
95
+
96
+ # print(embeder, input_size, in_size)
97
+ # print(tokenizer)
98
+
99
+
100
+ # print(embeder_output1)
101
+ # print(embeder_output1.shape) # torch.Size([1, 8, 768])
102
+
103
+
104
+ # print(embeder_output2)
105
+ # print(embeder_output2.shape) # torch.Size([1, 8, 768])
106
+
107
+
108
+ # error = torch.abs(embeder_output1 - embeder_output2)
109
+ # print(error)
110
+ # print("Embedding distance between the two sentences: ")
111
+ # print(f"String1: {string1}")
112
+ # print(f"String2: {string2}")
113
+ # print(torch.mean(error))
114
+
115
+
116
+ # exit()
117
+
118
+
119
+ # meta_id_name='Patient'
120
+ # meta_weeks_name='Weeks'
121
+ # meta_fvc_name='FVC'
122
+ # meta_percent_name='Percent'
123
+ # meta_age_name='Age'
124
+ # meta_sex_name='Sex'
125
+ # meta_status_name='SmokingStatus'
126
+
127
+ TASK_VALUE="segmentation"
128
+ CLAMP_RANGE_CT = [-300,300]
129
+ CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...
130
+
131
+ ##判定是否有效胸部的肺部体积阈值ml
132
+ LUNG_VOL_THRESH=1000
133
+ FEMUR_VOL_THRESH=80
134
+ ROI="abdomen"
135
+
136
+ LABEL_DICT={
137
+ "0":"backgroud",
138
+ "1":"aorta",
139
+ "2":"gall_bladder",
140
+ "3":"kidney_left",
141
+ "4":"kidney_right",
142
+ "5":"liver",
143
+ "6":"pancreas",
144
+ "7":"postcava",
145
+ "8":"spleen",
146
+ "9":"stomach",
147
+ "10":"adrenal_gland_left",
148
+ "11":"adrenal_gland_right",
149
+ "12":"bladder",
150
+ "13":"celiac_trunk",
151
+ "14":"colon",
152
+ "15":"duodenum",
153
+ "16":"esophagus",
154
+ "17":"femur_left",
155
+ "18":"femur_right",
156
+ "19":"hepatic_vessel",
157
+ "20":"intestine",
158
+ "21":"lung_left",
159
+ "22":"lung_right",
160
+ "23":"portal_vein_and_splenic_vein",
161
+ "24":"prostate",
162
+ "25":"rectum"
163
+ }
164
+ # def find_metadata_files(path):
165
+ # # for Cancer Image Archive (TCIA) dataset
166
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
167
+ # return glob.glob(search_pattern, recursive=True)
168
+
169
+ def find_metadata_files(path):
170
+ # for Cancer Image Archive (TCIA) dataset
171
+ search_pattern = os.path.join(path, '*.csv')
172
+ return glob.glob(search_pattern, recursive=True)
173
+ ##added by yanguoqing on 20250527
174
+ def find_image_dirs(path):
175
+ return os.listdir(path)
176
+
177
+ ##modify by yanguoqing on 20250527
178
+ def load_dicom_images(folder_path):
179
+ reader = sitk.ImageSeriesReader()
180
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
181
+ reader.SetFileNames(dicom_names)
182
+ image = reader.Execute()
183
+ return dicom_names,image
184
+
185
+ ##added by yanguoqing on 20250527
186
+ def load_dicom_tag(imgs):
187
+ reader = sitk.ImageFileReader()
188
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
189
+ reader.SetFileName(imgs)
190
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
191
+ # metadata_keys = reader.GetMetaDataKeys()
192
+ tag=reader.Execute()
193
+ return tag
194
+
195
+ def load_nrrd(fp):
196
+ return sitk.ReadImage(fp)
197
+
198
+ def save_nifti(image, output_path, folder_path):
199
+ # Set metadata in the NIfTI file's header
200
+ output_dirpath = os.path.dirname(output_path)
201
+ if not os.path.exists(output_dirpath):
202
+ print(f"Creating directory {output_dirpath}")
203
+ os.makedirs(output_dirpath)
204
+ # Set metadata in the NIfTI file's header
205
+ image.SetMetaData("FolderPath", folder_path)
206
+ sitk.WriteImage(image, output_path)
207
+
208
+ ##modify by yanguoqing on 20250527
209
+ def convert_windows_to_linux_path(windows_path):
210
+ # Replace backslashes with forward slashes and remove the drive letter
211
+ # Some meta files have windows paths, but the data is stored on a linux server
212
+ linux_path = windows_path.replace('\\', '/')
213
+ if ':' in linux_path:
214
+ linux_path = linux_path.split(':', 1)[1]
215
+ return linux_path
216
+
217
+ def simpleitk_volume_calculation(image_path):
218
+ """
219
+ 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax
220
+ """
221
+
222
+ image=util.load_nifti(image_path)
223
+ # 获取体素尺寸
224
+ spacing = image.GetSpacing()
225
+ voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³
226
+
227
+ # print(f"图像尺寸: {image.GetSize()}")
228
+ # print(f"体素间距: {spacing}")
229
+ # print(f"单个体素体积: {voxel_volume:.6f} mm³")
230
+ ##计算有效像元数量
231
+ image_array2 = sitk.GetArrayFromImage(image)
232
+ valid_pxiels=image_array2[image_array2==1].sum()
233
+ if valid_pxiels<10:
234
+ return 0
235
+ # 简单的阈值分割(需要根据实际情况调整阈值)
236
+ segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1)
237
+
238
+ # 统计体素数量
239
+ statistics = sitk.LabelShapeStatisticsImageFilter()
240
+ statistics.Execute(segmented)
241
+
242
+ voxel_count = statistics.GetNumberOfPixels(1)
243
+ volume_mm3 = voxel_count * voxel_volume
244
+ volume_ml = volume_mm3 / 1000.0
245
+
246
+ # print(f"体素数量: {voxel_count}")
247
+ # print(f"器官体积: {volume_ml:.2f} mL")
248
+
249
+ return volume_ml
250
+
251
+ def main(target_path, output_dir):
252
+ metadata_files = find_metadata_files(target_path)
253
+ pid_dirs=find_image_dirs(target_path)
254
+ failed_files = []
255
+ if not os.path.isdir(output_dir):
256
+ os.makedirs(output_dir)
257
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
258
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
259
+ meta = meta_data()
260
+
261
+ # Initialize the JSON file
262
+ if not os.path.exists(json_output_path):
263
+ with open(json_output_path, 'w') as json_file:
264
+ json.dump({}, json_file)
265
+
266
+ if pid_dirs:
267
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
268
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
269
+ continue
270
+ if not pid_dir.startswith("BDMAP_"):
271
+ continue
272
+
273
+ meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
274
+ if os.path.isfile(meta_file):
275
+ mf_flag=True
276
+ df_meta=pd.read_csv(meta_file,sep=',')
277
+ else:
278
+ mf_flag=False
279
+
280
+ full_path=os.path.join(target_path,pid_dir,"ct.nii.gz")
281
+
282
+
283
+ if not os.path.isfile(full_path):
284
+ continue
285
+ try:
286
+ print(full_path)
287
+
288
+ dicom_image=util.load_nifti(full_path)
289
+ spacing_info = dicom_image.GetSpacing()
290
+ print('SPACING INFO:', spacing_info)
291
+
292
+ # metadata_keys = dicom_image.GetMetaDataKeys()
293
+
294
+ # dtag=load_dicom_tag(dicom_fp[0])
295
+ # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
296
+ # modality=dtag.GetMetaData('0008|0060')##Modality
297
+ uid=pid_dir
298
+ modality="CT"
299
+ study='AbdomenAtlas'##Dataset_name
300
+ CIA_other_info = {
301
+ 'Study_UID':uid,
302
+ 'metadata_file':''
303
+ # 'Series_Description':serise_desc
304
+ }
305
+ CIA_other_info['split'] = "train"
306
+ if mf_flag:
307
+ CIA_other_info['metadata_file']=meta_file
308
+
309
+ size = list(dicom_image.GetSize())
310
+ resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
311
+
312
+ # resize the image
313
+ if resampler is not None:
314
+ proces_image = resampler.Execute(dicom_image)
315
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
316
+ CIA_other_info['Resample'] = True
317
+ else:
318
+ proces_image = dicom_image
319
+ CIA_other_info['Resample'] = False
320
+
321
+ ##
322
+ # CIA_other_info['Image_id']=meta_image_id
323
+ # CIA_other_info['Weeks']=str(meta_weeks)
324
+ # CIA_other_info['FVC']=str(meta_fvc)
325
+ # CIA_other_info['Percent']=str(meta_percent)
326
+ # CIA_other_info['Age']=str(meta_age)
327
+ # CIA_other_info['Sex']=meta_sex
328
+ # CIA_other_info['Smoke_Status']=meta_status
329
+ # threshold the image
330
+ if 'CT' in modality:
331
+ proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
332
+ else:
333
+ pass
334
+
335
+ output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
336
+ # output_path=convert_windows_to_linux_path(output_path)
337
+ save_nifti(proces_image, output_path, full_path)
338
+ print(f"Saved NIfTI file to {output_path}")
339
+
340
+ ##segment
341
+ label_path_dict = {}
342
+ label_flag=True
343
+
344
+ label_paths = os.path.join(target_path,pid_dir, 'segmentations')
345
+ label_files=glob.glob("%s/*.nii.gz"%(label_paths))
346
+ #print(label_paths,label_files)
347
+ pelvis_flag=False
348
+ thorax_flag=False
349
+ if len(label_files)>0:
350
+ for lf in label_files:
351
+ lf_name=os.path.basename(lf)
352
+
353
+ lf_tissue=lf_name.replace(".nii.gz","")
354
+
355
+ if 'femur' in lf_tissue:
356
+ vol_femur=simpleitk_volume_calculation(lf)
357
+ print(lf_tissue,vol_femur)
358
+ if vol_femur>=FEMUR_VOL_THRESH:
359
+ pelvis_flag=True
360
+ if 'lung' in lf_tissue:
361
+ vol_lung=simpleitk_volume_calculation(lf)
362
+ print(lf_tissue,vol_lung)
363
+ if vol_lung>=LUNG_VOL_THRESH:
364
+ thorax_flag=True
365
+
366
+
367
+ label_image=load_nrrd(lf)
368
+ resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
369
+ if resampler is not None:
370
+ proces_label = resampler.Execute(label_image)
371
+ else:
372
+ proces_label = label_image
373
+
374
+
375
+ # print(proces_image.GetSize(),proces_label.GetSize())
376
+ try:
377
+ assert proces_image.GetSize() == proces_label.GetSize()
378
+ except Exception as e:
379
+ failed_files.append(lf)
380
+ continue
381
+
382
+ label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz")
383
+
384
+ label_path_dict[lf_tissue] = label_output_path
385
+ util.save_nifti(proces_label, label_output_path, lf)
386
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
387
+
388
+ else:
389
+ label_flag=False
390
+ except RuntimeError:
391
+ failed_files.append(full_path)
392
+ print(f"Failed to load DICOM images from {full_path}")
393
+ continue
394
+
395
+ '''
396
+ meta.add_keyvalue('Image_id',meta_image_id)
397
+ meta.add_keyvalue('Weeks',meta_weeks)
398
+ meta.add_keyvalue('FVC',meta_fvc)
399
+ meta.add_keyvalue('Percent',meta_percent)
400
+ meta.add_keyvalue('Age',meta_age)
401
+ meta.add_keyvalue('Sex',meta_sex)
402
+ meta.add_keyvalue('Smoke_Status',meta_status)
403
+ '''
404
+
405
+ size_processed = list(proces_image.GetSize())
406
+
407
+ meta_image_id=uid
408
+ # meta.add_keyvalue('Image_id',meta_image_id)
409
+ meta.add_keyvalue('Spacing_mm',min(spacing_info))
410
+ meta.add_keyvalue('OriImg_path',full_path)
411
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
412
+ meta.add_keyvalue('Modality',modality)
413
+ meta.add_keyvalue('Dataset_name',study)
414
+
415
+ roi='abdomen'
416
+
417
+ if thorax_flag:
418
+ roi='thorax-'+roi
419
+
420
+ if pelvis_flag:
421
+ roi=roi+"-pelvis"
422
+
423
+ meta.add_keyvalue('ROI',roi)
424
+
425
+
426
+
427
+ if label_flag:
428
+ # print(label_path_dict.keys())
429
+ meta.add_keyvalue('Task',TASK_VALUE)
430
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
431
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
432
+
433
+ # meta.add_keyvalue('Label_Dict',LABEL_DICT)
434
+
435
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
436
+
437
+
438
+
439
+
440
+ # Write the mapping to the JSON file on the fly
441
+ with open(json_output_path, 'r+') as json_file:
442
+ existing_mappings = json.load(json_file)
443
+ existing_mappings[output_path] = meta.get_meta_data()
444
+ json_file.seek(0)
445
+ json.dump(existing_mappings, json_file, indent=4)
446
+ json_file.truncate()
447
+ else:
448
+ print("No metadata.csv files found.")
449
+
450
+ with open(failed_files_path, "w") as json_file:
451
+ json.dump(failed_files, json_file)
452
+
453
+ print(f"The list has been written to {failed_files_path}")
454
+ print(f"Saved NIfTI mappings to {json_output_path}")
455
+
456
+ if __name__ == "__main__":
457
+ parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
458
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2")
459
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v3/")
460
+ args = parser.parse_args()
461
+ print(args.target_path, args.output_dir)
462
+ main(args.target_path, args.output_dir)
463
+
464
+
465
+
466
+
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
AbdomenAtlas/util.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def load_dicom_images(folder_path):
8
+ reader = sitk.ImageSeriesReader()
9
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
10
+ reader.SetFileNames(dicom_names)
11
+ image = reader.Execute()
12
+ return image
13
+
14
+ def convert_windows_to_linux_path(windows_path):
15
+ # Replace backslashes with forward slashes and remove the drive letter
16
+ # Some meta files have windows paths, but the data is stored on a linux server
17
+ linux_path = windows_path.replace('\\', '/')
18
+ if ':' in linux_path:
19
+ linux_path = linux_path.split(':', 1)[1]
20
+ return linux_path
21
+
22
+ # =============================================================================
23
+ # ========================developed with TotalSegmentor========================
24
+ # =============================================================================
25
+
26
+ def read_table(file_path, split_str=';'):
27
+ try:
28
+ df = pd.read_excel(file_path, engine='openpyxl')
29
+ except:
30
+ df = pd.read_csv(file_path, sep=split_str)
31
+ return df
32
+
33
+ def load_nifti(image_path):
34
+ return sitk.ReadImage(image_path)
35
+
36
+ def save_nifti(image, output_path, folder_path):
37
+ output_dirpath = os.path.dirname(output_path)
38
+ if not os.path.exists(output_dirpath):
39
+ print(f"Creating directory {output_dirpath}")
40
+ os.makedirs(output_dirpath)
41
+ # Set metadata in the NIfTI file's header
42
+ image.SetMetaData("FolderPath", folder_path)
43
+ sitk.WriteImage(image, output_path)
44
+
45
+ def find_metadata_files(path, file_name='*meta*'):
46
+ # for TotalSegmentor dataset
47
+ search_pattern = os.path.join(path, '**', file_name)
48
+ return glob.glob(search_pattern, recursive=True)
49
+
50
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
51
+ img_path = []
52
+ for root, dirs, files in os.walk(folder_path):
53
+ for file in files:
54
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
55
+ img_path.append(os.path.join(root, file))
56
+ if is_sorted:
57
+ img_path.sort()
58
+ return img_path
59
+
60
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
61
+ '''
62
+ Resample the image to have isotropic spacing, following the steps:
63
+ 1. Find the minimum spacing
64
+ 2. Resample the image to have the minimum spacing
65
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
66
+ 4. Set the output spacing
67
+ 5. Return the resampler for resampling
68
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
69
+ '''
70
+ # 讨论为什么重新写这个函数!!!
71
+ if size is None:
72
+ size = ref_img.GetSize()
73
+ if spacing is None:
74
+ spacing = ref_img.GetSpacing()
75
+ min_spacing = min(spacing)
76
+ if all([spc == min_spacing for spc in spacing]):
77
+ return None
78
+ else:
79
+ # if 1:
80
+ if interpolator == 'nearest':
81
+ interpolator = sitk.sitkNearestNeighbor
82
+ elif interpolator == 'linear':
83
+ interpolator = sitk.sitkLinear
84
+ resampler = sitk.ResampleImageFilter()
85
+ # new_spacing = [max_spacing] * len(spacing)
86
+ # print(size)
87
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
88
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
89
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
90
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
91
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
92
+ # resampler.SetSize(new_size)
93
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
94
+ resampler.SetSize(new_size_xy)
95
+ resampler.SetOutputSpacing(new_size_spacing)
96
+
97
+ # print(new_size,new_size_xy)
98
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
99
+ resampler.SetOutputDirection(ref_img.GetDirection())
100
+ resampler.SetInterpolator(interpolator)
101
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
102
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
103
+ return resampler
104
+
105
+ def clamp_image(in_img,clamp_range):
106
+ '''
107
+ Clamp the image to the specified range
108
+ '''
109
+ clamp_filter = sitk.ClampImageFilter()
110
+ clamp_filter.SetLowerBound(clamp_range[0])
111
+ clamp_filter.SetUpperBound(clamp_range[1])
112
+ return clamp_filter.Execute(in_img)
113
+
114
+ def get_synonyms_dict(dict_type='ROI'):
115
+ '''
116
+ Get the dictionary of synonyms for the specified dictionary type
117
+ '''
118
+ if dict_type == 'ROI':
119
+ dict_synonyms = {
120
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
121
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
122
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
123
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
124
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
125
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
126
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
127
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
128
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
129
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
130
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
131
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
132
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
133
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
134
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
135
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
136
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
137
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
138
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
139
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
140
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
141
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
142
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
143
+ }
144
+ elif dict_type == 'Label_tissue':
145
+ dict_synonyms = {
146
+ 'liver': ['liver','hepatic'],
147
+ 'spleen': ['spleen','splenic'],
148
+ 'kidney': ['kidney','renal'],
149
+ 'pancreas': ['pancreas','pancreatic'],
150
+ 'stomach': ['stomach','gastric'],
151
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
152
+ 'gallbladder': ['gallbladder'],
153
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
154
+ 'bladder': ['bladder'],
155
+ 'prostate': ['prostate'],
156
+ 'uterus': ['uterus'],
157
+ 'ovary': ['ovary'],
158
+ 'testicle': ['testicle'],
159
+ 'lymph_node': ['lymph_node','lymph node'],
160
+ 'bone': ['bone'],
161
+ 'lung': ['lung'],
162
+ 'heart': ['heart'],
163
+ 'esophagus': ['esophagus'],
164
+ 'muscle': ['muscle'],
165
+ 'fat': ['fat'],
166
+ 'skin': ['skin'],
167
+ 'vessel': ['vessel'],
168
+ 'tumor': ['tumor'],
169
+ 'other': ['other']
170
+ }
171
+ elif dict_type == 'Task':
172
+ dict_synonyms = {
173
+ 'segmentation': ['segmentation', 'seg', 'mask'],
174
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
175
+ 'localization': ['localization', 'locate', 'location', 'position'],
176
+ 'registration': ['registration', 'register', 'align', 'alignment'],
177
+ 'detection': ['detection', 'detect', 'find', 'locate'],
178
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
179
+ }
180
+ elif dict_type == 'Modality':
181
+ dict_synonyms = {
182
+ 'CT': ['CT', 'computed tomography'],
183
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
184
+ 'PET': ['PET', 'positron emission tomography'],
185
+ 'US': ['US', 'ultrasound'],
186
+ 'X-ray': ['X-ray', 'radiography'],
187
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
188
+ }
189
+ else:
190
+ raise ValueError(f"dict_type {dict_type} is not valid")
191
+ return dict_synonyms
192
+
193
+ def replace_synonyms(text, dict_synonyms):
194
+ '''
195
+ Replace the synonyms in the text with the standard term
196
+ '''
197
+ if isinstance(text,str):
198
+ for key, value in dict_synonyms.items():
199
+ for v in value:
200
+ if v.lower() in text.lower():
201
+ return key
202
+ Warning(f"Value {text} is not in the correct format")
203
+ elif isinstance(text,list):
204
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
205
+ elif isinstance(text,dict):
206
+ for key in text.keys():
207
+ # replace values in dict
208
+ text[key] = replace_synonyms(text[key], dict_synonyms)
209
+ # replace keys in dict
210
+ for k in dict_synonyms.keys():
211
+ text[dict_synonyms[k]] = text.pop(key)
212
+ return text
213
+
214
+ # =============================================================================
215
+
216
+ class meta_data(object):
217
+ '''
218
+ This class is used to store the metadata of the dataset
219
+ '''
220
+ def __init__(self):
221
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
222
+ with open(self.config_format_path, 'r') as file:
223
+ self.config_format = json.load(file)
224
+ self.config = {}
225
+ for key in self.config_format.keys():
226
+ if self.config_format[key]['required'] == True:
227
+ self.config[key] = {}
228
+ self.keytypes = self.find_all_keys_with_type()
229
+ self.keytypes_flatten = self.flatten_json()
230
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
231
+ for key in self.ambiguity_keys:
232
+ ambiguity_dict = get_synonyms_dict(key)
233
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
234
+
235
+ def get_ketytypes(self):
236
+ return self.keytypes
237
+
238
+ def get_keytypes_flatten(self):
239
+ return self.keytypes_flatten
240
+
241
+ def find_all_keys_with_type(self, data=None, parent_key=''):
242
+ if data is None:
243
+ data = self.config_format
244
+ keys_with_type = {}
245
+ if isinstance(data, dict):
246
+ for key, value in data.items():
247
+ full_key = f"{parent_key}.{key}" if parent_key else key
248
+ if isinstance(value, dict) and 'type' in value:
249
+ keys_with_type[full_key] = value['type']
250
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
251
+ elif isinstance(data, list):
252
+ for index, item in enumerate(data):
253
+ full_key = f"{parent_key}[{index}]"
254
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
255
+ return keys_with_type
256
+
257
+ def flatten_json(self, data=None, parent_key='', sep='.'):
258
+ if data is None:
259
+ data = self.config_format
260
+ items = {}
261
+ if isinstance(data, dict):
262
+ for key, value in data.items():
263
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
264
+ if isinstance(value, dict):
265
+ items.update(self.flatten_json(value, new_key, sep=sep))
266
+ elif isinstance(value, list):
267
+ for i, item in enumerate(value):
268
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
269
+ else:
270
+ items[new_key] = value
271
+ elif isinstance(data, list):
272
+ for i, item in enumerate(data):
273
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
274
+ return items
275
+
276
+ def req_check(self):
277
+ self.unfilled_keys = []
278
+ for key in self.config.keys():
279
+ if self.config[key] == {}:
280
+ self.unfilled_keys.append(key)
281
+ if len(self.unfilled_keys) == 0:
282
+ return True
283
+ else:
284
+ return False
285
+
286
+ def type_check(self, key, value):
287
+ if key not in self.config_format.keys():
288
+ print(key, "is not a valid key")
289
+ return False
290
+
291
+ if key == 'Modality':
292
+ if value not in self.config_format[key]['options']:
293
+ return False
294
+ else:
295
+ return True
296
+
297
+ elif key == 'OriImg_path':
298
+ if isinstance(value, str):
299
+ return True
300
+ else:
301
+ return False
302
+
303
+ elif key == 'Label_path' and isinstance(value, dict):
304
+ for skey in value.keys():
305
+ if skey in self.config_format[key]['keys']:
306
+ for kk in value[skey]:
307
+ if isinstance(value[skey][kk],str):
308
+ pass
309
+ # if kk in self.config_format[key]['value']['keys']:
310
+ # if isinstance(value[skey][kk],str):
311
+ # pass
312
+ # else:
313
+ # return False
314
+ else:
315
+ return False
316
+ return True
317
+
318
+ elif key == 'ROI':
319
+ if value not in self.config_format[key]['options']:
320
+ return False
321
+ else:
322
+ return True
323
+
324
+ elif key == 'Label_tissue' and isinstance(value, list):
325
+ for i in value:
326
+ if i not in self.config_format[key]['items']['options']:
327
+ return False
328
+ return True
329
+
330
+ elif key =='Task' and isinstance(value, list):
331
+ for i in value:
332
+ if i not in self.config_format[key]['items']['options']:
333
+ return False
334
+ return True
335
+
336
+ elif key == 'Spacing_mm':
337
+ if isinstance(value, float):
338
+ return True
339
+ else:
340
+ False
341
+
342
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
343
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
344
+ return all(isinstance(item, int) for item in value)
345
+
346
+ elif key == 'Dataset_name':
347
+ if isinstance(value, str):
348
+ return True
349
+ else:
350
+ return False
351
+ ##added by yanguoiqng on 2025-08-08
352
+ elif key == 'Sub_modality':
353
+
354
+ if isinstance(value, dict):
355
+ return True
356
+ else:
357
+ return False
358
+ elif key == 'Label_Dict':
359
+
360
+ if isinstance(value, dict):
361
+ return True
362
+ else:
363
+ return False
364
+ def add_extra_keyvalue(self, key, value):
365
+ self.config[key] = value
366
+ return True
367
+
368
+ def add_keyvalue(self, key, value):
369
+ if key in self.ambiguity_keys:
370
+ value = replace_synonyms(value, get_synonyms_dict(key))
371
+ # print(key, value)
372
+ if self.type_check(key, value):
373
+
374
+ self.config[key] = value
375
+ return True
376
+ else:
377
+ Warning(f"Value {value} is not in the correct format for key {key}")
378
+ pass
379
+ # print(f"Value {value} is not in the correct format for key {key}")
380
+
381
+ def get_meta_data(self):
382
+ if self.req_check():
383
+ return self.config
384
+ else:
385
+ print("Not all required keys are filled", self.unfilled_keys)
386
+ return False
387
+
388
+
389
+
390
+ if __name__ == '__main__':
391
+ meta = meta_data()
392
+ print(meta.get_keytypes_flatten())
393
+ print(meta.get_ketytypes())
394
+ meta.add_keyvalue('Modality', 'CT')
395
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
396
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
397
+ meta.add_keyvalue('Spacing_mm', 1.5)
398
+ meta.add_keyvalue('Size', [512, 512, 100])
399
+ meta.add_keyvalue('Dataset_name', 'CT')
400
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
401
+ meta.add_keyvalue('Task', ['1', '2', '3'])
402
+ print(meta.get_meta_data())
403
+ meta.add_extra_key('extra', 'extra')
404
+ print(meta.get_meta_data())
405
+ print(meta.get_ketytypes())
406
+ print(meta.get_keytypes_flatten)
407
+
408
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
409
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
410
+ print(img_paths)
AbdomenAtlas/xx_update.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-8-18
5
+ update AbdomenAtlas3.0 data clean
6
+
7
+ https://arxiv.org/pdf/2407.16697
8
+ https://zhuanlan.zhihu.com/p/19339643417
9
+
10
+ AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。
11
+ 该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。
12
+ 这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。
13
+ 通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。
14
+
15
+ 数据集统计信息
16
+ 总数据量:
17
+ 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。
18
+ 包含 8,562 个肿瘤实例:
19
+ 肝脏肿瘤:3,036 个实例(929 份报告)
20
+ 胰腺肿瘤:354 个实例(344 份报告)
21
+ 肾脏肿瘤:4,239 个实例(1,674 份报告)
22
+ 6,061 份无肿瘤报告(作为对照组)
23
+ 小肿瘤(≤2 cm):
24
+ 943 份小肿瘤相关报告:
25
+ 肝脏:347 个实例(占肝脏肿瘤的 37.4%)
26
+ 胰腺:83 个实例(占胰腺肿瘤的 24.1%)
27
+ 肾脏:466 个实例(占肾脏肿瘤的 27.8%)
28
+ 肿瘤分期与解剖结构:
29
+ 260 份胰腺肿瘤分期报告(T1–T4)
30
+ 提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割
31
+ 标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度
32
+ 图像与文本配对:
33
+ 1.8M 文本 Token,包含三类报告:
34
+ 结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等)
35
+ 叙述性报告:通过 LLM 转换,模仿目标医院的报告风格
36
+ 人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容
37
+
38
+ AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值
39
+ 1 aorta
40
+ 2 gall_bladder
41
+ 3 kidney_left
42
+ 4 kidney_right
43
+ 5 liver
44
+ 6 pancreas
45
+ 7 postcava
46
+ 8 spleen
47
+ 9 stomach
48
+ 10 adrenal_gland_left
49
+ 11 adrenal_gland_right
50
+ 12 bladder
51
+ 13 celiac_trunk
52
+ 14 colon
53
+ 15 duodenum
54
+ 16 esophagus
55
+ 17 femur_left
56
+ 18 femur_right
57
+ 19 hepatic_vessel
58
+ 20 intestine
59
+ 21 lung_left
60
+ 22 lung_right
61
+ 23 portal_vein_and_splenic_vein
62
+ 24 prostate
63
+ 25 rectum
64
+
65
+
66
+ 参考TotalSegment分别存储25个器官的label处理后的数据文件
67
+ '''
68
+ import os
69
+ import glob
70
+ import pandas as pd
71
+ import SimpleITK as sitk
72
+ import argparse
73
+ import json
74
+ from tqdm import tqdm
75
+ from util import meta_data
76
+ import util
77
+ import numpy as np
78
+ # from bert_helper import *
79
+
80
+ # model_name = "bert-large-uncased"
81
+ # reduce_method = 'mean'
82
+ # max_words_num = 32 # max number of words in the caption > 2
83
+
84
+ # embeder, tokenizer = get_frozen_embeder(model_name)
85
+
86
+ # string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
87
+ # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
88
+
89
+ # string2 = "modality: ct, gender: female, age: 50, roi: head"
90
+
91
+ # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
92
+
93
+ # input_size = embeder.config.vocab_size
94
+ # in_size = embeder.config.hidden_size
95
+
96
+ # print(embeder, input_size, in_size)
97
+ # print(tokenizer)
98
+
99
+
100
+ # print(embeder_output1)
101
+ # print(embeder_output1.shape) # torch.Size([1, 8, 768])
102
+
103
+
104
+ # print(embeder_output2)
105
+ # print(embeder_output2.shape) # torch.Size([1, 8, 768])
106
+
107
+
108
+ # error = torch.abs(embeder_output1 - embeder_output2)
109
+ # print(error)
110
+ # print("Embedding distance between the two sentences: ")
111
+ # print(f"String1: {string1}")
112
+ # print(f"String2: {string2}")
113
+ # print(torch.mean(error))
114
+
115
+
116
+ # exit()
117
+
118
+
119
+ # meta_id_name='Patient'
120
+ # meta_weeks_name='Weeks'
121
+ # meta_fvc_name='FVC'
122
+ # meta_percent_name='Percent'
123
+ # meta_age_name='Age'
124
+ # meta_sex_name='Sex'
125
+ # meta_status_name='SmokingStatus'
126
+
127
+ TASK_VALUE="segmentation"
128
+ CLAMP_RANGE_CT = [-300,300]
129
+ CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...
130
+
131
+ ##判定是否有效胸部的肺部体积阈值ml
132
+ LUNG_VOL_THRESH=1000
133
+ FEMUR_VOL_THRESH=80
134
+ KIDNEY_VOL_THRESH=100
135
+ gall_bladder_VOL_THRESH=12
136
+ ROI="abdomen"
137
+
138
+ PROCESS_FLAG=True
139
+
140
+ LABEL_DICT={
141
+ "0":"backgroud",
142
+ "1":"aorta",
143
+ "2":"gall_bladder",
144
+ "3":"kidney_left",
145
+ "4":"kidney_right",
146
+ "5":"liver",
147
+ "6":"pancreas",
148
+ "7":"postcava",
149
+ "8":"spleen",
150
+ "9":"stomach",
151
+ "10":"adrenal_gland_left",
152
+ "11":"adrenal_gland_right",
153
+ "12":"bladder",
154
+ "13":"celiac_trunk",
155
+ "14":"colon",
156
+ "15":"duodenum",
157
+ "16":"esophagus",
158
+ "17":"femur_left",
159
+ "18":"femur_right",
160
+ "19":"hepatic_vessel",
161
+ "20":"intestine",
162
+ "21":"lung_left",
163
+ "22":"lung_right",
164
+ "23":"portal_vein_and_splenic_vein",
165
+ "24":"prostate",
166
+ "25":"rectum"
167
+ }
168
+ # def find_metadata_files(path):
169
+ # # for Cancer Image Archive (TCIA) dataset
170
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
171
+ # return glob.glob(search_pattern, recursive=True)
172
+
173
+ def find_metadata_files(path):
174
+ # for Cancer Image Archive (TCIA) dataset
175
+ search_pattern = os.path.join(path, '*.csv')
176
+ return glob.glob(search_pattern, recursive=True)
177
+ ##added by yanguoqing on 20250527
178
+ def find_image_dirs(path):
179
+ return os.listdir(path)
180
+
181
+ ##modify by yanguoqing on 20250527
182
+ def load_dicom_images(folder_path):
183
+ reader = sitk.ImageSeriesReader()
184
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
185
+ reader.SetFileNames(dicom_names)
186
+ image = reader.Execute()
187
+ return dicom_names,image
188
+
189
+ ##added by yanguoqing on 20250527
190
+ def load_dicom_tag(imgs):
191
+ reader = sitk.ImageFileReader()
192
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
193
+ reader.SetFileName(imgs)
194
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
195
+ # metadata_keys = reader.GetMetaDataKeys()
196
+ tag=reader.Execute()
197
+ return tag
198
+
199
+ def load_nrrd(fp):
200
+ return sitk.ReadImage(fp)
201
+
202
+ def save_nifti(image, output_path, folder_path):
203
+ # Set metadata in the NIfTI file's header
204
+ output_dirpath = os.path.dirname(output_path)
205
+ if not os.path.exists(output_dirpath):
206
+ print(f"Creating directory {output_dirpath}")
207
+ os.makedirs(output_dirpath)
208
+ # Set metadata in the NIfTI file's header
209
+ image.SetMetaData("FolderPath", folder_path)
210
+ sitk.WriteImage(image, output_path)
211
+
212
+ ##modify by yanguoqing on 20250527
213
+ def convert_windows_to_linux_path(windows_path):
214
+ # Replace backslashes with forward slashes and remove the drive letter
215
+ # Some meta files have windows paths, but the data is stored on a linux server
216
+ linux_path = windows_path.replace('\\', '/')
217
+ if ':' in linux_path:
218
+ linux_path = linux_path.split(':', 1)[1]
219
+ return linux_path
220
+
221
+ def simpleitk_volume_calculation(image_path):
222
+ """
223
+ 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax
224
+ """
225
+
226
+ image=util.load_nifti(image_path)
227
+ # 获取体素尺寸
228
+ spacing = image.GetSpacing()
229
+ voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³
230
+
231
+ # print(f"图像尺寸: {image.GetSize()}")
232
+ # print(f"体素间距: {spacing}")
233
+ # print(f"单个体素体积: {voxel_volume:.6f} mm³")
234
+ ##计算有效像元数量
235
+ image_array2 = sitk.GetArrayFromImage(image)
236
+ valid_pxiels=image_array2[image_array2==1].sum()
237
+ if valid_pxiels<10:
238
+ return 0
239
+ # 简单的阈值分割(需要根据实际情况调整阈值)
240
+ segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1)
241
+
242
+ # 统计体素数量
243
+ statistics = sitk.LabelShapeStatisticsImageFilter()
244
+ statistics.Execute(segmented)
245
+
246
+ voxel_count = statistics.GetNumberOfPixels(1)
247
+ volume_mm3 = voxel_count * voxel_volume
248
+ volume_ml = volume_mm3 / 1000.0
249
+
250
+ # print(f"体素数量: {voxel_count}")
251
+ # print(f"器官体积: {volume_ml:.2f} mL")
252
+
253
+ return volume_ml
254
+
255
+ def main(target_path, output_dir):
256
+ metadata_files = find_metadata_files(target_path)
257
+ pid_dirs=find_image_dirs(target_path)
258
+ failed_files = []
259
+ label_dict={}
260
+ if not os.path.isdir(output_dir):
261
+ os.makedirs(output_dir)
262
+ json_output_path = os.path.join(output_dir, 'xx.json')
263
+ failed_files_path = os.path.join(output_dir, 'yy.json')
264
+ #meta = meta_data()
265
+ with open(json_output_path,'r') as fi:
266
+ fj=json.load(fi)
267
+ '''
268
+ # Initialize the JSON file
269
+ if not os.path.exists(json_output_path):
270
+ with open(json_output_path, 'w') as json_file:
271
+ json.dump({}, json_file)
272
+ '''
273
+ if pid_dirs:
274
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
275
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
276
+ continue
277
+ if not pid_dir.startswith("BDMAP_"):
278
+ continue
279
+
280
+ meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
281
+ if os.path.isfile(meta_file):
282
+ mf_flag=True
283
+ # df_meta=pd.read_csv(meta_file,sep=',')
284
+ else:
285
+ mf_flag=False
286
+
287
+ full_path=os.path.join(target_path,pid_dir,"ct.nii.gz")
288
+
289
+
290
+ try:
291
+ '''
292
+ dicom_image=util.load_nifti(full_path)
293
+ spacing_info = dicom_image.GetSpacing()
294
+ print('SPACING INFO:', spacing_info)
295
+
296
+ # metadata_keys = dicom_image.GetMetaDataKeys()
297
+
298
+ # dtag=load_dicom_tag(dicom_fp[0])
299
+ # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
300
+ # modality=dtag.GetMetaData('0008|0060')##Modality
301
+ uid=pid_dir
302
+ modality="CT"
303
+ study='AbdomenAtlas'##Dataset_name
304
+ CIA_other_info = {
305
+ 'Study_UID':uid,
306
+ 'metadata_file':''
307
+ # 'Series_Description':serise_desc
308
+ }
309
+ CIA_other_info['split'] = "train"
310
+ if mf_flag:
311
+ CIA_other_info['metadata_file']=meta_file
312
+
313
+ size = list(dicom_image.GetSize())
314
+ resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
315
+
316
+ # resize the image
317
+ if resampler is not None:
318
+ proces_image = resampler.Execute(dicom_image)
319
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
320
+ CIA_other_info['Resample'] = True
321
+ else:
322
+ proces_image = dicom_image
323
+ CIA_other_info['Resample'] = False
324
+
325
+ ##
326
+ # CIA_other_info['Image_id']=meta_image_id
327
+ # CIA_other_info['Weeks']=str(meta_weeks)
328
+ # CIA_other_info['FVC']=str(meta_fvc)
329
+ # CIA_other_info['Percent']=str(meta_percent)
330
+ # CIA_other_info['Age']=str(meta_age)
331
+ # CIA_other_info['Sex']=meta_sex
332
+ # CIA_other_info['Smoke_Status']=meta_status
333
+ # threshold the image
334
+ if 'CT' in modality:
335
+ proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
336
+ else:
337
+ pass
338
+
339
+ output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
340
+ # output_path=convert_windows_to_linux_path(output_path)
341
+ save_nifti(proces_image, output_path, full_path)
342
+ print(f"Saved NIfTI file to {output_path}")
343
+ '''
344
+ ##segment
345
+ label_path_dict = {}
346
+ label_flag=True
347
+
348
+ label_paths = os.path.join(target_path,pid_dir, 'segmentations')
349
+ label_files=glob.glob("%s/*.nii.gz"%(label_paths))
350
+ #print(label_paths,label_files)
351
+ pelvis_flag=False
352
+ thorax_flag=False
353
+ lung_min=0
354
+ lung_max=0
355
+ kidney_flag=False
356
+ gall_bladder_flag=False
357
+ if len(label_files)>0:
358
+ for lf in label_files:
359
+ lf_name=os.path.basename(lf)
360
+
361
+ lf_tissue=lf_name.replace(".nii.gz","")
362
+
363
+ if 'femur' in lf_tissue:
364
+ vol_femur=simpleitk_volume_calculation(lf)
365
+ print(lf_tissue,vol_femur)
366
+ if vol_femur>=FEMUR_VOL_THRESH:
367
+ pelvis_flag=True
368
+ if 'lung' in lf_tissue:
369
+ vol_lung=simpleitk_volume_calculation(lf)
370
+ print(lf_tissue,vol_lung)
371
+ lung_max=max(lung_max,vol_lung)
372
+ if lung_min==0:
373
+ lung_min=vol_lung
374
+ else:
375
+ lung_min=min(lung_min,vol_lung)
376
+ if lung_min>=LUNG_VOL_THRESH:
377
+ thorax_flag=True
378
+ if 'kidney_right' in lf_tissue:
379
+ vol_kidney=simpleitk_volume_calculation(lf)
380
+ print(lf_tissue,vol_kidney)
381
+ if vol_kidney>=KIDNEY_VOL_THRESH:
382
+ kidney_flag=True
383
+
384
+ if 'gall_bladder' in lf_tissue:
385
+ vol_gall_bladder=simpleitk_volume_calculation(lf)
386
+ print(lf_tissue,vol_gall_bladder)
387
+ if vol_gall_bladder>=gall_bladder_VOL_THRESH:
388
+ gall_bladder_flag=True
389
+ '''
390
+ label_image=load_nrrd(lf)
391
+ resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
392
+ if resampler is not None:
393
+ proces_label = resampler.Execute(label_image)
394
+ else:
395
+ proces_label = label_image
396
+
397
+
398
+ # print(proces_image.GetSize(),proces_label.GetSize())
399
+ try:
400
+ assert proces_image.GetSize() == proces_label.GetSize()
401
+ except Exception as e:
402
+ failed_files.append(lf)
403
+ continue
404
+
405
+ label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz")
406
+
407
+ label_path_dict[lf_tissue] = label_output_path
408
+ util.save_nifti(proces_label, label_output_path, lf)
409
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
410
+ '''
411
+ else:
412
+ label_flag=False
413
+ except RuntimeError:
414
+ failed_files.append(full_path)
415
+ print(f"Failed to load DICOM images from {full_path}")
416
+ continue
417
+
418
+ '''
419
+ meta.add_keyvalue('Image_id',meta_image_id)
420
+ meta.add_keyvalue('Weeks',meta_weeks)
421
+ meta.add_keyvalue('FVC',meta_fvc)
422
+ meta.add_keyvalue('Percent',meta_percent)
423
+ meta.add_keyvalue('Age',meta_age)
424
+ meta.add_keyvalue('Sex',meta_sex)
425
+ meta.add_keyvalue('Smoke_Status',meta_status)
426
+
427
+
428
+ size_processed = list(proces_image.GetSize())
429
+
430
+ meta_image_id=uid
431
+ # meta.add_keyvalue('Image_id',meta_image_id)
432
+ meta.add_keyvalue('Spacing_mm',min(spacing_info))
433
+ meta.add_keyvalue('OriImg_path',full_path)
434
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
435
+ meta.add_keyvalue('Modality',modality)
436
+ meta.add_keyvalue('Dataset_name',study)
437
+ '''
438
+ roi='abdomen'
439
+ if thorax_flag and gall_bladder_flag:
440
+ roi='thorax-'+roi
441
+ if thorax_flag and not gall_bladder_flag:
442
+ roi='thorax'
443
+ if pelvis_flag and gall_bladder_flag:
444
+ roi=roi+"-pelvis"
445
+ if pelvis_flag and not gall_bladder_flag:
446
+ roi='pelvis'
447
+ if lung_min>0 and lung_max/lung_min>3:
448
+ label_dict[pid_dir]=[lung_max,lung_min]
449
+
450
+ print(pid_dir,roi)
451
+ #meta.add_keyvalue('ROI',roi)
452
+ for ik in fj.keys():
453
+ fi=fj[ik]
454
+ jid=fi['Metadata']['Study_UID']
455
+ max_length=fi['Spacing_mm']*max(fi['Size'])*0.001
456
+ print(max_length,max_length>1.2)
457
+ if jid==pid_dir:
458
+ if roi=='thorax-abdomen-pelvis' and max_length>1.2:
459
+ roi='whole-body'
460
+ fj[ik]['ROI']=roi
461
+ print(jid,max_length,roi)
462
+ break
463
+ else:
464
+ continue
465
+
466
+ '''
467
+ if label_flag:
468
+ # print(label_path_dict.keys())
469
+ meta.add_keyvalue('Task',TASK_VALUE)
470
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
471
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
472
+
473
+ # meta.add_keyvalue('Label_Dict',LABEL_DICT)
474
+
475
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
476
+
477
+
478
+
479
+
480
+ # Write the mapping to the JSON file on the fly
481
+ with open(json_output_path, 'r+') as json_file:
482
+ existing_mappings = json.load(json_file)
483
+ existing_mappings[output_path] = meta.get_meta_data()
484
+ json_file.seek(0)
485
+ json.dump(existing_mappings, json_file, indent=4)
486
+ json_file.truncate()
487
+ '''
488
+ else:
489
+ print("No metadata.csv files found.")
490
+
491
+
492
+ with open(json_output_path,'w') as fi:
493
+ json.dump(fj,fi)
494
+ print(f"The list has been written to {failed_files_path}")
495
+ print(f"Saved NIfTI mappings to {json_output_path}")
496
+ #print(label_dict)
497
+ if __name__ == "__main__":
498
+ parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
499
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2")
500
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/")
501
+ args = parser.parse_args()
502
+ print(args.target_path, args.output_dir)
503
+ main(args.target_path, args.output_dir)
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
AbdomenCT1k/config_format.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+
117
+ "Sub_modality": {
118
+ "type": "dict",
119
+ "required": false
120
+ },
121
+ "Label_Dict": {
122
+ "type": "dict",
123
+ "required": false
124
+ }
125
+ }
AbdomenCT1k/dataclean_abdomen_ct_1k.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-8-18
5
+ update AbdomenCT1K data clean
6
+
7
+ https://github.com/JunMa11/AbdomenCT-1K
8
+ liver (label 1), kidney (label 2), spleen (label 3), and pancreas (label 4).
9
+ AbdomenCT-1K 是一个大规模腹部 CT 数据集,包含了 1112 例 CT 扫描,用于 4 种腹部器官的分割,包括肝脏、肾脏、脾脏和胰腺。需注意,肾脏没有区分左右。
10
+ 这些数据主要来源于 6 个数据集,其中 5 个是公开数据集,分别是 LiTS(201例)、KiTS19(300例)、MSD Spleen(61例)、MSD Pancreas(420例)和 NIH Pancreas(80例)。
11
+ 另外一个是来自南京大学的新数据集,包含 50 例 CT 扫描。原始的这些公开数据集大多只针对某一特定器官进行了标注,但在 AbdomenCT-1K 中,每例 CT 扫描都对这四种器官进行了全面的标注。
12
+
13
+
14
+ '''
15
+ import os
16
+ import glob
17
+ import pandas as pd
18
+ import SimpleITK as sitk
19
+ import argparse
20
+ import json
21
+ from tqdm import tqdm
22
+ from util import meta_data
23
+ import util
24
+ import numpy as np
25
+ # from bert_helper import *
26
+
27
+ # model_name = "bert-large-uncased"
28
+ # reduce_method = 'mean'
29
+ # max_words_num = 32 # max number of words in the caption > 2
30
+
31
+ # embeder, tokenizer = get_frozen_embeder(model_name)
32
+
33
+ # string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
34
+ # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
35
+
36
+ # string2 = "modality: ct, gender: female, age: 50, roi: head"
37
+
38
+ # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
39
+
40
+ # input_size = embeder.config.vocab_size
41
+ # in_size = embeder.config.hidden_size
42
+
43
+ # print(embeder, input_size, in_size)
44
+ # print(tokenizer)
45
+
46
+
47
+ # print(embeder_output1)
48
+ # print(embeder_output1.shape) # torch.Size([1, 8, 768])
49
+
50
+
51
+ # print(embeder_output2)
52
+ # print(embeder_output2.shape) # torch.Size([1, 8, 768])
53
+
54
+
55
+ # error = torch.abs(embeder_output1 - embeder_output2)
56
+ # print(error)
57
+ # print("Embedding distance between the two sentences: ")
58
+ # print(f"String1: {string1}")
59
+ # print(f"String2: {string2}")
60
+ # print(torch.mean(error))
61
+
62
+
63
+ # exit()
64
+
65
+
66
+ # meta_id_name='Patient'
67
+ # meta_weeks_name='Weeks'
68
+ # meta_fvc_name='FVC'
69
+ # meta_percent_name='Percent'
70
+ # meta_age_name='Age'
71
+ # meta_sex_name='Sex'
72
+ # meta_status_name='SmokingStatus'
73
+
74
+ TASK_VALUE="segmentation"
75
+ CLAMP_RANGE_CT = [-300,300]
76
+ CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...
77
+
78
+
79
+ LABEL_DICT={
80
+ "0":"backgroud",
81
+ "1":"liver",
82
+ "2":"kidney",
83
+ "3":"spleen",
84
+ "4":"pancreas"
85
+ }
86
+
87
+ # def find_metadata_files(path):
88
+ # # for Cancer Image Archive (TCIA) dataset
89
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
90
+ # return glob.glob(search_pattern, recursive=True)
91
+
92
+ def find_metadata_files(path):
93
+ # for Cancer Image Archive (TCIA) dataset
94
+ search_pattern = os.path.join(path, '*.csv')
95
+ return glob.glob(search_pattern, recursive=True)
96
+ ##added by yanguoqing on 20250527
97
+ def find_image_dirs(path):
98
+ return os.listdir(path)
99
+
100
+ ##modify by yanguoqing on 20250527
101
+ def load_dicom_images(folder_path):
102
+ reader = sitk.ImageSeriesReader()
103
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
104
+ reader.SetFileNames(dicom_names)
105
+ image = reader.Execute()
106
+ return dicom_names,image
107
+
108
+ ##added by yanguoqing on 20250527
109
+ def load_dicom_tag(imgs):
110
+ reader = sitk.ImageFileReader()
111
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
112
+ reader.SetFileName(imgs)
113
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
114
+ # metadata_keys = reader.GetMetaDataKeys()
115
+ tag=reader.Execute()
116
+ return tag
117
+
118
+ def load_nrrd(fp):
119
+ return sitk.ReadImage(fp)
120
+
121
+ def save_nifti(image, output_path, folder_path):
122
+ # Set metadata in the NIfTI file's header
123
+ output_dirpath = os.path.dirname(output_path)
124
+ if not os.path.exists(output_dirpath):
125
+ print(f"Creating directory {output_dirpath}")
126
+ os.makedirs(output_dirpath)
127
+ # Set metadata in the NIfTI file's header
128
+ image.SetMetaData("FolderPath", folder_path)
129
+ sitk.WriteImage(image, output_path)
130
+
131
+ ##modify by yanguoqing on 20250527
132
+ def convert_windows_to_linux_path(windows_path):
133
+ # Replace backslashes with forward slashes and remove the drive letter
134
+ # Some meta files have windows paths, but the data is stored on a linux server
135
+ linux_path = windows_path.replace('\\', '/')
136
+ if ':' in linux_path:
137
+ linux_path = linux_path.split(':', 1)[1]
138
+ return linux_path
139
+
140
+ def main(target_path, output_dir):
141
+ metadata_files = find_metadata_files(target_path)
142
+ pid_dirs=find_image_dirs(target_path)
143
+ failed_files = []
144
+ if not os.path.isdir(output_dir):
145
+ os.makedirs(output_dir)
146
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
147
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
148
+ meta = meta_data()
149
+
150
+ # Initialize the JSON file
151
+ if not os.path.exists(json_output_path):
152
+ with open(json_output_path, 'w') as json_file:
153
+ json.dump({}, json_file)
154
+
155
+ if pid_dirs:
156
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
157
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
158
+ continue
159
+ if not "AbdomenCT-1K-ImagePart" in pid_dir:
160
+ continue
161
+ meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
162
+ if os.path.isfile(meta_file):
163
+ mf_flag=True
164
+ df_meta=pd.read_csv(meta_file,sep=',')
165
+ else:
166
+ mf_flag=False
167
+
168
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
169
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
170
+
171
+
172
+
173
+ full_path=os.path.join(target_path,pid_dir,data_dir)
174
+ # data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
175
+
176
+ # if data_info_row.shape[0]>0:
177
+ # data_info_row=data_info_row.reset_index()
178
+ # #print(data_info_row[meta_id_name])
179
+ # meta_image_id=data_info_row[meta_id_name][0]
180
+ # meta_weeks=data_info_row[meta_weeks_name][0]
181
+ # meta_fvc=data_info_row[meta_fvc_name][0]
182
+ # meta_percent=data_info_row[meta_percent_name][0]
183
+ # meta_age=data_info_row[meta_age_name][0]
184
+ # meta_sex=data_info_row[meta_sex_name][0]
185
+ # meta_status=data_info_row[meta_status_name][0]
186
+ # else:
187
+ # meta_image_id=data_dir
188
+ # meta_weeks=''
189
+ # meta_fvc=''
190
+ # meta_percent=''
191
+ # meta_age=''
192
+ # meta_sex=''
193
+ # meta_status=''
194
+ # full_path = convert_windows_to_linux_path(full_path)
195
+ if not os.path.isfile(full_path):
196
+ continue
197
+ if not data_dir.endswith(".nii.gz"):
198
+ continue
199
+ try:
200
+ print(full_path)
201
+
202
+ dicom_image=util.load_nifti(full_path)
203
+ spacing_info = dicom_image.GetSpacing()
204
+ print('SPACING INFO:', spacing_info)
205
+
206
+ # metadata_keys = dicom_image.GetMetaDataKeys()
207
+
208
+ # dtag=load_dicom_tag(dicom_fp[0])
209
+ # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
210
+ # modality=dtag.GetMetaData('0008|0060')##Modality
211
+ uid=data_dir[:10]
212
+ modality="CT"
213
+ study='AbdomenCT1K'##Dataset_name
214
+ CIA_other_info = {
215
+ 'Study_UID':uid,
216
+ 'metadata_file':''
217
+ # 'Series_Description':serise_desc
218
+ }
219
+ CIA_other_info['split'] = "train"
220
+ if mf_flag:
221
+ CIA_other_info['metadata_file']=meta_file
222
+
223
+ size = list(dicom_image.GetSize())
224
+ resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
225
+
226
+ # resize the image
227
+ if resampler is not None:
228
+ proces_image = resampler.Execute(dicom_image)
229
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
230
+ CIA_other_info['Resample'] = True
231
+ else:
232
+ proces_image = dicom_image
233
+ CIA_other_info['Resample'] = False
234
+
235
+ ##
236
+ # CIA_other_info['Image_id']=meta_image_id
237
+ # CIA_other_info['Weeks']=str(meta_weeks)
238
+ # CIA_other_info['FVC']=str(meta_fvc)
239
+ # CIA_other_info['Percent']=str(meta_percent)
240
+ # CIA_other_info['Age']=str(meta_age)
241
+ # CIA_other_info['Sex']=meta_sex
242
+ # CIA_other_info['Smoke_Status']=meta_status
243
+ # threshold the image
244
+ if 'CT' in modality:
245
+ proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
246
+ else:
247
+ pass
248
+
249
+ output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
250
+ # output_path=convert_windows_to_linux_path(output_path)
251
+ save_nifti(proces_image, output_path, full_path)
252
+ print(f"Saved NIfTI file to {output_path}")
253
+
254
+ ##segment
255
+ label_path_dict = {}
256
+ label_flag=True
257
+
258
+ label_paths = os.path.join(target_path, 'Mask')
259
+ label_files=glob.glob("%s/%s.nii.gz"%(label_paths,uid))
260
+ #print(label_paths,label_files)
261
+ if len(label_files)>0:
262
+ lf=label_files[0]
263
+ lf_name=os.path.basename(lf)
264
+ lf_id=lf_name.split("_")[0]
265
+ lf_tissue="abdomen"
266
+ label_image=load_nrrd(lf)
267
+ resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
268
+ if resampler is not None:
269
+ proces_label = resampler.Execute(label_image)
270
+ else:
271
+ proces_label = label_image
272
+
273
+ label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{uid}.nii.gz")
274
+
275
+ label_path_dict[lf_tissue] = label_output_path
276
+ util.save_nifti(proces_label, label_output_path, lf)
277
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
278
+
279
+ else:
280
+ label_flag=False
281
+ except RuntimeError:
282
+ failed_files.append(full_path)
283
+ print(f"Failed to load DICOM images from {full_path}")
284
+ continue
285
+
286
+ '''
287
+ meta.add_keyvalue('Image_id',meta_image_id)
288
+ meta.add_keyvalue('Weeks',meta_weeks)
289
+ meta.add_keyvalue('FVC',meta_fvc)
290
+ meta.add_keyvalue('Percent',meta_percent)
291
+ meta.add_keyvalue('Age',meta_age)
292
+ meta.add_keyvalue('Sex',meta_sex)
293
+ meta.add_keyvalue('Smoke_Status',meta_status)
294
+ '''
295
+ if label_flag:
296
+ print(proces_image.GetSize(),proces_label.GetSize())
297
+ try:
298
+ assert proces_image.GetSize() == proces_label.GetSize()
299
+ except Exception as e:
300
+ failed_files.append(full_path)
301
+ continue
302
+ size_processed = list(proces_image.GetSize())
303
+
304
+ meta_image_id=uid
305
+ # meta.add_keyvalue('Image_id',meta_image_id)
306
+ meta.add_keyvalue('Spacing_mm',min(spacing_info))
307
+ meta.add_keyvalue('OriImg_path',full_path)
308
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
309
+ meta.add_keyvalue('Modality',modality)
310
+ meta.add_keyvalue('Dataset_name',study)
311
+ meta.add_keyvalue('ROI','abdomen')
312
+
313
+
314
+
315
+ if label_flag:
316
+ print(label_path_dict.keys())
317
+ meta.add_keyvalue('Task',TASK_VALUE)
318
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
319
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
320
+
321
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
322
+
323
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
324
+
325
+
326
+
327
+
328
+ # Write the mapping to the JSON file on the fly
329
+ with open(json_output_path, 'r+') as json_file:
330
+ existing_mappings = json.load(json_file)
331
+ existing_mappings[output_path] = meta.get_meta_data()
332
+ json_file.seek(0)
333
+ json.dump(existing_mappings, json_file, indent=4)
334
+ json_file.truncate()
335
+ else:
336
+ print("No metadata.csv files found.")
337
+
338
+ with open(failed_files_path, "w") as json_file:
339
+ json.dump(failed_files, json_file)
340
+
341
+ print(f"The list has been written to {failed_files_path}")
342
+ print(f"Saved NIfTI mappings to {json_output_path}")
343
+
344
+ if __name__ == "__main__":
345
+ parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
346
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenCT1k")
347
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenCT1k/")
348
+ args = parser.parse_args()
349
+ print(args.target_path, args.output_dir)
350
+ main(args.target_path, args.output_dir)
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
AbdomenCT1k/util.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def load_dicom_images(folder_path):
8
+ reader = sitk.ImageSeriesReader()
9
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
10
+ reader.SetFileNames(dicom_names)
11
+ image = reader.Execute()
12
+ return image
13
+
14
+ def convert_windows_to_linux_path(windows_path):
15
+ # Replace backslashes with forward slashes and remove the drive letter
16
+ # Some meta files have windows paths, but the data is stored on a linux server
17
+ linux_path = windows_path.replace('\\', '/')
18
+ if ':' in linux_path:
19
+ linux_path = linux_path.split(':', 1)[1]
20
+ return linux_path
21
+
22
+ # =============================================================================
23
+ # ========================developed with TotalSegmentor========================
24
+ # =============================================================================
25
+
26
+ def read_table(file_path, split_str=';'):
27
+ try:
28
+ df = pd.read_excel(file_path, engine='openpyxl')
29
+ except:
30
+ df = pd.read_csv(file_path, sep=split_str)
31
+ return df
32
+
33
+ def load_nifti(image_path):
34
+ return sitk.ReadImage(image_path)
35
+
36
+ def save_nifti(image, output_path, folder_path):
37
+ output_dirpath = os.path.dirname(output_path)
38
+ if not os.path.exists(output_dirpath):
39
+ print(f"Creating directory {output_dirpath}")
40
+ os.makedirs(output_dirpath)
41
+ # Set metadata in the NIfTI file's header
42
+ image.SetMetaData("FolderPath", folder_path)
43
+ sitk.WriteImage(image, output_path)
44
+
45
+ def find_metadata_files(path, file_name='*meta*'):
46
+ # for TotalSegmentor dataset
47
+ search_pattern = os.path.join(path, '**', file_name)
48
+ return glob.glob(search_pattern, recursive=True)
49
+
50
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
51
+ img_path = []
52
+ for root, dirs, files in os.walk(folder_path):
53
+ for file in files:
54
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
55
+ img_path.append(os.path.join(root, file))
56
+ if is_sorted:
57
+ img_path.sort()
58
+ return img_path
59
+
60
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
61
+ '''
62
+ Resample the image to have isotropic spacing, following the steps:
63
+ 1. Find the minimum spacing
64
+ 2. Resample the image to have the minimum spacing
65
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
66
+ 4. Set the output spacing
67
+ 5. Return the resampler for resampling
68
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
69
+ '''
70
+ # 讨论为什么重新写这个函数!!!
71
+ if size is None:
72
+ size = ref_img.GetSize()
73
+ if spacing is None:
74
+ spacing = ref_img.GetSpacing()
75
+ min_spacing = min(spacing)
76
+ if all([spc == min_spacing for spc in spacing]):
77
+ return None
78
+ else:
79
+ # if 1:
80
+ if interpolator == 'nearest':
81
+ interpolator = sitk.sitkNearestNeighbor
82
+ elif interpolator == 'linear':
83
+ interpolator = sitk.sitkLinear
84
+ resampler = sitk.ResampleImageFilter()
85
+ # new_spacing = [max_spacing] * len(spacing)
86
+ # print(size)
87
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
88
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
89
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
90
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
91
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
92
+ # resampler.SetSize(new_size)
93
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
94
+ resampler.SetSize(new_size_xy)
95
+ resampler.SetOutputSpacing(new_size_spacing)
96
+
97
+ # print(new_size,new_size_xy)
98
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
99
+ resampler.SetOutputDirection(ref_img.GetDirection())
100
+ resampler.SetInterpolator(interpolator)
101
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
102
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
103
+ return resampler
104
+
105
+ def clamp_image(in_img,clamp_range):
106
+ '''
107
+ Clamp the image to the specified range
108
+ '''
109
+ clamp_filter = sitk.ClampImageFilter()
110
+ clamp_filter.SetLowerBound(clamp_range[0])
111
+ clamp_filter.SetUpperBound(clamp_range[1])
112
+ return clamp_filter.Execute(in_img)
113
+
114
+ def get_synonyms_dict(dict_type='ROI'):
115
+ '''
116
+ Get the dictionary of synonyms for the specified dictionary type
117
+ '''
118
+ if dict_type == 'ROI':
119
+ dict_synonyms = {
120
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
121
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
122
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
123
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
124
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
125
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
126
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
127
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
128
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
129
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
130
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
131
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
132
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
133
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
134
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
135
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
136
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
137
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
138
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
139
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
140
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
141
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
142
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
143
+ }
144
+ elif dict_type == 'Label_tissue':
145
+ dict_synonyms = {
146
+ 'liver': ['liver','hepatic'],
147
+ 'spleen': ['spleen','splenic'],
148
+ 'kidney': ['kidney','renal'],
149
+ 'pancreas': ['pancreas','pancreatic'],
150
+ 'stomach': ['stomach','gastric'],
151
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
152
+ 'gallbladder': ['gallbladder'],
153
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
154
+ 'bladder': ['bladder'],
155
+ 'prostate': ['prostate'],
156
+ 'uterus': ['uterus'],
157
+ 'ovary': ['ovary'],
158
+ 'testicle': ['testicle'],
159
+ 'lymph_node': ['lymph_node','lymph node'],
160
+ 'bone': ['bone'],
161
+ 'lung': ['lung'],
162
+ 'heart': ['heart'],
163
+ 'esophagus': ['esophagus'],
164
+ 'muscle': ['muscle'],
165
+ 'fat': ['fat'],
166
+ 'skin': ['skin'],
167
+ 'vessel': ['vessel'],
168
+ 'tumor': ['tumor'],
169
+ 'other': ['other']
170
+ }
171
+ elif dict_type == 'Task':
172
+ dict_synonyms = {
173
+ 'segmentation': ['segmentation', 'seg', 'mask'],
174
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
175
+ 'localization': ['localization', 'locate', 'location', 'position'],
176
+ 'registration': ['registration', 'register', 'align', 'alignment'],
177
+ 'detection': ['detection', 'detect', 'find', 'locate'],
178
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
179
+ }
180
+ elif dict_type == 'Modality':
181
+ dict_synonyms = {
182
+ 'CT': ['CT', 'computed tomography'],
183
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
184
+ 'PET': ['PET', 'positron emission tomography'],
185
+ 'US': ['US', 'ultrasound'],
186
+ 'X-ray': ['X-ray', 'radiography'],
187
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
188
+ }
189
+ else:
190
+ raise ValueError(f"dict_type {dict_type} is not valid")
191
+ return dict_synonyms
192
+
193
+ def replace_synonyms(text, dict_synonyms):
194
+ '''
195
+ Replace the synonyms in the text with the standard term
196
+ '''
197
+ if isinstance(text,str):
198
+ for key, value in dict_synonyms.items():
199
+ for v in value:
200
+ if v.lower() in text.lower():
201
+ return key
202
+ Warning(f"Value {text} is not in the correct format")
203
+ elif isinstance(text,list):
204
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
205
+ elif isinstance(text,dict):
206
+ for key in text.keys():
207
+ # replace values in dict
208
+ text[key] = replace_synonyms(text[key], dict_synonyms)
209
+ # replace keys in dict
210
+ for k in dict_synonyms.keys():
211
+ text[dict_synonyms[k]] = text.pop(key)
212
+ return text
213
+
214
+ # =============================================================================
215
+
216
+ class meta_data(object):
217
+ '''
218
+ This class is used to store the metadata of the dataset
219
+ '''
220
+ def __init__(self):
221
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
222
+ with open(self.config_format_path, 'r') as file:
223
+ self.config_format = json.load(file)
224
+ self.config = {}
225
+ for key in self.config_format.keys():
226
+ if self.config_format[key]['required'] == True:
227
+ self.config[key] = {}
228
+ self.keytypes = self.find_all_keys_with_type()
229
+ self.keytypes_flatten = self.flatten_json()
230
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
231
+ for key in self.ambiguity_keys:
232
+ ambiguity_dict = get_synonyms_dict(key)
233
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
234
+
235
+ def get_ketytypes(self):
236
+ return self.keytypes
237
+
238
+ def get_keytypes_flatten(self):
239
+ return self.keytypes_flatten
240
+
241
+ def find_all_keys_with_type(self, data=None, parent_key=''):
242
+ if data is None:
243
+ data = self.config_format
244
+ keys_with_type = {}
245
+ if isinstance(data, dict):
246
+ for key, value in data.items():
247
+ full_key = f"{parent_key}.{key}" if parent_key else key
248
+ if isinstance(value, dict) and 'type' in value:
249
+ keys_with_type[full_key] = value['type']
250
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
251
+ elif isinstance(data, list):
252
+ for index, item in enumerate(data):
253
+ full_key = f"{parent_key}[{index}]"
254
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
255
+ return keys_with_type
256
+
257
+ def flatten_json(self, data=None, parent_key='', sep='.'):
258
+ if data is None:
259
+ data = self.config_format
260
+ items = {}
261
+ if isinstance(data, dict):
262
+ for key, value in data.items():
263
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
264
+ if isinstance(value, dict):
265
+ items.update(self.flatten_json(value, new_key, sep=sep))
266
+ elif isinstance(value, list):
267
+ for i, item in enumerate(value):
268
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
269
+ else:
270
+ items[new_key] = value
271
+ elif isinstance(data, list):
272
+ for i, item in enumerate(data):
273
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
274
+ return items
275
+
276
+ def req_check(self):
277
+ self.unfilled_keys = []
278
+ for key in self.config.keys():
279
+ if self.config[key] == {}:
280
+ self.unfilled_keys.append(key)
281
+ if len(self.unfilled_keys) == 0:
282
+ return True
283
+ else:
284
+ return False
285
+
286
+ def type_check(self, key, value):
287
+ if key not in self.config_format.keys():
288
+ print(key, "is not a valid key")
289
+ return False
290
+
291
+ if key == 'Modality':
292
+ if value not in self.config_format[key]['options']:
293
+ return False
294
+ else:
295
+ return True
296
+
297
+ elif key == 'OriImg_path':
298
+ if isinstance(value, str):
299
+ return True
300
+ else:
301
+ return False
302
+
303
+ elif key == 'Label_path' and isinstance(value, dict):
304
+ for skey in value.keys():
305
+ if skey in self.config_format[key]['keys']:
306
+ for kk in value[skey]:
307
+ if isinstance(value[skey][kk],str):
308
+ pass
309
+ # if kk in self.config_format[key]['value']['keys']:
310
+ # if isinstance(value[skey][kk],str):
311
+ # pass
312
+ # else:
313
+ # return False
314
+ else:
315
+ return False
316
+ return True
317
+
318
+ elif key == 'ROI':
319
+ if value not in self.config_format[key]['options']:
320
+ return False
321
+ else:
322
+ return True
323
+
324
+ elif key == 'Label_tissue' and isinstance(value, list):
325
+ for i in value:
326
+ if i not in self.config_format[key]['items']['options']:
327
+ return False
328
+ return True
329
+
330
+ elif key =='Task' and isinstance(value, list):
331
+ for i in value:
332
+ if i not in self.config_format[key]['items']['options']:
333
+ return False
334
+ return True
335
+
336
+ elif key == 'Spacing_mm':
337
+ if isinstance(value, float):
338
+ return True
339
+ else:
340
+ False
341
+
342
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
343
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
344
+ return all(isinstance(item, int) for item in value)
345
+
346
+ elif key == 'Dataset_name':
347
+ if isinstance(value, str):
348
+ return True
349
+ else:
350
+ return False
351
+ ##added by yanguoiqng on 2025-08-08
352
+ elif key == 'Sub_modality':
353
+
354
+ if isinstance(value, dict):
355
+ return True
356
+ else:
357
+ return False
358
+ elif key == 'Label_Dict':
359
+
360
+ if isinstance(value, dict):
361
+ return True
362
+ else:
363
+ return False
364
+ def add_extra_keyvalue(self, key, value):
365
+ self.config[key] = value
366
+ return True
367
+
368
+ def add_keyvalue(self, key, value):
369
+ if key in self.ambiguity_keys:
370
+ value = replace_synonyms(value, get_synonyms_dict(key))
371
+ # print(key, value)
372
+ if self.type_check(key, value):
373
+
374
+ self.config[key] = value
375
+ return True
376
+ else:
377
+ Warning(f"Value {value} is not in the correct format for key {key}")
378
+ pass
379
+ # print(f"Value {value} is not in the correct format for key {key}")
380
+
381
+ def get_meta_data(self):
382
+ if self.req_check():
383
+ return self.config
384
+ else:
385
+ print("Not all required keys are filled", self.unfilled_keys)
386
+ return False
387
+
388
+
389
+
390
+ if __name__ == '__main__':
391
+ meta = meta_data()
392
+ print(meta.get_keytypes_flatten())
393
+ print(meta.get_ketytypes())
394
+ meta.add_keyvalue('Modality', 'CT')
395
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
396
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
397
+ meta.add_keyvalue('Spacing_mm', 1.5)
398
+ meta.add_keyvalue('Size', [512, 512, 100])
399
+ meta.add_keyvalue('Dataset_name', 'CT')
400
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
401
+ meta.add_keyvalue('Task', ['1', '2', '3'])
402
+ print(meta.get_meta_data())
403
+ meta.add_extra_key('extra', 'extra')
404
+ print(meta.get_meta_data())
405
+ print(meta.get_ketytypes())
406
+ print(meta.get_keytypes_flatten)
407
+
408
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
409
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
410
+ print(img_paths)
CLAUDE.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Medical imaging data engineering pipeline for standardizing diverse datasets (CT, MRI, PET) into a unified NIfTI format with consistent JSON metadata. Each subdirectory handles one dataset (AbdomenAtlas, BRATS, MnM2, OASIS, OAI_ZIB, PSMA, Kaggle OSIC, etc.).
8
+
9
+ ## Running Data Cleaning Scripts
10
+
11
+ Each dataset has its own `dataclean_*.py` script. Run from the dataset's subdirectory:
12
+
13
+ ```bash
14
+ python dataclean_abdomen_atlas.py --target_path /path/to/raw/data --output_dir /path/to/output
15
+ ```
16
+
17
+ All scripts follow the same `--target_path` / `--output_dir` argument pattern. Versioned scripts (e.g., `_v2.py`, `_v3.py`) represent iterative improvements; use the highest version unless investigating regressions.
18
+
19
+ ## Dependencies
20
+
21
+ Python 3 with: `SimpleITK`, `pandas`, `numpy`, `tqdm`, `openpyxl` (for Excel metadata). No requirements.txt exists — install manually.
22
+
23
+ ## Architecture
24
+
25
+ ### Processing Pipeline (per dataset)
26
+
27
+ 1. **Load** raw data (DICOM via `sitk.ImageSeriesReader`, NIfTI via `sitk.ReadImage`, or NRRD)
28
+ 2. **Extract metadata** from headers, CSV files, or DICOM tags
29
+ 3. **Resample** to isotropic spacing using minimum voxel spacing (`get_unisize_resampler`)
30
+ 4. **Clamp intensities** — CT: `[-300, 300]` HU; MRI: varies per dataset
31
+ 5. **Process segmentation labels** with identical resampling (nearest-neighbor interpolation)
32
+ 6. **Validate** image/label dimension alignment via `assert` on `GetSize()`
33
+ 7. **Write** standardized NIfTI (`.nii.gz`) + append to `nifti_mappings.json`
34
+
35
+ ### Key Shared Components
36
+
37
+ **`util.py`** (copied into each dataset directory — not a shared import):
38
+ - `meta_data` class — validates metadata against `config_format.json` schema, enforces required fields (Modality, OriImg_path, Spacing_mm, Size, Dataset_name), normalizes ambiguous terminology via synonym dictionaries
39
+ - `get_unisize_resampler()` — builds a SimpleITK resampler for isotropic spacing; returns `None` if spacing is already isotropic
40
+ - `clamp_image()` — HU/intensity clamping via `sitk.ClampImageFilter`
41
+ - `get_synonyms_dict()` / `replace_synonyms()` — canonical mapping for ROI names, tissue labels, modalities, and task types
42
+ - `load_nifti()`, `load_dicom_images()`, `save_nifti()` — I/O wrappers that embed `FolderPath` metadata in NIfTI headers
43
+
44
+ **`config_format.json`** (per dataset directory): defines the metadata schema — field types, required flags, and allowed option values.
45
+
46
+ ### Output Structure
47
+
48
+ ```
49
+ {output_dir}/{patient_id}/{patient_id}.nii.gz # processed image
50
+ {output_dir}/{patient_id}/{task}/{tissue}.nii.gz # segmentation labels
51
+ {output_dir}/nifti_mappings.json # metadata keyed by output path
52
+ {output_dir}/failed_files.json # files that failed processing
53
+ ```
54
+
55
+ ### Dataset-Specific Notes
56
+
57
+ - **AbdomenAtlas**: 25-organ segmentation labels stored as individual NIfTI files per organ; also has `combined_labels.nii.gz` (values 0-25)
58
+ - **BRATS (2019/2020/2021)**: Multi-modal MRI (FLAIR, T1, T1ce, T2) — each modality processed as a separate sub-modality entry
59
+ - **MnM2/MnMs**: Cardiac MRI with vendor metadata (Siemens, Philips, GE, Canon)
60
+ - **OASIS**: Both cross-sectional and longitudinal variants; includes clinical scores (MMSE, CDR)
61
+ - **OAI_ZIB**: Knee MRI with 6-structure segmentation and clinical grading (WOMAC)
62
+ - **PSMA**: Dual-tracer PET/CT (PSMA & FDG); has longitudinal variant
63
+
64
+ ## Important Conventions
65
+
66
+ - Resampling uses the **minimum** of the original spacing values to create isotropic voxels
67
+ - Labels are resampled with **nearest-neighbor** interpolation; images use **linear**
68
+ - The `meta_data` class normalizes terminology automatically — e.g., "chest" maps to "thorax", "seg" maps to "segmentation"
69
+ - `util.py` is duplicated across directories (not shared via import) — changes must be propagated manually
70
+ - Code comments and docstrings are frequently in Chinese
71
+ - Log files (`*.log`) in each directory contain processing run history — these can be large (up to 23 MB)
MnM2_clean/config_format.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+ "ImgDict": {
117
+ "type": "dict",
118
+ "required": false
119
+ },
120
+ "Label_Dict": {
121
+ "type": "dict",
122
+ "required": false
123
+ }
124
+ }
MnM2_clean/dataclean_MnM2.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-08-26
5
+ update MnMs2 data clean
6
+
7
+ nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本):
8
+ 1.LA或者SA需要分开存储处理;
9
+ 2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧;
10
+ 3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息;
11
+ 4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下
12
+
13
+ a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件;
14
+ b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件;
15
+
16
+ ##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825
17
+ 分割标签:NIFTI 格式,标签值:
18
+
19
+ 0:背景
20
+
21
+ 1:左心室腔(LV)
22
+
23
+ 2:右心室腔(RV)
24
+
25
+ 3:左心室心肌(Myo
26
+
27
+ 当前版本没有元文件信息
28
+
29
+ '''
30
+ import os
31
+ import glob
32
+ import pandas as pd
33
+ import SimpleITK as sitk
34
+ import argparse
35
+ import json
36
+ from tqdm import tqdm
37
+ from util import meta_data
38
+ import util
39
+ import numpy as np
40
+ # from bert_helper import *
41
+
42
+
43
+
44
+
45
+ TASK_VALUE="segmentation"
46
+ CLAMP_RANGE_CT = [-300,300]
47
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
48
+ TARGET_VOXEL_SPACING=None
49
+
50
+ LABEL_DICT={
51
+ "0":"backgroud",
52
+ "1":"LV",#左心室 Blood Pools
53
+ "3":"MYO",#左心室心肌
54
+ "2":"RV"#右心室 Blood Pools
55
+ }
56
+
57
+ # def find_metadata_files(path):
58
+ # # for Cancer Image Archive (TCIA) dataset
59
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
60
+ # return glob.glob(search_pattern, recursive=True)
61
+
62
+ def find_metadata_files(path):
63
+ # for Cancer Image Archive (TCIA) dataset
64
+ search_pattern = os.path.join(path, '*.csv')
65
+ return glob.glob(search_pattern, recursive=True)
66
+ ##added by yanguoqing on 20250527
67
+ def find_image_dirs(path):
68
+ return os.listdir(path)
69
+
70
+ ##modify by yanguoqing on 20250527
71
+ def load_dicom_images(folder_path):
72
+ reader = sitk.ImageSeriesReader()
73
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
74
+ reader.SetFileNames(dicom_names)
75
+ image = reader.Execute()
76
+ return dicom_names,image
77
+
78
+ ##added by yanguoqing on 20250527
79
+ def load_dicom_tag(imgs):
80
+ reader = sitk.ImageFileReader()
81
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
82
+ reader.SetFileName(imgs)
83
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
84
+ # metadata_keys = reader.GetMetaDataKeys()
85
+ tag=reader.Execute()
86
+ return tag
87
+
88
+ def load_nrrd(fp):
89
+ return sitk.ReadImage(fp)
90
+
91
+ def save_nifti(image, output_path, folder_path):
92
+ # Set metadata in the NIfTI file's header
93
+ output_dirpath = os.path.dirname(output_path)
94
+ if not os.path.exists(output_dirpath):
95
+ print(f"Creating directory {output_dirpath}")
96
+ os.makedirs(output_dirpath)
97
+ # Set metadata in the NIfTI file's header
98
+ image.SetMetaData("FolderPath", folder_path)
99
+ sitk.WriteImage(image, output_path)
100
+
101
+ ##modify by yanguoqing on 20250527
102
+ def convert_windows_to_linux_path(windows_path):
103
+ # Replace backslashes with forward slashes and remove the drive letter
104
+ # Some meta files have windows paths, but the data is stored on a linux server
105
+ linux_path = windows_path.replace('\\', '/')
106
+ if ':' in linux_path:
107
+ linux_path = linux_path.split(':', 1)[1]
108
+ return linux_path
109
+
110
+ def main(target_path, output_dir):
111
+ metadata_files = find_metadata_files(target_path)
112
+ pid_dirs=find_image_dirs(target_path)
113
+ # pid_dirs=["Training","Testing","Validation"]
114
+ failed_files = []
115
+ if not os.path.isdir(output_dir):
116
+ os.makedirs(output_dir)
117
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
118
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
119
+ meta = meta_data()
120
+
121
+ # Initialize the JSON file
122
+ if not os.path.exists(json_output_path):
123
+ with open(json_output_path, 'w') as json_file:
124
+ json.dump({}, json_file)
125
+ meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
126
+ if os.path.isfile(meta_file):
127
+ mf_flag=True
128
+ df_meta=pd.read_csv(meta_file,sep=',')
129
+ else:
130
+ mf_flag=False
131
+
132
+ if pid_dirs:
133
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
134
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
135
+ continue
136
+ meta_image_id=pid_dir
137
+
138
+ modality="MRI"
139
+ study='MnM2'##Dataset_name
140
+
141
+ full_dir=os.path.join(target_path,pid_dir)
142
+ dfs=find_image_dirs(full_dir)##list all nii.gz files
143
+
144
+
145
+ if len(dfs)>0:
146
+ for df in dfs:
147
+ ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件
148
+ if "CINE" in df:
149
+ ##正常处理
150
+ label_flag=False
151
+ if "_LA_" in df:
152
+ la_flag=True
153
+ else:
154
+ la_flag=False
155
+
156
+ elif "ES.nii.gz" in df:
157
+ if "_LA_" in df:
158
+ la_flag=True
159
+ else:
160
+ la_flag=False
161
+ if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))):
162
+ label_flag=True
163
+ else:
164
+ label_flag=False
165
+ else:
166
+ continue
167
+ try:
168
+ ##处理数据
169
+ full_path_image=os.path.join(full_dir,df)
170
+
171
+ sitk_img_original = util.load_nifti(full_path_image)
172
+ if sitk_img_original is None:
173
+ print(f" Failed to load image: {full_path_image}")
174
+ continue
175
+
176
+ original_spacing = list(sitk_img_original.GetSpacing())
177
+ original_size = list(sitk_img_original.GetSize())
178
+ sitk_img_processed = sitk_img_original
179
+ # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4
180
+ is_4d_image = sitk_img_original.GetDimension() == 4
181
+
182
+ frame_flag=False
183
+ # --- Resampling Logic (Revised for 4D) ---
184
+ if is_4d_image:
185
+
186
+ # Always process 4D images channel-wise for resampling
187
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
188
+ channels = []
189
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
190
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
191
+
192
+
193
+ for i in range(num_channels):
194
+ extractor = sitk.ExtractImageFilter()
195
+ current_3d_channel_size = original_size[:3]
196
+
197
+ if sitk_img_original.GetDimension() == 4:
198
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
199
+ extractor.SetIndex([0,0,0,i])
200
+ channel_3d_img = extractor.Execute(sitk_img_original)
201
+ else:
202
+ channel_3d_img = sitk_img_original
203
+ if i > 0: break
204
+
205
+ channel_resampler = util.get_unisize_resampler(
206
+ channel_3d_img, 'linear',
207
+ spacing=channel_target_spacing, size=current_3d_channel_size
208
+ )
209
+ if channel_resampler:
210
+ channels.append(channel_resampler.Execute(channel_3d_img))
211
+ else:
212
+ channels.append(channel_3d_img)
213
+
214
+ if channels:
215
+ if len(channels) > 1: # Only join if there are multiple channels
216
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
217
+ ##aded by yanguoqing on 2025-08-11
218
+ frame_flag=True
219
+ # imgDict={}
220
+ # for kf_idx in range(num_channels):
221
+ # imgDict[str(kf_idx)]='none'
222
+ # if str(meta_ed):imgDict[str(meta_ed)]='ed'
223
+ # if str(meta_es):imgDict[str(meta_es)]='es'
224
+ # meta.add_keyvalue('ImgDict',imgDict)
225
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
226
+ sitk_img_processed = channels[0]
227
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
228
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
229
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
230
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
231
+ else: # 3D image, no TARGET_VOXEL_SPACING
232
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
233
+ spacing=original_spacing, size=original_size)
234
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
235
+
236
+
237
+ CIA_other_info = {
238
+ 'metadata_file':''
239
+ # 'Series_Description':serise_desc
240
+ }
241
+ CIA_other_info['split'] = "train"
242
+ CIA_other_info['Image_id']=meta_image_id
243
+ if mf_flag:
244
+ CIA_other_info['metadata_file']=meta_file
245
+
246
+ is_processed_4d = sitk_img_processed.GetDimension() == 4
247
+ clamp_range_to_use=None
248
+ if clamp_range_to_use and is_processed_4d:
249
+ clamped_channels_final = []
250
+ num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
251
+ for i in range(num_channels_final):
252
+ extractor = sitk.ExtractImageFilter()
253
+ proc_size_final = sitk_img_processed.GetSize()
254
+ extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
255
+ extractor.SetIndex([0,0,0,i])
256
+ channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
257
+ clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
258
+ if clamped_channels_final:
259
+ if len(clamped_channels_final) > 1:
260
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
261
+ elif len(clamped_channels_final) == 1:
262
+ sitk_img_processed = clamped_channels_final[0]
263
+ elif clamp_range_to_use: # 3D image
264
+ sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
265
+
266
+
267
+ output_path = os.path.join(output_dir,pid_dir, f"{df}")
268
+ # output_path=convert_windows_to_linux_path(output_path)
269
+ save_nifti(sitk_img_processed, output_path, full_path_image)
270
+ print(f"Saved NIfTI file to {output_path}")
271
+
272
+
273
+
274
+ label_path_dict = {}
275
+
276
+ if label_flag:
277
+ processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}")
278
+ full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))
279
+
280
+ sitk_lbl_original = util.load_nifti(full_path_label)
281
+ if not sitk_lbl_original:
282
+ print(f" Failed to load label: {full_path_label}")
283
+ processed_lbl_full_path = None
284
+ continue
285
+ if sitk_lbl_original:
286
+ label_resampler = sitk.ResampleImageFilter()
287
+ reference_for_label = sitk_img_processed # Default to processed image
288
+
289
+ if sitk_img_processed.GetDimension() == 4:
290
+ num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
291
+ if num_comp_proc > 0:
292
+ extractor = sitk.ExtractImageFilter()
293
+ proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
294
+ extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
295
+ extractor.SetIndex([0,0,0,0])
296
+ try:
297
+ reference_for_label = extractor.Execute(sitk_img_processed)
298
+ except Exception as ref_err:
299
+ print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
300
+ # print(traceback.format_exc())
301
+ reference_for_label = None
302
+ else: # Fallback if extraction fails
303
+ print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
304
+ reference_for_label = None # This will cause an issue below if not handled
305
+
306
+ sitk_lbl_processed = None
307
+
308
+ if reference_for_label and reference_for_label.GetDimension() > 0:
309
+ label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
310
+ label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
311
+
312
+ if sitk_lbl_original.GetDimension() == 4:
313
+ lbl_channels = []
314
+ lbl_size = list(sitk_lbl_original.GetSize())
315
+ for i in range(lbl_size[3]):
316
+ extractor = sitk.ExtractImageFilter()
317
+ extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
318
+ extractor.SetIndex([0, 0, 0, i])
319
+ single_channel = extractor.Execute(sitk_lbl_original)
320
+
321
+ label_resampler.SetReferenceImage(reference_for_label)
322
+ resampled_channel = label_resampler.Execute(single_channel)
323
+ lbl_channels.append(resampled_channel)
324
+
325
+ if len(lbl_channels) > 1:
326
+ sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
327
+ elif len(lbl_channels) == 1:
328
+ sitk_lbl_processed = lbl_channels[0]
329
+ else:
330
+ label_resampler.SetReferenceImage(reference_for_label)
331
+ sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
332
+ if processed_lbl_full_path:
333
+ if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
334
+ print(f" Mismatch between image and label size (ignoring channels):")
335
+ print(f" Image size: {sitk_img_processed.GetSize()}")
336
+ print(f" Label size: {sitk_lbl_processed.GetSize()}")
337
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
338
+ else:
339
+ print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
340
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original
341
+ # processed_lbl_full_path should still point to this saved original label
342
+ sitk_lbl_processed=sitk_lbl_original
343
+ else:
344
+ processed_lbl_full_path = None
345
+ else:
346
+ processed_lbl_full_path = None
347
+
348
+ if processed_lbl_full_path:
349
+ label_path_dict['heart'] = processed_lbl_full_path
350
+
351
+ print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
352
+ print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
353
+ try:
354
+ assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
355
+
356
+ except Exception as e:
357
+ failed_files.append(full_path_label)
358
+ continue
359
+ except RuntimeError:
360
+ failed_files.append(full_path_image)
361
+ print(f"Failed to load MnMs images from {full_path_image}")
362
+ continue
363
+
364
+ size_processed = list(sitk_img_processed.GetSize())
365
+ print('size_processed',size_processed,original_size)
366
+
367
+ # meta.add_keyvalue('Image_id',meta_image_id)
368
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
369
+ meta.add_keyvalue('OriImg_path',full_path_image)
370
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
371
+ meta.add_keyvalue('Modality',modality)
372
+ meta.add_keyvalue('Dataset_name',study)
373
+ meta.add_keyvalue('ROI','chest')
374
+
375
+
376
+ if processed_lbl_full_path:
377
+ print(label_path_dict.keys())
378
+ meta.add_keyvalue('Task',TASK_VALUE)
379
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
380
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
381
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
382
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
383
+
384
+
385
+
386
+
387
+ # Write the mapping to the JSON file on the fly
388
+ with open(json_output_path, 'r+') as json_file:
389
+ existing_mappings = json.load(json_file)
390
+ existing_mappings[output_path] = meta.get_meta_data()
391
+ json_file.seek(0)
392
+ print(existing_mappings)
393
+ json.dump(existing_mappings, json_file, indent=4)
394
+ json_file.truncate()
395
+ else:
396
+ continue
397
+
398
+
399
+
400
+ with open(failed_files_path, "w") as json_file:
401
+ json.dump(failed_files, json_file)
402
+
403
+ print(f"The list has been written to {failed_files_path}")
404
+ print(f"Saved NIfTI mappings to {json_output_path}")
405
+
406
+ if __name__ == "__main__":
407
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
408
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/")
409
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnM2/")
410
+ args = parser.parse_args()
411
+ print(args.target_path, args.output_dir)
412
+ main(args.target_path, args.output_dir)
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
MnM2_clean/dataclean_MnM2_v2.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-08-26
5
+ update MnMs2 data clean
6
+
7
+ nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本):
8
+ 1.LA或者SA需要分开存储处理;
9
+ 2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧;
10
+ 3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息;
11
+ 4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下
12
+
13
+ a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件;
14
+ b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件;
15
+
16
+ ##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825
17
+ 分割标签:NIFTI 格式,标签值:
18
+
19
+ 0:背景
20
+
21
+ 1:左心室腔(LV)
22
+
23
+ 2:右心室腔(RV)
24
+
25
+ 3:左心室心肌(Myo
26
+
27
+ 当前版本没有元文件信息
28
+
29
+ '''
30
+ import os
31
+ import glob
32
+ import pandas as pd
33
+ import SimpleITK as sitk
34
+ import argparse
35
+ import json
36
+ from tqdm import tqdm
37
+ from util import meta_data
38
+ import util
39
+ import numpy as np
40
+ # from bert_helper import *
41
+
42
+
43
+
44
+
45
+ TASK_VALUE="segmentation"
46
+ CLAMP_RANGE_CT = [-300,300]
47
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
48
+ TARGET_VOXEL_SPACING=None
49
+
50
+ LABEL_DICT={
51
+ "0":"backgroud",
52
+ "1":"LV",#左心室 Blood Pools
53
+ "3":"MYO",#左心室心肌
54
+ "2":"RV"#右心室 Blood Pools
55
+ }
56
+
57
+ # def find_metadata_files(path):
58
+ # # for Cancer Image Archive (TCIA) dataset
59
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
60
+ # return glob.glob(search_pattern, recursive=True)
61
+
62
+ def find_metadata_files(path):
63
+ # for Cancer Image Archive (TCIA) dataset
64
+ search_pattern = os.path.join(path, '*.csv')
65
+ return glob.glob(search_pattern, recursive=True)
66
+ ##added by yanguoqing on 20250527
67
+ def find_image_dirs(path):
68
+ return os.listdir(path)
69
+
70
+ ##modify by yanguoqing on 20250527
71
+ def load_dicom_images(folder_path):
72
+ reader = sitk.ImageSeriesReader()
73
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
74
+ reader.SetFileNames(dicom_names)
75
+ image = reader.Execute()
76
+ return dicom_names,image
77
+
78
+ ##added by yanguoqing on 20250527
79
+ def load_dicom_tag(imgs):
80
+ reader = sitk.ImageFileReader()
81
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
82
+ reader.SetFileName(imgs)
83
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
84
+ # metadata_keys = reader.GetMetaDataKeys()
85
+ tag=reader.Execute()
86
+ return tag
87
+
88
+ def load_nrrd(fp):
89
+ return sitk.ReadImage(fp)
90
+
91
+ def save_nifti(image, output_path, folder_path):
92
+ # Set metadata in the NIfTI file's header
93
+ output_dirpath = os.path.dirname(output_path)
94
+ if not os.path.exists(output_dirpath):
95
+ print(f"Creating directory {output_dirpath}")
96
+ os.makedirs(output_dirpath)
97
+ # Set metadata in the NIfTI file's header
98
+ image.SetMetaData("FolderPath", folder_path)
99
+ sitk.WriteImage(image, output_path)
100
+
101
+ ##modify by yanguoqing on 20250527
102
+ def convert_windows_to_linux_path(windows_path):
103
+ # Replace backslashes with forward slashes and remove the drive letter
104
+ # Some meta files have windows paths, but the data is stored on a linux server
105
+ linux_path = windows_path.replace('\\', '/')
106
+ if ':' in linux_path:
107
+ linux_path = linux_path.split(':', 1)[1]
108
+ return linux_path
109
+
110
+ def main(target_path, output_dir):
111
+ metadata_files = find_metadata_files(target_path)
112
+ pid_dirs=find_image_dirs(target_path)
113
+ # pid_dirs=["Training","Testing","Validation"]
114
+ failed_files = []
115
+ if not os.path.isdir(output_dir):
116
+ os.makedirs(output_dir)
117
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
118
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
119
+ meta = meta_data()
120
+
121
+ # Initialize the JSON file
122
+ if not os.path.exists(json_output_path):
123
+ with open(json_output_path, 'w') as json_file:
124
+ json.dump({}, json_file)
125
+ meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
126
+ if os.path.isfile(meta_file):
127
+ mf_flag=True
128
+ df_meta=pd.read_csv(meta_file,sep=',')
129
+ else:
130
+ mf_flag=False
131
+
132
+ if pid_dirs:
133
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
134
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
135
+ continue
136
+ meta_image_id=pid_dir
137
+
138
+ modality="MRI"
139
+ study='MnM2'##Dataset_name
140
+
141
+ full_dir=os.path.join(target_path,pid_dir)
142
+ dfs=find_image_dirs(full_dir)##list all nii.gz files
143
+
144
+
145
+ if len(dfs)>0:
146
+ for df in dfs:
147
+ ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件
148
+ if "CINE" in df:
149
+ ##正常处理
150
+ label_flag=False
151
+ if "_LA_" in df:
152
+ la_flag=True
153
+ else:
154
+ la_flag=False
155
+
156
+ elif "ES.nii.gz" in df:
157
+ if "_LA_" in df:
158
+ la_flag=True
159
+ else:
160
+ la_flag=False
161
+ if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))):
162
+ label_flag=True
163
+ else:
164
+ label_flag=False
165
+ else:
166
+ continue
167
+ try:
168
+ ##处理数据
169
+ full_path_image=os.path.join(full_dir,df)
170
+
171
+ sitk_img_original = util.load_nifti(full_path_image)
172
+ if sitk_img_original is None:
173
+ print(f" Failed to load image: {full_path_image}")
174
+ continue
175
+
176
+ original_spacing = list(sitk_img_original.GetSpacing())
177
+ original_size = list(sitk_img_original.GetSize())
178
+ sitk_img_processed = sitk_img_original
179
+ # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4
180
+ is_4d_image = sitk_img_original.GetDimension() == 4
181
+
182
+ frame_flag=False
183
+ # --- Resampling Logic (Revised for 4D) ---
184
+ if is_4d_image:
185
+
186
+ # Always process 4D images channel-wise for resampling
187
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
188
+ channels = []
189
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
190
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
191
+
192
+
193
+ for i in range(num_channels):
194
+ extractor = sitk.ExtractImageFilter()
195
+ current_3d_channel_size = original_size[:3]
196
+
197
+ if sitk_img_original.GetDimension() == 4:
198
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
199
+ extractor.SetIndex([0,0,0,i])
200
+ channel_3d_img = extractor.Execute(sitk_img_original)
201
+ else:
202
+ channel_3d_img = sitk_img_original
203
+ if i > 0: break
204
+
205
+ channel_resampler = util.get_unisize_resampler(
206
+ channel_3d_img, 'linear',
207
+ spacing=channel_target_spacing, size=current_3d_channel_size
208
+ )
209
+ if channel_resampler:
210
+ channels.append(channel_resampler.Execute(channel_3d_img))
211
+ else:
212
+ channels.append(channel_3d_img)
213
+
214
+ if channels:
215
+ if len(channels) > 1: # Only join if there are multiple channels
216
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
217
+ ##aded by yanguoqing on 2025-08-11
218
+ frame_flag=True
219
+ # imgDict={}
220
+ # for kf_idx in range(num_channels):
221
+ # imgDict[str(kf_idx)]='none'
222
+ # if str(meta_ed):imgDict[str(meta_ed)]='ed'
223
+ # if str(meta_es):imgDict[str(meta_es)]='es'
224
+ # meta.add_keyvalue('ImgDict',imgDict)
225
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
226
+ sitk_img_processed = channels[0]
227
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
228
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
229
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
230
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
231
+ else: # 3D image, no TARGET_VOXEL_SPACING
232
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
233
+ spacing=original_spacing, size=original_size)
234
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
235
+
236
+
237
+ CIA_other_info = {
238
+ 'metadata_file':''
239
+ # 'Series_Description':serise_desc
240
+ }
241
+ CIA_other_info['split'] = "train"
242
+ CIA_other_info['Image_id']=meta_image_id
243
+ if mf_flag:
244
+ CIA_other_info['metadata_file']=meta_file
245
+
246
+ is_processed_4d = sitk_img_processed.GetDimension() == 4
247
+ clamp_range_to_use=None
248
+ if clamp_range_to_use and is_processed_4d:
249
+ clamped_channels_final = []
250
+ num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
251
+ for i in range(num_channels_final):
252
+ extractor = sitk.ExtractImageFilter()
253
+ proc_size_final = sitk_img_processed.GetSize()
254
+ extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
255
+ extractor.SetIndex([0,0,0,i])
256
+ channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
257
+ clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
258
+ if clamped_channels_final:
259
+ if len(clamped_channels_final) > 1:
260
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
261
+ elif len(clamped_channels_final) == 1:
262
+ sitk_img_processed = clamped_channels_final[0]
263
+ elif clamp_range_to_use: # 3D image
264
+ sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
265
+
266
+
267
+ output_path = os.path.join(output_dir,pid_dir, f"{df}")
268
+ # output_path=convert_windows_to_linux_path(output_path)
269
+ save_nifti(sitk_img_processed, output_path, full_path_image)
270
+ print(f"Saved NIfTI file to {output_path}")
271
+
272
+
273
+
274
+ label_path_dict = {}
275
+
276
+ if label_flag:
277
+ processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}")
278
+ full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))
279
+
280
+ sitk_lbl_original = util.load_nifti(full_path_label)
281
+ if not sitk_lbl_original:
282
+ print(f" Failed to load label: {full_path_label}")
283
+ processed_lbl_full_path = None
284
+ continue
285
+ if sitk_lbl_original:
286
+ label_resampler = sitk.ResampleImageFilter()
287
+ reference_for_label = sitk_img_processed # Default to processed image
288
+ print(sitk_img_processed.GetDimension(),sitk_img_processed.GetSize())
289
+ if sitk_img_processed.GetDimension() == 4:
290
+ num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
291
+ if num_comp_proc > 0:
292
+ extractor = sitk.ExtractImageFilter()
293
+ proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
294
+ extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
295
+ extractor.SetIndex([0,0,0,0])
296
+ try:
297
+ reference_for_label = extractor.Execute(sitk_img_processed)
298
+ except Exception as ref_err:
299
+ print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
300
+ # print(traceback.format_exc())
301
+ reference_for_label = None
302
+ else: # Fallback if extraction fails
303
+ print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
304
+ reference_for_label = None # This will cause an issue below if not handled
305
+
306
+ sitk_lbl_processed = None
307
+
308
+ if reference_for_label and reference_for_label.GetDimension() > 0:
309
+ label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
310
+ label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
311
+
312
+ if sitk_lbl_original.GetDimension() == 4:
313
+ lbl_channels = []
314
+ lbl_size = list(sitk_lbl_original.GetSize())
315
+ for i in range(lbl_size[3]):
316
+ extractor = sitk.ExtractImageFilter()
317
+ extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
318
+ extractor.SetIndex([0, 0, 0, i])
319
+ single_channel = extractor.Execute(sitk_lbl_original)
320
+
321
+ label_resampler.SetReferenceImage(reference_for_label)
322
+ resampled_channel = label_resampler.Execute(single_channel)
323
+ lbl_channels.append(resampled_channel)
324
+
325
+ if len(lbl_channels) > 1:
326
+ sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
327
+ elif len(lbl_channels) == 1:
328
+ sitk_lbl_processed = lbl_channels[0]
329
+ else:
330
+ label_resampler.SetReferenceImage(reference_for_label)
331
+ sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
332
+ if processed_lbl_full_path:
333
+ if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
334
+ print(f" Mismatch between image and label size (ignoring channels):")
335
+ print(f" Image size: {sitk_img_processed.GetSize()}")
336
+ print(f" Label size: {sitk_lbl_processed.GetSize()}")
337
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
338
+ else:
339
+ print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
340
+ img_resampler_obj = util.get_unisize_resampler(sitk_lbl_original, 'nearest',
341
+ spacing=original_spacing, size=original_size)
342
+ if img_resampler_obj:
343
+ sitk_lbl_processed = img_resampler_obj.Execute(sitk_lbl_original)
344
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original
345
+ # processed_lbl_full_path should still point to this saved original label
346
+
347
+
348
+ else:
349
+ processed_lbl_full_path = None
350
+ else:
351
+ processed_lbl_full_path = None
352
+
353
+ if processed_lbl_full_path:
354
+ label_path_dict['heart'] = processed_lbl_full_path
355
+
356
+ print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
357
+ print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
358
+ try:
359
+ assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
360
+
361
+ except Exception as e:
362
+ failed_files.append(full_path_label)
363
+ continue
364
+ except RuntimeError:
365
+ failed_files.append(full_path_image)
366
+ print(f"Failed to load MnMs images from {full_path_image}")
367
+ continue
368
+
369
+ size_processed = list(sitk_img_processed.GetSize())
370
+ print('size_processed',size_processed,original_size)
371
+
372
+ # meta.add_keyvalue('Image_id',meta_image_id)
373
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
374
+ meta.add_keyvalue('OriImg_path',full_path_image)
375
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
376
+ meta.add_keyvalue('Modality',modality)
377
+ meta.add_keyvalue('Dataset_name',study)
378
+ meta.add_keyvalue('ROI','chest')
379
+
380
+
381
+ if processed_lbl_full_path:
382
+ print(label_path_dict.keys())
383
+ meta.add_keyvalue('Task',TASK_VALUE)
384
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
385
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
386
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
387
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
388
+
389
+
390
+
391
+
392
+ # Write the mapping to the JSON file on the fly
393
+ with open(json_output_path, 'r+') as json_file:
394
+ existing_mappings = json.load(json_file)
395
+ existing_mappings[output_path] = meta.get_meta_data()
396
+ json_file.seek(0)
397
+ print(existing_mappings)
398
+ json.dump(existing_mappings, json_file, indent=4)
399
+ json_file.truncate()
400
+ else:
401
+ continue
402
+
403
+
404
+
405
+ with open(failed_files_path, "w") as json_file:
406
+ json.dump(failed_files, json_file)
407
+
408
+ print(f"The list has been written to {failed_files_path}")
409
+ print(f"Saved NIfTI mappings to {json_output_path}")
410
+
411
+ if __name__ == "__main__":
412
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
413
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/")
414
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnM2/V2")
415
+ args = parser.parse_args()
416
+ print(args.target_path, args.output_dir)
417
+ main(args.target_path, args.output_dir)
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
MnM2_clean/dataclean_MnM2_v3.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-08-26
5
+ update MnMs2 data clean
6
+
7
+ nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本):
8
+ 1.LA或者SA需要分开存储处理;
9
+ 2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧;
10
+ 3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息;
11
+ 4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下
12
+
13
+ a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件;
14
+ b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件;
15
+
16
+ ##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825
17
+ 分割标签:NIFTI 格式,标签值:
18
+
19
+ 0:背景
20
+
21
+ 1:左心室腔(LV)
22
+
23
+ 2:右心室腔(RV)
24
+
25
+ 3:左心室心肌(Myo
26
+
27
+ 当前版本没有元文件信息
28
+
29
+ '''
30
+ import os
31
+ import glob
32
+ import pandas as pd
33
+ import SimpleITK as sitk
34
+ import argparse
35
+ import json
36
+ from tqdm import tqdm
37
+ from util import meta_data
38
+ import util
39
+ import numpy as np
40
+ # from bert_helper import *
41
+
42
+
43
+
44
+
45
+ TASK_VALUE="segmentation"
46
+ CLAMP_RANGE_CT = [-300,300]
47
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
48
+ TARGET_VOXEL_SPACING=None
49
+
50
+ LABEL_DICT={
51
+ "0":"backgroud",
52
+ "1":"LV",#左心室 Blood Pools
53
+ "3":"MYO",#左心室心肌
54
+ "2":"RV"#右心室 Blood Pools
55
+ }
56
+
57
+ # def find_metadata_files(path):
58
+ # # for Cancer Image Archive (TCIA) dataset
59
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
60
+ # return glob.glob(search_pattern, recursive=True)
61
+
62
+ def find_metadata_files(path):
63
+ # for Cancer Image Archive (TCIA) dataset
64
+ search_pattern = os.path.join(path, '*.csv')
65
+ return glob.glob(search_pattern, recursive=True)
66
+ ##added by yanguoqing on 20250527
67
+ def find_image_dirs(path):
68
+ return os.listdir(path)
69
+
70
+ ##modify by yanguoqing on 20250527
71
+ def load_dicom_images(folder_path):
72
+ reader = sitk.ImageSeriesReader()
73
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
74
+ reader.SetFileNames(dicom_names)
75
+ image = reader.Execute()
76
+ return dicom_names,image
77
+
78
+ ##added by yanguoqing on 20250527
79
+ def load_dicom_tag(imgs):
80
+ reader = sitk.ImageFileReader()
81
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
82
+ reader.SetFileName(imgs)
83
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
84
+ # metadata_keys = reader.GetMetaDataKeys()
85
+ tag=reader.Execute()
86
+ return tag
87
+
88
+ def load_nrrd(fp):
89
+ return sitk.ReadImage(fp)
90
+
91
+ def save_nifti(image, output_path, folder_path):
92
+ # Set metadata in the NIfTI file's header
93
+ output_dirpath = os.path.dirname(output_path)
94
+ if not os.path.exists(output_dirpath):
95
+ print(f"Creating directory {output_dirpath}")
96
+ os.makedirs(output_dirpath)
97
+ # Set metadata in the NIfTI file's header
98
+ image.SetMetaData("FolderPath", folder_path)
99
+ sitk.WriteImage(image, output_path)
100
+
101
+ ##modify by yanguoqing on 20250527
102
+ def convert_windows_to_linux_path(windows_path):
103
+ # Replace backslashes with forward slashes and remove the drive letter
104
+ # Some meta files have windows paths, but the data is stored on a linux server
105
+ linux_path = windows_path.replace('\\', '/')
106
+ if ':' in linux_path:
107
+ linux_path = linux_path.split(':', 1)[1]
108
+ return linux_path
109
+
110
+ def main(target_path, output_dir):
111
+ metadata_files = find_metadata_files(target_path)
112
+ pid_dirs=find_image_dirs(target_path)
113
+ # pid_dirs=["Training","Testing","Validation"]
114
+ failed_files = []
115
+ if not os.path.isdir(output_dir):
116
+ os.makedirs(output_dir)
117
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
118
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
119
+ meta = meta_data()
120
+
121
+ # Initialize the JSON file
122
+ if not os.path.exists(json_output_path):
123
+ with open(json_output_path, 'w') as json_file:
124
+ json.dump({}, json_file)
125
+ meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
126
+ if os.path.isfile(meta_file):
127
+ mf_flag=True
128
+ df_meta=pd.read_csv(meta_file,sep=',')
129
+ else:
130
+ mf_flag=False
131
+
132
+ if pid_dirs:
133
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
134
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
135
+ continue
136
+ meta_image_id=pid_dir
137
+
138
+ modality="MRI"
139
+ study='MnM2'##Dataset_name
140
+
141
+ full_dir=os.path.join(target_path,pid_dir)
142
+ dfs=find_image_dirs(full_dir)##list all nii.gz files
143
+
144
+ print(">>>>",meta_image_id)
145
+ if len(dfs)>0:
146
+ for df in dfs:
147
+ ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件
148
+ if "CINE" in df:
149
+ ##正常处理
150
+ label_flag=False
151
+ if "_LA_" in df:
152
+ la_flag=True
153
+ else:
154
+ la_flag=False
155
+ continue##
156
+ elif "ES.nii.gz" in df:
157
+ if "_LA_" in df:
158
+ la_flag=True
159
+ else:
160
+ la_flag=False
161
+ if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))):
162
+ label_flag=True
163
+ else:
164
+ label_flag=False
165
+ elif "ED.nii.gz" in df :##ED.nii.gz
166
+ if "_LA_" in df:
167
+ la_flag=True
168
+ else:
169
+ la_flag=False
170
+ if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))):
171
+ label_flag=True
172
+ else:
173
+ label_flag=False
174
+ else:
175
+ continue
176
+ try:
177
+ ##处理数据
178
+ full_path_image=os.path.join(full_dir,df)
179
+ print("orig_file:",full_path_image)
180
+ sitk_img_original = util.load_nifti(full_path_image)
181
+ if sitk_img_original is None:
182
+ print(f" Failed to load image: {full_path_image}")
183
+ continue
184
+
185
+ original_spacing = list(sitk_img_original.GetSpacing())
186
+ original_size = list(sitk_img_original.GetSize())
187
+ sitk_img_processed = sitk_img_original
188
+ # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4
189
+ is_4d_image = sitk_img_original.GetDimension() == 4
190
+
191
+ frame_flag=False
192
+ # --- Resampling Logic (Revised for 4D) ---
193
+ if is_4d_image:
194
+
195
+ # Always process 4D images channel-wise for resampling
196
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
197
+ channels = []
198
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
199
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
200
+
201
+
202
+ for i in range(num_channels):
203
+ extractor = sitk.ExtractImageFilter()
204
+ current_3d_channel_size = original_size[:3]
205
+
206
+ if sitk_img_original.GetDimension() == 4:
207
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
208
+ extractor.SetIndex([0,0,0,i])
209
+ channel_3d_img = extractor.Execute(sitk_img_original)
210
+ else:
211
+ channel_3d_img = sitk_img_original
212
+ if i > 0: break
213
+
214
+ channel_resampler = util.get_unisize_resampler(
215
+ channel_3d_img, 'linear',
216
+ spacing=channel_target_spacing, size=current_3d_channel_size
217
+ )
218
+ if channel_resampler:
219
+ channels.append(channel_resampler.Execute(channel_3d_img))
220
+ else:
221
+ channels.append(channel_3d_img)
222
+
223
+ if channels:
224
+ if len(channels) > 1: # Only join if there are multiple channels
225
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
226
+ ##aded by yanguoqing on 2025-08-11
227
+ frame_flag=True
228
+ # imgDict={}
229
+ # for kf_idx in range(num_channels):
230
+ # imgDict[str(kf_idx)]='none'
231
+ # if str(meta_ed):imgDict[str(meta_ed)]='ed'
232
+ # if str(meta_es):imgDict[str(meta_es)]='es'
233
+ # meta.add_keyvalue('ImgDict',imgDict)
234
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
235
+ sitk_img_processed = channels[0]
236
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
237
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
238
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
239
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
240
+ else: # 3D image, no TARGET_VOXEL_SPACING
241
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
242
+ spacing=original_spacing, size=original_size)
243
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
244
+
245
+
246
+ CIA_other_info = {
247
+ 'metadata_file':''
248
+ # 'Series_Description':serise_desc
249
+ }
250
+ CIA_other_info['split'] = "train"
251
+ CIA_other_info['Image_id']=meta_image_id
252
+ if mf_flag:
253
+ CIA_other_info['metadata_file']=meta_file
254
+
255
+ is_processed_4d = sitk_img_processed.GetDimension() == 4
256
+ clamp_range_to_use=None
257
+ if clamp_range_to_use and is_processed_4d:
258
+ clamped_channels_final = []
259
+ num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
260
+ for i in range(num_channels_final):
261
+ extractor = sitk.ExtractImageFilter()
262
+ proc_size_final = sitk_img_processed.GetSize()
263
+ extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
264
+ extractor.SetIndex([0,0,0,i])
265
+ channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
266
+ clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
267
+ if clamped_channels_final:
268
+ if len(clamped_channels_final) > 1:
269
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
270
+ elif len(clamped_channels_final) == 1:
271
+ sitk_img_processed = clamped_channels_final[0]
272
+ elif clamp_range_to_use: # 3D image
273
+ sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
274
+
275
+
276
+ output_path = os.path.join(output_dir,pid_dir, f"{df}")
277
+ # output_path=convert_windows_to_linux_path(output_path)
278
+ save_nifti(sitk_img_processed, output_path, full_path_image)
279
+ print(f"Saved NIfTI file to {output_path}")
280
+
281
+
282
+
283
+ label_path_dict = {}
284
+
285
+ if label_flag:
286
+ processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}")
287
+ full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))
288
+ print("label_file",full_path_label)
289
+ sitk_lbl_original = util.load_nifti(full_path_label)
290
+ if not sitk_lbl_original:
291
+ print(f" Failed to load label: {full_path_label}")
292
+ processed_lbl_full_path = None
293
+ continue
294
+ if sitk_lbl_original:
295
+ label_resampler = sitk.ResampleImageFilter()
296
+ reference_for_label = sitk_img_processed # Default to processed image
297
+ print(sitk_img_processed.GetDimension(),sitk_img_processed.GetSize())
298
+ if sitk_img_processed.GetDimension() == 4:
299
+ print("frame label match")
300
+ num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
301
+ if num_comp_proc > 0:
302
+ extractor = sitk.ExtractImageFilter()
303
+ proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
304
+ extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
305
+ extractor.SetIndex([0,0,0,0])
306
+ try:
307
+ reference_for_label = extractor.Execute(sitk_img_processed)
308
+ except Exception as ref_err:
309
+ print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
310
+ # print(traceback.format_exc())
311
+ reference_for_label = None
312
+ else: # Fallback if extraction fails
313
+ print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
314
+ reference_for_label = None # This will cause an issue below if not handled
315
+
316
+ sitk_lbl_processed = None
317
+
318
+ if reference_for_label and reference_for_label.GetDimension() > 0:
319
+ label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
320
+ label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
321
+
322
+ if sitk_lbl_original.GetDimension() == 4:
323
+ lbl_channels = []
324
+ lbl_size = list(sitk_lbl_original.GetSize())
325
+ for i in range(lbl_size[3]):
326
+ extractor = sitk.ExtractImageFilter()
327
+ extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
328
+ extractor.SetIndex([0, 0, 0, i])
329
+ single_channel = extractor.Execute(sitk_lbl_original)
330
+
331
+ label_resampler.SetReferenceImage(reference_for_label)
332
+ resampled_channel = label_resampler.Execute(single_channel)
333
+ lbl_channels.append(resampled_channel)
334
+
335
+ if len(lbl_channels) > 1:
336
+ sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
337
+ elif len(lbl_channels) == 1:
338
+ sitk_lbl_processed = lbl_channels[0]
339
+ else:
340
+ label_resampler.SetReferenceImage(reference_for_label)
341
+ sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
342
+ # if processed_lbl_full_path:
343
+ # if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
344
+ # print(f" Mismatch between image and label size (ignoring channels):")
345
+ # print(f" Image size: {sitk_img_processed.GetSize()}")
346
+ # print(f" Label size: {sitk_lbl_processed.GetSize()}")
347
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
348
+ else:
349
+ # print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
350
+ print("no frame label match")
351
+ # original_spacing = list(reference_for_label.GetSpacing())
352
+ # original_size = list(reference_for_label.GetSize())
353
+ print(original_spacing,original_size)
354
+ img_resampler_obj = util.get_unisize_resampler(sitk_lbl_original, 'nearest',
355
+ spacing=original_spacing, size=original_size)
356
+ if img_resampler_obj:
357
+ sitk_lbl_processed = img_resampler_obj.Execute(sitk_lbl_original)
358
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) # Save original
359
+ else:
360
+ print('failed to resample label')
361
+ # processed_lbl_full_path should still point to this saved original label
362
+
363
+
364
+ else:
365
+ processed_lbl_full_path = None
366
+ else:
367
+ processed_lbl_full_path = None
368
+
369
+ if processed_lbl_full_path:
370
+
371
+
372
+ print('compare original image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
373
+ print('compare processed image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
374
+ try:
375
+ assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
376
+ label_path_dict['heart'] = processed_lbl_full_path
377
+ print("process label path:", processed_lbl_full_path)
378
+ except Exception as e:
379
+ failed_files.append(full_path_label)
380
+ continue
381
+
382
+
383
+ except RuntimeError:
384
+ failed_files.append(full_path_image)
385
+ print(f"Failed to load MnMs images from {full_path_image}")
386
+ continue
387
+
388
+ size_processed = list(sitk_img_processed.GetSize())
389
+ print('size_processed',size_processed,original_size)
390
+
391
+ # meta.add_keyvalue('Image_id',meta_image_id)
392
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
393
+ meta.add_keyvalue('OriImg_path',full_path_image)
394
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
395
+ meta.add_keyvalue('Modality',modality)
396
+ meta.add_keyvalue('Dataset_name',study)
397
+ meta.add_keyvalue('ROI','chest')
398
+
399
+
400
+ if processed_lbl_full_path:
401
+ # print(label_path_dict.keys())
402
+ meta.add_keyvalue('Task',TASK_VALUE)
403
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
404
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
405
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
406
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
407
+
408
+
409
+ print(meta.get_meta_data())
410
+
411
+ # Write the mapping to the JSON file on the fly
412
+ with open(json_output_path, 'r+') as json_file:
413
+ existing_mappings = json.load(json_file)
414
+ existing_mappings[output_path] = meta.get_meta_data()
415
+ json_file.seek(0)
416
+ # print(existing_mappings)
417
+ json.dump(existing_mappings, json_file, indent=4)
418
+ json_file.truncate()
419
+ else:
420
+ continue
421
+
422
+
423
+
424
+ with open(failed_files_path, "w") as json_file:
425
+ json.dump(failed_files, json_file)
426
+
427
+ print(f"The list has been written to {failed_files_path}")
428
+ print(f"Saved NIfTI mappings to {json_output_path}")
429
+
430
+ if __name__ == "__main__":
431
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
432
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/")
433
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/ygq/Data_Engineering/MnM2_clean/test")
434
+ args = parser.parse_args()
435
+ print(args.target_path, args.output_dir)
436
+ main(args.target_path, args.output_dir)
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
MnM2_clean/util.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def load_dicom_images(folder_path):
8
+ reader = sitk.ImageSeriesReader()
9
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
10
+ reader.SetFileNames(dicom_names)
11
+ image = reader.Execute()
12
+ return image
13
+
14
+ def convert_windows_to_linux_path(windows_path):
15
+ # Replace backslashes with forward slashes and remove the drive letter
16
+ # Some meta files have windows paths, but the data is stored on a linux server
17
+ linux_path = windows_path.replace('\\', '/')
18
+ if ':' in linux_path:
19
+ linux_path = linux_path.split(':', 1)[1]
20
+ return linux_path
21
+
22
+ # =============================================================================
23
+ # ========================developed with TotalSegmentor========================
24
+ # =============================================================================
25
+
26
+ def read_table(file_path, split_str=';'):
27
+ try:
28
+ df = pd.read_excel(file_path, engine='openpyxl')
29
+ except:
30
+ df = pd.read_csv(file_path, sep=split_str)
31
+ return df
32
+
33
+ def load_nifti(image_path):
34
+ return sitk.ReadImage(image_path)
35
+
36
+ def save_nifti(image, output_path, folder_path):
37
+ output_dirpath = os.path.dirname(output_path)
38
+ if not os.path.exists(output_dirpath):
39
+ print(f"Creating directory {output_dirpath}")
40
+ os.makedirs(output_dirpath)
41
+ # Set metadata in the NIfTI file's header
42
+ image.SetMetaData("FolderPath", folder_path)
43
+ sitk.WriteImage(image, output_path)
44
+
45
+ def find_metadata_files(path, file_name='*meta*'):
46
+ # for TotalSegmentor dataset
47
+ search_pattern = os.path.join(path, '**', file_name)
48
+ return glob.glob(search_pattern, recursive=True)
49
+
50
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
51
+ img_path = []
52
+ for root, dirs, files in os.walk(folder_path):
53
+ for file in files:
54
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
55
+ img_path.append(os.path.join(root, file))
56
+ if is_sorted:
57
+ img_path.sort()
58
+ return img_path
59
+
60
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
61
+ '''
62
+ Resample the image to have isotropic spacing, following the steps:
63
+ 1. Find the minimum spacing
64
+ 2. Resample the image to have the minimum spacing
65
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
66
+ 4. Set the output spacing
67
+ 5. Return the resampler for resampling
68
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
69
+ '''
70
+ # 讨论为什么重新写这个函数!!!
71
+ if size is None:
72
+ size = ref_img.GetSize()
73
+ if spacing is None:
74
+ spacing = ref_img.GetSpacing()
75
+ min_spacing = min(spacing)
76
+ if all([spc == min_spacing for spc in spacing]):
77
+ return None
78
+ else:
79
+ # if 1:
80
+ if interpolator == 'nearest':
81
+ interpolator = sitk.sitkNearestNeighbor
82
+ elif interpolator == 'linear':
83
+ interpolator = sitk.sitkLinear
84
+ resampler = sitk.ResampleImageFilter()
85
+ # new_spacing = [max_spacing] * len(spacing)
86
+ # print(size)
87
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
88
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
89
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
90
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
91
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
92
+ # resampler.SetSize(new_size)
93
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
94
+ resampler.SetSize(new_size_xy)
95
+ resampler.SetOutputSpacing(new_size_spacing)
96
+
97
+ # print(new_size,new_size_xy)
98
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
99
+ resampler.SetOutputDirection(ref_img.GetDirection())
100
+ resampler.SetInterpolator(interpolator)
101
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
102
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
103
+ return resampler
104
+
105
+ def clamp_image(in_img,clamp_range):
106
+ '''
107
+ Clamp the image to the specified range
108
+ '''
109
+ clamp_filter = sitk.ClampImageFilter()
110
+ clamp_filter.SetLowerBound(clamp_range[0])
111
+ clamp_filter.SetUpperBound(clamp_range[1])
112
+ return clamp_filter.Execute(in_img)
113
+
114
+ def get_synonyms_dict(dict_type='ROI'):
115
+ '''
116
+ Get the dictionary of synonyms for the specified dictionary type
117
+ '''
118
+ if dict_type == 'ROI':
119
+ dict_synonyms = {
120
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
121
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
122
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
123
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
124
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
125
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
126
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
127
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
128
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
129
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
130
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
131
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
132
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
133
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
134
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
135
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
136
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
137
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
138
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
139
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
140
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
141
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
142
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
143
+ }
144
+ elif dict_type == 'Label_tissue':
145
+ dict_synonyms = {
146
+ 'liver': ['liver','hepatic'],
147
+ 'spleen': ['spleen','splenic'],
148
+ 'kidney': ['kidney','renal'],
149
+ 'pancreas': ['pancreas','pancreatic'],
150
+ 'stomach': ['stomach','gastric'],
151
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
152
+ 'gallbladder': ['gallbladder'],
153
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
154
+ 'bladder': ['bladder'],
155
+ 'prostate': ['prostate'],
156
+ 'uterus': ['uterus'],
157
+ 'ovary': ['ovary'],
158
+ 'testicle': ['testicle'],
159
+ 'lymph_node': ['lymph_node','lymph node'],
160
+ 'bone': ['bone'],
161
+ 'lung': ['lung'],
162
+ 'heart': ['heart'],
163
+ 'esophagus': ['esophagus'],
164
+ 'muscle': ['muscle'],
165
+ 'fat': ['fat'],
166
+ 'skin': ['skin'],
167
+ 'vessel': ['vessel'],
168
+ 'tumor': ['tumor'],
169
+ 'other': ['other']
170
+ }
171
+ elif dict_type == 'Task':
172
+ dict_synonyms = {
173
+ 'segmentation': ['segmentation', 'seg', 'mask'],
174
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
175
+ 'localization': ['localization', 'locate', 'location', 'position'],
176
+ 'registration': ['registration', 'register', 'align', 'alignment'],
177
+ 'detection': ['detection', 'detect', 'find', 'locate'],
178
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
179
+ }
180
+ elif dict_type == 'Modality':
181
+ dict_synonyms = {
182
+ 'CT': ['CT', 'computed tomography'],
183
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
184
+ 'PET': ['PET', 'positron emission tomography'],
185
+ 'US': ['US', 'ultrasound'],
186
+ 'X-ray': ['X-ray', 'radiography'],
187
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
188
+ }
189
+ else:
190
+ raise ValueError(f"dict_type {dict_type} is not valid")
191
+ return dict_synonyms
192
+
193
+ def replace_synonyms(text, dict_synonyms):
194
+ '''
195
+ Replace the synonyms in the text with the standard term
196
+ '''
197
+ if isinstance(text,str):
198
+ for key, value in dict_synonyms.items():
199
+ for v in value:
200
+ if v.lower() in text.lower():
201
+ return key
202
+ Warning(f"Value {text} is not in the correct format")
203
+ elif isinstance(text,list):
204
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
205
+ elif isinstance(text,dict):
206
+ for key in text.keys():
207
+ # replace values in dict
208
+ text[key] = replace_synonyms(text[key], dict_synonyms)
209
+ # replace keys in dict
210
+ for k in dict_synonyms.keys():
211
+ text[dict_synonyms[k]] = text.pop(key)
212
+ return text
213
+
214
+ # =============================================================================
215
+
216
+ class meta_data(object):
217
+ '''
218
+ This class is used to store the metadata of the dataset
219
+ '''
220
+ def __init__(self):
221
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
222
+ with open(self.config_format_path, 'r') as file:
223
+ self.config_format = json.load(file)
224
+ self.config = {}
225
+ for key in self.config_format.keys():
226
+ if self.config_format[key]['required'] == True:
227
+ self.config[key] = {}
228
+ self.keytypes = self.find_all_keys_with_type()
229
+ self.keytypes_flatten = self.flatten_json()
230
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
231
+ for key in self.ambiguity_keys:
232
+ ambiguity_dict = get_synonyms_dict(key)
233
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
234
+
235
+ def get_ketytypes(self):
236
+ return self.keytypes
237
+
238
+ def get_keytypes_flatten(self):
239
+ return self.keytypes_flatten
240
+
241
+ def find_all_keys_with_type(self, data=None, parent_key=''):
242
+ if data is None:
243
+ data = self.config_format
244
+ keys_with_type = {}
245
+ if isinstance(data, dict):
246
+ for key, value in data.items():
247
+ full_key = f"{parent_key}.{key}" if parent_key else key
248
+ if isinstance(value, dict) and 'type' in value:
249
+ keys_with_type[full_key] = value['type']
250
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
251
+ elif isinstance(data, list):
252
+ for index, item in enumerate(data):
253
+ full_key = f"{parent_key}[{index}]"
254
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
255
+ return keys_with_type
256
+
257
+ def flatten_json(self, data=None, parent_key='', sep='.'):
258
+ if data is None:
259
+ data = self.config_format
260
+ items = {}
261
+ if isinstance(data, dict):
262
+ for key, value in data.items():
263
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
264
+ if isinstance(value, dict):
265
+ items.update(self.flatten_json(value, new_key, sep=sep))
266
+ elif isinstance(value, list):
267
+ for i, item in enumerate(value):
268
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
269
+ else:
270
+ items[new_key] = value
271
+ elif isinstance(data, list):
272
+ for i, item in enumerate(data):
273
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
274
+ return items
275
+
276
+ def req_check(self):
277
+ self.unfilled_keys = []
278
+ for key in self.config.keys():
279
+ if self.config[key] == {}:
280
+ self.unfilled_keys.append(key)
281
+ if len(self.unfilled_keys) == 0:
282
+ return True
283
+ else:
284
+ return False
285
+
286
+ def type_check(self, key, value):
287
+ if key not in self.config_format.keys():
288
+ print(key, "is not a valid key")
289
+ return False
290
+
291
+ if key == 'Modality':
292
+ if value not in self.config_format[key]['options']:
293
+ return False
294
+ else:
295
+ return True
296
+
297
+ elif key == 'OriImg_path':
298
+ if isinstance(value, str):
299
+ return True
300
+ else:
301
+ return False
302
+
303
+ elif key == 'Label_path' and isinstance(value, dict):
304
+ for skey in value.keys():
305
+ if skey in self.config_format[key]['keys']:
306
+ for kk in value[skey]:
307
+ if isinstance(value[skey][kk],str):
308
+ pass
309
+ # if kk in self.config_format[key]['value']['keys']:
310
+ # if isinstance(value[skey][kk],str):
311
+ # pass
312
+ # else:
313
+ # return False
314
+ else:
315
+ return False
316
+ return True
317
+
318
+ elif key == 'ROI':
319
+ if value not in self.config_format[key]['options']:
320
+ return False
321
+ else:
322
+ return True
323
+
324
+ elif key == 'Label_tissue' and isinstance(value, list):
325
+ for i in value:
326
+ if i not in self.config_format[key]['items']['options']:
327
+ return False
328
+ return True
329
+
330
+ elif key =='Task' and isinstance(value, list):
331
+ for i in value:
332
+ if i not in self.config_format[key]['items']['options']:
333
+ return False
334
+ return True
335
+
336
+ elif key == 'Spacing_mm':
337
+ if isinstance(value, float):
338
+ return True
339
+ else:
340
+ False
341
+
342
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
343
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
344
+ return all(isinstance(item, int) for item in value)
345
+
346
+ elif key == 'Dataset_name':
347
+ if isinstance(value, str):
348
+ return True
349
+ else:
350
+ return False
351
+ elif key == 'ImgDict':
352
+ if isinstance(value, dict):
353
+ return True
354
+ else:
355
+ return False
356
+ elif key == 'Label_Dict':
357
+ if isinstance(value, dict):
358
+ return True
359
+ else:
360
+ return False
361
+ def add_extra_keyvalue(self, key, value):
362
+ self.config[key] = value
363
+ return True
364
+
365
+ def add_keyvalue(self, key, value):
366
+ if key in self.ambiguity_keys:
367
+ value = replace_synonyms(value, get_synonyms_dict(key))
368
+ # print(key, value)
369
+ if self.type_check(key, value):
370
+ self.config[key] = value
371
+ return True
372
+ else:
373
+ Warning(f"Value {value} is not in the correct format for key {key}")
374
+ pass
375
+ # print(f"Value {value} is not in the correct format for key {key}")
376
+
377
+ def get_meta_data(self):
378
+ if self.req_check():
379
+ return self.config
380
+ else:
381
+ print("Not all required keys are filled", self.unfilled_keys)
382
+ return False
383
+
384
+
385
+
386
+ if __name__ == '__main__':
387
+ meta = meta_data()
388
+ print(meta.get_keytypes_flatten())
389
+ print(meta.get_ketytypes())
390
+ meta.add_keyvalue('Modality', 'CT')
391
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
392
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
393
+ meta.add_keyvalue('Spacing_mm', 1.5)
394
+ meta.add_keyvalue('Size', [512, 512, 100])
395
+ meta.add_keyvalue('Dataset_name', 'CT')
396
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
397
+ meta.add_keyvalue('Task', ['1', '2', '3'])
398
+ print(meta.get_meta_data())
399
+ meta.add_extra_key('extra', 'extra')
400
+ print(meta.get_meta_data())
401
+ print(meta.get_ketytypes())
402
+ print(meta.get_keytypes_flatten)
403
+
404
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
405
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
406
+ print(img_paths)
MnMs_clean/config_format.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+ "ImgDict": {
117
+ "type": "dict",
118
+ "required": false
119
+ },
120
+ "Label_Dict": {
121
+ "type": "dict",
122
+ "required": false
123
+ }
124
+ }
MnMs_clean/dataclean_MnMs.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-07-24
5
+ update MnMs data clean
6
+ https://github.com/openmedlab/Awesome-Medical-Dataset/blob/main/resources/M&Ms.md
7
+ https://zhuanlan.zhihu.com/p/694831343
8
+
9
+ 来自 6 个国际医疗中心 的 340 名受试者 的 CMR 数据。
10
+ 覆盖 4 个主流 MRI 设备厂商(Siemens, Philips, GE, Canon)。
11
+ 数据集文件结构如下,数据集被组织成训练集、验证集和测试集三个主目录,其中训练集进一步分为有标注和无标注的子目录。每个有标注的子目录包含病人的成像文件以及相应的标注数据。
12
+ M&Ms
13
+ ├── Training
14
+ │ ├── Labeled
15
+ │ │ ├── A0S9V9
16
+ │ │ │ ├── A0S9V9_sa.nii.gz
17
+ │ │ │ └── A0S9V9_sa_gt.nii.gz
18
+ │ │ ├── A1D0Q7
19
+ │ │ ├── A1D9Z7
20
+ │ │ └── ...
21
+ │ └── Unlabeled
22
+ ├── Validation
23
+ ├── Testing
24
+ └── 211230_M&Ms_Dataset_information_diagnosis_opendataset.csv
25
+
26
+ 对训练集有标注的 150 例数据进行图像尺寸统计,size 的格式为 (x,y,z,frame)
27
+ 经验丰富的临床医生对心脏磁共振(CMR)图像进行了分割,参考了 ACDC 的标注标准,标注了左心室(LV)、右心室(RV)血池以及左心室心肌(MYO)的轮廓,标签分别为:1(LV)、2(MYO)和3(RV)。
28
+
29
+ '''
30
+ import os
31
+ import glob
32
+ import pandas as pd
33
+ import SimpleITK as sitk
34
+ import argparse
35
+ import json
36
+ from tqdm import tqdm
37
+ from util import meta_data
38
+ import util
39
+ import numpy as np
40
+ # from bert_helper import *
41
+
42
+
43
+
44
+ meta_id_name='External code'
45
+ meta_vendor_name='VendorName'
46
+ meta_centre_name='Centre'
47
+ meta_pathology_name='Pathology'
48
+ meta_ed_name='ED'
49
+ meta_es_name='ES'
50
+ meta_age_name='Age'
51
+ meta_sex_name='Sex'
52
+ meta_height_name='Height'
53
+ meta_weight_name='Weight'
54
+
55
+ TASK_VALUE="segmentation"
56
+ CLAMP_RANGE_CT = [-300,300]
57
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
58
+ TARGET_VOXEL_SPACING=None
59
+
60
+ LABEL_DICT={
61
+ "0":"backgroud",
62
+ "1":"LV",#左心室 Blood Pools
63
+ "2":"MYO",#左心室心肌
64
+ "3":"RV"#右心室 Blood Pools
65
+ }
66
+
67
+ # def find_metadata_files(path):
68
+ # # for Cancer Image Archive (TCIA) dataset
69
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
70
+ # return glob.glob(search_pattern, recursive=True)
71
+
72
+ def find_metadata_files(path):
73
+ # for Cancer Image Archive (TCIA) dataset
74
+ search_pattern = os.path.join(path, '*.csv')
75
+ return glob.glob(search_pattern, recursive=True)
76
+ ##added by yanguoqing on 20250527
77
+ def find_image_dirs(path):
78
+ return os.listdir(path)
79
+
80
+ ##modify by yanguoqing on 20250527
81
+ def load_dicom_images(folder_path):
82
+ reader = sitk.ImageSeriesReader()
83
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
84
+ reader.SetFileNames(dicom_names)
85
+ image = reader.Execute()
86
+ return dicom_names,image
87
+
88
+ ##added by yanguoqing on 20250527
89
+ def load_dicom_tag(imgs):
90
+ reader = sitk.ImageFileReader()
91
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
92
+ reader.SetFileName(imgs)
93
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
94
+ # metadata_keys = reader.GetMetaDataKeys()
95
+ tag=reader.Execute()
96
+ return tag
97
+
98
+ def load_nrrd(fp):
99
+ return sitk.ReadImage(fp)
100
+
101
+ def save_nifti(image, output_path, folder_path):
102
+ # Set metadata in the NIfTI file's header
103
+ output_dirpath = os.path.dirname(output_path)
104
+ if not os.path.exists(output_dirpath):
105
+ print(f"Creating directory {output_dirpath}")
106
+ os.makedirs(output_dirpath)
107
+ # Set metadata in the NIfTI file's header
108
+ image.SetMetaData("FolderPath", folder_path)
109
+ sitk.WriteImage(image, output_path)
110
+
111
+ ##modify by yanguoqing on 20250527
112
+ def convert_windows_to_linux_path(windows_path):
113
+ # Replace backslashes with forward slashes and remove the drive letter
114
+ # Some meta files have windows paths, but the data is stored on a linux server
115
+ linux_path = windows_path.replace('\\', '/')
116
+ if ':' in linux_path:
117
+ linux_path = linux_path.split(':', 1)[1]
118
+ return linux_path
119
+
120
+ def main(target_path, output_dir):
121
+ metadata_files = find_metadata_files(target_path)
122
+ pid_dirs=find_image_dirs(target_path)
123
+ pid_dirs=["Training","Testing","Validation"]
124
+ failed_files = []
125
+ if not os.path.isdir(output_dir):
126
+ os.makedirs(output_dir)
127
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
128
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
129
+ meta = meta_data()
130
+
131
+ # Initialize the JSON file
132
+ if not os.path.exists(json_output_path):
133
+ with open(json_output_path, 'w') as json_file:
134
+ json.dump({}, json_file)
135
+ meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
136
+ if os.path.isfile(meta_file):
137
+ mf_flag=True
138
+ df_meta=pd.read_csv(meta_file,sep=',')
139
+ else:
140
+ mf_flag=False
141
+
142
+ if pid_dirs:
143
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
144
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
145
+ continue
146
+ if pid_dir =="Training":
147
+ tr_flag=True
148
+ else:
149
+ tr_flag=False
150
+ label_flag=False
151
+
152
+ if not tr_flag:
153
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
154
+ unlabeled_list=image_dirs
155
+ else:
156
+ image_dir_1=find_image_dirs(os.path.join(target_path,pid_dir,'Labeled'))
157
+ image_dir_2=find_image_dirs(os.path.join(target_path,pid_dir,'Unlabeled'))
158
+ unlabeled_list=image_dir_2
159
+ image_dirs=image_dir_1+image_dir_2
160
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
161
+
162
+ location=data_dir
163
+ if not tr_flag:
164
+ full_path=os.path.join(target_path,pid_dir,data_dir)
165
+ else:
166
+ if data_dir in unlabeled_list:
167
+ full_path=os.path.join(target_path,pid_dir,"Unlabeled",data_dir)
168
+ else:
169
+ full_path=os.path.join(target_path,pid_dir,"Labeled",data_dir)
170
+ label_flag=True
171
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
172
+
173
+ if data_info_row.shape[0]>0:
174
+ data_info_row=data_info_row.reset_index()
175
+ #print(data_info_row[meta_id_name])
176
+ meta_image_id=data_info_row[meta_id_name][0]
177
+ meta_vendor=data_info_row[meta_vendor_name][0]
178
+ meta_centre=data_info_row[meta_centre_name][0]
179
+ meta_pathology=data_info_row[meta_pathology_name][0]
180
+ meta_age=data_info_row[meta_age_name][0]
181
+ meta_sex=data_info_row[meta_sex_name][0]
182
+ meta_height=data_info_row[meta_height_name][0]
183
+ meta_weigth=data_info_row[meta_weight_name][0]
184
+ meta_ed=data_info_row[meta_ed_name][0]
185
+ meta_es=data_info_row[meta_es_name][0]
186
+ else:
187
+ meta_image_id=data_dir
188
+ meta_vendor=''
189
+ meta_centre=''
190
+ meta_pathology=''
191
+ meta_age=''
192
+ meta_sex=''
193
+ meta_height=''
194
+ meta_weigth=''
195
+ meta_ed=''
196
+ meta_es=''
197
+ # full_path = convert_windows_to_linux_path(full_path)
198
+ if not os.path.isdir(full_path):
199
+ continue
200
+ try:
201
+ print(full_path)
202
+ full_path_image=os.path.join(full_path,"%s_sa.nii.gz"%data_dir)
203
+
204
+ if label_flag:
205
+ full_path_label=os.path.join(full_path,"%s_sa_gt.nii.gz"%data_dir)
206
+ if not os.path.isfile(full_path_label):
207
+ full_path_label=None
208
+ else:
209
+ full_path_label=None
210
+
211
+ sitk_img_original = util.load_nifti(full_path_image)
212
+ if sitk_img_original is None:
213
+ print(f" Failed to load image: {full_path_image}")
214
+ continue
215
+
216
+ modality="MRI"
217
+ study='MnMs'##Dataset_name
218
+ CIA_other_info = {
219
+ 'metadata_file':''
220
+ # 'Series_Description':serise_desc
221
+ }
222
+ CIA_other_info['split'] = pid_dir
223
+ if mf_flag:
224
+ CIA_other_info['metadata_file']=meta_file
225
+
226
+ original_spacing = list(sitk_img_original.GetSpacing())
227
+ original_size = list(sitk_img_original.GetSize())
228
+ sitk_img_processed = sitk_img_original
229
+ # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4
230
+ is_4d_image = sitk_img_original.GetDimension() == 4
231
+
232
+ frame_flag=False
233
+ # --- Resampling Logic (Revised for 4D) ---
234
+ if is_4d_image:
235
+
236
+
237
+ # Always process 4D images channel-wise for resampling
238
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
239
+ channels = []
240
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
241
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
242
+
243
+
244
+ for i in range(num_channels):
245
+ extractor = sitk.ExtractImageFilter()
246
+ current_3d_channel_size = original_size[:3]
247
+
248
+ if sitk_img_original.GetDimension() == 4:
249
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
250
+ extractor.SetIndex([0,0,0,i])
251
+ channel_3d_img = extractor.Execute(sitk_img_original)
252
+ else:
253
+ channel_3d_img = sitk_img_original
254
+ if i > 0: break
255
+
256
+ channel_resampler = util.get_unisize_resampler(
257
+ channel_3d_img, 'linear',
258
+ spacing=channel_target_spacing, size=current_3d_channel_size
259
+ )
260
+ if channel_resampler:
261
+ channels.append(channel_resampler.Execute(channel_3d_img))
262
+ else:
263
+ channels.append(channel_3d_img)
264
+
265
+ if channels:
266
+ if len(channels) > 1: # Only join if there are multiple channels
267
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
268
+ ##aded by yanguoqing on 2025-08-11
269
+ frame_flag=True
270
+ imgDict={}
271
+ for kf_idx in range(num_channels):
272
+ imgDict[str(kf_idx)]='none'
273
+ if str(meta_ed):imgDict[str(meta_ed)]='ed'
274
+ if str(meta_es):imgDict[str(meta_es)]='es'
275
+ meta.add_keyvalue('ImgDict',imgDict)
276
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
277
+ sitk_img_processed = channels[0]
278
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
279
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
280
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
281
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
282
+ else: # 3D image, no TARGET_VOXEL_SPACING
283
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
284
+ spacing=original_spacing, size=original_size)
285
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
286
+
287
+
288
+
289
+ ##
290
+ CIA_other_info['Image_id']=meta_image_id
291
+ CIA_other_info['Vendor']=meta_vendor
292
+ CIA_other_info['Centre']=str(meta_centre)
293
+ CIA_other_info['Pathology']=str(meta_pathology)
294
+ CIA_other_info['Age']=str(meta_age)
295
+ CIA_other_info['Sex']=meta_sex
296
+ CIA_other_info['Height']=str(meta_height)
297
+ CIA_other_info['Weight']=str(meta_weigth)
298
+ CIA_other_info['ED']=str(meta_ed)
299
+ CIA_other_info['ES']=str(meta_es)
300
+
301
+
302
+
303
+ # --- End Resampling Logic ---
304
+
305
+ is_processed_4d = sitk_img_processed.GetDimension() == 4
306
+ clamp_range_to_use=None
307
+ if clamp_range_to_use and is_processed_4d:
308
+ clamped_channels_final = []
309
+ num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
310
+ for i in range(num_channels_final):
311
+ extractor = sitk.ExtractImageFilter()
312
+ proc_size_final = sitk_img_processed.GetSize()
313
+ extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
314
+ extractor.SetIndex([0,0,0,i])
315
+ channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
316
+ clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
317
+ if clamped_channels_final:
318
+ if len(clamped_channels_final) > 1:
319
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
320
+ elif len(clamped_channels_final) == 1:
321
+ sitk_img_processed = clamped_channels_final[0]
322
+ elif clamp_range_to_use: # 3D image
323
+ sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
324
+
325
+
326
+ output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
327
+ # output_path=convert_windows_to_linux_path(output_path)
328
+ save_nifti(sitk_img_processed, output_path, full_path_image)
329
+ print(f"Saved NIfTI file to {output_path}")
330
+
331
+ label_path_dict = {}
332
+
333
+ processed_lbl_full_path = os.path.join(output_dir, data_dir, TASK_VALUE, f"{data_dir}.nii.gz")
334
+ print(processed_lbl_full_path,full_path_label,tr_flag,label_flag)
335
+ if tr_flag and label_flag and os.path.exists(full_path_label):
336
+ sitk_lbl_original = util.load_nifti(full_path_label)
337
+ if not sitk_lbl_original:
338
+ print(f" Failed to load label: {full_path_label}")
339
+ processed_lbl_full_path = None
340
+ continue
341
+ if sitk_lbl_original:
342
+ label_resampler = sitk.ResampleImageFilter()
343
+ reference_for_label = sitk_img_processed # Default to processed image
344
+
345
+ if sitk_img_processed.GetDimension() == 4:
346
+ num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
347
+ if num_comp_proc > 0:
348
+ extractor = sitk.ExtractImageFilter()
349
+ proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
350
+ extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
351
+ extractor.SetIndex([0,0,0,0])
352
+ try:
353
+ reference_for_label = extractor.Execute(sitk_img_processed)
354
+ except Exception as ref_err:
355
+ print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
356
+ # print(traceback.format_exc())
357
+ reference_for_label = None
358
+ else: # Fallback if extraction fails
359
+ print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
360
+ reference_for_label = None # This will cause an issue below if not handled
361
+
362
+ sitk_lbl_processed = None
363
+
364
+ if reference_for_label and reference_for_label.GetDimension() > 0:
365
+ label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
366
+ label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
367
+
368
+ if sitk_lbl_original.GetDimension() == 4:
369
+ lbl_channels = []
370
+ lbl_size = list(sitk_lbl_original.GetSize())
371
+ for i in range(lbl_size[3]):
372
+ extractor = sitk.ExtractImageFilter()
373
+ extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
374
+ extractor.SetIndex([0, 0, 0, i])
375
+ single_channel = extractor.Execute(sitk_lbl_original)
376
+
377
+ label_resampler.SetReferenceImage(reference_for_label)
378
+ resampled_channel = label_resampler.Execute(single_channel)
379
+ lbl_channels.append(resampled_channel)
380
+
381
+ if len(lbl_channels) > 1:
382
+ sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
383
+ elif len(lbl_channels) == 1:
384
+ sitk_lbl_processed = lbl_channels[0]
385
+ else:
386
+ label_resampler.SetReferenceImage(reference_for_label)
387
+ sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
388
+ if processed_lbl_full_path:
389
+ if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
390
+ print(f" Mismatch between image and label size (ignoring channels):")
391
+ print(f" Image size: {sitk_img_processed.GetSize()}")
392
+ print(f" Label size: {sitk_lbl_processed.GetSize()}")
393
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
394
+ else:
395
+ print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
396
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original
397
+ # processed_lbl_full_path should still point to this saved original label
398
+ else:
399
+ processed_lbl_full_path = None
400
+ else:
401
+ processed_lbl_full_path = None
402
+
403
+ if processed_lbl_full_path:
404
+ label_path_dict['heart'] = processed_lbl_full_path
405
+
406
+ print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
407
+ print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
408
+ try:
409
+ assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
410
+
411
+ except Exception as e:
412
+ failed_files.append(full_path_label)
413
+ continue
414
+
415
+ except RuntimeError:
416
+ failed_files.append(full_path_image)
417
+ print(f"Failed to load MnMs images from {full_path_image}")
418
+ continue
419
+
420
+
421
+
422
+
423
+ size_processed = list(sitk_img_processed.GetSize())
424
+ print('size_processed',size_processed)
425
+
426
+ # meta.add_keyvalue('Image_id',meta_image_id)
427
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
428
+ meta.add_keyvalue('OriImg_path',full_path_image)
429
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
430
+ meta.add_keyvalue('Modality',modality)
431
+ meta.add_keyvalue('Dataset_name',study)
432
+ meta.add_keyvalue('ROI','chest')
433
+
434
+
435
+ if processed_lbl_full_path:
436
+ print(label_path_dict.keys())
437
+ meta.add_keyvalue('Task',TASK_VALUE)
438
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
439
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
440
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
441
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
442
+
443
+
444
+
445
+
446
+ # Write the mapping to the JSON file on the fly
447
+ with open(json_output_path, 'r+') as json_file:
448
+ existing_mappings = json.load(json_file)
449
+ existing_mappings[output_path] = meta.get_meta_data()
450
+ json_file.seek(0)
451
+ print(existing_mappings)
452
+ json.dump(existing_mappings, json_file, indent=4)
453
+ json_file.truncate()
454
+ # else:
455
+ # print("No metadata.csv files found.")
456
+
457
+ with open(failed_files_path, "w") as json_file:
458
+ json.dump(failed_files, json_file)
459
+
460
+ print(f"The list has been written to {failed_files_path}")
461
+ print(f"Saved NIfTI mappings to {json_output_path}")
462
+
463
+ if __name__ == "__main__":
464
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
465
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnMs/OpenDataset/")
466
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnMs/")
467
+ args = parser.parse_args()
468
+ print(args.target_path, args.output_dir)
469
+ main(args.target_path, args.output_dir)
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
MnMs_clean/util.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def load_dicom_images(folder_path):
8
+ reader = sitk.ImageSeriesReader()
9
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
10
+ reader.SetFileNames(dicom_names)
11
+ image = reader.Execute()
12
+ return image
13
+
14
+ def convert_windows_to_linux_path(windows_path):
15
+ # Replace backslashes with forward slashes and remove the drive letter
16
+ # Some meta files have windows paths, but the data is stored on a linux server
17
+ linux_path = windows_path.replace('\\', '/')
18
+ if ':' in linux_path:
19
+ linux_path = linux_path.split(':', 1)[1]
20
+ return linux_path
21
+
22
+ # =============================================================================
23
+ # ========================developed with TotalSegmentor========================
24
+ # =============================================================================
25
+
26
+ def read_table(file_path, split_str=';'):
27
+ try:
28
+ df = pd.read_excel(file_path, engine='openpyxl')
29
+ except:
30
+ df = pd.read_csv(file_path, sep=split_str)
31
+ return df
32
+
33
+ def load_nifti(image_path):
34
+ return sitk.ReadImage(image_path)
35
+
36
+ def save_nifti(image, output_path, folder_path):
37
+ output_dirpath = os.path.dirname(output_path)
38
+ if not os.path.exists(output_dirpath):
39
+ print(f"Creating directory {output_dirpath}")
40
+ os.makedirs(output_dirpath)
41
+ # Set metadata in the NIfTI file's header
42
+ image.SetMetaData("FolderPath", folder_path)
43
+ sitk.WriteImage(image, output_path)
44
+
45
+ def find_metadata_files(path, file_name='*meta*'):
46
+ # for TotalSegmentor dataset
47
+ search_pattern = os.path.join(path, '**', file_name)
48
+ return glob.glob(search_pattern, recursive=True)
49
+
50
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
51
+ img_path = []
52
+ for root, dirs, files in os.walk(folder_path):
53
+ for file in files:
54
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
55
+ img_path.append(os.path.join(root, file))
56
+ if is_sorted:
57
+ img_path.sort()
58
+ return img_path
59
+
60
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
61
+ '''
62
+ Resample the image to have isotropic spacing, following the steps:
63
+ 1. Find the minimum spacing
64
+ 2. Resample the image to have the minimum spacing
65
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
66
+ 4. Set the output spacing
67
+ 5. Return the resampler for resampling
68
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
69
+ '''
70
+ # 讨论为什么重新写这个函数!!!
71
+ if size is None:
72
+ size = ref_img.GetSize()
73
+ if spacing is None:
74
+ spacing = ref_img.GetSpacing()
75
+ min_spacing = min(spacing)
76
+ if all([spc == min_spacing for spc in spacing]):
77
+ return None
78
+ else:
79
+ # if 1:
80
+ if interpolator == 'nearest':
81
+ interpolator = sitk.sitkNearestNeighbor
82
+ elif interpolator == 'linear':
83
+ interpolator = sitk.sitkLinear
84
+ resampler = sitk.ResampleImageFilter()
85
+ # new_spacing = [max_spacing] * len(spacing)
86
+ # print(size)
87
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
88
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
89
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
90
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
91
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
92
+ # resampler.SetSize(new_size)
93
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
94
+ resampler.SetSize(new_size_xy)
95
+ resampler.SetOutputSpacing(new_size_spacing)
96
+
97
+ # print(new_size,new_size_xy)
98
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
99
+ resampler.SetOutputDirection(ref_img.GetDirection())
100
+ resampler.SetInterpolator(interpolator)
101
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
102
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
103
+ return resampler
104
+
105
+ def clamp_image(in_img,clamp_range):
106
+ '''
107
+ Clamp the image to the specified range
108
+ '''
109
+ clamp_filter = sitk.ClampImageFilter()
110
+ clamp_filter.SetLowerBound(clamp_range[0])
111
+ clamp_filter.SetUpperBound(clamp_range[1])
112
+ return clamp_filter.Execute(in_img)
113
+
114
+ def get_synonyms_dict(dict_type='ROI'):
115
+ '''
116
+ Get the dictionary of synonyms for the specified dictionary type
117
+ '''
118
+ if dict_type == 'ROI':
119
+ dict_synonyms = {
120
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
121
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
122
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
123
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
124
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
125
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
126
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
127
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
128
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
129
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
130
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
131
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
132
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
133
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
134
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
135
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
136
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
137
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
138
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
139
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
140
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
141
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
142
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
143
+ }
144
+ elif dict_type == 'Label_tissue':
145
+ dict_synonyms = {
146
+ 'liver': ['liver','hepatic'],
147
+ 'spleen': ['spleen','splenic'],
148
+ 'kidney': ['kidney','renal'],
149
+ 'pancreas': ['pancreas','pancreatic'],
150
+ 'stomach': ['stomach','gastric'],
151
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
152
+ 'gallbladder': ['gallbladder'],
153
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
154
+ 'bladder': ['bladder'],
155
+ 'prostate': ['prostate'],
156
+ 'uterus': ['uterus'],
157
+ 'ovary': ['ovary'],
158
+ 'testicle': ['testicle'],
159
+ 'lymph_node': ['lymph_node','lymph node'],
160
+ 'bone': ['bone'],
161
+ 'lung': ['lung'],
162
+ 'heart': ['heart'],
163
+ 'esophagus': ['esophagus'],
164
+ 'muscle': ['muscle'],
165
+ 'fat': ['fat'],
166
+ 'skin': ['skin'],
167
+ 'vessel': ['vessel'],
168
+ 'tumor': ['tumor'],
169
+ 'other': ['other']
170
+ }
171
+ elif dict_type == 'Task':
172
+ dict_synonyms = {
173
+ 'segmentation': ['segmentation', 'seg', 'mask'],
174
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
175
+ 'localization': ['localization', 'locate', 'location', 'position'],
176
+ 'registration': ['registration', 'register', 'align', 'alignment'],
177
+ 'detection': ['detection', 'detect', 'find', 'locate'],
178
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
179
+ }
180
+ elif dict_type == 'Modality':
181
+ dict_synonyms = {
182
+ 'CT': ['CT', 'computed tomography'],
183
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
184
+ 'PET': ['PET', 'positron emission tomography'],
185
+ 'US': ['US', 'ultrasound'],
186
+ 'X-ray': ['X-ray', 'radiography'],
187
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
188
+ }
189
+ else:
190
+ raise ValueError(f"dict_type {dict_type} is not valid")
191
+ return dict_synonyms
192
+
193
+ def replace_synonyms(text, dict_synonyms):
194
+ '''
195
+ Replace the synonyms in the text with the standard term
196
+ '''
197
+ if isinstance(text,str):
198
+ for key, value in dict_synonyms.items():
199
+ for v in value:
200
+ if v.lower() in text.lower():
201
+ return key
202
+ Warning(f"Value {text} is not in the correct format")
203
+ elif isinstance(text,list):
204
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
205
+ elif isinstance(text,dict):
206
+ for key in text.keys():
207
+ # replace values in dict
208
+ text[key] = replace_synonyms(text[key], dict_synonyms)
209
+ # replace keys in dict
210
+ for k in dict_synonyms.keys():
211
+ text[dict_synonyms[k]] = text.pop(key)
212
+ return text
213
+
214
+ # =============================================================================
215
+
216
+ class meta_data(object):
217
+ '''
218
+ This class is used to store the metadata of the dataset
219
+ '''
220
+ def __init__(self):
221
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
222
+ with open(self.config_format_path, 'r') as file:
223
+ self.config_format = json.load(file)
224
+ self.config = {}
225
+ for key in self.config_format.keys():
226
+ if self.config_format[key]['required'] == True:
227
+ self.config[key] = {}
228
+ self.keytypes = self.find_all_keys_with_type()
229
+ self.keytypes_flatten = self.flatten_json()
230
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
231
+ for key in self.ambiguity_keys:
232
+ ambiguity_dict = get_synonyms_dict(key)
233
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
234
+
235
+ def get_ketytypes(self):
236
+ return self.keytypes
237
+
238
+ def get_keytypes_flatten(self):
239
+ return self.keytypes_flatten
240
+
241
+ def find_all_keys_with_type(self, data=None, parent_key=''):
242
+ if data is None:
243
+ data = self.config_format
244
+ keys_with_type = {}
245
+ if isinstance(data, dict):
246
+ for key, value in data.items():
247
+ full_key = f"{parent_key}.{key}" if parent_key else key
248
+ if isinstance(value, dict) and 'type' in value:
249
+ keys_with_type[full_key] = value['type']
250
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
251
+ elif isinstance(data, list):
252
+ for index, item in enumerate(data):
253
+ full_key = f"{parent_key}[{index}]"
254
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
255
+ return keys_with_type
256
+
257
+ def flatten_json(self, data=None, parent_key='', sep='.'):
258
+ if data is None:
259
+ data = self.config_format
260
+ items = {}
261
+ if isinstance(data, dict):
262
+ for key, value in data.items():
263
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
264
+ if isinstance(value, dict):
265
+ items.update(self.flatten_json(value, new_key, sep=sep))
266
+ elif isinstance(value, list):
267
+ for i, item in enumerate(value):
268
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
269
+ else:
270
+ items[new_key] = value
271
+ elif isinstance(data, list):
272
+ for i, item in enumerate(data):
273
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
274
+ return items
275
+
276
+ def req_check(self):
277
+ self.unfilled_keys = []
278
+ for key in self.config.keys():
279
+ if self.config[key] == {}:
280
+ self.unfilled_keys.append(key)
281
+ if len(self.unfilled_keys) == 0:
282
+ return True
283
+ else:
284
+ return False
285
+
286
+ def type_check(self, key, value):
287
+ if key not in self.config_format.keys():
288
+ print(key, "is not a valid key")
289
+ return False
290
+
291
+ if key == 'Modality':
292
+ if value not in self.config_format[key]['options']:
293
+ return False
294
+ else:
295
+ return True
296
+
297
+ elif key == 'OriImg_path':
298
+ if isinstance(value, str):
299
+ return True
300
+ else:
301
+ return False
302
+
303
+ elif key == 'Label_path' and isinstance(value, dict):
304
+ for skey in value.keys():
305
+ if skey in self.config_format[key]['keys']:
306
+ for kk in value[skey]:
307
+ if isinstance(value[skey][kk],str):
308
+ pass
309
+ # if kk in self.config_format[key]['value']['keys']:
310
+ # if isinstance(value[skey][kk],str):
311
+ # pass
312
+ # else:
313
+ # return False
314
+ else:
315
+ return False
316
+ return True
317
+
318
+ elif key == 'ROI':
319
+ if value not in self.config_format[key]['options']:
320
+ return False
321
+ else:
322
+ return True
323
+
324
+ elif key == 'Label_tissue' and isinstance(value, list):
325
+ for i in value:
326
+ if i not in self.config_format[key]['items']['options']:
327
+ return False
328
+ return True
329
+
330
+ elif key =='Task' and isinstance(value, list):
331
+ for i in value:
332
+ if i not in self.config_format[key]['items']['options']:
333
+ return False
334
+ return True
335
+
336
+ elif key == 'Spacing_mm':
337
+ if isinstance(value, float):
338
+ return True
339
+ else:
340
+ False
341
+
342
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
343
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
344
+ return all(isinstance(item, int) for item in value)
345
+
346
+ elif key == 'Dataset_name':
347
+ if isinstance(value, str):
348
+ return True
349
+ else:
350
+ return False
351
+ elif key == 'ImgDict':
352
+ if isinstance(value, dict):
353
+ return True
354
+ else:
355
+ return False
356
+ elif key == 'Label_Dict':
357
+ if isinstance(value, dict):
358
+ return True
359
+ else:
360
+ return False
361
+ def add_extra_keyvalue(self, key, value):
362
+ self.config[key] = value
363
+ return True
364
+
365
+ def add_keyvalue(self, key, value):
366
+ if key in self.ambiguity_keys:
367
+ value = replace_synonyms(value, get_synonyms_dict(key))
368
+ # print(key, value)
369
+ if self.type_check(key, value):
370
+ self.config[key] = value
371
+ return True
372
+ else:
373
+ Warning(f"Value {value} is not in the correct format for key {key}")
374
+ pass
375
+ # print(f"Value {value} is not in the correct format for key {key}")
376
+
377
+ def get_meta_data(self):
378
+ if self.req_check():
379
+ return self.config
380
+ else:
381
+ print("Not all required keys are filled", self.unfilled_keys)
382
+ return False
383
+
384
+
385
+
386
+ if __name__ == '__main__':
387
+ meta = meta_data()
388
+ print(meta.get_keytypes_flatten())
389
+ print(meta.get_ketytypes())
390
+ meta.add_keyvalue('Modality', 'CT')
391
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
392
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
393
+ meta.add_keyvalue('Spacing_mm', 1.5)
394
+ meta.add_keyvalue('Size', [512, 512, 100])
395
+ meta.add_keyvalue('Dataset_name', 'CT')
396
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
397
+ meta.add_keyvalue('Task', ['1', '2', '3'])
398
+ print(meta.get_meta_data())
399
+ meta.add_extra_key('extra', 'extra')
400
+ print(meta.get_meta_data())
401
+ print(meta.get_ketytypes())
402
+ print(meta.get_keytypes_flatten)
403
+
404
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
405
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
406
+ print(img_paths)
OAISIS_clean/config_format.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+
117
+ "Sub_modality": {
118
+ "type": "dict",
119
+ "required": false
120
+ },
121
+ "Label_Dict": {
122
+ "type": "dict",
123
+ "required": false
124
+ }
125
+ }
OAISIS_clean/dataclean_OASIS_1_CS_Sectional.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-09-01
5
+
6
+ OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
7
+ OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
8
+
9
+ 1. 目录与文件命名规则
10
+ 根目录下按受试者会话ID建立文件夹。
11
+ 受试者ID格式:OAS1_xxxx (例如 OAS1_0012)
12
+ 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像)
13
+ OAS1_xxxx_MRy/
14
+
15
+ ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件
16
+ ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看)
17
+ ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式)
18
+ ├── PROCESSED/ # 预处理后的图像
19
+ │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³)
20
+ │ └── T88_111/ # 图谱配准空间下的图像
21
+ │ ├── t4_files/ # 存储配准变换矩阵文件
22
+ │ └── ... # 配准后的图像文件
23
+ └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1)
24
+
25
+
26
+ 所有图像均以 Analyze 7.5格式 存储,包含:
27
+ 一个图像文件(.img)
28
+ 一个头文件(.hdr)
29
+ 使用 16位大端序(big-endian) 存储
30
+
31
+ OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位
32
+ OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位
33
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位
34
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位
35
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位
36
+
37
+ 1. 人口统计学信息
38
+ 性别(M/F)
39
+ 用手习惯(Hand)(均为右利手)
40
+ 年龄(Age)
41
+ 教育程度(Educ)(1-5级)
42
+ 社会经济地位(SES)
43
+
44
+ 2. 临床评估
45
+ MMSE(简易精神状态检查)
46
+ CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
47
+
48
+ 3. 衍生解剖指标
49
+ eTIV:估计颅内容积
50
+ ASF:图谱缩放因子
51
+ nWBV:标准化全脑体积
52
+
53
+
54
+ OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集
55
+ 经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。
56
+ ├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录
57
+ │ ├── mri/ # 体积数据(Volume-based data)
58
+ │ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式)
59
+ │ │ ├── nu.mgz # 强度归一化后的图像
60
+ │ │ ├── T1.mgz # 用于分割的图像
61
+ │ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割)
62
+ │ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割
63
+ │ │ ├── brain.mgz # 去除非脑组织后的图像
64
+ │ │ ├── brainmask.mgz # 大脑掩模
65
+ │ │ └── ... (其他文件)
66
+ │ ├── surf/ # 表面数据(Surface-based data)
67
+ │ │ ├── lh.pial # 左半球软脑膜表面
68
+ │ │ ├── lh.white # 左半球白质表面
69
+ │ │ ├── rh.pial # 右半球软脑膜表面
70
+ │ │ ├── rh.white # 右半球白质表面
71
+ │ │ ├── lh.thickness # 左半球皮层厚度图
72
+ │ │ └── ... (其他文件)
73
+ │ ├── stats/ # 统计结果(文本文件)
74
+ │ │ ├── aseg.stats # 皮质下结构体积统计
75
+ │ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计
76
+ │ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计
77
+ │ └── label/ # 标签文件
78
+ │ └── ...
79
+ '''
80
+ import os
81
+ import glob
82
+ import pandas as pd
83
+ import SimpleITK as sitk
84
+ import argparse
85
+ import json
86
+ from tqdm import tqdm
87
+ from util import meta_data
88
+ import util
89
+ import numpy as np
90
+ # from bert_helper import *
91
+
92
+ import shutil
93
+
94
+ import warnings
95
+ warnings.filterwarnings("ignore")
96
+ meta_id_name='ID'
97
+ ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),���育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
98
+ META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
99
+
100
+ TASK_VALUE="segmentation"
101
+ CLAMP_RANGE_CT = [-300,300]
102
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
103
+ TARGET_VOXEL_SPACING=None
104
+
105
+ ##参考MSD的sub_modality描述信息
106
+ SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"]
107
+ ##文件名对应的排序顺序
108
+ SERIES_ORDER=["flair","t1","t1ce","t2"]
109
+
110
+ LABEL_DICT={
111
+ "0":"backgroud",
112
+ "1":"cerebrospinal fluid",#CSF
113
+ "2":"gray matter",#GM
114
+ "3":"white matter"#WM
115
+ }
116
+ # def find_metadata_files(path):
117
+ # # for Cancer Image Archive (TCIA) dataset
118
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
119
+ # return glob.glob(search_pattern, recursive=True)
120
+
121
+ def find_metadata_files(path):
122
+ # for Cancer Image Archive (TCIA) dataset
123
+ search_pattern = os.path.join(path, '*.csv')
124
+ return glob.glob(search_pattern, recursive=True)
125
+ ##added by yanguoqing on 20250527
126
+ def find_image_dirs(path):
127
+ return os.listdir(path)
128
+
129
+ ##modify by yanguoqing on 20250527
130
+ def load_dicom_images(folder_path):
131
+ reader = sitk.ImageSeriesReader()
132
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
133
+ reader.SetFileNames(dicom_names)
134
+ image = reader.Execute()
135
+ return dicom_names,image
136
+
137
+ ##added by yanguoqing on 20250527
138
+ def load_dicom_tag(imgs):
139
+ reader = sitk.ImageFileReader()
140
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
141
+ reader.SetFileName(imgs)
142
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
143
+ # metadata_keys = reader.GetMetaDataKeys()
144
+ tag=reader.Execute()
145
+ return tag
146
+
147
+ def load_nrrd(fp):
148
+ return sitk.ReadImage(fp)
149
+
150
+ ##modify by yanguoqing on 20250805
151
+ def load_brtas_images(series_files):
152
+ '''
153
+ 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离)
154
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放
155
+ '''
156
+ reader = sitk.ImageSeriesReader()
157
+ reader.SetFileNames(series_files)
158
+ image = reader.Execute()
159
+ return image
160
+
161
+ def save_nifti(image, output_path, folder_path):
162
+ # Set metadata in the NIfTI file's header
163
+ output_dirpath = os.path.dirname(output_path)
164
+ if not os.path.exists(output_dirpath):
165
+ print(f"Creating directory {output_dirpath}")
166
+ os.makedirs(output_dirpath)
167
+ # Set metadata in the NIfTI file's header
168
+ image.SetMetaData("FolderPath", folder_path)
169
+ sitk.WriteImage(image, output_path)
170
+
171
+ ##modify by yanguoqing on 20250527
172
+ def convert_windows_to_linux_path(windows_path):
173
+ # Replace backslashes with forward slashes and remove the drive letter
174
+ # Some meta files have windows paths, but the data is stored on a linux server
175
+ linux_path = windows_path.replace('\\', '/')
176
+ if ':' in linux_path:
177
+ linux_path = linux_path.split(':', 1)[1]
178
+ return linux_path
179
+
180
+ def main(target_path, output_dir):
181
+
182
+ pid_dirs=find_image_dirs(target_path)
183
+ failed_files = []
184
+ if not os.path.isdir(output_dir):
185
+ os.makedirs(output_dir)
186
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
187
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
188
+ meta = meta_data()
189
+
190
+ # Initialize the JSON file
191
+ if not os.path.exists(json_output_path):
192
+ with open(json_output_path, 'w') as json_file:
193
+ json.dump({}, json_file)
194
+ ##方便处理解析信息,转成csv文件
195
+ meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
196
+ meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
197
+ if os.path.isfile(meta_file):
198
+ mf_flag=True
199
+ df_meta=pd.read_csv(meta_file,sep=',')
200
+ else:
201
+ mf_flag=False
202
+
203
+
204
+ if pid_dirs:
205
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
206
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
207
+ continue
208
+
209
+ ##遍历所有目录下的病例数据
210
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
211
+
212
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
213
+ ##data_dir即id
214
+ full_path=os.path.join(target_path,pid_dir,data_dir)
215
+
216
+ modality="MRI"
217
+ study='OASIS_1'##Dataset_name
218
+ CIA_other_info = {'metadata_file':''}
219
+ CIA_other_info['split'] = "train"
220
+ CIA_other_info['metadata_file']=meta_file_ori
221
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
222
+
223
+ if data_info_row.shape[0]>0:
224
+ data_info_row=data_info_row.reset_index()
225
+ #print(data_info_row[meta_id_name])
226
+ for keyname in META_COLUMN[1:]:
227
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
228
+
229
+ CIA_other_info['Image_id']=data_dir
230
+
231
+
232
+ else:
233
+ meta_image_id=data_dir
234
+ for keyname in META_COLUMN[1:]:
235
+ CIA_other_info[keyname]=''
236
+
237
+
238
+
239
+ try:
240
+ ##读取去骨保留脑组织的img
241
+ #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img
242
+ full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_masked_gfc.img"%(full_path,data_dir))[0]
243
+ # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir)
244
+
245
+ if os.path.isfile(full_file):
246
+ ##存在有效的MRI影像数据进行后续处理
247
+ sitk_img_original=util.load_nifti(full_file)
248
+ else:
249
+ print("病例数据%s为空"%data_dir)
250
+ continue
251
+
252
+
253
+ original_spacing = list(sitk_img_original.GetSpacing())
254
+ original_size = list(sitk_img_original.GetSize())
255
+
256
+
257
+
258
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))
259
+ meta.add_keyvalue('OriImg_path',full_file)
260
+ meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin
261
+ meta.add_keyvalue('Modality',modality)
262
+ meta.add_keyvalue('Dataset_name',study)
263
+ meta.add_keyvalue('ROI','head')
264
+
265
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
266
+
267
+ output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
268
+ # output_path=convert_windows_to_linux_path(output_path)
269
+ ##
270
+ save_nifti(sitk_img_original, output_image_file, full_path)
271
+ print(f"Saved NIfTI file to {output_image_file}")
272
+ ##Label processing
273
+
274
+ label_path_dict={}
275
+ #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img
276
+ full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0]
277
+
278
+
279
+ process_label_path=os.path.join(output_dir,data_dir,'segmentation')
280
+
281
+ processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz")
282
+
283
+ if not os.path.isdir(process_label_path):
284
+ os.makedirs(process_label_path,exist_ok=True)
285
+
286
+ if not os.path.isfile(full_label_file):
287
+ label_flag=False
288
+ else:
289
+ sitk_lbl_original = util.load_nifti(full_label_file)
290
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
291
+ print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
292
+
293
+ label_path_dict['head'] = processed_lbl_full_path
294
+ label_flag=True
295
+
296
+ if label_flag:
297
+ meta.add_keyvalue('Task',TASK_VALUE)
298
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
299
+
300
+
301
+
302
+ # try:
303
+ # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
304
+ # except Exception as e:
305
+ # failed_files.append(full_path_label)
306
+ # continue
307
+ print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
308
+
309
+ except Exception as e:
310
+ print(e)
311
+ failed_files.append(data_dir)
312
+ print(f"Failed to load BRATS images from {data_dir}")
313
+ continue
314
+
315
+
316
+
317
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
318
+
319
+
320
+ # Write the mapping to the JSON file on the fly
321
+ with open(json_output_path, 'r+') as json_file:
322
+ existing_mappings = json.load(json_file)
323
+ existing_mappings[output_image_file] = meta.get_meta_data()
324
+ json_file.seek(0)
325
+ # print(existing_mappings)
326
+ json.dump(existing_mappings, json_file, indent=4)
327
+ json_file.truncate()
328
+ # else:
329
+ # print("No metadata.csv files found.")
330
+
331
+ with open(failed_files_path, "w") as json_file:
332
+ json.dump(failed_files, json_file)
333
+
334
+ print(f"The list has been written to {failed_files_path}")
335
+ print(f"Saved NIfTI mappings to {json_output_path}")
336
+
337
+ if __name__ == "__main__":
338
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
339
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
340
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL")
341
+ args = parser.parse_args()
342
+ print(args.target_path, args.output_dir)
343
+ main(args.target_path, args.output_dir)
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
OAISIS_clean/dataclean_OASIS_1_CS_Sectional_Unmask.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-09-01
5
+
6
+ OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
7
+ OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
8
+
9
+ 1. 目录与文件命名规则
10
+ 根目录下按受试者会话ID建立文件夹。
11
+ 受试者ID格式:OAS1_xxxx (例如 OAS1_0012)
12
+ 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像)
13
+ OAS1_xxxx_MRy/
14
+
15
+ ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件
16
+ ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看)
17
+ ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式)
18
+ ├── PROCESSED/ # 预处理后的图像
19
+ │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³)
20
+ │ └── T88_111/ # 图谱配准空间下的图像
21
+ │ ├── t4_files/ # 存储配准变换矩阵文件
22
+ │ └── ... # 配准后的图像文件
23
+ └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1)
24
+
25
+
26
+ 所有图像均以 Analyze 7.5格式 存储,包含:
27
+ 一个图像文件(.img)
28
+ 一个头文件(.hdr)
29
+ 使用 16位大端序(big-endian) 存储
30
+
31
+ OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位
32
+ OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位
33
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位
34
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位
35
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位
36
+
37
+ 1. 人口统计学信息
38
+ 性别(M/F)
39
+ 用手习惯(Hand)(均为右利手)
40
+ 年龄(Age)
41
+ 教育程度(Educ)(1-5级)
42
+ 社会经济地位(SES)
43
+
44
+ 2. 临床评估
45
+ MMSE(简易精神状态检查)
46
+ CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
47
+
48
+ 3. 衍生解剖指标
49
+ eTIV:估计颅内容积
50
+ ASF:图谱缩放因子
51
+ nWBV:标准化全脑体积
52
+
53
+
54
+ OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集
55
+ 经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。
56
+ ├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录
57
+ │ ├── mri/ # 体积数据(Volume-based data)
58
+ │ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式)
59
+ │ │ ├── nu.mgz # 强度归一化后的图像
60
+ │ │ ├── T1.mgz # 用于分割的图像
61
+ │ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割)
62
+ │ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割
63
+ │ │ ├── brain.mgz # 去除非脑组织后的图像
64
+ │ │ ├── brainmask.mgz # 大脑掩模
65
+ │ │ └── ... (其他文件)
66
+ │ ├── surf/ # 表面数据(Surface-based data)
67
+ │ │ ├── lh.pial # 左半球软脑膜表面
68
+ │ │ ├── lh.white # 左半球白质表面
69
+ │ │ ├── rh.pial # 右半球软脑膜表面
70
+ │ │ ├── rh.white # 右半球白质表面
71
+ │ │ ├── lh.thickness # 左半球皮层厚度图
72
+ │ │ └── ... (其他文件)
73
+ │ ├── stats/ # 统计结果(文本文件)
74
+ │ │ ├── aseg.stats # 皮质下结构体积统计
75
+ │ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计
76
+ │ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计
77
+ │ └── label/ # 标签文件
78
+ │ └── ...
79
+ '''
80
+ import os
81
+ import glob
82
+ import pandas as pd
83
+ import SimpleITK as sitk
84
+ import argparse
85
+ import json
86
+ from tqdm import tqdm
87
+ from util import meta_data
88
+ import util
89
+ import numpy as np
90
+ # from bert_helper import *
91
+
92
+ import shutil
93
+
94
+ import warnings
95
+ warnings.filterwarnings("ignore")
96
+ meta_id_name='ID'
97
+ ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),���育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
98
+ META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
99
+
100
+ TASK_VALUE="segmentation"
101
+ CLAMP_RANGE_CT = [-300,300]
102
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
103
+ TARGET_VOXEL_SPACING=None
104
+
105
+ ##参考MSD的sub_modality描述信息
106
+ SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"]
107
+ ##文件名对应的排序顺序
108
+ SERIES_ORDER=["flair","t1","t1ce","t2"]
109
+
110
+ LABEL_DICT={
111
+ "0":"backgroud",
112
+ "1":"cerebrospinal fluid",#CSF
113
+ "2":"gray matter",#GM
114
+ "3":"white matter"#WM
115
+ }
116
+ # def find_metadata_files(path):
117
+ # # for Cancer Image Archive (TCIA) dataset
118
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
119
+ # return glob.glob(search_pattern, recursive=True)
120
+
121
+ def find_metadata_files(path):
122
+ # for Cancer Image Archive (TCIA) dataset
123
+ search_pattern = os.path.join(path, '*.csv')
124
+ return glob.glob(search_pattern, recursive=True)
125
+ ##added by yanguoqing on 20250527
126
+ def find_image_dirs(path):
127
+ return os.listdir(path)
128
+
129
+ ##modify by yanguoqing on 20250527
130
+ def load_dicom_images(folder_path):
131
+ reader = sitk.ImageSeriesReader()
132
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
133
+ reader.SetFileNames(dicom_names)
134
+ image = reader.Execute()
135
+ return dicom_names,image
136
+
137
+ ##added by yanguoqing on 20250527
138
+ def load_dicom_tag(imgs):
139
+ reader = sitk.ImageFileReader()
140
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
141
+ reader.SetFileName(imgs)
142
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
143
+ # metadata_keys = reader.GetMetaDataKeys()
144
+ tag=reader.Execute()
145
+ return tag
146
+
147
+ def load_nrrd(fp):
148
+ return sitk.ReadImage(fp)
149
+
150
+ ##modify by yanguoqing on 20250805
151
+ def load_brtas_images(series_files):
152
+ '''
153
+ 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离)
154
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放
155
+ '''
156
+ reader = sitk.ImageSeriesReader()
157
+ reader.SetFileNames(series_files)
158
+ image = reader.Execute()
159
+ return image
160
+
161
+ def save_nifti(image, output_path, folder_path):
162
+ # Set metadata in the NIfTI file's header
163
+ output_dirpath = os.path.dirname(output_path)
164
+ if not os.path.exists(output_dirpath):
165
+ print(f"Creating directory {output_dirpath}")
166
+ os.makedirs(output_dirpath)
167
+ # Set metadata in the NIfTI file's header
168
+ image.SetMetaData("FolderPath", folder_path)
169
+ sitk.WriteImage(image, output_path)
170
+
171
+ ##modify by yanguoqing on 20250527
172
+ def convert_windows_to_linux_path(windows_path):
173
+ # Replace backslashes with forward slashes and remove the drive letter
174
+ # Some meta files have windows paths, but the data is stored on a linux server
175
+ linux_path = windows_path.replace('\\', '/')
176
+ if ':' in linux_path:
177
+ linux_path = linux_path.split(':', 1)[1]
178
+ return linux_path
179
+
180
+ def main(target_path, output_dir):
181
+
182
+ pid_dirs=find_image_dirs(target_path)
183
+ failed_files = []
184
+ if not os.path.isdir(output_dir):
185
+ os.makedirs(output_dir)
186
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
187
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
188
+ meta = meta_data()
189
+
190
+ # Initialize the JSON file
191
+ if not os.path.exists(json_output_path):
192
+ with open(json_output_path, 'w') as json_file:
193
+ json.dump({}, json_file)
194
+ ##方便处理解析信息,转成csv文件
195
+ meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
196
+ meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
197
+ if os.path.isfile(meta_file):
198
+ mf_flag=True
199
+ df_meta=pd.read_csv(meta_file,sep=',')
200
+ else:
201
+ mf_flag=False
202
+
203
+
204
+ if pid_dirs:
205
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
206
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
207
+ continue
208
+
209
+ ##遍历所有目录下的病例数据
210
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
211
+
212
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
213
+ ##data_dir即id
214
+ full_path=os.path.join(target_path,pid_dir,data_dir)
215
+
216
+ modality="MRI"
217
+ study='OASIS_1'##Dataset_name
218
+ CIA_other_info = {'metadata_file':''}
219
+ CIA_other_info['split'] = "train"
220
+ CIA_other_info['metadata_file']=meta_file_ori
221
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
222
+
223
+ if data_info_row.shape[0]>0:
224
+ data_info_row=data_info_row.reset_index()
225
+ #print(data_info_row[meta_id_name])
226
+ for keyname in META_COLUMN[1:]:
227
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
228
+
229
+ CIA_other_info['Image_id']=data_dir
230
+
231
+
232
+ else:
233
+ meta_image_id=data_dir
234
+ for keyname in META_COLUMN[1:]:
235
+ CIA_other_info[keyname]=''
236
+
237
+
238
+
239
+ try:
240
+ ##读取完整匹配后的影像
241
+ #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img
242
+ ##OAS1_0001_MR1_mpr_n4_anon_111_t88_gfc
243
+ full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_gfc.img"%(full_path,data_dir))[0]
244
+ # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir)
245
+
246
+ if os.path.isfile(full_file):
247
+ ##存在有效的MRI影像数据进行后续处理
248
+ sitk_img_original=util.load_nifti(full_file)
249
+ else:
250
+ print("病例数据%s为空"%data_dir)
251
+ continue
252
+
253
+
254
+ original_spacing = list(sitk_img_original.GetSpacing())
255
+ original_size = list(sitk_img_original.GetSize())
256
+
257
+
258
+
259
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))
260
+ meta.add_keyvalue('OriImg_path',full_file)
261
+ meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin
262
+ meta.add_keyvalue('Modality',modality)
263
+ meta.add_keyvalue('Dataset_name',study)
264
+ meta.add_keyvalue('ROI','head')
265
+
266
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
267
+
268
+ output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
269
+ # output_path=convert_windows_to_linux_path(output_path)
270
+ ##
271
+ save_nifti(sitk_img_original, output_image_file, full_path)
272
+ print(f"Saved NIfTI file to {output_image_file}")
273
+ ##Label processing
274
+
275
+ label_path_dict={}
276
+ #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img
277
+ full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0]
278
+
279
+
280
+ process_label_path=os.path.join(output_dir,data_dir,'segmentation')
281
+
282
+ processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz")
283
+
284
+ if not os.path.isdir(process_label_path):
285
+ os.makedirs(process_label_path,exist_ok=True)
286
+
287
+ if not os.path.isfile(full_label_file):
288
+ label_flag=False
289
+ else:
290
+ sitk_lbl_original = util.load_nifti(full_label_file)
291
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
292
+ print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
293
+
294
+ label_path_dict['head'] = processed_lbl_full_path
295
+ label_flag=True
296
+
297
+ if label_flag:
298
+ meta.add_keyvalue('Task',TASK_VALUE)
299
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
300
+
301
+
302
+
303
+ # try:
304
+ # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
305
+ # except Exception as e:
306
+ # failed_files.append(full_path_label)
307
+ # continue
308
+ print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
309
+
310
+ except Exception as e:
311
+ print(e)
312
+ failed_files.append(data_dir)
313
+ print(f"Failed to load BRATS images from {data_dir}")
314
+ continue
315
+
316
+
317
+
318
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
319
+
320
+
321
+ # Write the mapping to the JSON file on the fly
322
+ with open(json_output_path, 'r+') as json_file:
323
+ existing_mappings = json.load(json_file)
324
+ existing_mappings[output_image_file] = meta.get_meta_data()
325
+ json_file.seek(0)
326
+ # print(existing_mappings)
327
+ json.dump(existing_mappings, json_file, indent=4)
328
+ json_file.truncate()
329
+ # else:
330
+ # print("No metadata.csv files found.")
331
+
332
+ with open(failed_files_path, "w") as json_file:
333
+ json.dump(failed_files, json_file)
334
+
335
+ print(f"The list has been written to {failed_files_path}")
336
+ print(f"Saved NIfTI mappings to {json_output_path}")
337
+
338
+ if __name__ == "__main__":
339
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
340
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
341
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL_UNMASK")
342
+ args = parser.parse_args()
343
+ print(args.target_path, args.output_dir)
344
+ main(args.target_path, args.output_dir)
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
OAISIS_clean/dataclean_OASIS_1_CS_Sectional_raw.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-09-04
5
+
6
+ OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
7
+ OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
8
+
9
+ 1. 目录与文件命名规则
10
+ 根目录下按受试者会话ID建立文件夹。
11
+ 受试者ID格式:OAS1_xxxx (例如 OAS1_0012)
12
+ 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像)
13
+ OAS1_xxxx_MRy/
14
+
15
+ ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件
16
+ ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看)
17
+ ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式)
18
+ ├── PROCESSED/ # 预处理后的图像
19
+ │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³)
20
+ │ └── T88_111/ # 图谱配准空间下的图像
21
+ │ ├── t4_files/ # 存储配准变换矩阵文件
22
+ │ └── ... # 配准后的图像文件
23
+ └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1)
24
+
25
+
26
+ 所有图像均以 Analyze 7.5格式 存储,包含:
27
+ 一个图像文件(.img)
28
+ 一个头文件(.hdr)
29
+ 使用 16位大端序(big-endian) 存储
30
+
31
+ OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位
32
+ OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位
33
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位
34
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位
35
+ OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位
36
+
37
+ 1. 人口统计学信息
38
+ 性别(M/F)
39
+ 用手习惯(Hand)(均为右利手)
40
+ 年龄(Age)
41
+ 教育程度(Educ)(1-5级)
42
+ 社会经济地位(SES)
43
+
44
+ 2. 临床评估
45
+ MMSE(简易精神状态检查)
46
+ CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
47
+
48
+ 3. 衍生解剖指标
49
+ eTIV:估计颅内容积
50
+ ASF:图谱缩放因子
51
+ nWBV:标准化全脑体积
52
+ '''
53
+ import os
54
+ import glob,re
55
+ import pandas as pd
56
+ import SimpleITK as sitk
57
+ import argparse
58
+ import json
59
+ from tqdm import tqdm
60
+ from util import meta_data
61
+ import util
62
+ import numpy as np
63
+ # from bert_helper import *
64
+
65
+ import shutil
66
+ ##dataset_meta
67
+ import warnings
68
+ warnings.filterwarnings("ignore")
69
+ meta_id_name='ID'
70
+ ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
71
+ META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
72
+
73
+
74
+ TASK_VALUE="segmentation"
75
+ CLAMP_RANGE_CT = [-300,300]
76
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
77
+ TARGET_VOXEL_SPACING=None
78
+
79
+
80
+ # def find_metadata_files(path):
81
+ # # for Cancer Image Archive (TCIA) dataset
82
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
83
+ # return glob.glob(search_pattern, recursive=True)
84
+
85
+ def find_metadata_files(path):
86
+ # for Cancer Image Archive (TCIA) dataset
87
+ search_pattern = os.path.join(path, '*.csv')
88
+ return glob.glob(search_pattern, recursive=True)
89
+ ##added by yanguoqing on 20250527
90
+ def find_image_dirs(path):
91
+ return os.listdir(path)
92
+
93
+ ##modify by yanguoqing on 20250527
94
+ def load_dicom_images(folder_path):
95
+ reader = sitk.ImageSeriesReader()
96
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
97
+ reader.SetFileNames(dicom_names)
98
+ image = reader.Execute()
99
+ return dicom_names,image
100
+
101
+ ##added by yanguoqing on 20250527
102
+ def load_dicom_tag(imgs):
103
+ reader = sitk.ImageFileReader()
104
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
105
+ reader.SetFileName(imgs)
106
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
107
+ # metadata_keys = reader.GetMetaDataKeys()
108
+ tag=reader.Execute()
109
+ return tag
110
+
111
+ def load_nrrd(fp):
112
+ return sitk.ReadImage(fp)
113
+
114
+ ##modify by yanguoqing on 20250904
115
+ def load_raw_images(series_files):
116
+ '''
117
+ 每个病例包含3到4种RAW的单次平扫MR
118
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放
119
+ '''
120
+ reader = sitk.ImageSeriesReader()
121
+ reader.SetFileNames(series_files)
122
+ image = reader.Execute()
123
+ return image
124
+
125
+ def save_nifti(image, output_path, folder_path):
126
+ # Set metadata in the NIfTI file's header
127
+ output_dirpath = os.path.dirname(output_path)
128
+ if not os.path.exists(output_dirpath):
129
+ print(f"Creating directory {output_dirpath}")
130
+ os.makedirs(output_dirpath)
131
+ # Set metadata in the NIfTI file's header
132
+ image.SetMetaData("FolderPath", folder_path)
133
+ sitk.WriteImage(image, output_path)
134
+
135
+ ##modify by yanguoqing on 20250527
136
+ def convert_windows_to_linux_path(windows_path):
137
+ # Replace backslashes with forward slashes and remove the drive letter
138
+ # Some meta files have windows paths, but the data is stored on a linux server
139
+ linux_path = windows_path.replace('\\', '/')
140
+ if ':' in linux_path:
141
+ linux_path = linux_path.split(':', 1)[1]
142
+ return linux_path
143
+
144
+ def main(target_path, output_dir):
145
+ pid_dirs=find_image_dirs(target_path)
146
+ failed_files = []
147
+ if not os.path.isdir(output_dir):
148
+ os.makedirs(output_dir)
149
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
150
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
151
+ meta = meta_data()
152
+
153
+ # Initialize the JSON file
154
+ if not os.path.exists(json_output_path):
155
+ with open(json_output_path, 'w') as json_file:
156
+ json.dump({}, json_file)
157
+ ##方便处理解析信息,转成csv文件
158
+ meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
159
+ meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
160
+ if os.path.isfile(meta_file):
161
+ mf_flag=True
162
+ df_meta=pd.read_csv(meta_file,sep=',')
163
+ else:
164
+ mf_flag=False
165
+
166
+
167
+ if pid_dirs:
168
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
169
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
170
+ continue
171
+
172
+ ##遍历所有目录下的病例数据
173
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
174
+
175
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
176
+ ##data_dir即id
177
+ full_path=os.path.join(target_path,pid_dir,data_dir)
178
+
179
+ modality="MRI"
180
+ study='OASIS_1'##Dataset_name
181
+ CIA_other_info = {'metadata_file':''}
182
+ CIA_other_info['split'] = "train"
183
+ CIA_other_info['metadata_file']=meta_file_ori
184
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
185
+
186
+ if data_info_row.shape[0]>0:
187
+ data_info_row=data_info_row.reset_index()
188
+ #print(data_info_row[meta_id_name])
189
+ for keyname in META_COLUMN[1:]:
190
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
191
+
192
+ CIA_other_info['Image_id']=data_dir
193
+
194
+
195
+ else:
196
+ meta_image_id=data_dir
197
+ for keyname in META_COLUMN[1:]:
198
+ CIA_other_info[keyname]=''
199
+
200
+
201
+
202
+ try:
203
+ ##读取原始的RAW目录下多次单扫img
204
+ #\RAW\OAS1_0001_MR1_mpr-1_anon.img
205
+ series_files=glob.glob("%s/RAW/%s_mpr-*.img"%(full_path,data_dir))
206
+ series_files.sort()
207
+
208
+ if len(series_files)>0:
209
+ ##存在有效的MRI影像数据进行后续处理
210
+ sitk_img_original=load_raw_images(series_files)
211
+ submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files]
212
+ sub_modality_dict={}
213
+ for idx,value in enumerate(submodality):
214
+ sub_modality_dict[idx]=value
215
+
216
+ meta.add_keyvalue('Sub_modality',sub_modality_dict)
217
+
218
+ else:
219
+ print("病例数据%s为空"%data_dir)
220
+ continue
221
+
222
+
223
+ original_spacing = list(sitk_img_original.GetSpacing())
224
+ original_size = list(sitk_img_original.GetSize())
225
+
226
+
227
+
228
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))
229
+ meta.add_keyvalue('OriImg_path',",".join(series_files))
230
+ meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin
231
+ meta.add_keyvalue('Modality',modality)
232
+ meta.add_keyvalue('Dataset_name',study)
233
+ meta.add_keyvalue('ROI','head')
234
+
235
+
236
+
237
+ output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
238
+ # output_path=convert_windows_to_linux_path(output_path)
239
+ ##
240
+ save_nifti(sitk_img_original, output_image_file, full_path)
241
+ print(f"Saved NIfTI file to {output_image_file}")
242
+ ##Label processing
243
+
244
+
245
+
246
+ except Exception as e:
247
+ print(e)
248
+ failed_files.append(data_dir)
249
+ print(f"Failed to load BRATS images from {data_dir}")
250
+ continue
251
+
252
+
253
+
254
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
255
+
256
+
257
+ # Write the mapping to the JSON file on the fly
258
+ with open(json_output_path, 'r+') as json_file:
259
+ existing_mappings = json.load(json_file)
260
+ existing_mappings[output_image_file] = meta.get_meta_data()
261
+ json_file.seek(0)
262
+ # print(existing_mappings)
263
+ json.dump(existing_mappings, json_file, indent=4)
264
+ json_file.truncate()
265
+ # else:
266
+ # print("No metadata.csv files found.")
267
+
268
+ with open(failed_files_path, "w") as json_file:
269
+ json.dump(failed_files, json_file)
270
+
271
+ print(f"The list has been written to {failed_files_path}")
272
+ print(f"Saved NIfTI mappings to {json_output_path}")
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
276
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
277
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL_RAW")
278
+ args = parser.parse_args()
279
+ print(args.target_path, args.output_dir)
280
+ main(args.target_path, args.output_dir)
OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-09-04
5
+
6
+ OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
7
+ OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
8
+
9
+ OASIS-2,全称为 Longitudinal Multimodal Neuroimaging: Principal 150 Subjects,是 OASIS 项目发布的第二个核心数据集。顾名思义,它的核心特点是 纵向(Longitudinal)。
10
+
11
+ 核心目标:
12
+ 研究正常衰老和阿尔茨海默病(AD)中的大脑结构随时间变化的模式。
13
+ 研究设计:
14
+ 纵向研究。同一批受试者被多次扫描和评估,持续数年。
15
+ 样本量:
16
+ 150 名年龄在 60 到 96 岁之间的受试者。
17
+ 人群组成:
18
+ 所有 150 名受试者在首次扫描时都被诊断为认知正常(CDR = 0)。
19
+ 在研究期间,部分受试者仍然保持认知正常,而另一部分则发展为痴呆(被临床诊断为可能患有阿尔茨海默病)。
20
+ 数据采集:
21
+ 每名受试者进行了 至少 2 次 的访视会话(session),最多达到了 5 次。
22
+ 每次访视之间的平均间隔时间约为 2.2 年,整个研究跨度最长超过 7 年。
23
+ 每次访视都包括:3-4 次 T1 加权 MRI 扫描(在单次会话中完成,用于平均以提高信噪比)和详细的临床神经心理评估。
24
+ 数据内容:
25
+ 与 OASIS-1 类似,包括原始 DICOM 图像、预处理后的 Analyze 格式图像,以及全面的临床认知评估数据。
26
+
27
+
28
+ 关键区别的详细解释
29
+ 横断面 vs. 纵向 (Cross-Sectional vs. Longitudinal):
30
+ OASIS-1 像是在给一个城市的所有人在同一天拍一张照片。你可以比较年轻人和老年人、健康人和病人的区别,但看不到任何一个人是如何变老或生病的。
31
+ OASIS-2 像是挑选了150位健康的老年人,然后每年都给他们拍一张照片,持续好几年。这样你就能亲眼看到有些人如何慢慢地出现变化,最终生病。这对于理解疾病的过程至关重要。
32
+ 受试者群体的区别:
33
+ OASIS-1 包含了已经确诊的AD患者,非常适合训练一个模型来学习“AD大脑看起来是什么样”。
34
+ OASIS-2 的受试者起点都是健康的,这使得它成为研究疾病前驱期(即临床症状出现之前)的宝贵资源。你可以分析那些最终患病的人,在多年前其大脑是否就已经存在细微的、可检测的差异。
35
+ 数据分析方法的差异:
36
+ 分析 OASIS-1 通常使用跨主体(cross-sectional)比较,例如比较AD组和正常对照组的平均海马体积。
37
+ 分析 OASIS-2 则侧重于个体内部随时间的变化(within-subject change)。例如,为每个受试者计算其年化脑萎缩率,然后比较保持正常组和转化组之间的萎缩速率差异。这需要更复杂的纵向统计模型。
38
+
39
+
40
+ 1. 人口统计学信息
41
+ 性别(M/F)
42
+ 用手习惯(Hand)(均为右利手)
43
+ 年龄(Age)
44
+ 教育程度(Educ)(1-5级)
45
+ 社会经济地位(SES)
46
+
47
+ 2. 临床评估
48
+ MMSE(简易精神状态检查)
49
+ CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
50
+
51
+ 3. 衍生解剖指标
52
+ eTIV:估计颅内容积
53
+ ASF:图谱缩放因子
54
+ nWBV:标准化全脑体积
55
+ '''
56
+ import os
57
+ import glob,re
58
+ import pandas as pd
59
+ import SimpleITK as sitk
60
+ import argparse
61
+ import json
62
+ from tqdm import tqdm
63
+ from util import meta_data
64
+ import util
65
+ import numpy as np
66
+ # from bert_helper import *
67
+
68
+ import shutil
69
+ ##dataset_meta
70
+ import warnings
71
+ warnings.filterwarnings("ignore")
72
+ meta_id_name='MRI ID'
73
+ ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
74
+ #META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
75
+ META_COLUMN=['Subject ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand','Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
76
+
77
+ TASK_VALUE="segmentation"
78
+ CLAMP_RANGE_CT = [-300,300]
79
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
80
+ TARGET_VOXEL_SPACING=None
81
+
82
+
83
+ # def find_metadata_files(path):
84
+ # # for Cancer Image Archive (TCIA) dataset
85
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
86
+ # return glob.glob(search_pattern, recursive=True)
87
+
88
+ def find_metadata_files(path):
89
+ # for Cancer Image Archive (TCIA) dataset
90
+ search_pattern = os.path.join(path, '*.csv')
91
+ return glob.glob(search_pattern, recursive=True)
92
+ ##added by yanguoqing on 20250527
93
+ def find_image_dirs(path):
94
+ return os.listdir(path)
95
+
96
+ ##modify by yanguoqing on 20250527
97
+ def load_dicom_images(folder_path):
98
+ reader = sitk.ImageSeriesReader()
99
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
100
+ reader.SetFileNames(dicom_names)
101
+ image = reader.Execute()
102
+ return dicom_names,image
103
+
104
+ ##added by yanguoqing on 20250527
105
+ def load_dicom_tag(imgs):
106
+ reader = sitk.ImageFileReader()
107
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
108
+ reader.SetFileName(imgs)
109
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
110
+ # metadata_keys = reader.GetMetaDataKeys()
111
+ tag=reader.Execute()
112
+ return tag
113
+
114
+ def load_nrrd(fp):
115
+ return sitk.ReadImage(fp)
116
+
117
+ ##modify by yanguoqing on 20250904
118
+ def load_raw_images(series_files):
119
+ '''
120
+ 每个病例包含3到4种RAW的单次平扫MR
121
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放
122
+ '''
123
+ reader = sitk.ImageSeriesReader()
124
+ reader.SetFileNames(series_files)
125
+ image = reader.Execute()
126
+ return image
127
+
128
+ def save_nifti(image, output_path, folder_path):
129
+ # Set metadata in the NIfTI file's header
130
+ output_dirpath = os.path.dirname(output_path)
131
+ if not os.path.exists(output_dirpath):
132
+ print(f"Creating directory {output_dirpath}")
133
+ os.makedirs(output_dirpath)
134
+ # Set metadata in the NIfTI file's header
135
+ image.SetMetaData("FolderPath", folder_path)
136
+ sitk.WriteImage(image, output_path)
137
+
138
+ ##modify by yanguoqing on 20250527
139
+ def convert_windows_to_linux_path(windows_path):
140
+ # Replace backslashes with forward slashes and remove the drive letter
141
+ # Some meta files have windows paths, but the data is stored on a linux server
142
+ linux_path = windows_path.replace('\\', '/')
143
+ if ':' in linux_path:
144
+ linux_path = linux_path.split(':', 1)[1]
145
+ return linux_path
146
+
147
+ def main(target_path, output_dir):
148
+ pid_dirs=find_image_dirs(target_path)
149
+ failed_files = []
150
+ if not os.path.isdir(output_dir):
151
+ os.makedirs(output_dir)
152
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
153
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
154
+ meta = meta_data()
155
+
156
+ # Initialize the JSON file
157
+ if not os.path.exists(json_output_path):
158
+ with open(json_output_path, 'w') as json_file:
159
+ json.dump({}, json_file)
160
+ ##方便处理解析信息,转成csv文件
161
+ meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis2_longitudinal_demographics.csv')
162
+ meta_file_ori=os.path.join(target_path,'oasis_longitudinal_demographics-8d83e569fa2e2d30 (1).xlsx')
163
+ if os.path.isfile(meta_file):
164
+ mf_flag=True
165
+ df_meta=pd.read_csv(meta_file,sep=',')
166
+ else:
167
+ mf_flag=False
168
+
169
+
170
+ if pid_dirs:
171
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
172
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
173
+ continue
174
+
175
+ ##遍历所有目录下的病例数据
176
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
177
+
178
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
179
+ ##data_dir即id
180
+ full_path=os.path.join(target_path,pid_dir,data_dir)
181
+
182
+ modality="MRI"
183
+ study='OASIS_2'##Dataset_name
184
+ CIA_other_info = {'metadata_file':''}
185
+ CIA_other_info['split'] = "train"
186
+ CIA_other_info['metadata_file']=meta_file_ori
187
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
188
+
189
+ if data_info_row.shape[0]>0:
190
+ data_info_row=data_info_row.reset_index()
191
+ #print(data_info_row[meta_id_name])
192
+ for keyname in META_COLUMN[:]:
193
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
194
+
195
+ CIA_other_info['Image_id']=data_dir
196
+
197
+
198
+ else:
199
+ meta_image_id=data_dir
200
+ for keyname in META_COLUMN[1:]:
201
+ CIA_other_info[keyname]=''
202
+
203
+
204
+
205
+ try:
206
+ ##读取原始的RAW目录下多次单扫img
207
+ #\RAW\OAS1_0001_MR1_mpr-1_anon.img
208
+ series_files=glob.glob("%s/RAW/mpr-*.img"%(full_path))
209
+ series_files.sort()
210
+
211
+ if len(series_files)>0:
212
+ ##存在有效的MRI影像数据进行后续处理
213
+ sitk_img_original=load_raw_images(series_files)
214
+ submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files]
215
+ sub_modality_dict={}
216
+ for idx,value in enumerate(submodality):
217
+ sub_modality_dict[idx]=value
218
+
219
+ meta.add_keyvalue('Sub_modality',sub_modality_dict)
220
+
221
+ else:
222
+ print("病例数据%s为空"%data_dir)
223
+ continue
224
+
225
+
226
+ original_spacing = list(sitk_img_original.GetSpacing())
227
+ original_size = list(sitk_img_original.GetSize())
228
+
229
+
230
+
231
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))
232
+ meta.add_keyvalue('OriImg_path',",".join(series_files))
233
+ meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin
234
+ meta.add_keyvalue('Modality',modality)
235
+ meta.add_keyvalue('Dataset_name',study)
236
+ meta.add_keyvalue('ROI','head')
237
+
238
+
239
+
240
+ output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
241
+ # output_path=convert_windows_to_linux_path(output_path)
242
+ ##
243
+ save_nifti(sitk_img_original, output_image_file, full_path)
244
+ print(f"Saved NIfTI file to {output_image_file}")
245
+ ##Label processing
246
+
247
+
248
+
249
+ except Exception as e:
250
+ print(e)
251
+ failed_files.append(data_dir)
252
+ print(f"Failed to load OASIS images from {data_dir}")
253
+ continue
254
+
255
+
256
+
257
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
258
+
259
+
260
+ # Write the mapping to the JSON file on the fly
261
+ with open(json_output_path, 'r+') as json_file:
262
+ existing_mappings = json.load(json_file)
263
+ existing_mappings[output_image_file] = meta.get_meta_data()
264
+ json_file.seek(0)
265
+ # print(existing_mappings)
266
+ json.dump(existing_mappings, json_file, indent=4)
267
+ json_file.truncate()
268
+ # else:
269
+ # print("No metadata.csv files found.")
270
+
271
+ with open(failed_files_path, "w") as json_file:
272
+ json.dump(failed_files, json_file)
273
+
274
+ print(f"The list has been written to {failed_files_path}")
275
+ print(f"Saved NIfTI mappings to {json_output_path}")
276
+
277
+ if __name__ == "__main__":
278
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
279
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_2/OAS2_RAW//")
280
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW")
281
+ args = parser.parse_args()
282
+ print(args.target_path, args.output_dir)
283
+ main(args.target_path, args.output_dir)
OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw_v2.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ write by ygq
4
+ create on 2025-09-04
5
+
6
+ OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
7
+ OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
8
+
9
+ OASIS-2,全称为 Longitudinal Multimodal Neuroimaging: Principal 150 Subjects,是 OASIS 项目发布的第二个核心数据集。顾名思义,它的核心特点是 纵向(Longitudinal)。
10
+
11
+ 核心目标:
12
+ 研究正常衰老和阿尔茨海默病(AD)中的大脑结构随时间变化的模式。
13
+ 研究设计:
14
+ 纵向研究。同一批受试者被多次扫描和评估,持续数年。
15
+ 样本量:
16
+ 150 名年龄在 60 到 96 岁之间的受试者。
17
+ 人群组成:
18
+ 所有 150 名受试者在首次扫描时都被诊断为认知正常(CDR = 0)。
19
+ 在研究期间,部分受试者仍然保持认知正常,而另一部分则发展为痴呆(被临床诊断为可能患有阿尔茨海默病)。
20
+ 数据采集:
21
+ 每名受试者进行了 至少 2 次 的访视会话(session),最多达到了 5 次。
22
+ 每次访视之间的平均间隔时间约为 2.2 年,整个研究跨度最长超过 7 年。
23
+ 每次访视都包括:3-4 次 T1 加权 MRI 扫描(在单次会话中完成,用于平均以提高信噪比)和详细的临床神经心理评估。
24
+ 数据内容:
25
+ 与 OASIS-1 类似,包括原始 DICOM 图像、预处理后的 Analyze 格式图像,以及全面的临床认知评估数据。
26
+
27
+
28
+ 关键区别的详细解释
29
+ 横断面 vs. 纵向 (Cross-Sectional vs. Longitudinal):
30
+ OASIS-1 像是在给一个城市的所有人在同一天拍一张照片。你可以比较年轻人和老年人、健康人和病人的区别,但看不到任何一个人是如何变老或生病的。
31
+ OASIS-2 像是挑选了150位健康的老年人,然后每年都给他们拍一张照片,持续好几年。这样你就能亲眼看到有些人如何慢慢地出现变化,最终生病。这对于理解疾病的过程至关重要。
32
+ 受试者群体的区别:
33
+ OASIS-1 包含了已经确诊的AD患者,非常适合训练一个模型来学习“AD大脑看起来是什么样”。
34
+ OASIS-2 的受试者起点都是健康的,这使得它成为研究疾病前驱期(即临床症状出现之前)的宝贵资源。你可以分析那些最终患病的人,在多年前其大脑是否就已经存在细微的、可检测的差异。
35
+ 数据分析方法的差异:
36
+ 分析 OASIS-1 通常使用跨主体(cross-sectional)比较,例如比较AD组和正常对照组的平均海马体积。
37
+ 分析 OASIS-2 则侧重于个体内部随时间的变化(within-subject change)。例如,为每个受试者计算其年化脑萎缩率,然后比较保持正常组和转化组之间的萎缩速率差异。这需要更复杂的纵向统计模型。
38
+
39
+
40
+ 1. 人口统计学信息
41
+ 性别(M/F)
42
+ 用手习惯(Hand)(均为右利手)
43
+ 年龄(Age)
44
+ 教育程度(Educ)(1-5级)
45
+ 社会经济地位(SES)
46
+
47
+ 2. 临床评估
48
+ MMSE(简易精神状态检查)
49
+ CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
50
+
51
+ 3. 衍生解剖指标
52
+ eTIV:估计颅内容积
53
+ ASF:图谱缩放因子
54
+ nWBV:标准化全脑体积
55
+ '''
56
+ import os
57
+ import glob,re
58
+ import pandas as pd
59
+ import SimpleITK as sitk
60
+ import argparse
61
+ import json
62
+ from tqdm import tqdm
63
+ from util import meta_data
64
+ import util
65
+ import numpy as np
66
+ # from bert_helper import *
67
+
68
+ import shutil
69
+ ##dataset_meta
70
+ import warnings
71
+ warnings.filterwarnings("ignore")
72
+ meta_id_name='MRI ID'
73
+ ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
74
+ #META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
75
+ META_COLUMN=['Subject ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand','Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
76
+
77
+ TASK_VALUE="segmentation"
78
+ CLAMP_RANGE_CT = [-300,300]
79
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
80
+ TARGET_VOXEL_SPACING=None
81
+
82
+ # def find_metadata_files(path):
83
+ # # for Cancer Image Archive (TCIA) dataset
84
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
85
+ # return glob.glob(search_pattern, recursive=True)
86
+
87
+ def find_metadata_files(path):
88
+ # for Cancer Image Archive (TCIA) dataset
89
+ search_pattern = os.path.join(path, '*.csv')
90
+ return glob.glob(search_pattern, recursive=True)
91
+ ##added by yanguoqing on 20250527
92
+ def find_image_dirs(path):
93
+ return os.listdir(path)
94
+
95
+ ##modify by yanguoqing on 20250527
96
+ def load_dicom_images(folder_path):
97
+ reader = sitk.ImageSeriesReader()
98
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
99
+ reader.SetFileNames(dicom_names)
100
+ image = reader.Execute()
101
+ return dicom_names,image
102
+
103
+ ##added by yanguoqing on 20250527
104
+ def load_dicom_tag(imgs):
105
+ reader = sitk.ImageFileReader()
106
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
107
+ reader.SetFileName(imgs)
108
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
109
+ # metadata_keys = reader.GetMetaDataKeys()
110
+ tag=reader.Execute()
111
+ return tag
112
+
113
+ def load_nrrd(fp):
114
+ return sitk.ReadImage(fp)
115
+
116
+ ##modify by yanguoqing on 20250904
117
+ def load_raw_images(series_files):
118
+ '''
119
+ 每个病例包含3到4种RAW的单次平扫MR
120
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放
121
+ '''
122
+ reader = sitk.ImageSeriesReader()
123
+ reader.SetFileNames(series_files)
124
+ image = reader.Execute()
125
+ return image
126
+
127
+ def save_nifti(image, output_path, folder_path):
128
+ # Set metadata in the NIfTI file's header
129
+ output_dirpath = os.path.dirname(output_path)
130
+ if not os.path.exists(output_dirpath):
131
+ print(f"Creating directory {output_dirpath}")
132
+ os.makedirs(output_dirpath)
133
+ # Set metadata in the NIfTI file's header
134
+ image.SetMetaData("FolderPath", folder_path)
135
+ sitk.WriteImage(image, output_path)
136
+
137
+ ##modify by yanguoqing on 20250527
138
+ def convert_windows_to_linux_path(windows_path):
139
+ # Replace backslashes with forward slashes and remove the drive letter
140
+ # Some meta files have windows paths, but the data is stored on a linux server
141
+ linux_path = windows_path.replace('\\', '/')
142
+ if ':' in linux_path:
143
+ linux_path = linux_path.split(':', 1)[1]
144
+ return linux_path
145
+
146
+ def main(target_path, output_dir):
147
+ pid_dirs=find_image_dirs(target_path)
148
+ failed_files = []
149
+ if not os.path.isdir(output_dir):
150
+ os.makedirs(output_dir)
151
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
152
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
153
+ meta = meta_data()
154
+
155
+ # Initialize the JSON file
156
+ if not os.path.exists(json_output_path):
157
+ with open(json_output_path, 'w') as json_file:
158
+ json.dump({}, json_file)
159
+ ##方便处理解析信息,转成csv文件
160
+ meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis2_longitudinal_demographics.csv')
161
+ meta_file_ori=os.path.join(target_path,'oasis_longitudinal_demographics-8d83e569fa2e2d30 (1).xlsx')
162
+ if os.path.isfile(meta_file):
163
+ mf_flag=True
164
+ df_meta=pd.read_csv(meta_file,sep=',')
165
+ else:
166
+ mf_flag=False
167
+
168
+
169
+ if pid_dirs:
170
+ for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
171
+ if not os.path.isdir(os.path.join(target_path,pid_dir)):
172
+ continue
173
+
174
+ ##遍历所有目录下的病例数据
175
+ image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
176
+
177
+ for data_dir in tqdm(image_dirs, desc="Processing images files"):
178
+ ##data_dir即id
179
+ full_path=os.path.join(target_path,pid_dir,data_dir)
180
+
181
+ modality="MRI"
182
+ study='OASIS_2'##Dataset_name
183
+ CIA_other_info = {'metadata_file':''}
184
+ CIA_other_info['split'] = "train"
185
+ CIA_other_info['metadata_file']=meta_file_ori
186
+ data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
187
+
188
+ if data_info_row.shape[0]>0:
189
+ data_info_row=data_info_row.reset_index()
190
+ #print(data_info_row[meta_id_name])
191
+ for keyname in META_COLUMN[:]:
192
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
193
+
194
+ CIA_other_info['Image_id']=data_dir
195
+
196
+
197
+ else:
198
+ meta_image_id=data_dir
199
+ for keyname in META_COLUMN[1:]:
200
+ CIA_other_info[keyname]=''
201
+
202
+
203
+
204
+ try:
205
+ ##读取原始的RAW目录下多次单扫img
206
+ #\RAW\OAS1_0001_MR1_mpr-1_anon.img
207
+ series_files=glob.glob("%s/RAW/mpr-*.img"%(full_path))
208
+ series_files.sort()
209
+
210
+ if len(series_files)>0:
211
+ ##存在有效的MRI影像数据进行后续处理
212
+ sitk_img_original=load_raw_images(series_files)
213
+ submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files]
214
+ sub_modality_dict={}
215
+ for idx,value in enumerate(submodality):
216
+ sub_modality_dict[idx]=value
217
+
218
+ meta.add_keyvalue('Sub_modality',sub_modality_dict)
219
+
220
+ else:
221
+ print("病例数据%s为空"%data_dir)
222
+ continue
223
+
224
+ original_spacing = list(sitk_img_original.GetSpacing())
225
+ original_size = list(sitk_img_original.GetSize())
226
+ print(original_spacing)
227
+ is_4d_image = sitk_img_original.GetDimension() == 4
228
+
229
+ frame_flag=False
230
+ # --- Resampling Logic (Revised for 4D) ---
231
+ if is_4d_image:
232
+
233
+ # Always process 4D images channel-wise for resampling
234
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
235
+ channels = []
236
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
237
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
238
+
239
+
240
+ for i in range(num_channels):
241
+ extractor = sitk.ExtractImageFilter()
242
+ current_3d_channel_size = original_size[:3]
243
+
244
+ if sitk_img_original.GetDimension() == 4:
245
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
246
+ extractor.SetIndex([0,0,0,i])
247
+ channel_3d_img = extractor.Execute(sitk_img_original)
248
+ else:
249
+ channel_3d_img = sitk_img_original
250
+ if i > 0: break
251
+
252
+ channel_resampler = util.get_unisize_resampler(
253
+ channel_3d_img, 'linear',
254
+ spacing=channel_target_spacing, size=current_3d_channel_size
255
+ )
256
+ if channel_resampler:
257
+ channels.append(channel_resampler.Execute(channel_3d_img))
258
+ else:
259
+ channels.append(channel_3d_img)
260
+
261
+ if channels:
262
+ if len(channels) > 1: # Only join if there are multiple channels
263
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
264
+ ##aded by yanguoqing on 2025-08-11
265
+ frame_flag=True
266
+ # imgDict={}
267
+ # for kf_idx in range(num_channels):
268
+ # imgDict[str(kf_idx)]='none'
269
+ # if str(meta_ed):imgDict[str(meta_ed)]='ed'
270
+ # if str(meta_es):imgDict[str(meta_es)]='es'
271
+ # meta.add_keyvalue('ImgDict',imgDict)
272
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
273
+ sitk_img_processed = channels[0]
274
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
275
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
276
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
277
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
278
+ else: # 3D image, no TARGET_VOXEL_SPACING
279
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
280
+ spacing=original_spacing, size=original_size)
281
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
282
+
283
+
284
+
285
+
286
+
287
+ original_spacing = list(sitk_img_original.GetSpacing())
288
+ original_size = list(sitk_img_original.GetSize())
289
+ size_processed = list(sitk_img_processed.GetSize())
290
+ print('size_processed',size_processed,original_size)
291
+
292
+
293
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))
294
+ meta.add_keyvalue('OriImg_path',",".join(series_files))
295
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
296
+ meta.add_keyvalue('Modality',modality)
297
+ meta.add_keyvalue('Dataset_name',study)
298
+ meta.add_keyvalue('ROI','head')
299
+
300
+
301
+
302
+ output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
303
+ # output_path=convert_windows_to_linux_path(output_path)
304
+ ##
305
+ save_nifti(sitk_img_processed, output_image_file, full_path)
306
+ print(f"Saved NIfTI file to {output_image_file}")
307
+ ##Label processing
308
+
309
+
310
+
311
+ except Exception as e:
312
+ print(e)
313
+ failed_files.append(data_dir)
314
+ print(f"Failed to load OASIS images from {data_dir}")
315
+ continue
316
+
317
+
318
+
319
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
320
+
321
+
322
+ # Write the mapping to the JSON file on the fly
323
+ with open(json_output_path, 'r+') as json_file:
324
+ existing_mappings = json.load(json_file)
325
+ existing_mappings[output_image_file] = meta.get_meta_data()
326
+ json_file.seek(0)
327
+ # print(existing_mappings)
328
+ json.dump(existing_mappings, json_file, indent=4)
329
+ json_file.truncate()
330
+ # else:
331
+ # print("No metadata.csv files found.")
332
+
333
+ with open(failed_files_path, "w") as json_file:
334
+ json.dump(failed_files, json_file)
335
+
336
+ print(f"The list has been written to {failed_files_path}")
337
+ print(f"Saved NIfTI mappings to {json_output_path}")
338
+
339
+ if __name__ == "__main__":
340
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
341
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_2/OAS2_RAW//")
342
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW_V2")
343
+ args = parser.parse_args()
344
+ print(args.target_path, args.output_dir)
345
+ main(args.target_path, args.output_dir)
OAISIS_clean/oasis2_longitudinal_demographics.csv ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
2
+ OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2,27,0,1987,0.696,0.883
3
+ OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2,30,0,2004,0.681,0.876
4
+ OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23,0.5,1678,0.736,1.046
5
+ OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28,0.5,1738,0.713,1.010
6
+ OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22,0.5,1698,0.701,1.034
7
+ OAS2_0004,OAS2_0004_MR1,Nondemented,1,0,F,R,88,18,3,28,0,1215,0.710,1.444
8
+ OAS2_0004,OAS2_0004_MR2,Nondemented,2,538,F,R,90,18,3,27,0,1200,0.718,1.462
9
+ OAS2_0005,OAS2_0005_MR1,Nondemented,1,0,M,R,80,12,4,28,0,1689,0.712,1.039
10
+ OAS2_0005,OAS2_0005_MR2,Nondemented,2,1010,M,R,83,12,4,29,0.5,1701,0.711,1.032
11
+ OAS2_0005,OAS2_0005_MR3,Nondemented,3,1603,M,R,85,12,4,30,0,1699,0.705,1.033
12
+ OAS2_0007,OAS2_0007_MR1,Demented,1,0,M,R,71,16,,28,0.5,1357,0.748,1.293
13
+ OAS2_0007,OAS2_0007_MR3,Demented,3,518,M,R,73,16,,27,1,1365,0.727,1.286
14
+ OAS2_0007,OAS2_0007_MR4,Demented,4,1281,M,R,75,16,,27,1,1372,0.710,1.279
15
+ OAS2_0008,OAS2_0008_MR1,Nondemented,1,0,F,R,93,14,2,30,0,1272,0.698,1.380
16
+ OAS2_0008,OAS2_0008_MR2,Nondemented,2,742,F,R,95,14,2,29,0,1257,0.703,1.396
17
+ OAS2_0009,OAS2_0009_MR1,Demented,1,0,M,R,68,12,2,27,0.5,1457,0.806,1.205
18
+ OAS2_0009,OAS2_0009_MR2,Demented,2,576,M,R,69,12,2,24,0.5,1480,0.791,1.186
19
+ OAS2_0010,OAS2_0010_MR1,Demented,1,0,F,R,66,12,3,30,0.5,1447,0.769,1.213
20
+ OAS2_0010,OAS2_0010_MR2,Demented,2,854,F,R,68,12,3,29,0.5,1482,0.752,1.184
21
+ OAS2_0012,OAS2_0012_MR1,Nondemented,1,0,F,R,78,16,2,29,0,1333,0.748,1.316
22
+ OAS2_0012,OAS2_0012_MR2,Nondemented,2,730,F,R,80,16,2,29,0,1323,0.738,1.326
23
+ OAS2_0012,OAS2_0012_MR3,Nondemented,3,1598,F,R,83,16,2,29,0,1323,0.718,1.327
24
+ OAS2_0013,OAS2_0013_MR1,Nondemented,1,0,F,R,81,12,4,30,0,1230,0.715,1.427
25
+ OAS2_0013,OAS2_0013_MR2,Nondemented,2,643,F,R,82,12,4,30,0,1212,0.720,1.448
26
+ OAS2_0013,OAS2_0013_MR3,Nondemented,3,1456,F,R,85,12,4,29,0,1225,0.710,1.433
27
+ OAS2_0014,OAS2_0014_MR1,Demented,1,0,M,R,76,16,3,21,0.5,1602,0.697,1.096
28
+ OAS2_0014,OAS2_0014_MR2,Demented,2,504,M,R,77,16,3,16,1,1590,0.696,1.104
29
+ OAS2_0016,OAS2_0016_MR1,Demented,1,0,M,R,88,8,4,25,0.5,1651,0.660,1.063
30
+ OAS2_0016,OAS2_0016_MR2,Demented,2,707,M,R,90,8,4,23,0.5,1668,0.646,1.052
31
+ OAS2_0017,OAS2_0017_MR1,Nondemented,1,0,M,R,80,12,3,29,0,1783,0.752,0.985
32
+ OAS2_0017,OAS2_0017_MR3,Nondemented,3,617,M,R,81,12,3,27,0.5,1814,0.759,0.968
33
+ OAS2_0017,OAS2_0017_MR4,Nondemented,4,1861,M,R,85,12,3,30,0,1820,0.755,0.964
34
+ OAS2_0017,OAS2_0017_MR5,Nondemented,5,2400,M,R,86,12,3,27,0,1813,0.761,0.968
35
+ OAS2_0018,OAS2_0018_MR1,Converted,1,0,F,R,87,14,1,30,0,1406,0.715,1.248
36
+ OAS2_0018,OAS2_0018_MR3,Converted,3,489,F,R,88,14,1,29,0,1398,0.713,1.255
37
+ OAS2_0018,OAS2_0018_MR4,Converted,4,1933,F,R,92,14,1,27,0.5,1423,0.696,1.234
38
+ OAS2_0020,OAS2_0020_MR1,Converted,1,0,M,R,80,20,1,29,0,1587,0.693,1.106
39
+ OAS2_0020,OAS2_0020_MR2,Converted,2,756,M,R,82,20,1,28,0.5,1606,0.677,1.093
40
+ OAS2_0020,OAS2_0020_MR3,Converted,3,1563,M,R,84,20,1,26,0.5,1597,0.666,1.099
41
+ OAS2_0021,OAS2_0021_MR1,Demented,1,0,M,R,72,20,1,26,0.5,1911,0.719,0.919
42
+ OAS2_0021,OAS2_0021_MR2,Demented,2,1164,M,R,76,20,1,25,0.5,1926,0.736,0.911
43
+ OAS2_0022,OAS2_0022_MR1,Nondemented,1,0,F,R,61,16,3,30,0,1313,0.805,1.337
44
+ OAS2_0022,OAS2_0022_MR2,Nondemented,2,828,F,R,64,16,3,29,0,1316,0.796,1.333
45
+ OAS2_0023,OAS2_0023_MR1,Demented,1,0,F,R,86,12,4,21,0.5,1247,0.662,1.407
46
+ OAS2_0023,OAS2_0023_MR2,Demented,2,578,F,R,87,12,4,21,0.5,1250,0.652,1.405
47
+ OAS2_0026,OAS2_0026_MR1,Demented,1,0,M,R,82,12,3,27,0.5,1420,0.713,1.236
48
+ OAS2_0026,OAS2_0026_MR2,Demented,2,673,M,R,84,12,3,27,0.5,1445,0.695,1.214
49
+ OAS2_0027,OAS2_0027_MR1,Nondemented,1,0,F,R,69,12,3,29,0,1365,0.783,1.286
50
+ OAS2_0027,OAS2_0027_MR2,Nondemented,2,609,F,R,71,12,3,30,0,1360,0.782,1.291
51
+ OAS2_0027,OAS2_0027_MR3,Nondemented,3,1234,F,R,73,12,3,30,0,1358,0.775,1.293
52
+ OAS2_0027,OAS2_0027_MR4,Nondemented,4,1779,F,R,74,12,3,30,0,1353,0.772,1.297
53
+ OAS2_0028,OAS2_0028_MR1,Demented,1,0,M,R,64,18,2,22,0.5,1547,0.737,1.134
54
+ OAS2_0028,OAS2_0028_MR2,Demented,2,610,M,R,66,18,2,21,1,1562,0.717,1.124
55
+ OAS2_0029,OAS2_0029_MR1,Nondemented,1,0,F,R,77,12,4,29,0,1377,0.734,1.275
56
+ OAS2_0029,OAS2_0029_MR2,Nondemented,2,1099,F,R,80,12,4,30,0,1390,0.735,1.263
57
+ OAS2_0030,OAS2_0030_MR1,Nondemented,1,0,F,R,60,18,1,30,0,1402,0.822,1.252
58
+ OAS2_0030,OAS2_0030_MR2,Nondemented,2,932,F,R,62,18,1,30,0,1392,0.817,1.261
59
+ OAS2_0031,OAS2_0031_MR1,Converted,1,0,F,R,86,12,3,30,0,1430,0.718,1.227
60
+ OAS2_0031,OAS2_0031_MR2,Converted,2,446,F,R,88,12,3,30,0,1445,0.719,1.215
61
+ OAS2_0031,OAS2_0031_MR3,Converted,3,1588,F,R,91,12,3,28,0.5,1463,0.696,1.199
62
+ OAS2_0032,OAS2_0032_MR1,Demented,1,0,M,R,90,12,3,21,0.5,1307,0.679,1.342
63
+ OAS2_0032,OAS2_0032_MR2,Demented,2,642,M,R,92,12,3,24,0.5,1311,0.676,1.339
64
+ OAS2_0034,OAS2_0034_MR1,Nondemented,1,0,F,R,79,16,1,29,0,1466,0.703,1.197
65
+ OAS2_0034,OAS2_0034_MR2,Nondemented,2,489,F,R,80,16,1,30,0,1450,0.698,1.210
66
+ OAS2_0034,OAS2_0034_MR3,Nondemented,3,1287,F,R,82,16,1,30,0,1460,0.695,1.202
67
+ OAS2_0034,OAS2_0034_MR4,Nondemented,4,1884,F,R,84,16,1,30,0,1453,0.684,1.208
68
+ OAS2_0035,OAS2_0035_MR1,Nondemented,1,0,F,R,88,12,4,30,0,1336,0.738,1.313
69
+ OAS2_0035,OAS2_0035_MR2,Nondemented,2,405,F,R,89,12,4,27,0,1329,0.733,1.320
70
+ OAS2_0036,OAS2_0036_MR1,Nondemented,1,0,F,R,69,13,4,30,0,1359,0.789,1.291
71
+ OAS2_0036,OAS2_0036_MR3,Nondemented,3,713,F,R,70,13,4,30,0,1361,0.783,1.290
72
+ OAS2_0036,OAS2_0036_MR4,Nondemented,4,1770,F,R,73,13,4,30,0,1360,0.773,1.291
73
+ OAS2_0036,OAS2_0036_MR5,Nondemented,5,2369,F,R,75,13,4,29,0,1349,0.778,1.301
74
+ OAS2_0037,OAS2_0037_MR1,Demented,1,0,M,R,82,12,4,27,0.5,1477,0.729,1.188
75
+ OAS2_0037,OAS2_0037_MR2,Demented,2,1123,M,R,85,12,4,29,0.5,1487,0.717,1.180
76
+ OAS2_0037,OAS2_0037_MR3,Demented,3,2029,M,R,88,12,4,26,0.5,1483,0.709,1.184
77
+ OAS2_0037,OAS2_0037_MR4,Demented,4,2508,M,R,89,12,4,26,0.5,1485,0.706,1.181
78
+ OAS2_0039,OAS2_0039_MR1,Demented,1,0,F,R,81,18,2,26,0.5,1174,0.742,1.495
79
+ OAS2_0039,OAS2_0039_MR2,Demented,2,486,F,R,83,18,2,25,0.5,1179,0.733,1.488
80
+ OAS2_0040,OAS2_0040_MR1,Demented,1,0,M,R,84,6,4,25,0.5,1310,0.727,1.339
81
+ OAS2_0040,OAS2_0040_MR2,Demented,2,567,M,R,86,6,4,27,0.5,1320,0.724,1.329
82
+ OAS2_0040,OAS2_0040_MR3,Demented,3,1204,M,R,88,6,4,23,0.5,1348,0.713,1.302
83
+ OAS2_0041,OAS2_0041_MR1,Converted,1,0,F,R,71,16,1,27,0,1289,0.771,1.362
84
+ OAS2_0041,OAS2_0041_MR2,Converted,2,756,F,R,73,16,1,28,0,1295,0.768,1.356
85
+ OAS2_0041,OAS2_0041_MR3,Converted,3,1331,F,R,75,16,1,28,0.5,1314,0.760,1.335
86
+ OAS2_0042,OAS2_0042_MR1,Nondemented,1,0,F,R,70,17,3,29,0,1640,0.766,1.070
87
+ OAS2_0042,OAS2_0042_MR2,Nondemented,2,1008,F,R,73,17,3,29,0,1665,0.748,1.054
88
+ OAS2_0043,OAS2_0043_MR1,Demented,1,0,F,R,72,12,4,26,0.5,1453,0.777,1.208
89
+ OAS2_0043,OAS2_0043_MR2,Demented,2,491,F,R,73,12,4,26,0.5,1451,0.757,1.210
90
+ OAS2_0044,OAS2_0044_MR1,Demented,1,0,M,R,68,14,4,21,1,1333,0.685,1.317
91
+ OAS2_0044,OAS2_0044_MR2,Demented,2,352,M,R,69,14,4,15,1,1331,0.678,1.318
92
+ OAS2_0044,OAS2_0044_MR3,Demented,3,866,M,R,71,14,4,22,1,1332,0.679,1.317
93
+ OAS2_0045,OAS2_0045_MR1,Nondemented,1,0,F,R,75,18,1,30,0,1317,0.737,1.332
94
+ OAS2_0045,OAS2_0045_MR2,Nondemented,2,689,F,R,77,18,1,29,0,1322,0.731,1.327
95
+ OAS2_0046,OAS2_0046_MR1,Demented,1,0,F,R,83,15,2,20,0.5,1476,0.750,1.189
96
+ OAS2_0046,OAS2_0046_MR2,Demented,2,575,F,R,85,15,2,22,1,1483,0.748,1.183
97
+ OAS2_0047,OAS2_0047_MR1,Nondemented,1,0,F,R,77,16,2,29,0,1433,0.723,1.225
98
+ OAS2_0047,OAS2_0047_MR2,Nondemented,2,486,F,R,78,16,2,27,0,1414,0.727,1.242
99
+ OAS2_0048,OAS2_0048_MR1,Demented,1,0,M,R,66,16,1,19,1,1695,0.711,1.036
100
+ OAS2_0048,OAS2_0048_MR2,Demented,2,248,M,R,66,16,1,21,1,1708,0.703,1.028
101
+ OAS2_0048,OAS2_0048_MR3,Demented,3,647,M,R,68,16,1,19,1,1712,0.691,1.025
102
+ OAS2_0048,OAS2_0048_MR4,Demented,4,970,M,R,68,16,1,7,1,1714,0.682,1.024
103
+ OAS2_0048,OAS2_0048_MR5,Demented,5,1233,M,R,69,16,1,4,1,1701,0.676,1.032
104
+ OAS2_0049,OAS2_0049_MR1,Nondemented,1,0,F,R,69,16,3,30,0,1491,0.794,1.177
105
+ OAS2_0049,OAS2_0049_MR2,Nondemented,2,395,F,R,70,16,3,30,0,1505,0.791,1.166
106
+ OAS2_0049,OAS2_0049_MR3,Nondemented,3,687,F,R,71,16,3,30,0,1503,0.788,1.168
107
+ OAS2_0050,OAS2_0050_MR1,Demented,1,0,M,R,71,12,4,20,0.5,1461,0.724,1.202
108
+ OAS2_0050,OAS2_0050_MR2,Demented,2,538,M,R,72,12,4,17,1,1483,0.695,1.184
109
+ OAS2_0051,OAS2_0051_MR1,Nondemented,1,0,F,R,92,23,1,29,0,1454,0.701,1.207
110
+ OAS2_0051,OAS2_0051_MR2,Nondemented,2,457,F,R,94,23,1,29,0,1474,0.696,1.190
111
+ OAS2_0051,OAS2_0051_MR3,Nondemented,3,1526,F,R,97,23,1,30,0,1483,0.689,1.184
112
+ OAS2_0052,OAS2_0052_MR1,Nondemented,1,0,M,R,74,18,2,29,0,1463,0.737,1.199
113
+ OAS2_0052,OAS2_0052_MR2,Nondemented,2,1510,M,R,78,18,2,30,0,1484,0.703,1.183
114
+ OAS2_0053,OAS2_0053_MR1,Nondemented,1,0,F,R,82,16,3,29,0,1484,0.760,1.183
115
+ OAS2_0053,OAS2_0053_MR2,Nondemented,2,842,F,R,84,16,3,28,0,1500,0.744,1.170
116
+ OAS2_0054,OAS2_0054_MR1,Converted,1,0,F,R,85,18,1,29,0,1264,0.701,1.388
117
+ OAS2_0054,OAS2_0054_MR2,Converted,2,846,F,R,87,18,1,24,0.5,1275,0.683,1.376
118
+ OAS2_0055,OAS2_0055_MR1,Nondemented,1,0,M,R,65,13,3,29,0,1362,0.837,1.289
119
+ OAS2_0055,OAS2_0055_MR2,Nondemented,2,726,M,R,67,13,3,27,0,1365,0.827,1.285
120
+ OAS2_0056,OAS2_0056_MR1,Nondemented,1,0,F,R,71,14,2,28,0,1461,0.756,1.202
121
+ OAS2_0056,OAS2_0056_MR2,Nondemented,2,622,F,R,73,14,2,30,0,1456,0.739,1.205
122
+ OAS2_0057,OAS2_0057_MR1,Nondemented,1,0,F,R,81,12,2,30,0,1599,0.755,1.098
123
+ OAS2_0057,OAS2_0057_MR2,Nondemented,2,640,F,R,83,12,2,29,0,1569,0.757,1.118
124
+ OAS2_0057,OAS2_0057_MR3,Nondemented,3,1340,F,R,85,12,2,30,0,1580,0.739,1.111
125
+ OAS2_0058,OAS2_0058_MR1,Demented,1,0,M,R,78,14,3,30,0.5,1315,0.707,1.335
126
+ OAS2_0058,OAS2_0058_MR2,Demented,2,212,M,R,79,14,3,26,0.5,1308,0.706,1.341
127
+ OAS2_0058,OAS2_0058_MR3,Demented,3,764,M,R,80,14,3,29,0.5,1324,0.695,1.326
128
+ OAS2_0060,OAS2_0060_MR1,Demented,1,0,M,R,75,13,4,29,0.5,1416,0.766,1.239
129
+ OAS2_0060,OAS2_0060_MR2,Demented,2,1290,M,R,78,13,4,28,0.5,1408,0.757,1.247
130
+ OAS2_0061,OAS2_0061_MR1,Nondemented,1,0,M,R,68,18,1,30,0,1654,0.747,1.061
131
+ OAS2_0061,OAS2_0061_MR2,Nondemented,2,873,M,R,70,18,1,30,0,1660,0.738,1.057
132
+ OAS2_0061,OAS2_0061_MR3,Nondemented,3,1651,M,R,72,18,1,30,0,1681,0.729,1.044
133
+ OAS2_0062,OAS2_0062_MR1,Nondemented,1,0,F,R,79,18,2,29,0,1641,0.695,1.069
134
+ OAS2_0062,OAS2_0062_MR2,Nondemented,2,723,F,R,81,18,2,30,0,1664,0.677,1.055
135
+ OAS2_0062,OAS2_0062_MR3,Nondemented,3,1351,F,R,83,18,2,29,0,1667,0.688,1.053
136
+ OAS2_0063,OAS2_0063_MR1,Demented,1,0,F,R,80,12,,30,0.5,1430,0.737,1.228
137
+ OAS2_0063,OAS2_0063_MR2,Demented,2,490,F,R,81,12,,27,0.5,1453,0.721,1.208
138
+ OAS2_0064,OAS2_0064_MR1,Demented,1,0,F,R,78,8,5,23,1,1462,0.691,1.200
139
+ OAS2_0064,OAS2_0064_MR2,Demented,2,830,F,R,81,8,5,26,0.5,1459,0.694,1.203
140
+ OAS2_0064,OAS2_0064_MR3,Demented,3,1282,F,R,82,8,5,18,0.5,1464,0.682,1.199
141
+ OAS2_0066,OAS2_0066_MR1,Demented,1,0,M,R,61,18,1,30,1,1957,0.734,0.897
142
+ OAS2_0066,OAS2_0066_MR2,Demented,2,497,M,R,62,18,1,30,0.5,1928,0.731,0.910
143
+ OAS2_0067,OAS2_0067_MR1,Nondemented,1,0,M,R,67,12,4,30,0,1440,0.727,1.219
144
+ OAS2_0067,OAS2_0067_MR2,Nondemented,2,451,M,R,68,12,4,29,0,1438,0.738,1.220
145
+ OAS2_0067,OAS2_0067_MR3,Nondemented,3,1438,M,R,71,12,4,29,0,1455,0.724,1.206
146
+ OAS2_0067,OAS2_0067_MR4,Nondemented,4,2163,M,R,73,12,4,28,0,1444,0.722,1.215
147
+ OAS2_0068,OAS2_0068_MR1,Nondemented,1,0,F,R,88,12,3,30,0,1428,0.700,1.229
148
+ OAS2_0068,OAS2_0068_MR2,Nondemented,2,743,F,R,90,12,3,29,0,1475,0.676,1.190
149
+ OAS2_0069,OAS2_0069_MR1,Nondemented,1,0,F,R,81,18,2,29,0,1470,0.687,1.194
150
+ OAS2_0069,OAS2_0069_MR2,Nondemented,2,432,F,R,82,18,2,30,0,1471,0.690,1.193
151
+ OAS2_0070,OAS2_0070_MR1,Nondemented,1,0,M,R,80,17,1,28,0,1660,0.728,1.057
152
+ OAS2_0070,OAS2_0070_MR2,Nondemented,2,672,M,R,82,17,1,29,0,1692,0.723,1.037
153
+ OAS2_0070,OAS2_0070_MR3,Nondemented,3,1415,M,R,84,17,1,29,0,1707,0.717,1.028
154
+ OAS2_0070,OAS2_0070_MR4,Nondemented,4,1870,M,R,85,17,1,30,0,1724,0.704,1.018
155
+ OAS2_0070,OAS2_0070_MR5,Nondemented,5,2386,M,R,86,17,1,30,0,1720,0.705,1.020
156
+ OAS2_0071,OAS2_0071_MR1,Demented,1,0,F,R,83,13,2,27,1,1391,0.705,1.262
157
+ OAS2_0071,OAS2_0071_MR2,Demented,2,365,F,R,84,13,2,28,1,1402,0.695,1.252
158
+ OAS2_0073,OAS2_0073_MR1,Nondemented,1,0,F,R,70,14,3,29,0,1524,0.787,1.151
159
+ OAS2_0073,OAS2_0073_MR2,Nondemented,2,580,F,R,72,14,3,28,0,1512,0.777,1.161
160
+ OAS2_0073,OAS2_0073_MR3,Nondemented,3,1705,F,R,75,14,3,28,0,1507,0.782,1.164
161
+ OAS2_0073,OAS2_0073_MR4,Nondemented,4,2288,F,R,76,14,3,29,0,1490,0.774,1.178
162
+ OAS2_0073,OAS2_0073_MR5,Nondemented,5,2517,F,R,77,14,3,29,0,1504,0.769,1.167
163
+ OAS2_0075,OAS2_0075_MR1,Demented,1,0,F,R,73,8,5,25,0.5,1151,0.743,1.525
164
+ OAS2_0075,OAS2_0075_MR2,Demented,2,567,F,R,75,8,5,22,0.5,1143,0.741,1.535
165
+ OAS2_0076,OAS2_0076_MR1,Nondemented,1,0,F,R,66,18,2,30,0,1504,0.725,1.167
166
+ OAS2_0076,OAS2_0076_MR2,Nondemented,2,956,F,R,69,18,2,29,0,1536,0.719,1.143
167
+ OAS2_0076,OAS2_0076_MR3,Nondemented,3,1663,F,R,71,18,2,30,0,1520,0.718,1.155
168
+ OAS2_0077,OAS2_0077_MR1,Nondemented,1,0,M,R,69,16,2,28,0,1848,0.737,0.950
169
+ OAS2_0077,OAS2_0077_MR2,Nondemented,2,1393,M,R,73,16,2,29,0,1931,0.722,0.909
170
+ OAS2_0078,OAS2_0078_MR1,Nondemented,1,0,M,R,89,16,1,28,0,1631,0.674,1.076
171
+ OAS2_0078,OAS2_0078_MR2,Nondemented,2,441,M,R,91,16,1,28,0,1640,0.670,1.070
172
+ OAS2_0078,OAS2_0078_MR3,Nondemented,3,1019,M,R,92,16,1,30,0,1662,0.682,1.056
173
+ OAS2_0079,OAS2_0079_MR1,Demented,1,0,F,R,69,12,4,23,0.5,1447,0.759,1.213
174
+ OAS2_0079,OAS2_0079_MR2,Demented,2,584,F,R,71,12,4,16,1,1492,0.725,1.176
175
+ OAS2_0079,OAS2_0079_MR3,Demented,3,1435,F,R,73,12,4,16,1,1478,0.696,1.188
176
+ OAS2_0080,OAS2_0080_MR1,Demented,1,0,M,R,66,15,2,25,0.5,1548,0.727,1.134
177
+ OAS2_0080,OAS2_0080_MR2,Demented,2,580,M,R,68,15,2,30,0.5,1556,0.713,1.128
178
+ OAS2_0080,OAS2_0080_MR3,Demented,3,1209,M,R,69,15,2,28,0.5,1546,0.724,1.135
179
+ OAS2_0081,OAS2_0081_MR1,Demented,1,0,F,R,82,12,4,26,0.5,1271,0.695,1.381
180
+ OAS2_0081,OAS2_0081_MR2,Demented,2,659,F,R,84,12,4,26,0.5,1273,0.686,1.378
181
+ OAS2_0085,OAS2_0085_MR1,Nondemented,1,0,F,R,78,8,5,29,0,1383,0.756,1.269
182
+ OAS2_0085,OAS2_0085_MR2,Nondemented,2,670,F,R,80,8,5,27,0,1381,0.751,1.270
183
+ OAS2_0086,OAS2_0086_MR1,Nondemented,1,0,F,R,63,15,2,28,0,1544,0.805,1.136
184
+ OAS2_0086,OAS2_0086_MR2,Nondemented,2,802,F,R,65,15,2,28,0,1542,0.792,1.138
185
+ OAS2_0087,OAS2_0087_MR1,Demented,1,0,F,R,96,17,1,26,1,1465,0.683,1.198
186
+ OAS2_0087,OAS2_0087_MR2,Demented,2,754,F,R,98,17,1,21,2,1503,0.660,1.168
187
+ OAS2_0088,OAS2_0088_MR1,Demented,1,0,M,R,78,12,4,21,1,1477,0.672,1.188
188
+ OAS2_0088,OAS2_0088_MR2,Demented,2,751,M,R,80,12,4,20,1,1494,0.661,1.175
189
+ OAS2_0089,OAS2_0089_MR1,Demented,1,0,M,R,70,12,2,29,0.5,1432,0.692,1.225
190
+ OAS2_0089,OAS2_0089_MR3,Demented,3,563,M,R,72,12,2,27,1,1432,0.684,1.226
191
+ OAS2_0090,OAS2_0090_MR1,Nondemented,1,0,M,R,73,18,2,29,0,1548,0.773,1.134
192
+ OAS2_0090,OAS2_0090_MR2,Nondemented,2,680,M,R,75,18,2,29,0,1534,0.772,1.144
193
+ OAS2_0090,OAS2_0090_MR3,Nondemented,3,1345,M,R,76,18,2,30,0,1550,0.758,1.133
194
+ OAS2_0091,OAS2_0091_MR1,Nondemented,1,0,M,R,75,12,4,28,0,1511,0.739,1.162
195
+ OAS2_0091,OAS2_0091_MR2,Nondemented,2,1047,M,R,78,12,4,29,0,1506,0.715,1.166
196
+ OAS2_0092,OAS2_0092_MR1,Converted,1,0,F,R,83,12,2,28,0,1383,0.748,1.269
197
+ OAS2_0092,OAS2_0092_MR2,Converted,2,706,F,R,84,12,2,27,0.5,1390,0.728,1.263
198
+ OAS2_0094,OAS2_0094_MR1,Nondemented,1,0,F,R,61,16,1,30,0,1513,0.771,1.160
199
+ OAS2_0094,OAS2_0094_MR2,Nondemented,2,817,F,R,63,16,1,30,0,1449,0.774,1.212
200
+ OAS2_0095,OAS2_0095_MR1,Nondemented,1,0,M,R,71,18,1,30,0,1769,0.699,0.992
201
+ OAS2_0095,OAS2_0095_MR2,Nondemented,2,673,M,R,72,18,1,29,0,1785,0.687,0.983
202
+ OAS2_0095,OAS2_0095_MR3,Nondemented,3,1412,M,R,74,18,1,29,0,1814,0.679,0.967
203
+ OAS2_0096,OAS2_0096_MR1,Nondemented,1,0,F,R,89,13,3,29,0,1154,0.750,1.521
204
+ OAS2_0096,OAS2_0096_MR2,Nondemented,2,778,F,R,91,13,3,28,0,1165,0.736,1.506
205
+ OAS2_0097,OAS2_0097_MR1,Nondemented,1,0,M,R,74,16,2,30,0,1611,0.729,1.089
206
+ OAS2_0097,OAS2_0097_MR2,Nondemented,2,1024,M,R,77,16,2,30,0,1628,0.709,1.078
207
+ OAS2_0098,OAS2_0098_MR1,Demented,1,0,M,R,66,12,4,30,0.5,1446,0.780,1.214
208
+ OAS2_0098,OAS2_0098_MR2,Demented,2,661,M,R,67,12,4,28,0.5,1412,0.783,1.243
209
+ OAS2_0099,OAS2_0099_MR1,Demented,1,0,F,R,80,12,,27,0.5,1475,0.762,1.190
210
+ OAS2_0099,OAS2_0099_MR2,Demented,2,807,F,R,83,12,,23,0.5,1484,0.750,1.183
211
+ OAS2_0100,OAS2_0100_MR1,Nondemented,1,0,F,R,77,11,4,29,0,1583,0.777,1.108
212
+ OAS2_0100,OAS2_0100_MR2,Nondemented,2,1218,F,R,80,11,4,30,0,1586,0.757,1.107
213
+ OAS2_0100,OAS2_0100_MR3,Nondemented,3,1752,F,R,82,11,4,30,0,1590,0.760,1.104
214
+ OAS2_0101,OAS2_0101_MR1,Nondemented,1,0,F,R,71,18,2,30,0,1371,0.769,1.280
215
+ OAS2_0101,OAS2_0101_MR2,Nondemented,2,952,F,R,74,18,2,30,0,1400,0.752,1.254
216
+ OAS2_0101,OAS2_0101_MR3,Nondemented,3,1631,F,R,76,18,2,30,0,1379,0.757,1.273
217
+ OAS2_0102,OAS2_0102_MR1,Demented,1,0,M,R,82,15,3,29,0.5,1499,0.689,1.171
218
+ OAS2_0102,OAS2_0102_MR2,Demented,2,610,M,R,84,15,3,29,0.5,1497,0.686,1.172
219
+ OAS2_0102,OAS2_0102_MR3,Demented,3,1387,M,R,86,15,3,30,0.5,1498,0.681,1.171
220
+ OAS2_0103,OAS2_0103_MR1,Converted,1,0,F,R,69,16,1,30,0,1404,0.750,1.250
221
+ OAS2_0103,OAS2_0103_MR2,Converted,2,1554,F,R,74,16,1,30,0.5,1423,0.722,1.233
222
+ OAS2_0103,OAS2_0103_MR3,Converted,3,2002,F,R,75,16,1,30,0.5,1419,0.731,1.236
223
+ OAS2_0104,OAS2_0104_MR1,Demented,1,0,M,R,70,16,1,25,0.5,1568,0.696,1.119
224
+ OAS2_0104,OAS2_0104_MR2,Demented,2,465,M,R,71,16,1,17,1,1562,0.685,1.123
225
+ OAS2_0105,OAS2_0105_MR1,Nondemented,1,0,M,R,86,12,4,29,0,1783,0.703,0.984
226
+ OAS2_0105,OAS2_0105_MR2,Nondemented,2,675,M,R,87,12,4,30,0,1762,0.718,0.996
227
+ OAS2_0106,OAS2_0106_MR1,Demented,1,0,F,R,70,11,4,22,1,1445,0.722,1.214
228
+ OAS2_0106,OAS2_0106_MR2,Demented,2,729,F,R,72,11,4,21,1,1489,0.686,1.179
229
+ OAS2_0108,OAS2_0108_MR1,Demented,1,0,M,R,77,18,1,25,0.5,1604,0.781,1.094
230
+ OAS2_0108,OAS2_0108_MR2,Demented,2,883,M,R,79,18,1,27,0.5,1569,0.781,1.118
231
+ OAS2_0109,OAS2_0109_MR1,Nondemented,1,0,M,R,81,11,4,28,0,1750,0.670,1.003
232
+ OAS2_0109,OAS2_0109_MR2,Nondemented,2,766,M,R,83,11,4,29,0,1744,0.670,1.006
233
+ OAS2_0111,OAS2_0111_MR1,Demented,1,0,M,R,62,12,4,17,0.5,1525,0.732,1.151
234
+ OAS2_0111,OAS2_0111_MR2,Demented,2,881,M,R,65,12,4,17,0.5,1520,0.699,1.155
235
+ OAS2_0112,OAS2_0112_MR1,Demented,1,0,F,R,76,12,3,27,0.5,1315,0.698,1.335
236
+ OAS2_0112,OAS2_0112_MR2,Demented,2,558,F,R,78,12,3,20,0.5,1339,0.689,1.311
237
+ OAS2_0113,OAS2_0113_MR1,Demented,1,0,F,R,73,13,2,23,0.5,1536,0.725,1.142
238
+ OAS2_0113,OAS2_0113_MR2,Demented,2,504,F,R,75,13,2,28,0.5,1520,0.708,1.155
239
+ OAS2_0114,OAS2_0114_MR1,Demented,1,0,F,R,76,12,,27,0.5,1316,0.727,1.333
240
+ OAS2_0114,OAS2_0114_MR2,Demented,2,570,F,R,78,12,,27,1,1309,0.709,1.341
241
+ OAS2_0116,OAS2_0116_MR1,Demented,1,0,F,R,73,12,3,27,0.5,1425,0.769,1.232
242
+ OAS2_0116,OAS2_0116_MR2,Demented,2,616,F,R,75,12,3,28,0.5,1407,0.770,1.247
243
+ OAS2_0117,OAS2_0117_MR1,Nondemented,1,0,M,R,73,20,2,30,0,1842,0.758,0.953
244
+ OAS2_0117,OAS2_0117_MR2,Nondemented,2,576,M,R,74,20,2,30,0,1806,0.759,0.972
245
+ OAS2_0117,OAS2_0117_MR3,Nondemented,3,1345,M,R,76,20,2,30,0,1823,0.739,0.963
246
+ OAS2_0117,OAS2_0117_MR4,Nondemented,4,1927,M,R,78,20,2,29,0,1826,0.734,0.961
247
+ OAS2_0118,OAS2_0118_MR1,Converted,1,0,F,R,67,14,4,30,0,1508,0.794,1.164
248
+ OAS2_0118,OAS2_0118_MR2,Converted,2,1422,F,R,71,14,4,26,0.5,1529,0.788,1.147
249
+ OAS2_0119,OAS2_0119_MR1,Nondemented,1,0,F,R,81,15,2,28,0,1486,0.754,1.181
250
+ OAS2_0119,OAS2_0119_MR2,Nondemented,2,733,F,R,83,15,2,29,0,1482,0.751,1.184
251
+ OAS2_0119,OAS2_0119_MR3,Nondemented,3,1713,F,R,85,15,2,30,0,1488,0.741,1.180
252
+ OAS2_0120,OAS2_0120_MR1,Demented,1,0,F,R,76,14,3,25,1,1409,0.715,1.246
253
+ OAS2_0120,OAS2_0120_MR2,Demented,2,595,F,R,78,14,3,15,2,1401,0.700,1.253
254
+ OAS2_0121,OAS2_0121_MR1,Nondemented,1,0,F,R,73,11,4,30,0,1475,0.726,1.190
255
+ OAS2_0121,OAS2_0121_MR2,Nondemented,2,647,F,R,74,11,4,30,0,1517,0.705,1.157
256
+ OAS2_0122,OAS2_0122_MR1,Nondemented,1,0,F,R,86,16,3,30,0,1293,0.747,1.357
257
+ OAS2_0122,OAS2_0122_MR2,Nondemented,2,597,F,R,88,16,3,30,0,1295,0.744,1.355
258
+ OAS2_0124,OAS2_0124_MR1,Demented,1,0,M,R,70,16,3,29,0.5,1463,0.749,1.200
259
+ OAS2_0124,OAS2_0124_MR2,Demented,2,472,M,R,71,16,3,27,0.5,1479,0.750,1.187
260
+ OAS2_0126,OAS2_0126_MR1,Nondemented,1,0,F,R,74,12,3,29,0,1344,0.739,1.306
261
+ OAS2_0126,OAS2_0126_MR2,Nondemented,2,472,F,R,75,12,3,29,0,1338,0.747,1.312
262
+ OAS2_0126,OAS2_0126_MR3,Nondemented,3,1192,F,R,77,12,3,29,0,1344,0.740,1.306
263
+ OAS2_0127,OAS2_0127_MR1,Converted,1,0,M,R,79,18,1,29,0,1644,0.729,1.067
264
+ OAS2_0127,OAS2_0127_MR2,Converted,2,851,M,R,81,18,1,29,0.5,1654,0.720,1.061
265
+ OAS2_0127,OAS2_0127_MR3,Converted,3,1042,M,R,81,18,1,29,0.5,1647,0.717,1.066
266
+ OAS2_0127,OAS2_0127_MR4,Converted,4,2153,M,R,84,18,1,29,0.5,1668,0.694,1.052
267
+ OAS2_0127,OAS2_0127_MR5,Converted,5,2639,M,R,86,18,1,30,0.5,1670,0.669,1.051
268
+ OAS2_0128,OAS2_0128_MR1,Nondemented,1,0,F,R,76,16,1,28,0,1346,0.762,1.304
269
+ OAS2_0128,OAS2_0128_MR2,Nondemented,2,1140,F,R,79,16,1,29,0,1354,0.739,1.297
270
+ OAS2_0129,OAS2_0129_MR1,Nondemented,1,0,F,R,78,18,1,30,0,1440,0.666,1.219
271
+ OAS2_0129,OAS2_0129_MR2,Nondemented,2,737,F,R,80,18,1,30,0,1436,0.663,1.222
272
+ OAS2_0129,OAS2_0129_MR3,Nondemented,3,1591,F,R,82,18,1,29,0,1442,0.644,1.217
273
+ OAS2_0131,OAS2_0131_MR1,Converted,1,0,F,R,65,12,2,30,0.5,1340,0.754,1.309
274
+ OAS2_0131,OAS2_0131_MR2,Converted,2,679,F,R,67,12,2,25,0,1331,0.761,1.318
275
+ OAS2_0133,OAS2_0133_MR1,Converted,1,0,F,R,78,12,3,29,0,1475,0.731,1.190
276
+ OAS2_0133,OAS2_0133_MR3,Converted,3,1006,F,R,81,12,3,28,0.5,1495,0.687,1.174
277
+ OAS2_0134,OAS2_0134_MR1,Demented,1,0,F,R,70,11,4,29,0.5,1295,0.748,1.355
278
+ OAS2_0134,OAS2_0134_MR2,Demented,2,539,F,R,71,11,4,28,0.5,1284,0.741,1.367
279
+ OAS2_0135,OAS2_0135_MR1,Nondemented,1,0,M,R,74,18,2,30,0,1636,0.680,1.073
280
+ OAS2_0135,OAS2_0135_MR2,Nondemented,2,1146,M,R,78,18,2,27,0,1645,0.663,1.067
281
+ OAS2_0137,OAS2_0137_MR1,Demented,1,0,M,R,74,18,2,28,0.5,1659,0.739,1.058
282
+ OAS2_0137,OAS2_0137_MR2,Demented,2,636,M,R,75,18,2,30,0.5,1651,0.737,1.063
283
+ OAS2_0138,OAS2_0138_MR1,Nondemented,1,0,F,R,73,16,2,29,0,1123,0.786,1.563
284
+ OAS2_0138,OAS2_0138_MR2,Nondemented,2,846,F,R,75,16,2,28,0,1106,0.767,1.587
285
+ OAS2_0139,OAS2_0139_MR1,Demented,1,0,F,R,67,16,1,29,0.5,1337,0.766,1.312
286
+ OAS2_0139,OAS2_0139_MR2,Demented,2,403,F,R,68,16,1,29,0.5,1344,0.733,1.305
287
+ OAS2_0140,OAS2_0140_MR1,Demented,1,0,F,R,76,16,3,26,0.5,1391,0.705,1.262
288
+ OAS2_0140,OAS2_0140_MR2,Demented,2,793,F,R,78,16,3,27,0.5,1393,0.690,1.260
289
+ OAS2_0140,OAS2_0140_MR3,Demented,3,1655,F,R,81,16,3,25,0.5,1396,0.687,1.257
290
+ OAS2_0141,OAS2_0141_MR1,Nondemented,1,0,F,R,65,18,2,30,0,1277,0.812,1.374
291
+ OAS2_0141,OAS2_0141_MR2,Nondemented,2,1022,F,R,68,18,2,29,0,1290,0.795,1.361
292
+ OAS2_0142,OAS2_0142_MR1,Nondemented,1,0,F,R,69,16,3,29,0,1380,0.819,1.272
293
+ OAS2_0142,OAS2_0142_MR2,Nondemented,2,665,F,R,71,16,3,28,0,1390,0.810,1.262
294
+ OAS2_0143,OAS2_0143_MR1,Nondemented,1,0,F,R,89,18,2,30,0,1715,0.746,1.023
295
+ OAS2_0143,OAS2_0143_MR2,Nondemented,2,561,F,R,91,18,2,30,0,1714,0.741,1.024
296
+ OAS2_0143,OAS2_0143_MR3,Nondemented,3,1553,F,R,93,18,2,29,0,1744,0.723,1.006
297
+ OAS2_0144,OAS2_0144_MR1,Converted,1,0,M,R,77,16,1,30,0,1704,0.716,1.030
298
+ OAS2_0144,OAS2_0144_MR2,Converted,2,683,M,R,79,16,1,30,0.5,1722,0.708,1.019
299
+ OAS2_0145,OAS2_0145_MR1,Converted,1,0,F,R,68,16,3,30,0,1298,0.799,1.352
300
+ OAS2_0145,OAS2_0145_MR2,Converted,2,1707,F,R,73,16,3,29,0.5,1287,0.771,1.364
301
+ OAS2_0146,OAS2_0146_MR1,Demented,1,0,F,R,80,15,2,20,1,1732,0.685,1.013
302
+ OAS2_0146,OAS2_0146_MR2,Demented,2,525,F,R,82,15,2,20,1,1729,0.698,1.015
303
+ OAS2_0147,OAS2_0147_MR1,Nondemented,1,0,F,R,77,13,2,29,0,1351,0.769,1.299
304
+ OAS2_0147,OAS2_0147_MR2,Nondemented,2,440,F,R,78,13,2,29,0,1334,0.769,1.316
305
+ OAS2_0147,OAS2_0147_MR3,Nondemented,3,1204,F,R,80,13,2,28,0,1337,0.762,1.313
306
+ OAS2_0147,OAS2_0147_MR4,Nondemented,4,1806,F,R,82,13,2,30,0,1342,0.747,1.307
307
+ OAS2_0149,OAS2_0149_MR1,Nondemented,1,0,F,R,81,13,2,29,0,1345,0.737,1.305
308
+ OAS2_0149,OAS2_0149_MR2,Nondemented,2,674,F,R,83,13,2,30,0,1335,0.732,1.314
309
+ OAS2_0150,OAS2_0150_MR1,Demented,1,0,F,R,73,12,3,30,0.5,1343,0.720,1.306
310
+ OAS2_0150,OAS2_0150_MR2,Demented,2,518,F,R,75,12,3,27,1,1357,0.714,1.293
311
+ OAS2_0152,OAS2_0152_MR1,Nondemented,1,0,F,R,66,18,2,29,0,1191,0.785,1.474
312
+ OAS2_0152,OAS2_0152_MR2,Nondemented,2,790,F,R,68,18,2,29,0,1194,0.772,1.469
313
+ OAS2_0152,OAS2_0152_MR3,Nondemented,3,1329,F,R,69,18,2,29,0,1202,0.770,1.461
314
+ OAS2_0154,OAS2_0154_MR1,Nondemented,1,0,F,R,75,18,1,29,0,1436,0.750,1.222
315
+ OAS2_0154,OAS2_0154_MR2,Nondemented,2,791,F,R,77,18,1,28,0,1559,0.713,1.125
316
+ OAS2_0156,OAS2_0156_MR1,Nondemented,1,0,F,R,78,18,1,30,0,1243,0.748,1.412
317
+ OAS2_0156,OAS2_0156_MR2,Nondemented,2,777,F,R,81,18,1,30,0,1256,0.739,1.398
318
+ OAS2_0157,OAS2_0157_MR1,Demented,1,0,F,R,73,12,2,19,1,1274,0.728,1.377
319
+ OAS2_0157,OAS2_0157_MR2,Demented,2,764,F,R,75,12,2,18,1,1479,0.657,1.187
320
+ OAS2_0158,OAS2_0158_MR1,Nondemented,1,0,F,R,73,15,4,29,0,1272,0.697,1.380
321
+ OAS2_0158,OAS2_0158_MR2,Nondemented,2,1399,F,R,76,15,4,29,0,1281,0.680,1.370
322
+ OAS2_0159,OAS2_0159_MR1,Demented,1,0,F,R,73,14,3,29,0.5,1238,0.757,1.418
323
+ OAS2_0159,OAS2_0159_MR2,Demented,2,759,F,R,76,14,3,28,0.5,1236,0.764,1.419
324
+ OAS2_0160,OAS2_0160_MR1,Demented,1,0,M,R,76,12,,27,0.5,1557,0.705,1.127
325
+ OAS2_0160,OAS2_0160_MR2,Demented,2,552,M,R,78,12,,29,1,1569,0.704,1.119
326
+ OAS2_0161,OAS2_0161_MR1,Nondemented,1,0,M,R,77,16,1,29,0,1818,0.734,0.965
327
+ OAS2_0161,OAS2_0161_MR2,Nondemented,2,454,M,R,79,16,1,30,0,1817,0.736,0.966
328
+ OAS2_0161,OAS2_0161_MR3,Nondemented,3,1033,M,R,80,16,1,29,0,1830,0.724,0.959
329
+ OAS2_0162,OAS2_0162_MR1,Demented,1,0,M,R,82,14,2,23,0.5,1514,0.678,1.159
330
+ OAS2_0162,OAS2_0162_MR2,Demented,2,621,M,R,84,14,2,22,0.5,1550,0.665,1.132
331
+ OAS2_0164,OAS2_0164_MR1,Demented,1,0,M,R,77,20,1,23,1,1713,0.756,1.024
332
+ OAS2_0164,OAS2_0164_MR2,Demented,2,580,M,R,79,20,1,25,2,1710,0.760,1.026
333
+ OAS2_0165,OAS2_0165_MR1,Demented,1,0,M,R,78,12,3,23,1,1491,0.710,1.177
334
+ OAS2_0165,OAS2_0165_MR2,Demented,2,736,M,R,80,12,3,17,1,1755,0.696,1.000
335
+ OAS2_0169,OAS2_0169_MR1,Nondemented,1,0,F,R,71,18,1,30,0,1426,0.731,1.231
336
+ OAS2_0169,OAS2_0169_MR2,Nondemented,2,691,F,R,73,18,1,30,0,1414,0.739,1.241
337
+ OAS2_0171,OAS2_0171_MR1,Nondemented,1,0,M,R,76,16,3,30,0,1832,0.769,0.958
338
+ OAS2_0171,OAS2_0171_MR2,Nondemented,2,493,M,R,77,16,3,30,0,1820,0.768,0.964
339
+ OAS2_0171,OAS2_0171_MR3,Nondemented,3,1695,M,R,81,16,3,30,0,1836,0.744,0.956
340
+ OAS2_0172,OAS2_0172_MR1,Demented,1,0,M,R,75,16,1,30,0.5,1891,0.709,0.928
341
+ OAS2_0172,OAS2_0172_MR2,Demented,2,1212,M,R,79,16,1,29,0.5,1899,0.700,0.924
342
+ OAS2_0174,OAS2_0174_MR1,Nondemented,1,0,M,R,60,12,4,30,0,1379,0.806,1.273
343
+ OAS2_0174,OAS2_0174_MR2,Nondemented,2,695,M,R,62,12,4,30,0,1378,0.795,1.274
344
+ OAS2_0174,OAS2_0174_MR3,Nondemented,3,1555,M,R,64,12,4,30,0,1370,0.794,1.281
345
+ OAS2_0175,OAS2_0175_MR1,Demented,1,0,M,R,70,16,4,26,0.5,1796,0.742,0.977
346
+ OAS2_0175,OAS2_0175_MR2,Demented,2,700,M,R,72,16,4,28,0.5,1796,0.732,0.977
347
+ OAS2_0175,OAS2_0175_MR3,Demented,3,1343,M,R,73,16,4,28,0.5,1803,0.731,0.973
348
+ OAS2_0176,OAS2_0176_MR1,Converted,1,0,M,R,84,16,2,30,0,1404,0.710,1.250
349
+ OAS2_0176,OAS2_0176_MR2,Converted,2,774,M,R,87,16,2,30,0,1398,0.696,1.255
350
+ OAS2_0176,OAS2_0176_MR3,Converted,3,1631,M,R,89,16,2,30,0.5,1408,0.679,1.246
351
+ OAS2_0177,OAS2_0177_MR1,Nondemented,1,0,M,R,68,14,3,26,0,1444,0.778,1.216
352
+ OAS2_0177,OAS2_0177_MR2,Nondemented,2,665,M,R,70,14,3,28,0,1510,0.770,1.162
353
+ OAS2_0178,OAS2_0178_MR1,Nondemented,1,0,F,R,89,14,2,29,0,1509,0.756,1.163
354
+ OAS2_0178,OAS2_0178_MR2,Nondemented,2,600,F,R,90,14,2,28,0,1495,0.746,1.174
355
+ OAS2_0178,OAS2_0178_MR3,Nondemented,3,1447,F,R,93,14,2,30,0,1488,0.735,1.179
356
+ OAS2_0179,OAS2_0179_MR1,Demented,1,0,M,R,79,20,1,26,0.5,1548,0.711,1.134
357
+ OAS2_0179,OAS2_0179_MR2,Demented,2,652,M,R,81,20,1,26,0.5,1556,0.691,1.128
358
+ OAS2_0181,OAS2_0181_MR1,Demented,1,0,F,R,74,12,,26,0.5,1171,0.733,1.499
359
+ OAS2_0181,OAS2_0181_MR2,Demented,2,539,F,R,75,12,,,1,1169,0.742,1.501
360
+ OAS2_0181,OAS2_0181_MR3,Demented,3,1107,F,R,77,12,,,1,1159,0.733,1.515
361
+ OAS2_0182,OAS2_0182_MR1,Demented,1,0,M,R,73,12,,23,0.5,1661,0.698,1.056
362
+ OAS2_0182,OAS2_0182_MR2,Demented,2,776,M,R,75,12,,20,0.5,1654,0.696,1.061
363
+ OAS2_0183,OAS2_0183_MR1,Nondemented,1,0,F,R,66,13,2,30,0,1495,0.746,1.174
364
+ OAS2_0183,OAS2_0183_MR2,Nondemented,2,182,F,R,66,13,2,30,0,1506,0.740,1.165
365
+ OAS2_0183,OAS2_0183_MR3,Nondemented,3,732,F,R,68,13,2,30,0,1506,0.740,1.165
366
+ OAS2_0183,OAS2_0183_MR4,Nondemented,4,2107,F,R,72,13,2,30,0,1510,0.723,1.162
367
+ OAS2_0184,OAS2_0184_MR1,Demented,1,0,F,R,72,16,3,24,0.5,1354,0.733,1.296
368
+ OAS2_0184,OAS2_0184_MR2,Demented,2,553,F,R,73,16,3,21,1,1351,0.708,1.299
369
+ OAS2_0185,OAS2_0185_MR1,Demented,1,0,M,R,80,16,1,28,0.5,1704,0.711,1.030
370
+ OAS2_0185,OAS2_0185_MR2,Demented,2,842,M,R,82,16,1,28,0.5,1693,0.694,1.037
371
+ OAS2_0185,OAS2_0185_MR3,Demented,3,2297,M,R,86,16,1,26,0.5,1688,0.675,1.040
372
+ OAS2_0186,OAS2_0186_MR1,Nondemented,1,0,F,R,61,13,2,30,0,1319,0.801,1.331
373
+ OAS2_0186,OAS2_0186_MR2,Nondemented,2,763,F,R,63,13,2,30,0,1327,0.796,1.323
374
+ OAS2_0186,OAS2_0186_MR3,Nondemented,3,1608,F,R,65,13,2,30,0,1333,0.801,1.317
OAISIS_clean/oasis_cross-sectional-5708aa0a98d82080.csv ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
2
+ OAS1_0001_MR1,F,R,74,2,3,29,0,1344,0.743,1.306,N/A
3
+ OAS1_0002_MR1,F,R,55,4,1,29,0,1147,0.81,1.531,N/A
4
+ OAS1_0003_MR1,F,R,73,4,3,27,0.5,1454,0.708,1.207,N/A
5
+ OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,N/A
6
+ OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,N/A
7
+ OAS1_0006_MR1,F,R,24,,,,,1131,0.862,1.551,N/A
8
+ OAS1_0007_MR1,M,R,21,,,,,1516,0.83,1.157,N/A
9
+ OAS1_0009_MR1,F,R,20,,,,,1505,0.843,1.166,N/A
10
+ OAS1_0010_MR1,M,R,74,5,2,30,0,1636,0.689,1.073,N/A
11
+ OAS1_0011_MR1,F,R,52,3,2,30,0,1321,0.827,1.329,N/A
12
+ OAS1_0012_MR1,M,R,30,,,,,1574,0.842,1.115,N/A
13
+ OAS1_0013_MR1,F,R,81,5,2,30,0,1664,0.679,1.055,N/A
14
+ OAS1_0014_MR1,F,R,19,,,,,1525,0.856,1.151,N/A
15
+ OAS1_0015_MR1,M,R,76,2,,28,0.5,1738,0.719,1.01,N/A
16
+ OAS1_0016_MR1,M,R,82,2,4,27,0.5,1477,0.739,1.188,N/A
17
+ OAS1_0017_MR1,M,R,21,,,,,1689,0.845,1.039,N/A
18
+ OAS1_0018_MR1,M,R,39,3,4,28,0,1636,0.813,1.073,N/A
19
+ OAS1_0019_MR1,F,R,89,5,1,30,0,1536,0.715,1.142,N/A
20
+ OAS1_0020_MR1,F,R,48,5,2,29,0,1326,0.785,1.323,N/A
21
+ OAS1_0021_MR1,F,R,80,3,3,23,0.5,1794,0.765,0.978,N/A
22
+ OAS1_0022_MR1,F,R,69,2,4,23,0.5,1447,0.757,1.213,N/A
23
+ OAS1_0023_MR1,M,R,82,2,3,27,0.5,1420,0.71,1.236,N/A
24
+ OAS1_0025_MR1,F,R,24,,,,,1240,0.893,1.415,N/A
25
+ OAS1_0026_MR1,F,R,58,5,1,30,0,1235,0.82,1.421,N/A
26
+ OAS1_0027_MR1,F,R,43,,,,,1194,0.834,1.47,N/A
27
+ OAS1_0028_MR1,F,R,86,2,4,27,1,1449,0.738,1.211,N/A
28
+ OAS1_0029_MR1,M,R,21,,,,,1653,0.858,1.062,N/A
29
+ OAS1_0030_MR1,F,R,65,2,3,29,0,1392,0.764,1.261,N/A
30
+ OAS1_0031_MR1,M,R,88,1,4,26,1,1419,0.674,1.236,N/A
31
+ OAS1_0032_MR1,M,R,89,4,1,28,0,1631,0.682,1.076,N/A
32
+ OAS1_0033_MR1,F,R,80,4,2,29,0,1323,0.735,1.326,N/A
33
+ OAS1_0034_MR1,M,R,51,5,1,29,0,1538,0.831,1.141,N/A
34
+ OAS1_0035_MR1,F,R,84,3,2,28,1,1402,0.695,1.252,N/A
35
+ OAS1_0037_MR1,M,R,27,,,,,1313,0.842,1.336,N/A
36
+ OAS1_0038_MR1,F,R,23,,,,,1443,0.839,1.216,N/A
37
+ OAS1_0039_MR1,M,R,70,4,3,29,0.5,1463,0.772,1.2,N/A
38
+ OAS1_0040_MR1,F,R,38,,,,,1244,0.824,1.411,N/A
39
+ OAS1_0041_MR1,F,R,62,2,,28,0.5,1350,0.758,1.3,N/A
40
+ OAS1_0042_MR1,M,R,80,4,2,29,0.5,1854,0.709,0.947,N/A
41
+ OAS1_0043_MR1,M,R,21,,,,,1511,0.846,1.162,N/A
42
+ OAS1_0044_MR1,F,R,47,4,2,30,0,1346,0.829,1.304,N/A
43
+ OAS1_0045_MR1,M,R,29,,,,,1590,0.829,1.104,N/A
44
+ OAS1_0046_MR1,M,R,64,2,,22,0.5,1351,0.787,1.299,N/A
45
+ OAS1_0047_MR1,F,R,57,,,,,1408,0.784,1.247,N/A
46
+ OAS1_0049_MR1,F,R,20,,,,,1329,0.887,1.321,N/A
47
+ OAS1_0050_MR1,F,R,48,,,,,1358,0.841,1.293,N/A
48
+ OAS1_0051_MR1,F,R,24,,,,,1567,0.835,1.12,N/A
49
+ OAS1_0052_MR1,F,R,78,1,5,23,1,1462,0.697,1.2,N/A
50
+ OAS1_0053_MR1,F,R,83,1,4,21,1,1384,0.699,1.268,N/A
51
+ OAS1_0054_MR1,F,R,21,,,,,1567,0.848,1.12,N/A
52
+ OAS1_0055_MR1,F,R,20,,,,,1432,0.831,1.226,N/A
53
+ OAS1_0056_MR1,F,R,72,3,3,15,1,1324,0.668,1.325,N/A
54
+ OAS1_0057_MR1,F,R,21,,,,,1333,0.862,1.317,N/A
55
+ OAS1_0058_MR1,F,R,46,5,1,30,0,1585,0.817,1.107,N/A
56
+ OAS1_0059_MR1,F,R,20,,,,,1396,0.827,1.257,N/A
57
+ OAS1_0060_MR1,M,R,79,4,,29,0.5,1564,0.734,1.122,N/A
58
+ OAS1_0061_MR1,F,R,20,,,,,1749,0.84,1.441,N/A
59
+ OAS1_0062_MR1,F,R,73,3,2,30,0,1456,0.754,1.205,N/A
60
+ OAS1_0063_MR1,M,R,48,,,,,1675,0.818,1.048,N/A
61
+ OAS1_0064_MR1,F,R,77,1,4,29,0,1583,0.767,1.108,N/A
62
+ OAS1_0065_MR1,M,R,90,2,3,25,0,1301,0.645,1.349,N/A
63
+ OAS1_0066_MR1,F,R,66,1,4,28,0.5,1309,0.765,1.341,N/A
64
+ OAS1_0067_MR1,F,R,71,4,1,27,1,1549,0.73,1.133,N/A
65
+ OAS1_0068_MR1,F,R,67,3,4,30,0,1508,0.805,1.164,N/A
66
+ OAS1_0069_MR1,M,R,33,4,1,30,0,1709,0.784,1.027,N/A
67
+ OAS1_0070_MR1,F,R,63,3,2,30,0,1327,0.801,1.323,N/A
68
+ OAS1_0071_MR1,F,R,49,5,1,30,0,1459,0.808,1.203,N/A
69
+ OAS1_0072_MR1,F,R,60,5,1,30,0,1402,0.823,1.252,N/A
70
+ OAS1_0073_MR1,F,R,69,2,4,21,1,1495,0.655,1.174,N/A
71
+ OAS1_0074_MR1,M,R,43,4,,30,0,1547,0.847,1.134,N/A
72
+ OAS1_0075_MR1,F,R,83,3,2,30,0,1335,0.72,1.314,N/A
73
+ OAS1_0076_MR1,F,R,18,,,,,1501,0.839,1.169,N/A
74
+ OAS1_0077_MR1,F,R,20,,,,,1537,0.852,1.142,N/A
75
+ OAS1_0078_MR1,F,R,64,3,2,30,0,1395,0.809,1.258,N/A
76
+ OAS1_0079_MR1,F,R,25,,,,,1522,0.826,1.153,N/A
77
+ OAS1_0080_MR1,F,R,25,,,,,1628,0.857,1.078,N/A
78
+ OAS1_0081_MR1,F,R,18,,,,,1309,0.857,1.341,N/A
79
+ OAS1_0082_MR1,F,R,75,2,3,28,0.5,1407,0.776,1.247,N/A
80
+ OAS1_0083_MR1,F,R,90,5,3,27,0,1200,0.727,1.462,N/A
81
+ OAS1_0084_MR1,F,R,81,2,,27,0.5,1453,0.727,1.208,N/A
82
+ OAS1_0085_MR1,F,R,70,2,3,29,0,1283,0.791,1.368,N/A
83
+ OAS1_0086_MR1,F,R,47,4,1,30,0,1311,0.835,1.339,N/A
84
+ OAS1_0087_MR1,F,R,21,,,,,1507,0.845,1.165,N/A
85
+ OAS1_0088_MR1,F,R,40,,,,,1557,0.865,1.127,N/A
86
+ OAS1_0090_MR1,M,R,20,,,,,1728,0.862,1.016,N/A
87
+ OAS1_0091_MR1,F,R,18,,,,,1701,0.834,1.032,N/A
88
+ OAS1_0092_MR1,M,R,22,,,,,1442,0.834,1.217,N/A
89
+ OAS1_0094_MR1,F,R,66,2,3,30,0.5,1447,0.772,1.213,N/A
90
+ OAS1_0095_MR1,M,R,28,,,,,1578,0.856,1.112,N/A
91
+ OAS1_0096_MR1,F,R,47,5,2,29,0,1357,0.809,1.294,N/A
92
+ OAS1_0097_MR1,M,R,23,,,,,1568,0.816,1.119,N/A
93
+ OAS1_0098_MR1,F,R,67,2,,18,0.5,1653,0.693,1.062,N/A
94
+ OAS1_0099_MR1,F,R,19,,,,,1484,0.878,1.183,N/A
95
+ OAS1_0101_MR1,M,R,29,,,,,1486,0.84,1.181,N/A
96
+ OAS1_0102_MR1,M,R,18,,,,,1542,0.85,1.138,N/A
97
+ OAS1_0103_MR1,F,R,19,,,,,1499,0.85,1.17,N/A
98
+ OAS1_0104_MR1,F,R,24,,,,,1447,0.841,1.213,N/A
99
+ OAS1_0105_MR1,M,R,20,,,,,1512,0.839,1.161,N/A
100
+ OAS1_0106_MR1,F,R,81,2,4,30,0,1230,0.717,1.427,N/A
101
+ OAS1_0107_MR1,M,R,20,,,,,1733,0.853,1.013,N/A
102
+ OAS1_0108_MR1,M,R,25,,,,,1825,0.854,0.962,N/A
103
+ OAS1_0109_MR1,F,R,61,4,3,30,0,1313,0.813,1.337,N/A
104
+ OAS1_0110_MR1,M,R,84,3,4,28,0,1483,0.697,1.183,N/A
105
+ OAS1_0111_MR1,M,R,23,,,,,1711,0.855,1.025,N/A
106
+ OAS1_0112_MR1,F,R,69,5,2,29,0,1536,0.733,1.143,N/A
107
+ OAS1_0113_MR1,F,R,83,2,2,29,0,1569,0.768,1.118,N/A
108
+ OAS1_0114_MR1,M,R,62,2,4,30,0,1378,0.804,1.274,N/A
109
+ OAS1_0115_MR1,M,R,72,5,1,26,0.5,1911,0.726,0.919,N/A
110
+ OAS1_0116_MR1,F,R,52,5,1,30,0,1373,0.784,1.279,N/A
111
+ OAS1_0117_MR1,M,R,25,,,,,1759,0.783,0.998,N/A
112
+ OAS1_0119_MR1,M,R,19,,,,,1502,0.838,1.169,N/A
113
+ OAS1_0120_MR1,M,R,70,4,4,26,0.5,1796,0.736,0.977,N/A
114
+ OAS1_0121_MR1,M,R,26,,,,,1684,0.82,1.042,N/A
115
+ OAS1_0122_MR1,F,R,83,5,2,22,1,1377,0.715,1.274,N/A
116
+ OAS1_0123_MR1,F,R,83,3,4,24,0.5,1282,0.797,1.369,N/A
117
+ OAS1_0124_MR1,M,R,73,2,,23,0.5,1661,0.709,1.056,N/A
118
+ OAS1_0125_MR1,F,R,22,,,,,1537,0.832,1.142,N/A
119
+ OAS1_0126_MR1,M,R,21,,,,,1582,0.885,1.11,N/A
120
+ OAS1_0127_MR1,M,R,30,,,,,1538,0.862,1.141,N/A
121
+ OAS1_0129_MR1,M,R,18,,,,,1514,0.846,1.159,N/A
122
+ OAS1_0130_MR1,M,R,68,3,3,26,0,1444,0.789,1.216,N/A
123
+ OAS1_0131_MR1,M,R,24,,,,,1637,0.824,1.072,N/A
124
+ OAS1_0132_MR1,M,R,22,,,,,1596,0.85,1.099,N/A
125
+ OAS1_0133_MR1,F,R,65,5,2,30,0,1277,0.814,1.374,N/A
126
+ OAS1_0134_MR1,M,R,80,2,4,20,1,1494,0.665,1.175,N/A
127
+ OAS1_0135_MR1,M,R,64,2,4,29,0,1561,0.801,1.124,N/A
128
+ OAS1_0136_MR1,F,R,24,,,,,1178,0.873,1.489,N/A
129
+ OAS1_0137_MR1,F,R,87,2,3,22,1,1499,0.672,1.171,N/A
130
+ OAS1_0138_MR1,M,R,80,2,4,28,0,1689,0.706,1.039,N/A
131
+ OAS1_0139_MR1,F,R,72,3,3,28,0,1512,0.779,1.161,N/A
132
+ OAS1_0140_MR1,F,R,23,,,,,1375,0.872,1.277,N/A
133
+ OAS1_0141_MR1,M,R,24,,,,,1523,0.846,1.152,N/A
134
+ OAS1_0142_MR1,M,R,70,4,1,27,0.5,1581,0.695,1.11,N/A
135
+ OAS1_0143_MR1,M,R,66,2,4,30,0.5,1446,0.784,1.214,N/A
136
+ OAS1_0144_MR1,M,R,22,,,,,1799,0.865,0.975,N/A
137
+ OAS1_0145_MR1,M,R,34,,,,,1653,0.831,1.062,N/A
138
+ OAS1_0146_MR1,F,R,82,5,1,28,0,1513,0.742,1.16,N/A
139
+ OAS1_0147_MR1,F,R,25,,,,,1663,0.845,1.055,N/A
140
+ OAS1_0148_MR1,M,R,23,,,,,1497,0.811,1.172,N/A
141
+ OAS1_0150_MR1,F,R,20,,,,,1510,0.875,1.162,N/A
142
+ OAS1_0151_MR1,F,R,25,,,,,1439,0.856,1.22,N/A
143
+ OAS1_0152_MR1,F,R,23,,,,,1471,0.83,1.193,N/A
144
+ OAS1_0153_MR1,M,R,23,,,,,1662,0.85,1.056,N/A
145
+ OAS1_0155_MR1,M,R,71,4,,28,0.5,1359,0.753,1.291,N/A
146
+ OAS1_0156_MR1,F,R,20,,,,,1591,0.834,1.103,N/A
147
+ OAS1_0157_MR1,F,R,86,4,3,30,0,1293,0.756,1.357,N/A
148
+ OAS1_0158_MR1,M,R,81,5,1,26,0.5,1556,0.689,1.128,N/A
149
+ OAS1_0159_MR1,F,R,40,,,,,1437,0.821,1.221,N/A
150
+ OAS1_0160_MR1,M,R,57,,,,,1745,0.813,1.006,N/A
151
+ OAS1_0161_MR1,F,R,84,2,2,27,0.5,1390,0.727,1.263,N/A
152
+ OAS1_0162_MR1,F,R,20,,,,,1219,0.872,1.44,N/A
153
+ OAS1_0163_MR1,F,R,18,,,,,1633,0.859,1.075,N/A
154
+ OAS1_0164_MR1,F,R,81,2,3,28,0.5,1495,0.687,1.174,N/A
155
+ OAS1_0165_MR1,F,R,74,2,3,29,0,1395,0.787,1.258,N/A
156
+ OAS1_0166_MR1,F,R,80,2,,27,0.5,1475,0.771,1.19,N/A
157
+ OAS1_0167_MR1,F,R,41,,,,,1361,0.849,1.289,N/A
158
+ OAS1_0168_MR1,F,R,50,,,,,1411,0.846,1.244,N/A
159
+ OAS1_0169_MR1,F,R,88,2,3,30,0,1445,0.718,1.215,N/A
160
+ OAS1_0170_MR1,M,R,71,2,4,29,0,1455,0.725,1.206,N/A
161
+ OAS1_0173_MR1,M,R,35,,,,,1475,0.829,1.19,N/A
162
+ OAS1_0174_MR1,M,R,23,,,,,1415,0.865,1.241,N/A
163
+ OAS1_0176_MR1,F,R,88,3,1,29,0,1398,0.712,1.255,N/A
164
+ OAS1_0177_MR1,F,R,54,4,1,30,0,1494,0.838,1.174,N/A
165
+ OAS1_0178_MR1,F,R,44,,,,,1272,0.853,1.38,N/A
166
+ OAS1_0179_MR1,F,R,87,2,4,21,0.5,1250,0.653,1.405,N/A
167
+ OAS1_0180_MR1,F,R,80,4,2,30,0,1496,0.745,1.173,N/A
168
+ OAS1_0181_MR1,F,R,49,4,2,30,0,1316,0.82,1.334,N/A
169
+ OAS1_0182_MR1,M,R,48,,,,,1561,0.816,1.124,N/A
170
+ OAS1_0183_MR1,M,R,44,,,,,1908,0.816,0.92,N/A
171
+ OAS1_0184_MR1,F,R,65,2,,16,1,1521,0.669,1.154,N/A
172
+ OAS1_0185_MR1,F,R,78,2,4,17,1,1314,0.739,1.336,N/A
173
+ OAS1_0186_MR1,M,R,84,5,1,29,0,1707,0.731,1.028,N/A
174
+ OAS1_0188_MR1,M,R,48,4,2,30,0,1464,0.79,1.199,N/A
175
+ OAS1_0189_MR1,M,R,22,,,,,1628,0.853,1.078,N/A
176
+ OAS1_0190_MR1,M,R,43,,,,,1561,0.813,1.124,N/A
177
+ OAS1_0191_MR1,F,R,21,,,,,1421,0.835,1.235,N/A
178
+ OAS1_0192_MR1,F,R,31,,,,,1294,0.839,1.357,N/A
179
+ OAS1_0193_MR1,F,R,23,,,,,1546,0.831,1.135,N/A
180
+ OAS1_0195_MR1,F,R,76,4,1,28,0,1346,0.766,1.304,N/A
181
+ OAS1_0197_MR1,F,R,89,3,3,29,0,1154,0.747,1.521,N/A
182
+ OAS1_0198_MR1,F,R,21,,,,,1332,0.852,1.317,N/A
183
+ OAS1_0199_MR1,M,R,69,5,1,30,0,1601,0.784,1.096,N/A
184
+ OAS1_0200_MR1,F,R,60,2,4,30,0,1366,0.807,1.285,N/A
185
+ OAS1_0201_MR1,F,R,85,4,1,26,0,1460,0.754,1.202,N/A
186
+ OAS1_0202_MR1,F,R,23,,,,,1574,0.865,1.115,N/A
187
+ OAS1_0203_MR1,F,R,71,2,3,30,0,1360,0.779,1.291,N/A
188
+ OAS1_0204_MR1,M,R,48,4,1,29,0,1430,0.797,1.227,N/A
189
+ OAS1_0205_MR1,M,R,75,4,1,30,0.5,1891,0.716,0.928,N/A
190
+ OAS1_0206_MR1,F,R,78,5,1,30,0,1243,0.747,1.412,N/A
191
+ OAS1_0207_MR1,M,R,51,5,2,29,0,1714,0.819,1.024,N/A
192
+ OAS1_0208_MR1,F,R,55,5,1,29,0,1368,0.823,1.283,N/A
193
+ OAS1_0209_MR1,F,R,22,,,,,1328,0.842,1.321,N/A
194
+ OAS1_0210_MR1,F,R,73,4,1,28,0.5,1676,0.722,1.047,N/A
195
+ OAS1_0211_MR1,M,R,20,,,,,1657,0.849,1.059,N/A
196
+ OAS1_0212_MR1,F,R,74,3,,28,0,1614,0.697,1.087,N/A
197
+ OAS1_0213_MR1,F,R,48,,,,,1332,0.801,1.318,N/A
198
+ OAS1_0214_MR1,M,R,18,,,,,1854,0.87,0.947,N/A
199
+ OAS1_0216_MR1,F,R,71,4,3,30,0,1503,0.792,1.168,N/A
200
+ OAS1_0217_MR1,F,R,78,4,3,27,0.5,1393,0.692,1.26,N/A
201
+ OAS1_0218_MR1,F,R,26,,,,,1291,0.843,1.36,N/A
202
+ OAS1_0220_MR1,F,R,75,5,1,30,0,1317,0.742,1.332,N/A
203
+ OAS1_0221_MR1,F,R,94,5,1,29,0,1474,0.696,1.19,N/A
204
+ OAS1_0222_MR1,F,R,49,,,,,1164,0.805,1.508,N/A
205
+ OAS1_0223_MR1,M,R,84,2,,20,1,1641,0.703,1.07,N/A
206
+ OAS1_0224_MR1,F,R,22,,,,,1378,0.852,1.274,N/A
207
+ OAS1_0226_MR1,M,R,90,1,4,23,0.5,1668,0.644,1.052,N/A
208
+ OAS1_0227_MR1,F,R,26,,,,,1288,0.777,1.362,N/A
209
+ OAS1_0228_MR1,F,R,81,3,2,28,0,1486,0.759,1.181,N/A
210
+ OAS1_0229_MR1,F,R,55,3,3,30,0,1327,0.832,1.323,N/A
211
+ OAS1_0230_MR1,F,R,19,,,,,1584,0.846,1.108,N/A
212
+ OAS1_0231_MR1,F,R,20,,,,,1429,0.852,1.228,N/A
213
+ OAS1_0232_MR1,M,R,22,,,,,1582,0.857,1.11,N/A
214
+ OAS1_0233_MR1,F,R,77,1,4,20,0.5,1376,0.701,1.275,N/A
215
+ OAS1_0234_MR1,M,R,75,5,2,29,0,1534,0.771,1.144,N/A
216
+ OAS1_0235_MR1,M,R,37,,,,,1407,0.842,1.247,N/A
217
+ OAS1_0236_MR1,F,R,20,,,,,1218,0.876,1.441,N/A
218
+ OAS1_0237_MR1,F,R,72,2,2,27,0,1322,0.764,1.328,N/A
219
+ OAS1_0238_MR1,F,R,77,2,3,28,0.5,1484,0.786,1.182,N/A
220
+ OAS1_0239_MR1,F,R,29,,,,,1439,0.823,1.22,N/A
221
+ OAS1_0240_MR1,F,R,74,2,,26,0.5,1171,0.736,1.499,N/A
222
+ OAS1_0241_MR1,F,R,74,5,2,30,0,1400,0.754,1.254,N/A
223
+ OAS1_0243_MR1,M,R,64,5,2,22,0.5,1547,0.742,1.134,N/A
224
+ OAS1_0244_MR1,F,R,80,4,2,29,0,1341,0.737,1.309,N/A
225
+ OAS1_0246_MR1,F,R,22,,,,,1522,0.841,1.153,N/A
226
+ OAS1_0247_MR1,M,R,90,2,3,21,0.5,1307,0.689,1.342,N/A
227
+ OAS1_0249_MR1,F,R,28,,,,,1217,0.871,1.443,N/A
228
+ OAS1_0250_MR1,M,R,21,,,,,1500,0.837,1.17,N/A
229
+ OAS1_0253_MR1,F,R,20,,,,,1751,0.852,1.002,N/A
230
+ OAS1_0254_MR1,F,R,85,5,1,29,0,1264,0.705,1.388,N/A
231
+ OAS1_0255_MR1,F,R,71,5,1,30,0,1426,0.737,1.231,N/A
232
+ OAS1_0256_MR1,M,R,70,5,1,30,0,1660,0.739,1.057,N/A
233
+ OAS1_0258_MR1,F,R,21,,,,,1516,0.87,1.158,N/A
234
+ OAS1_0259_MR1,F,R,78,3,2,29,0,1334,0.773,1.316,N/A
235
+ OAS1_0260_MR1,M,R,87,2,4,30,0,1762,0.719,0.996,N/A
236
+ OAS1_0261_MR1,M,R,28,,,,,1417,0.845,1.238,N/A
237
+ OAS1_0262_MR1,M,R,46,2,3,30,0,1604,0.784,1.094,N/A
238
+ OAS1_0263_MR1,M,R,79,4,1,30,0.5,1722,0.709,1.019,N/A
239
+ OAS1_0264_MR1,M,R,24,,,,,1591,0.849,1.103,N/A
240
+ OAS1_0265_MR1,F,R,54,,,,,1410,0.813,1.245,N/A
241
+ OAS1_0266_MR1,M,R,51,5,1,30,0,1793,0.834,0.979,N/A
242
+ OAS1_0267_MR1,M,R,80,5,2,28,0.5,1506,0.679,1.166,N/A
243
+ OAS1_0268_MR1,M,R,78,2,3,23,1,1491,0.715,1.177,N/A
244
+ OAS1_0269_MR1,F,R,72,1,4,21,1,1489,0.683,1.179,N/A
245
+ OAS1_0270_MR1,F,R,93,3,2,30,0,1272,0.703,1.38,N/A
246
+ OAS1_0271_MR1,F,R,89,2,4,27,0,1329,0.74,1.32,N/A
247
+ OAS1_0272_MR1,F,R,75,3,2,26,0.5,1355,0.745,1.296,N/A
248
+ OAS1_0273_MR1,F,R,89,1,4,18,0.5,1480,0.676,1.186,N/A
249
+ OAS1_0274_MR1,F,R,58,3,3,30,0,1373,0.815,1.278,N/A
250
+ OAS1_0275_MR1,M,R,50,,,,,1635,0.802,1.073,N/A
251
+ OAS1_0277_MR1,M,R,22,,,,,1913,0.841,0.917,N/A
252
+ OAS1_0278_MR1,F,R,96,5,1,26,1,1465,0.684,1.198,N/A
253
+ OAS1_0279_MR1,F,R,73,1,4,30,0,1475,0.721,1.19,N/A
254
+ OAS1_0280_MR1,F,R,78,5,1,30,0,1440,0.67,1.219,N/A
255
+ OAS1_0281_MR1,M,R,28,,,,,1538,0.835,1.141,N/A
256
+ OAS1_0282_MR1,F,R,45,,,,,1478,0.819,1.188,N/A
257
+ OAS1_0283_MR1,F,R,18,,,,,1578,0.836,1.112,N/A
258
+ OAS1_0284_MR1,F,R,91,5,2,30,0,1714,0.746,1.024,N/A
259
+ OAS1_0285_MR1,M,R,20,,,,,1470,0.843,1.194,N/A
260
+ OAS1_0286_MR1,F,R,83,3,2,20,0.5,1476,0.751,1.189,N/A
261
+ OAS1_0287_MR1,F,R,78,3,3,21,0.5,1194,0.694,1.47,N/A
262
+ OAS1_0288_MR1,M,R,71,2,4,20,0.5,1461,0.727,1.202,N/A
263
+ OAS1_0289_MR1,F,R,59,3,2,28,0,1334,0.767,1.316,N/A
264
+ OAS1_0290_MR1,M,R,83,3,2,26,0.5,1992,0.706,0.881,N/A
265
+ OAS1_0291_MR1,F,R,73,2,2,19,1,1274,0.745,1.377,N/A
266
+ OAS1_0292_MR1,F,R,64,4,2,30,0,1415,0.766,1.24,N/A
267
+ OAS1_0293_MR1,F,R,69,1,2,26,0,1384,0.783,1.268,N/A
268
+ OAS1_0294_MR1,F,R,20,,,,,1439,0.841,1.22,N/A
269
+ OAS1_0295_MR1,F,R,20,,,,,1412,0.803,1.243,N/A
270
+ OAS1_0296_MR1,F,R,28,,,,,1428,0.869,1.229,N/A
271
+ OAS1_0298_MR1,F,R,72,4,3,24,0.5,1354,0.738,1.296,N/A
272
+ OAS1_0299_MR1,F,R,90,2,3,29,0,1475,0.671,1.19,N/A
273
+ OAS1_0300_MR1,M,R,68,3,2,30,0.5,1556,0.723,1.128,N/A
274
+ OAS1_0301_MR1,F,R,90,3,2,28,0,1495,0.761,1.174,N/A
275
+ OAS1_0302_MR1,M,R,22,,,,,1570,0.831,1.118,N/A
276
+ OAS1_0303_MR1,F,R,67,2,4,30,0,1221,0.831,1.438,N/A
277
+ OAS1_0304_MR1,M,R,84,3,3,29,0.5,1497,0.693,1.172,N/A
278
+ OAS1_0305_MR1,M,R,48,,,,,1454,0.85,1.207,N/A
279
+ OAS1_0307_MR1,M,R,67,4,2,23,0.5,1399,0.735,1.255,N/A
280
+ OAS1_0308_MR1,F,R,78,3,3,15,2,1401,0.703,1.253,N/A
281
+ OAS1_0309_MR1,F,R,54,2,2,30,0,1441,0.786,1.218,N/A
282
+ OAS1_0310_MR1,F,R,20,,,,,1388,0.863,1.265,N/A
283
+ OAS1_0311_MR1,F,R,22,,,,,1366,0.83,1.285,N/A
284
+ OAS1_0312_MR1,F,R,73,3,,26,0.5,1311,0.756,1.339,N/A
285
+ OAS1_0313_MR1,F,R,20,,,,,1516,0.838,1.158,N/A
286
+ OAS1_0314_MR1,M,R,27,,,,,1720,0.84,1.02,N/A
287
+ OAS1_0315_MR1,M,R,77,5,1,25,0.5,1604,0.773,1.094,N/A
288
+ OAS1_0316_MR1,F,R,72,4,2,22,1,1493,0.69,1.176,N/A
289
+ OAS1_0317_MR1,M,R,86,4,1,26,0,1501,0.702,1.169,N/A
290
+ OAS1_0318_MR1,M,R,33,,,,,1634,0.836,1.074,N/A
291
+ OAS1_0319_MR1,M,R,31,,,,,1527,0.821,1.149,N/A
292
+ OAS1_0321_MR1,M,R,19,,,,,1478,0.843,1.187,N/A
293
+ OAS1_0322_MR1,F,R,65,3,4,29,0,1335,0.776,1.315,N/A
294
+ OAS1_0323_MR1,F,R,50,5,1,30,0,1370,0.826,1.281,N/A
295
+ OAS1_0325_MR1,F,R,27,,,,,1422,0.869,1.234,N/A
296
+ OAS1_0326_MR1,F,R,73,3,4,29,0,1272,0.7,1.38,N/A
297
+ OAS1_0327_MR1,M,R,50,,,,,1740,0.794,1.008,N/A
298
+ OAS1_0328_MR1,M,R,19,,,,,1453,0.878,1.208,N/A
299
+ OAS1_0329_MR1,F,R,80,2,3,29,0.5,1209,0.76,1.451,N/A
300
+ OAS1_0330_MR1,F,R,80,1,5,27,0,1381,0.752,1.27,N/A
301
+ OAS1_0331_MR1,F,R,54,,,,,1467,0.821,1.196,N/A
302
+ OAS1_0332_MR1,M,R,72,1,3,29,0,1734,0.762,1.012,N/A
303
+ OAS1_0333_MR1,M,R,26,,,,,1607,0.85,1.092,N/A
304
+ OAS1_0335_MR1,F,R,80,1,4,27,0.5,1654,0.678,1.061,N/A
305
+ OAS1_0336_MR1,F,R,41,,,,,1528,0.852,1.149,N/A
306
+ OAS1_0337_MR1,M,R,81,1,4,28,0,1750,0.676,1.003,N/A
307
+ OAS1_0338_MR1,M,R,77,4,1,29,0,1818,0.736,0.965,N/A
308
+ OAS1_0339_MR1,F,R,79,2,,24,0.5,1211,0.694,1.449,N/A
309
+ OAS1_0340_MR1,M,R,19,,,,,1650,0.853,1.063,N/A
310
+ OAS1_0341_MR1,F,R,71,2,4,30,0,1479,0.772,1.187,N/A
311
+ OAS1_0342_MR1,F,R,88,2,3,28,0,1370,0.765,1.281,N/A
312
+ OAS1_0343_MR1,M,R,68,3,3,30,0,1441,0.811,1.217,N/A
313
+ OAS1_0344_MR1,M,R,20,,,,,1510,0.851,1.163,N/A
314
+ OAS1_0345_MR1,F,R,54,4,2,30,0,1389,0.831,1.264,N/A
315
+ OAS1_0346_MR1,M,R,23,,,,,1485,0.843,1.181,N/A
316
+ OAS1_0348_MR1,F,R,22,,,,,1473,0.841,1.191,N/A
317
+ OAS1_0349_MR1,F,R,43,,,,,1227,0.858,1.43,N/A
318
+ OAS1_0350_MR1,M,R,21,,,,,1577,0.869,1.113,N/A
319
+ OAS1_0351_MR1,M,R,86,1,4,15,2,1512,0.665,1.161,N/A
320
+ OAS1_0352_MR1,F,R,81,5,2,26,0.5,1174,0.743,1.495,N/A
321
+ OAS1_0353_MR1,M,R,22,,,,,1680,0.8,1.044,N/A
322
+ OAS1_0354_MR1,M,R,74,1,3,26,0,1367,0.776,1.284,N/A
323
+ OAS1_0355_MR1,F,R,73,4,2,29,0,1123,0.79,1.563,N/A
324
+ OAS1_0356_MR1,F,R,68,3,2,30,0,1506,0.74,1.165,N/A
325
+ OAS1_0357_MR1,F,R,55,4,3,30,0,1450,0.82,1.21,N/A
326
+ OAS1_0358_MR1,M,R,65,3,3,29,0,1362,0.839,1.289,N/A
327
+ OAS1_0359_MR1,M,R,21,,,,,1714,0.864,1.024,N/A
328
+ OAS1_0361_MR1,M,R,20,,,,,1485,0.842,1.182,N/A
329
+ OAS1_0362_MR1,F,R,63,3,,14,0.5,1439,0.716,1.219,N/A
330
+ OAS1_0363_MR1,M,R,87,4,2,30,0,1398,0.702,1.255,N/A
331
+ OAS1_0365_MR1,M,R,74,5,2,30,0,1806,0.754,0.972,N/A
332
+ OAS1_0366_MR1,F,R,45,5,2,29,0,1549,0.813,1.133,N/A
333
+ OAS1_0367_MR1,F,R,46,2,2,28,0,1161,0.841,1.511,N/A
334
+ OAS1_0368_MR1,M,R,22,,,,,1572,0.856,1.116,N/A
335
+ OAS1_0369_MR1,F,R,73,4,1,28,0,1295,0.772,1.356,N/A
336
+ OAS1_0370_MR1,M,R,23,,,,,1734,0.847,1.012,N/A
337
+ OAS1_0371_MR1,F,R,70,3,4,30,0,1361,0.783,1.29,N/A
338
+ OAS1_0372_MR1,M,R,59,3,2,29,0,1596,0.817,1.1,N/A
339
+ OAS1_0373_MR1,F,R,80,3,2,20,1,1732,0.692,1.013,N/A
340
+ OAS1_0374_MR1,F,R,73,3,3,29,0.5,1238,0.76,1.418,N/A
341
+ OAS1_0375_MR1,M,R,46,,,,,1617,0.775,1.086,N/A
342
+ OAS1_0376_MR1,M,R,31,,,,,1579,0.817,1.111,N/A
343
+ OAS1_0377_MR1,M,R,25,,,,,1567,0.831,1.12,N/A
344
+ OAS1_0378_MR1,F,R,58,2,2,30,0,1418,0.821,1.238,N/A
345
+ OAS1_0379_MR1,F,R,20,,,,,1255,0.866,1.398,N/A
346
+ OAS1_0380_MR1,F,R,83,1,5,18,0.5,1313,0.705,1.337,N/A
347
+ OAS1_0381_MR1,M,R,59,4,2,29,0,1795,0.809,0.978,N/A
348
+ OAS1_0382_MR1,F,R,67,4,,15,1,1288,0.763,1.362,N/A
349
+ OAS1_0383_MR1,M,R,58,,,,,1590,0.746,1.104,N/A
350
+ OAS1_0384_MR1,F,R,38,,,,,1562,0.844,1.123,N/A
351
+ OAS1_0385_MR1,M,R,22,,,,,1643,0.841,1.068,N/A
352
+ OAS1_0386_MR1,F,R,26,,,,,1490,0.838,1.178,N/A
353
+ OAS1_0387_MR1,F,R,26,,,,,1149,0.851,1.527,N/A
354
+ OAS1_0388_MR1,F,R,77,2,4,22,1,1350,0.736,1.3,N/A
355
+ OAS1_0389_MR1,M,R,55,,,,,1678,0.782,1.046,N/A
356
+ OAS1_0390_MR1,M,R,69,2,2,24,0.5,1480,0.794,1.186,N/A
357
+ OAS1_0392_MR1,F,R,24,,,,,1441,0.848,1.218,N/A
358
+ OAS1_0394_MR1,F,R,22,,,,,1343,0.847,1.307,N/A
359
+ OAS1_0395_MR1,F,R,26,,,,,1295,0.834,1.356,N/A
360
+ OAS1_0396_MR1,M,R,25,,,,,1674,0.832,1.048,N/A
361
+ OAS1_0397_MR1,F,R,20,,,,,1265,0.846,1.387,N/A
362
+ OAS1_0398_MR1,M,R,71,5,1,30,0,1769,0.716,0.992,N/A
363
+ OAS1_0399_MR1,M,R,78,2,,29,1,1569,0.706,1.119,N/A
364
+ OAS1_0400_MR1,F,R,92,5,1,25,0.5,1774,0.644,0.989,N/A
365
+ OAS1_0401_MR1,F,R,54,4,3,29,0,1287,0.827,1.364,N/A
366
+ OAS1_0402_MR1,F,R,76,3,2,30,0.5,1350,0.763,1.3,N/A
367
+ OAS1_0403_MR1,M,R,19,,,,,1592,0.833,1.102,N/A
368
+ OAS1_0404_MR1,F,R,73,2,2,29,0,1465,0.776,1.198,N/A
369
+ OAS1_0405_MR1,M,R,77,5,1,23,1,1713,0.761,1.024,N/A
370
+ OAS1_0406_MR1,F,R,25,,,,,1346,0.855,1.303,N/A
371
+ OAS1_0407_MR1,F,R,55,,,,,1434,0.807,1.224,N/A
372
+ OAS1_0408_MR1,F,R,22,,,,,1518,0.861,1.156,N/A
373
+ OAS1_0409_MR1,M,R,34,,,,,1569,0.798,1.118,N/A
374
+ OAS1_0410_MR1,F,R,23,,,,,1507,0.87,1.165,N/A
375
+ OAS1_0411_MR1,F,R,71,5,1,29,0.5,1346,0.742,1.304,N/A
376
+ OAS1_0413_MR1,F,R,25,,,,,1447,0.866,1.213,N/A
377
+ OAS1_0415_MR1,F,R,21,,,,,1542,0.859,1.138,N/A
378
+ OAS1_0416_MR1,F,R,23,,,,,1567,0.852,1.12,N/A
379
+ OAS1_0417_MR1,F,R,30,,,,,1551,0.855,1.132,N/A
380
+ OAS1_0418_MR1,M,R,74,5,2,28,0.5,1659,0.747,1.058,N/A
381
+ OAS1_0419_MR1,F,R,21,,,,,1473,0.862,1.191,N/A
382
+ OAS1_0420_MR1,F,R,22,,,,,1732,0.848,1.013,N/A
383
+ OAS1_0421_MR1,F,R,22,,,,,1655,0.847,1.061,N/A
384
+ OAS1_0422_MR1,F,R,69,4,3,29,0,1380,0.809,1.272,N/A
385
+ OAS1_0423_MR1,M,R,75,2,4,28,0,1511,0.749,1.162,N/A
386
+ OAS1_0424_MR1,M,R,75,4,1,20,1,1613,0.715,1.088,N/A
387
+ OAS1_0425_MR1,F,R,78,1,4,23,1,1461,0.715,1.201,N/A
388
+ OAS1_0426_MR1,F,R,82,5,2,29,0,1316,0.791,1.334,N/A
389
+ OAS1_0428_MR1,F,R,84,4,3,28,0,1500,0.751,1.17,N/A
390
+ OAS1_0429_MR1,F,R,45,,,,,1385,0.808,1.267,N/A
391
+ OAS1_0430_MR1,M,R,71,4,1,17,1,1562,0.687,1.123,N/A
392
+ OAS1_0431_MR1,F,R,22,,,,,1405,0.822,1.249,N/A
393
+ OAS1_0432_MR1,F,R,72,2,4,26,0.5,1453,0.773,1.208,N/A
394
+ OAS1_0433_MR1,M,R,58,4,1,27,0,1606,0.779,1.093,N/A
395
+ OAS1_0434_MR1,F,R,50,,,,,1385,0.819,1.267,N/A
396
+ OAS1_0435_MR1,M,R,23,,,,,1766,0.82,0.994,N/A
397
+ OAS1_0437_MR1,F,R,22,,,,,1444,0.853,1.216,N/A
398
+ OAS1_0438_MR1,F,R,66,5,2,29,0,1191,0.787,1.474,N/A
399
+ OAS1_0439_MR1,M,R,21,,,,,1438,0.844,1.221,N/A
400
+ OAS1_0440_MR1,M,R,86,1,4,27,0.5,1320,0.723,1.329,N/A
401
+ OAS1_0441_MR1,M,R,81,5,1,29,0.5,1647,0.721,1.066,N/A
402
+ OAS1_0442_MR1,F,R,23,,,,,1431,0.847,1.227,N/A
403
+ OAS1_0443_MR1,F,R,52,3,3,30,0,1431,0.814,1.226,N/A
404
+ OAS1_0444_MR1,F,R,30,,,,,1250,0.86,1.404,N/A
405
+ OAS1_0445_MR1,F,R,90,1,2,29,0,1362,0.673,1.289,N/A
406
+ OAS1_0446_MR1,F,R,80,2,4,30,0,1390,0.748,1.263,N/A
407
+ OAS1_0447_MR1,F,R,92,4,1,24,0.5,1388,0.739,1.264,N/A
408
+ OAS1_0448_MR1,F,R,22,,,,,1524,0.858,1.152,N/A
409
+ OAS1_0449_MR1,F,R,71,3,4,29,0,1264,0.818,1.388,N/A
410
+ OAS1_0450_MR1,M,R,19,,,,,1478,0.88,1.188,N/A
411
+ OAS1_0451_MR1,M,R,73,5,3,27,0.5,1687,0.728,1.04,N/A
412
+ OAS1_0452_MR1,M,R,75,1,4,22,1,1656,0.762,1.06,N/A
413
+ OAS1_0453_MR1,F,R,70,1,4,29,0.5,1295,0.748,1.355,N/A
414
+ OAS1_0454_MR1,F,R,73,3,2,23,0.5,1536,0.73,1.142,N/A
415
+ OAS1_0455_MR1,F,R,61,2,4,28,0,1354,0.825,1.297,N/A
416
+ OAS1_0456_MR1,M,R,61,5,2,30,0,1637,0.78,1.072,N/A
417
+ OAS1_0457_MR1,F,R,62,3,3,26,0,1372,0.766,1.279,N/A
418
+ OAS1_0061_MR2,F,R,20,,,,,1757,0.845,0.999,1
419
+ OAS1_0080_MR2,F,R,25,,,,,1605,0.841,1.093,20
420
+ OAS1_0092_MR2,M,R,22,,,,,1457,0.835,1.205,5
421
+ OAS1_0101_MR2,M,R,29,,,,,1501,0.835,1.169,64
422
+ OAS1_0111_MR2,M,R,23,,,,,1714,0.861,1.024,2
423
+ OAS1_0117_MR2,M,R,25,,,,,1753,0.782,1.001,5
424
+ OAS1_0145_MR2,M,R,34,,,,,1654,0.832,1.061,10
425
+ OAS1_0150_MR2,F,R,20,,,,,1506,0.876,1.165,1
426
+ OAS1_0156_MR2,F,R,20,,,,,1577,0.832,1.113,12
427
+ OAS1_0191_MR2,F,R,21,,,,,1416,0.837,1.239,28
428
+ OAS1_0202_MR2,F,R,23,,,,,1548,0.861,1.134,21
429
+ OAS1_0230_MR2,F,R,19,,,,,1577,0.849,1.113,24
430
+ OAS1_0236_MR2,F,R,20,,,,,1222,0.872,1.436,3
431
+ OAS1_0239_MR2,F,R,29,,,,,1438,0.822,1.221,40
432
+ OAS1_0249_MR2,F,R,28,,,,,1215,0.865,1.444,3
433
+ OAS1_0285_MR2,M,R,20,,,,,1469,0.847,1.195,2
434
+ OAS1_0353_MR2,M,R,22,,,,,1684,0.79,1.042,40
435
+ OAS1_0368_MR2,M,R,22,,,,,1580,0.856,1.111,89
436
+ OAS1_0379_MR2,F,R,20,,,,,1262,0.861,1.39,2
437
+ OAS1_0395_MR2,F,R,26,,,,,1283,0.834,1.368,39
OAISIS_clean/util.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def load_dicom_images(folder_path):
8
+ reader = sitk.ImageSeriesReader()
9
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
10
+ reader.SetFileNames(dicom_names)
11
+ image = reader.Execute()
12
+ return image
13
+
14
+ def convert_windows_to_linux_path(windows_path):
15
+ # Replace backslashes with forward slashes and remove the drive letter
16
+ # Some meta files have windows paths, but the data is stored on a linux server
17
+ linux_path = windows_path.replace('\\', '/')
18
+ if ':' in linux_path:
19
+ linux_path = linux_path.split(':', 1)[1]
20
+ return linux_path
21
+
22
+ # =============================================================================
23
+ # ========================developed with TotalSegmentor========================
24
+ # =============================================================================
25
+
26
+ def read_table(file_path, split_str=';'):
27
+ try:
28
+ df = pd.read_excel(file_path, engine='openpyxl')
29
+ except:
30
+ df = pd.read_csv(file_path, sep=split_str)
31
+ return df
32
+
33
+ def load_nifti(image_path):
34
+ return sitk.ReadImage(image_path)
35
+
36
+ def save_nifti(image, output_path, folder_path):
37
+ output_dirpath = os.path.dirname(output_path)
38
+ if not os.path.exists(output_dirpath):
39
+ print(f"Creating directory {output_dirpath}")
40
+ os.makedirs(output_dirpath)
41
+ # Set metadata in the NIfTI file's header
42
+ image.SetMetaData("FolderPath", folder_path)
43
+ sitk.WriteImage(image, output_path)
44
+
45
+ def find_metadata_files(path, file_name='*meta*'):
46
+ # for TotalSegmentor dataset
47
+ search_pattern = os.path.join(path, '**', file_name)
48
+ return glob.glob(search_pattern, recursive=True)
49
+
50
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
51
+ img_path = []
52
+ for root, dirs, files in os.walk(folder_path):
53
+ for file in files:
54
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
55
+ img_path.append(os.path.join(root, file))
56
+ if is_sorted:
57
+ img_path.sort()
58
+ return img_path
59
+
60
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
61
+ '''
62
+ Resample the image to have isotropic spacing, following the steps:
63
+ 1. Find the minimum spacing
64
+ 2. Resample the image to have the minimum spacing
65
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
66
+ 4. Set the output spacing
67
+ 5. Return the resampler for resampling
68
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
69
+ '''
70
+ # 讨论为什么重新写这个函数!!!
71
+ if size is None:
72
+ size = ref_img.GetSize()
73
+ if spacing is None:
74
+ spacing = ref_img.GetSpacing()
75
+ min_spacing = min(spacing)
76
+ if all([spc == min_spacing for spc in spacing]):
77
+ return None
78
+ else:
79
+ # if 1:
80
+ if interpolator == 'nearest':
81
+ interpolator = sitk.sitkNearestNeighbor
82
+ elif interpolator == 'linear':
83
+ interpolator = sitk.sitkLinear
84
+ resampler = sitk.ResampleImageFilter()
85
+ # new_spacing = [max_spacing] * len(spacing)
86
+ # print(size)
87
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
88
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
89
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
90
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
91
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
92
+ # resampler.SetSize(new_size)
93
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
94
+ resampler.SetSize(new_size_xy)
95
+ resampler.SetOutputSpacing(new_size_spacing)
96
+
97
+ # print(new_size,new_size_xy)
98
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
99
+ resampler.SetOutputDirection(ref_img.GetDirection())
100
+ resampler.SetInterpolator(interpolator)
101
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
102
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
103
+ return resampler
104
+
105
+ def clamp_image(in_img,clamp_range):
106
+ '''
107
+ Clamp the image to the specified range
108
+ '''
109
+ clamp_filter = sitk.ClampImageFilter()
110
+ clamp_filter.SetLowerBound(clamp_range[0])
111
+ clamp_filter.SetUpperBound(clamp_range[1])
112
+ return clamp_filter.Execute(in_img)
113
+
114
+ def get_synonyms_dict(dict_type='ROI'):
115
+ '''
116
+ Get the dictionary of synonyms for the specified dictionary type
117
+ '''
118
+ if dict_type == 'ROI':
119
+ dict_synonyms = {
120
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
121
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
122
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
123
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
124
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
125
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
126
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
127
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
128
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
129
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
130
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
131
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
132
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
133
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
134
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
135
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
136
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
137
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
138
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
139
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
140
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
141
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
142
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
143
+ }
144
+ elif dict_type == 'Label_tissue':
145
+ dict_synonyms = {
146
+ 'liver': ['liver','hepatic'],
147
+ 'spleen': ['spleen','splenic'],
148
+ 'kidney': ['kidney','renal'],
149
+ 'pancreas': ['pancreas','pancreatic'],
150
+ 'stomach': ['stomach','gastric'],
151
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
152
+ 'gallbladder': ['gallbladder'],
153
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
154
+ 'bladder': ['bladder'],
155
+ 'prostate': ['prostate'],
156
+ 'uterus': ['uterus'],
157
+ 'ovary': ['ovary'],
158
+ 'testicle': ['testicle'],
159
+ 'lymph_node': ['lymph_node','lymph node'],
160
+ 'bone': ['bone'],
161
+ 'lung': ['lung'],
162
+ 'heart': ['heart'],
163
+ 'esophagus': ['esophagus'],
164
+ 'muscle': ['muscle'],
165
+ 'fat': ['fat'],
166
+ 'skin': ['skin'],
167
+ 'vessel': ['vessel'],
168
+ 'tumor': ['tumor'],
169
+ 'other': ['other']
170
+ }
171
+ elif dict_type == 'Task':
172
+ dict_synonyms = {
173
+ 'segmentation': ['segmentation', 'seg', 'mask'],
174
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
175
+ 'localization': ['localization', 'locate', 'location', 'position'],
176
+ 'registration': ['registration', 'register', 'align', 'alignment'],
177
+ 'detection': ['detection', 'detect', 'find', 'locate'],
178
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
179
+ }
180
+ elif dict_type == 'Modality':
181
+ dict_synonyms = {
182
+ 'CT': ['CT', 'computed tomography'],
183
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
184
+ 'PET': ['PET', 'positron emission tomography'],
185
+ 'US': ['US', 'ultrasound'],
186
+ 'X-ray': ['X-ray', 'radiography'],
187
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
188
+ }
189
+ else:
190
+ raise ValueError(f"dict_type {dict_type} is not valid")
191
+ return dict_synonyms
192
+
193
+ def replace_synonyms(text, dict_synonyms):
194
+ '''
195
+ Replace the synonyms in the text with the standard term
196
+ '''
197
+ if isinstance(text,str):
198
+ for key, value in dict_synonyms.items():
199
+ for v in value:
200
+ if v.lower() in text.lower():
201
+ return key
202
+ Warning(f"Value {text} is not in the correct format")
203
+ elif isinstance(text,list):
204
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
205
+ elif isinstance(text,dict):
206
+ for key in text.keys():
207
+ # replace values in dict
208
+ text[key] = replace_synonyms(text[key], dict_synonyms)
209
+ # replace keys in dict
210
+ for k in dict_synonyms.keys():
211
+ text[dict_synonyms[k]] = text.pop(key)
212
+ return text
213
+
214
+ # =============================================================================
215
+
216
+ class meta_data(object):
217
+ '''
218
+ This class is used to store the metadata of the dataset
219
+ '''
220
+ def __init__(self):
221
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
222
+ with open(self.config_format_path, 'r') as file:
223
+ self.config_format = json.load(file)
224
+ self.config = {}
225
+ for key in self.config_format.keys():
226
+ if self.config_format[key]['required'] == True:
227
+ self.config[key] = {}
228
+ self.keytypes = self.find_all_keys_with_type()
229
+ self.keytypes_flatten = self.flatten_json()
230
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
231
+ for key in self.ambiguity_keys:
232
+ ambiguity_dict = get_synonyms_dict(key)
233
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
234
+
235
+ def get_ketytypes(self):
236
+ return self.keytypes
237
+
238
+ def get_keytypes_flatten(self):
239
+ return self.keytypes_flatten
240
+
241
+ def find_all_keys_with_type(self, data=None, parent_key=''):
242
+ if data is None:
243
+ data = self.config_format
244
+ keys_with_type = {}
245
+ if isinstance(data, dict):
246
+ for key, value in data.items():
247
+ full_key = f"{parent_key}.{key}" if parent_key else key
248
+ if isinstance(value, dict) and 'type' in value:
249
+ keys_with_type[full_key] = value['type']
250
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
251
+ elif isinstance(data, list):
252
+ for index, item in enumerate(data):
253
+ full_key = f"{parent_key}[{index}]"
254
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
255
+ return keys_with_type
256
+
257
+ def flatten_json(self, data=None, parent_key='', sep='.'):
258
+ if data is None:
259
+ data = self.config_format
260
+ items = {}
261
+ if isinstance(data, dict):
262
+ for key, value in data.items():
263
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
264
+ if isinstance(value, dict):
265
+ items.update(self.flatten_json(value, new_key, sep=sep))
266
+ elif isinstance(value, list):
267
+ for i, item in enumerate(value):
268
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
269
+ else:
270
+ items[new_key] = value
271
+ elif isinstance(data, list):
272
+ for i, item in enumerate(data):
273
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
274
+ return items
275
+
276
+ def req_check(self):
277
+ self.unfilled_keys = []
278
+ for key in self.config.keys():
279
+ if self.config[key] == {}:
280
+ self.unfilled_keys.append(key)
281
+ if len(self.unfilled_keys) == 0:
282
+ return True
283
+ else:
284
+ return False
285
+
286
+ def type_check(self, key, value):
287
+ if key not in self.config_format.keys():
288
+ print(key, "is not a valid key")
289
+ return False
290
+
291
+ if key == 'Modality':
292
+ if value not in self.config_format[key]['options']:
293
+ return False
294
+ else:
295
+ return True
296
+
297
+ elif key == 'OriImg_path':
298
+ if isinstance(value, str):
299
+ return True
300
+ else:
301
+ return False
302
+
303
+ elif key == 'Label_path' and isinstance(value, dict):
304
+ for skey in value.keys():
305
+ if skey in self.config_format[key]['keys']:
306
+ for kk in value[skey]:
307
+ if isinstance(value[skey][kk],str):
308
+ pass
309
+ # if kk in self.config_format[key]['value']['keys']:
310
+ # if isinstance(value[skey][kk],str):
311
+ # pass
312
+ # else:
313
+ # return False
314
+ else:
315
+ return False
316
+ return True
317
+
318
+ elif key == 'ROI':
319
+ if value not in self.config_format[key]['options']:
320
+ return False
321
+ else:
322
+ return True
323
+
324
+ elif key == 'Label_tissue' and isinstance(value, list):
325
+ for i in value:
326
+ if i not in self.config_format[key]['items']['options']:
327
+ return False
328
+ return True
329
+
330
+ elif key =='Task' and isinstance(value, list):
331
+ for i in value:
332
+ if i not in self.config_format[key]['items']['options']:
333
+ return False
334
+ return True
335
+
336
+ elif key == 'Spacing_mm':
337
+ if isinstance(value, float):
338
+ return True
339
+ else:
340
+ False
341
+
342
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
343
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
344
+ return all(isinstance(item, int) for item in value)
345
+
346
+ elif key == 'Dataset_name':
347
+ if isinstance(value, str):
348
+ return True
349
+ else:
350
+ return False
351
+ ##added by yanguoiqng on 2025-08-08
352
+ elif key == 'Sub_modality':
353
+
354
+ if isinstance(value, dict):
355
+ return True
356
+ else:
357
+ return False
358
+ elif key == 'Label_Dict':
359
+
360
+ if isinstance(value, dict):
361
+ return True
362
+ else:
363
+ return False
364
+ def add_extra_keyvalue(self, key, value):
365
+ self.config[key] = value
366
+ return True
367
+
368
+ def add_keyvalue(self, key, value):
369
+ if key in self.ambiguity_keys:
370
+ value = replace_synonyms(value, get_synonyms_dict(key))
371
+ # print(key, value)
372
+ if self.type_check(key, value):
373
+
374
+ self.config[key] = value
375
+ return True
376
+ else:
377
+ Warning(f"Value {value} is not in the correct format for key {key}")
378
+ pass
379
+ # print(f"Value {value} is not in the correct format for key {key}")
380
+
381
+ def get_meta_data(self):
382
+ if self.req_check():
383
+ return self.config
384
+ else:
385
+ print("Not all required keys are filled", self.unfilled_keys)
386
+ return False
387
+
388
+
389
+
390
+ if __name__ == '__main__':
391
+ meta = meta_data()
392
+ print(meta.get_keytypes_flatten())
393
+ print(meta.get_ketytypes())
394
+ meta.add_keyvalue('Modality', 'CT')
395
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
396
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
397
+ meta.add_keyvalue('Spacing_mm', 1.5)
398
+ meta.add_keyvalue('Size', [512, 512, 100])
399
+ meta.add_keyvalue('Dataset_name', 'CT')
400
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
401
+ meta.add_keyvalue('Task', ['1', '2', '3'])
402
+ print(meta.get_meta_data())
403
+ meta.add_extra_key('extra', 'extra')
404
+ print(meta.get_meta_data())
405
+ print(meta.get_ketytypes())
406
+ print(meta.get_keytypes_flatten)
407
+
408
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
409
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
410
+ print(img_paths)
OAI_ZIB_clean/config_format.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+
117
+ "Sub_modality": {
118
+ "type": "dict",
119
+ "required": false
120
+ },
121
+ "Label_Dict": {
122
+ "type": "dict",
123
+ "required": false
124
+ }
125
+ }
OAI_ZIB_clean/dataclean_OAI_ZIB.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ OAI-ZIB Dataset Processing Script
4
+ create on 2026-03-05
5
+
6
+ OAI-ZIB: Osteoarthritis Initiative dataset curated by ZIB (Zuse Institute Berlin).
7
+ Contains RIGHT knee MRI scans and corresponding segmentation labelmaps for 507
8
+ subjects, split into train (253) and test (254) sets.
9
+
10
+ All images are RIGHT knee (confirmed via OAIZIB-CM kneeSideInfo.csv).
11
+
12
+ Label values:
13
+ 0: background
14
+ 1: femur
15
+ 2: femoral cartilage
16
+ 3: tibia
17
+ 4: medial tibial cartilage
18
+ 5: lateral tibial cartilage
19
+
20
+ Nonimaging metadata extracted per subject (baseline visit V00, right knee):
21
+ - enrollee01.txt: age, gender, race, ethnicity, cohort
22
+ - oscf01.txt: BMI, height, weight
23
+ - kxrsq01.txt: KL grade (right knee, Kellgren-Lawrence OA severity 0-4)
24
+ - womac01.txt: WOMAC scores (right knee: pain, ADL, stiffness)
25
+ '''
26
+ import os
27
+ import glob
28
+ import csv
29
+ import argparse
30
+ import json
31
+ import SimpleITK as sitk
32
+ from tqdm import tqdm
33
+ from util import meta_data
34
+ import util
35
+
36
+
37
+ TASK_VALUE = "segmentation"
38
+ TARGET_SPACING = [0.36, 0.36, 0.36] # isotropic resampling target (mm)
39
+
40
+
41
+ def resample_to_isotropic(sitk_img, target_spacing=TARGET_SPACING, interpolator=sitk.sitkLinear):
42
+ """Resample a SimpleITK image to isotropic spacing."""
43
+ original_spacing = sitk_img.GetSpacing()
44
+ original_size = sitk_img.GetSize()
45
+
46
+ new_size = [
47
+ int(round(osz * osp / tsp))
48
+ for osz, osp, tsp in zip(original_size, original_spacing, target_spacing)
49
+ ]
50
+
51
+ resampler = sitk.ResampleImageFilter()
52
+ resampler.SetOutputSpacing(target_spacing)
53
+ resampler.SetSize(new_size)
54
+ resampler.SetOutputDirection(sitk_img.GetDirection())
55
+ resampler.SetOutputOrigin(sitk_img.GetOrigin())
56
+ resampler.SetInterpolator(interpolator)
57
+ resampler.SetDefaultPixelValue(0)
58
+ resampler.SetTransform(sitk.Transform())
59
+
60
+ return resampler.Execute(sitk_img)
61
+
62
+ LABEL_DICT = {
63
+ "0": "background",
64
+ "1": "femur",
65
+ "2": "femoral cartilage",
66
+ "3": "tibia",
67
+ "4": "medial tibial cartilage",
68
+ "5": "lateral tibial cartilage"
69
+ }
70
+
71
+
72
+ def load_nonimaging_table(filepath):
73
+ """Load a tab-delimited nonimaging .txt file, skipping the description row (row 2)."""
74
+ rows = []
75
+ with open(filepath, 'r') as f:
76
+ reader = csv.DictReader(f, delimiter='\t', quotechar='"')
77
+ for i, row in enumerate(reader):
78
+ if i == 0:
79
+ # Row 0 after header is the description row — skip it
80
+ continue
81
+ rows.append(row)
82
+ return rows
83
+
84
+
85
+ def build_subject_lookup(rows, key='src_subject_id', visit_filter=None):
86
+ """Build a dict keyed by subject ID. If visit_filter is set, only keep rows with that visit."""
87
+ lookup = {}
88
+ for row in rows:
89
+ sid = row.get(key, '').strip('"')
90
+ visit = row.get('visit', '').strip('"')
91
+ if visit_filter and visit != visit_filter:
92
+ continue
93
+ if sid not in lookup:
94
+ lookup[sid] = row
95
+ return lookup
96
+
97
+
98
+ def load_all_nonimaging(nonimaging_dir):
99
+ """Load and index all relevant nonimaging tables by subject ID (baseline V00)."""
100
+ tables = {}
101
+
102
+ # enrollee01: demographics (use V00 baseline)
103
+ fp = os.path.join(nonimaging_dir, 'enrollee01.txt')
104
+ if os.path.isfile(fp):
105
+ tables['enrollee'] = build_subject_lookup(load_nonimaging_table(fp), visit_filter='V00')
106
+
107
+ # oscf01: BMI, height, weight (prefer V00, fallback to any visit with BMI)
108
+ fp = os.path.join(nonimaging_dir, 'oscf01.txt')
109
+ if os.path.isfile(fp):
110
+ rows = load_nonimaging_table(fp)
111
+ oscf_lookup = {}
112
+ for row in rows:
113
+ sid = row.get('src_subject_id', '').strip('"')
114
+ bmi = row.get('bmi', '').strip('"')
115
+ visit = row.get('visit', '').strip('"')
116
+ if not bmi:
117
+ continue
118
+ # Prefer V00, otherwise keep first available
119
+ if sid not in oscf_lookup or visit == 'V00':
120
+ oscf_lookup[sid] = row
121
+ tables['oscf'] = oscf_lookup
122
+
123
+ # kxrsq01: KL grade (use V00, RIGHT knee only: side=1)
124
+ fp = os.path.join(nonimaging_dir, 'kxrsq01.txt')
125
+ if os.path.isfile(fp):
126
+ rows = load_nonimaging_table(fp)
127
+ kl_lookup = {}
128
+ for row in rows:
129
+ sid = row.get('src_subject_id', '').strip('"')
130
+ visit = row.get('visit', '').strip('"')
131
+ side = row.get('side', '').strip('"')
132
+ if visit != 'V00' or side != '1': # side=1 is RIGHT
133
+ continue
134
+ kl = row.get('xrkl', '').strip('"')
135
+ if sid not in kl_lookup:
136
+ kl_lookup[sid] = kl
137
+ tables['kl_grade'] = kl_lookup
138
+
139
+ # womac01: WOMAC scores (use V00)
140
+ fp = os.path.join(nonimaging_dir, 'womac01.txt')
141
+ if os.path.isfile(fp):
142
+ tables['womac'] = build_subject_lookup(load_nonimaging_table(fp), visit_filter='V00')
143
+
144
+ return tables
145
+
146
+
147
+ def get_subject_metadata(subject_id, tables):
148
+ """Extract relevant metadata for a subject from preloaded tables."""
149
+ info = {}
150
+ info['Knee_Side'] = 'right'
151
+
152
+ # Demographics from enrollee01
153
+ enrollee = tables.get('enrollee', {}).get(subject_id, {})
154
+ if enrollee:
155
+ info['Age'] = enrollee.get('ageyears', '').strip('"')
156
+ info['Gender'] = enrollee.get('gender', '').strip('"')
157
+ info['Race'] = enrollee.get('race', '').strip('"')
158
+ info['Ethnicity'] = enrollee.get('ethnicity', '').strip('"')
159
+ info['Cohort'] = enrollee.get('e_cohort', '').strip('"')
160
+
161
+ # BMI from oscf01
162
+ oscf = tables.get('oscf', {}).get(subject_id, {})
163
+ if oscf:
164
+ info['BMI'] = oscf.get('bmi', '').strip('"')
165
+ info['Height_mm'] = oscf.get('height_av', '').strip('"')
166
+ info['Weight_kg'] = oscf.get('weight_met', '').strip('"')
167
+
168
+ # KL grade from kxrsq01 (right knee only)
169
+ kl = tables.get('kl_grade', {}).get(subject_id)
170
+ if kl is not None:
171
+ info['KL_Grade'] = kl
172
+
173
+ # WOMAC scores from womac01 (right knee only)
174
+ womac = tables.get('womac', {}).get(subject_id, {})
175
+ if womac:
176
+ info['WOMAC_Pain'] = womac.get('womkpr', '').strip('"')
177
+ info['WOMAC_ADL'] = womac.get('womadlr', '').strip('"')
178
+ info['WOMAC_Stiffness'] = womac.get('womtsr', '').strip('"')
179
+
180
+ return info
181
+
182
+
183
+ def main(target_path, output_dir):
184
+ if not os.path.isdir(output_dir):
185
+ os.makedirs(output_dir)
186
+
187
+ failed_files = []
188
+
189
+ # Load nonimaging metadata
190
+ nonimaging_dir = os.path.join(target_path, 'nonimaging', 'NonImaging')
191
+ print("Loading nonimaging metadata...")
192
+ tables = load_all_nonimaging(nonimaging_dir)
193
+ print(f" enrollee: {len(tables.get('enrollee', {}))} subjects")
194
+ print(f" oscf (BMI): {len(tables.get('oscf', {}))} subjects")
195
+ print(f" kl_grade (right): {len(tables.get('kl_grade', {}))} subjects")
196
+ print(f" womac: {len(tables.get('womac', {}))} subjects")
197
+
198
+ # Process train and test splits into separate folders
199
+ for split in ['train', 'test']:
200
+ image_dir = os.path.join(target_path, 'images', split)
201
+ label_dir = os.path.join(target_path, 'labels', split)
202
+
203
+ if not os.path.isdir(image_dir):
204
+ print(f"Image directory not found: {image_dir}")
205
+ continue
206
+
207
+ split_output_dir = os.path.join(output_dir, split)
208
+ os.makedirs(split_output_dir, exist_ok=True)
209
+
210
+ json_output_path = os.path.join(split_output_dir, 'nifti_mappings.json')
211
+ # Initialize the JSON file fresh
212
+ with open(json_output_path, 'w') as json_file:
213
+ json.dump({}, json_file)
214
+
215
+ image_files = sorted(glob.glob(os.path.join(image_dir, '*.nii.gz')))
216
+ print(f"\nProcessing {split} split: {len(image_files)} subjects -> {split_output_dir}")
217
+
218
+ for image_path in tqdm(image_files, desc=f"Processing {split}"):
219
+ filename = os.path.basename(image_path) # e.g. 9002817.nii.gz
220
+ subject_id = filename.replace('.nii.gz', '')
221
+
222
+ try:
223
+ # Read original image
224
+ sitk_img = sitk.ReadImage(image_path)
225
+ original_size = list(sitk_img.GetSize())
226
+ original_spacing = list(sitk_img.GetSpacing())
227
+
228
+ # Resample to isotropic
229
+ sitk_img_iso = resample_to_isotropic(sitk_img, TARGET_SPACING, sitk.sitkLinear)
230
+ resampled_size = list(sitk_img_iso.GetSize())
231
+ resampled_spacing = list(sitk_img_iso.GetSpacing())
232
+
233
+ # Build metadata (use resampled size/spacing)
234
+ meta = meta_data()
235
+ meta.add_keyvalue('Modality', 'MRI')
236
+ meta.add_keyvalue('OriImg_path', image_path)
237
+ meta.add_keyvalue('Spacing_mm', min(resampled_spacing))
238
+ meta.add_keyvalue('Size', resampled_size)
239
+ meta.add_keyvalue('Dataset_name', 'OAI_ZIB')
240
+ meta.add_keyvalue('ROI', 'leg')
241
+ meta.add_keyvalue('Label_Dict', LABEL_DICT)
242
+
243
+ # Output paths
244
+ output_subject_dir = os.path.join(split_output_dir, subject_id)
245
+ output_image_file = os.path.join(output_subject_dir, f"{subject_id}.nii.gz")
246
+
247
+ # Save resampled image
248
+ util.save_nifti(sitk_img_iso, output_image_file, image_path)
249
+
250
+ # Process label (use nearest-neighbor interpolation to preserve discrete labels)
251
+ label_path = os.path.join(label_dir, filename)
252
+ if os.path.isfile(label_path):
253
+ sitk_lbl = sitk.ReadImage(label_path)
254
+ sitk_lbl_iso = resample_to_isotropic(sitk_lbl, TARGET_SPACING, sitk.sitkNearestNeighbor)
255
+ process_label_dir = os.path.join(output_subject_dir, 'segmentation')
256
+ processed_lbl_path = os.path.join(process_label_dir, f"{subject_id}.nii.gz")
257
+ os.makedirs(process_label_dir, exist_ok=True)
258
+ util.save_nifti(sitk_lbl_iso, processed_lbl_path, label_path)
259
+
260
+ label_path_dict = {'knee': processed_lbl_path}
261
+ meta.add_keyvalue('Task', TASK_VALUE)
262
+ meta.add_keyvalue('Label_path', {TASK_VALUE: label_path_dict})
263
+
264
+ print(f" {subject_id}: {original_size} @ {[f'{s:.3f}' for s in original_spacing]} -> {resampled_size} @ {[f'{s:.3f}' for s in resampled_spacing]}")
265
+
266
+ # Build extra metadata from nonimaging
267
+ extra_info = {
268
+ 'split': split,
269
+ 'Image_id': subject_id,
270
+ 'nonimaging_dir': nonimaging_dir,
271
+ }
272
+ subject_meta = get_subject_metadata(subject_id, tables)
273
+ extra_info.update(subject_meta)
274
+
275
+ meta.add_extra_keyvalue('Metadata', extra_info)
276
+
277
+ # Write mapping
278
+ with open(json_output_path, 'r+') as json_file:
279
+ existing_mappings = json.load(json_file)
280
+ existing_mappings[output_image_file] = meta.get_meta_data()
281
+ json_file.seek(0)
282
+ json.dump(existing_mappings, json_file, indent=4)
283
+ json_file.truncate()
284
+
285
+ except Exception as e:
286
+ print(f" Failed {subject_id}: {e}")
287
+ failed_files.append(subject_id)
288
+ continue
289
+
290
+ # Save failed files
291
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
292
+ with open(failed_files_path, "w") as json_file:
293
+ json.dump(failed_files, json_file)
294
+
295
+ print(f"\nDone. Failed files ({len(failed_files)}): {failed_files_path}")
296
+
297
+
298
+ if __name__ == "__main__":
299
+ parser = argparse.ArgumentParser(description="Process OAI-ZIB dataset and save as processed NIfTI with mappings.")
300
+ parser.add_argument("--target_path", type=str,
301
+ default="/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/DATASETS/OAI_ZIB",
302
+ help="Path to raw OAI-ZIB dataset directory.")
303
+ parser.add_argument("--output_dir", type=str,
304
+ default="/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB",
305
+ help="Directory to save processed NIfTI files and mappings.")
306
+ args = parser.parse_args()
307
+ print(f"Input: {args.target_path}")
308
+ print(f"Output: {args.output_dir}")
309
+ main(args.target_path, args.output_dir)
OAI_ZIB_clean/util.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import SimpleITK as sitk
4
+ import glob
5
+ try:
6
+ import pandas as pd
7
+ except ImportError:
8
+ pd = None
9
+
10
+ def load_dicom_images(folder_path):
11
+ reader = sitk.ImageSeriesReader()
12
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
13
+ reader.SetFileNames(dicom_names)
14
+ image = reader.Execute()
15
+ return image
16
+
17
+ def convert_windows_to_linux_path(windows_path):
18
+ # Replace backslashes with forward slashes and remove the drive letter
19
+ # Some meta files have windows paths, but the data is stored on a linux server
20
+ linux_path = windows_path.replace('\\', '/')
21
+ if ':' in linux_path:
22
+ linux_path = linux_path.split(':', 1)[1]
23
+ return linux_path
24
+
25
+ # =============================================================================
26
+ # ========================developed with TotalSegmentor========================
27
+ # =============================================================================
28
+
29
+ def read_table(file_path, split_str=';'):
30
+ try:
31
+ df = pd.read_excel(file_path, engine='openpyxl')
32
+ except:
33
+ df = pd.read_csv(file_path, sep=split_str)
34
+ return df
35
+
36
+ def load_nifti(image_path):
37
+ return sitk.ReadImage(image_path)
38
+
39
+ def save_nifti(image, output_path, folder_path):
40
+ output_dirpath = os.path.dirname(output_path)
41
+ if not os.path.exists(output_dirpath):
42
+ print(f"Creating directory {output_dirpath}")
43
+ os.makedirs(output_dirpath)
44
+ # Set metadata in the NIfTI file's header
45
+ image.SetMetaData("FolderPath", folder_path)
46
+ sitk.WriteImage(image, output_path)
47
+
48
+ def find_metadata_files(path, file_name='*meta*'):
49
+ # for TotalSegmentor dataset
50
+ search_pattern = os.path.join(path, '**', file_name)
51
+ return glob.glob(search_pattern, recursive=True)
52
+
53
+ def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True):
54
+ img_path = []
55
+ for root, dirs, files in os.walk(folder_path):
56
+ for file in files:
57
+ if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file):
58
+ img_path.append(os.path.join(root, file))
59
+ if is_sorted:
60
+ img_path.sort()
61
+ return img_path
62
+
63
+ def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None):
64
+ '''
65
+ Resample the image to have isotropic spacing, following the steps:
66
+ 1. Find the minimum spacing
67
+ 2. Resample the image to have the minimum spacing
68
+ 3. Set the interpolator (linear for images, nearest for segmentation masks)
69
+ 4. Set the output spacing
70
+ 5. Return the resampler for resampling
71
+ For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1]
72
+ '''
73
+ # 讨论为什么重新写这个函数!!!
74
+ if size is None:
75
+ size = ref_img.GetSize()
76
+ if spacing is None:
77
+ spacing = ref_img.GetSpacing()
78
+ min_spacing = min(spacing)
79
+ if all([spc == min_spacing for spc in spacing]):
80
+ return None
81
+ else:
82
+ # if 1:
83
+ if interpolator == 'nearest':
84
+ interpolator = sitk.sitkNearestNeighbor
85
+ elif interpolator == 'linear':
86
+ interpolator = sitk.sitkLinear
87
+ resampler = sitk.ResampleImageFilter()
88
+ # new_spacing = [max_spacing] * len(spacing)
89
+ # print(size)
90
+ new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)]
91
+ new_size_xy=[new_size[0],new_size[1],new_size[2]]
92
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
93
+ new_size_spacing=[min_spacing,min_spacing,min_spacing]
94
+ # 讨论为什么重新写这个函数!!! --- YHM Jachin
95
+ # resampler.SetSize(new_size)
96
+ # resampler.SetOutputSpacing([min_spacing] * len(spacing))
97
+ resampler.SetSize(new_size_xy)
98
+ resampler.SetOutputSpacing(new_size_spacing)
99
+
100
+ # print(new_size,new_size_xy)
101
+ resampler.SetOutputOrigin(ref_img.GetOrigin())
102
+ resampler.SetOutputDirection(ref_img.GetDirection())
103
+ resampler.SetInterpolator(interpolator)
104
+ resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue())
105
+ resampler.SetOutputPixelType(ref_img.GetPixelID())
106
+ return resampler
107
+
108
+ def clamp_image(in_img,clamp_range):
109
+ '''
110
+ Clamp the image to the specified range
111
+ '''
112
+ clamp_filter = sitk.ClampImageFilter()
113
+ clamp_filter.SetLowerBound(clamp_range[0])
114
+ clamp_filter.SetUpperBound(clamp_range[1])
115
+ return clamp_filter.Execute(in_img)
116
+
117
+ def get_synonyms_dict(dict_type='ROI'):
118
+ '''
119
+ Get the dictionary of synonyms for the specified dictionary type
120
+ '''
121
+ if dict_type == 'ROI':
122
+ dict_synonyms = {
123
+ 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'],
124
+ 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'],
125
+ 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'],
126
+ 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'],
127
+ 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'],
128
+ 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'],
129
+ 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'],
130
+ 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'],
131
+ 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'],
132
+ 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'],
133
+ 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'],
134
+ 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'],
135
+ 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'],
136
+ 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'],
137
+ 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'],
138
+ 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'],
139
+ 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'],
140
+ 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'],
141
+ 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'],
142
+ 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'],
143
+ 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'],
144
+ 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',],
145
+ 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'],
146
+ }
147
+ elif dict_type == 'Label_tissue':
148
+ dict_synonyms = {
149
+ 'liver': ['liver','hepatic'],
150
+ 'spleen': ['spleen','splenic'],
151
+ 'kidney': ['kidney','renal'],
152
+ 'pancreas': ['pancreas','pancreatic'],
153
+ 'stomach': ['stomach','gastric'],
154
+ 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'],
155
+ 'gallbladder': ['gallbladder'],
156
+ 'adrenal_gland': ['adrenal_gland','adrenal gland'],
157
+ 'bladder': ['bladder'],
158
+ 'prostate': ['prostate'],
159
+ 'uterus': ['uterus'],
160
+ 'ovary': ['ovary'],
161
+ 'testicle': ['testicle'],
162
+ 'lymph_node': ['lymph_node','lymph node'],
163
+ 'bone': ['bone'],
164
+ 'lung': ['lung'],
165
+ 'heart': ['heart'],
166
+ 'esophagus': ['esophagus'],
167
+ 'muscle': ['muscle'],
168
+ 'fat': ['fat'],
169
+ 'skin': ['skin'],
170
+ 'vessel': ['vessel'],
171
+ 'tumor': ['tumor'],
172
+ 'other': ['other']
173
+ }
174
+ elif dict_type == 'Task':
175
+ dict_synonyms = {
176
+ 'segmentation': ['segmentation', 'seg', 'mask'],
177
+ 'classification': ['classification', 'class', 'diagnosis','identify','identification'],
178
+ 'localization': ['localization', 'locate', 'location', 'position'],
179
+ 'registration': ['registration', 'register', 'align', 'alignment'],
180
+ 'detection': ['detection', 'detect', 'find', 'locate'],
181
+ 'quantification': ['quantification', 'quantify', 'measure', 'measurement'],
182
+ }
183
+ elif dict_type == 'Modality':
184
+ dict_synonyms = {
185
+ 'CT': ['CT', 'computed tomography'],
186
+ 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'],
187
+ 'PET': ['PET', 'positron emission tomography'],
188
+ 'US': ['US', 'ultrasound'],
189
+ 'X-ray': ['X-ray', 'radiography'],
190
+ 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'],
191
+ }
192
+ else:
193
+ raise ValueError(f"dict_type {dict_type} is not valid")
194
+ return dict_synonyms
195
+
196
+ def replace_synonyms(text, dict_synonyms):
197
+ '''
198
+ Replace the synonyms in the text with the standard term
199
+ '''
200
+ if isinstance(text,str):
201
+ for key, value in dict_synonyms.items():
202
+ for v in value:
203
+ if v.lower() in text.lower():
204
+ return key
205
+ Warning(f"Value {text} is not in the correct format")
206
+ elif isinstance(text,list):
207
+ text = [replace_synonyms(t, dict_synonyms) for t in text]
208
+ elif isinstance(text,dict):
209
+ for key in text.keys():
210
+ # replace values in dict
211
+ text[key] = replace_synonyms(text[key], dict_synonyms)
212
+ # replace keys in dict
213
+ for k in dict_synonyms.keys():
214
+ text[dict_synonyms[k]] = text.pop(key)
215
+ return text
216
+
217
+ # =============================================================================
218
+
219
+ class meta_data(object):
220
+ '''
221
+ This class is used to store the metadata of the dataset
222
+ '''
223
+ def __init__(self):
224
+ self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json')
225
+ with open(self.config_format_path, 'r') as file:
226
+ self.config_format = json.load(file)
227
+ self.config = {}
228
+ for key in self.config_format.keys():
229
+ if self.config_format[key]['required'] == True:
230
+ self.config[key] = {}
231
+ self.keytypes = self.find_all_keys_with_type()
232
+ self.keytypes_flatten = self.flatten_json()
233
+ self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality']
234
+ for key in self.ambiguity_keys:
235
+ ambiguity_dict = get_synonyms_dict(key)
236
+ self.config_format[key]['options'] = list(ambiguity_dict.keys())
237
+
238
+ def get_ketytypes(self):
239
+ return self.keytypes
240
+
241
+ def get_keytypes_flatten(self):
242
+ return self.keytypes_flatten
243
+
244
+ def find_all_keys_with_type(self, data=None, parent_key=''):
245
+ if data is None:
246
+ data = self.config_format
247
+ keys_with_type = {}
248
+ if isinstance(data, dict):
249
+ for key, value in data.items():
250
+ full_key = f"{parent_key}.{key}" if parent_key else key
251
+ if isinstance(value, dict) and 'type' in value:
252
+ keys_with_type[full_key] = value['type']
253
+ keys_with_type.update(self.find_all_keys_with_type(value, full_key))
254
+ elif isinstance(data, list):
255
+ for index, item in enumerate(data):
256
+ full_key = f"{parent_key}[{index}]"
257
+ keys_with_type.update(self.find_all_keys_with_type(item, full_key))
258
+ return keys_with_type
259
+
260
+ def flatten_json(self, data=None, parent_key='', sep='.'):
261
+ if data is None:
262
+ data = self.config_format
263
+ items = {}
264
+ if isinstance(data, dict):
265
+ for key, value in data.items():
266
+ new_key = f"{parent_key}{sep}{key}" if parent_key else key
267
+ if isinstance(value, dict):
268
+ items.update(self.flatten_json(value, new_key, sep=sep))
269
+ elif isinstance(value, list):
270
+ for i, item in enumerate(value):
271
+ items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep))
272
+ else:
273
+ items[new_key] = value
274
+ elif isinstance(data, list):
275
+ for i, item in enumerate(data):
276
+ items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep))
277
+ return items
278
+
279
+ def req_check(self):
280
+ self.unfilled_keys = []
281
+ for key in self.config.keys():
282
+ if self.config[key] == {}:
283
+ self.unfilled_keys.append(key)
284
+ if len(self.unfilled_keys) == 0:
285
+ return True
286
+ else:
287
+ return False
288
+
289
+ def type_check(self, key, value):
290
+ if key not in self.config_format.keys():
291
+ print(key, "is not a valid key")
292
+ return False
293
+
294
+ if key == 'Modality':
295
+ if value not in self.config_format[key]['options']:
296
+ return False
297
+ else:
298
+ return True
299
+
300
+ elif key == 'OriImg_path':
301
+ if isinstance(value, str):
302
+ return True
303
+ else:
304
+ return False
305
+
306
+ elif key == 'Label_path' and isinstance(value, dict):
307
+ for skey in value.keys():
308
+ if skey in self.config_format[key]['keys']:
309
+ for kk in value[skey]:
310
+ if isinstance(value[skey][kk],str):
311
+ pass
312
+ # if kk in self.config_format[key]['value']['keys']:
313
+ # if isinstance(value[skey][kk],str):
314
+ # pass
315
+ # else:
316
+ # return False
317
+ else:
318
+ return False
319
+ return True
320
+
321
+ elif key == 'ROI':
322
+ if value not in self.config_format[key]['options']:
323
+ return False
324
+ else:
325
+ return True
326
+
327
+ elif key == 'Label_tissue' and isinstance(value, list):
328
+ for i in value:
329
+ if i not in self.config_format[key]['items']['options']:
330
+ return False
331
+ return True
332
+
333
+ elif key =='Task' and isinstance(value, list):
334
+ for i in value:
335
+ if i not in self.config_format[key]['items']['options']:
336
+ return False
337
+ return True
338
+
339
+ elif key == 'Spacing_mm':
340
+ if isinstance(value, float):
341
+ return True
342
+ else:
343
+ False
344
+
345
+ # elif key == 'Size' and isinstance(value, list) and len(value) == 3 :
346
+ elif key == 'Size' and isinstance(value, list) and len(value) >= 3 :
347
+ return all(isinstance(item, int) for item in value)
348
+
349
+ elif key == 'Dataset_name':
350
+ if isinstance(value, str):
351
+ return True
352
+ else:
353
+ return False
354
+ ##added by yanguoiqng on 2025-08-08
355
+ elif key == 'Sub_modality':
356
+
357
+ if isinstance(value, dict):
358
+ return True
359
+ else:
360
+ return False
361
+ elif key == 'Label_Dict':
362
+
363
+ if isinstance(value, dict):
364
+ return True
365
+ else:
366
+ return False
367
+ def add_extra_keyvalue(self, key, value):
368
+ self.config[key] = value
369
+ return True
370
+
371
+ def add_keyvalue(self, key, value):
372
+ if key in self.ambiguity_keys:
373
+ value = replace_synonyms(value, get_synonyms_dict(key))
374
+ # print(key, value)
375
+ if self.type_check(key, value):
376
+
377
+ self.config[key] = value
378
+ return True
379
+ else:
380
+ Warning(f"Value {value} is not in the correct format for key {key}")
381
+ pass
382
+ # print(f"Value {value} is not in the correct format for key {key}")
383
+
384
+ def get_meta_data(self):
385
+ if self.req_check():
386
+ return self.config
387
+ else:
388
+ print("Not all required keys are filled", self.unfilled_keys)
389
+ return False
390
+
391
+
392
+
393
+ if __name__ == '__main__':
394
+ meta = meta_data()
395
+ print(meta.get_keytypes_flatten())
396
+ print(meta.get_ketytypes())
397
+ meta.add_keyvalue('Modality', 'CT')
398
+ meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT')
399
+ meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}})
400
+ meta.add_keyvalue('Spacing_mm', 1.5)
401
+ meta.add_keyvalue('Size', [512, 512, 100])
402
+ meta.add_keyvalue('Dataset_name', 'CT')
403
+ meta.add_keyvalue('Label_tissue', ['1', '2', '3'])
404
+ meta.add_keyvalue('Task', ['1', '2', '3'])
405
+ print(meta.get_meta_data())
406
+ meta.add_extra_key('extra', 'extra')
407
+ print(meta.get_meta_data())
408
+ print(meta.get_ketytypes())
409
+ print(meta.get_keytypes_flatten)
410
+
411
+ org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT'
412
+ img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation')
413
+ print(img_paths)
PSMA_clean/config_format.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Modality": {
3
+ "type": "option",
4
+ "required": true,
5
+ "options": [
6
+ "CT",
7
+ "MRI",
8
+ "T1",
9
+ "T2",
10
+ "X-ray",
11
+ "Fluoroscopy",
12
+ "US",
13
+ "PET"
14
+ ]
15
+ },
16
+ "OriImg_path": {
17
+ "type": "string",
18
+ "required": true
19
+ },
20
+ "Label_path": {
21
+ "type": "dict",
22
+ "required": false,
23
+ "keys": [
24
+ "classification",
25
+ "segmentation",
26
+ "regression",
27
+ "detection",
28
+ "localization",
29
+ "registration",
30
+ "other"
31
+ ],
32
+ "value": {
33
+ "type": "dict",
34
+ "required": false,
35
+ "keys": [
36
+ "lung",
37
+ "liver",
38
+ "heart",
39
+ "brain",
40
+ "kidney"
41
+ ],
42
+ "value": {
43
+ "type": "string",
44
+ "required": false
45
+ }
46
+ }
47
+ },
48
+ "ROI": {
49
+ "type": "option",
50
+ "required": false,
51
+ "options": [
52
+ "chest-abdomen",
53
+ "abdomen-pelvis",
54
+ "head",
55
+ "neck",
56
+ "skeleton",
57
+ "chest",
58
+ "abdomen",
59
+ "shoulder",
60
+ "leg",
61
+ "arm",
62
+ "hand",
63
+ "foot",
64
+ "pelvis"
65
+ ]
66
+ },
67
+ "Label_tissue": {
68
+ "type": "list",
69
+ "required": false,
70
+ "items": {
71
+ "type": "option",
72
+ "required": true,
73
+ "options": [
74
+ "lung",
75
+ "liver",
76
+ "heart",
77
+ "brain",
78
+ "kidney",
79
+ "spleen",
80
+ "pancreas",
81
+ "stomach",
82
+ "intestine",
83
+ "muscle",
84
+ "bone"
85
+ ]
86
+ }
87
+ },
88
+ "Task": {
89
+ "type": "list",
90
+ "required": false,
91
+ "items": {
92
+ "type": "option",
93
+ "required": true,
94
+ "options": [
95
+ "classification",
96
+ "segmentation"
97
+ ]
98
+ }
99
+ },
100
+ "Spacing_mm": {
101
+ "type": "float",
102
+ "required": true
103
+ },
104
+ "Size": {
105
+ "type": "list",
106
+ "required": true,
107
+ "items": {
108
+ "type": "int",
109
+ "required": true
110
+ }
111
+ },
112
+ "Dataset_name": {
113
+ "type": "string",
114
+ "required": true
115
+ },
116
+
117
+ "Sub_modality": {
118
+ "type": "dict",
119
+ "required": false
120
+ },
121
+ "Label_Dict": {
122
+ "type": "dict",
123
+ "required": false
124
+ }
125
+ }
PSMA_clean/dataclean_PSMA_Longitudinal.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+
6
+
7
+ BL = Baseline(基线)
8
+ FU = Follow-up(随访)
9
+
10
+ 1. Baseline (基线)
11
+ 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。
12
+ 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。
13
+ 2. Follow-up (随访)
14
+ 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。
15
+ 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。
16
+ “BL FU” 在报告中的应用场景:
17
+ 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是:
18
+ “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。”
19
+
20
+ 例如:
21
+ 肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。”
22
+ 慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。
23
+
24
+ label:
25
+ 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息
26
+
27
+ 编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--
28
+ 备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict
29
+ BL的以及对应的MASK都是inputsTr目录下面
30
+ 命名形式:
31
+ 93dd4de5cd_BL_img_BL_img_00.nii.gz
32
+ 93dd4de5cd_BL_mask_BL_img_00.nii.gz
33
+ 93dd4de5cd_BL_00.json
34
+
35
+ FU在inputsTr目录下面,对应的mask在targetsTr力猛
36
+ 命名形式:
37
+ c6f057b865_FU_img_FU_img_00.nii.gz
38
+ c6f057b865_FU_mask_FU_img_00.nii.gz
39
+ c6f057b865_FU_img_FU_img_01.nii.gz
40
+ c6f057b865_FU_mask_FU_img_01.nii.gz
41
+ c6f057b865_FU_00.json
42
+ c6f057b865_FU_01.json
43
+
44
+
45
+ 元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置
46
+ lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
47
+ 1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung
48
+ 2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node
49
+
50
+ json格式样例
51
+ {
52
+ "name": "Points of interest",
53
+ "points": [
54
+ {
55
+ "name": "1",
56
+ "point": [
57
+ 84.9530896759608,
58
+ 273.525433308214,
59
+ 148.780708364732
60
+ ]
61
+ },
62
+ {
63
+ "name": "2",
64
+ "point": [
65
+ 206.307026476578,
66
+ 258.39816700611,
67
+ 177.256619144603
68
+ ]
69
+ }
70
+ ],
71
+ "type": "Multiple points",
72
+ "version": {
73
+ "major": 1,
74
+ "minor": 0
75
+ }
76
+ }
77
+
78
+ '''
79
+ import os
80
+ import glob
81
+ import pandas as pd
82
+ import SimpleITK as sitk
83
+ import argparse
84
+ import json
85
+ from tqdm import tqdm
86
+ from util import meta_data
87
+ import util
88
+ import numpy as np
89
+ # from bert_helper import *
90
+
91
+ import shutil
92
+
93
+
94
+
95
+ TASK_VALUE="segmentation"
96
+ CLAMP_RANGE_CT = [-300,300]
97
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
98
+ TARGET_VOXEL_SPACING=None
99
+
100
+ # ##参考MSD的sub_modality描述信息
101
+ # SUB_MODALITY=["CT","PET"]
102
+ # ##文件名对应的排序顺序
103
+ # SERIES_ORDER=["0000","0001"]
104
+
105
+ ##根据对应的json信息进行补充1-N的数值
106
+ LABEL_DICT={
107
+ "0":"backgroud",
108
+ }
109
+ META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']
110
+
111
+ # def find_metadata_files(path):
112
+ # # for Cancer Image Archive (TCIA) dataset
113
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
114
+ # return glob.glob(search_pattern, recursive=True)
115
+
116
+ def find_metadata_files(path):
117
+ # for Cancer Image Archive (TCIA) dataset
118
+ search_pattern = os.path.join(path, '*.csv')
119
+ return glob.glob(search_pattern, recursive=True)
120
+ ##added by yanguoqing on 20250527
121
+ def find_image_dirs(path):
122
+ return os.listdir(path)
123
+
124
+ ##modify by yanguoqing on 20250527
125
+ def load_dicom_images(folder_path):
126
+ reader = sitk.ImageSeriesReader()
127
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
128
+ reader.SetFileNames(dicom_names)
129
+ image = reader.Execute()
130
+ return dicom_names,image
131
+
132
+ ##added by yanguoqing on 20250527
133
+ def load_dicom_tag(imgs):
134
+ reader = sitk.ImageFileReader()
135
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
136
+ reader.SetFileName(imgs)
137
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
138
+ # metadata_keys = reader.GetMetaDataKeys()
139
+ tag=reader.Execute()
140
+ return tag
141
+
142
+ def load_nrrd(fp):
143
+ return sitk.ReadImage(fp)
144
+
145
+ ##modify by yanguoqing on 20250830
146
+ def merge_images(series_files):
147
+ '''
148
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
149
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
150
+ '''
151
+ reader = sitk.ImageSeriesReader()
152
+ reader.SetFileNames(series_files)
153
+ image = reader.Execute()
154
+ return image
155
+
156
+ def save_nifti(image, output_path, folder_path):
157
+ # Set metadata in the NIfTI file's header
158
+ output_dirpath = os.path.dirname(output_path)
159
+ if not os.path.exists(output_dirpath):
160
+ print(f"Creating directory {output_dirpath}")
161
+ os.makedirs(output_dirpath)
162
+ # Set metadata in the NIfTI file's header
163
+ image.SetMetaData("FolderPath", folder_path)
164
+ sitk.WriteImage(image, output_path)
165
+
166
+ ##modify by yanguoqing on 20250527
167
+ def convert_windows_to_linux_path(windows_path):
168
+ # Replace backslashes with forward slashes and remove the drive letter
169
+ # Some meta files have windows paths, but the data is stored on a linux server
170
+ linux_path = windows_path.replace('\\', '/')
171
+ if ':' in linux_path:
172
+ linux_path = linux_path.split(':', 1)[1]
173
+ return linux_path
174
+ ##added by yanguoqing on 2025-08-31
175
+ ##根据csv文件返回的所有数据文件名称,获取所有数据id的
176
+ def get_filename_list(fp_dir):
177
+ all_file_list=glob.glob("%s/*.csv"%fp_dir)
178
+
179
+
180
+ return all_file_list
181
+ ##获取study_id以及study_date
182
+ def check_fname(fname):
183
+ if fname.startswith("fdg"):
184
+ sid=fname[:14]
185
+ sdate=fname[15:25]
186
+ else:
187
+ sid=fname[:21]
188
+ sdate=fname[22:]
189
+ return sid,sdate
190
+ def main(target_path, output_dir):
191
+
192
+ pid_dirs=["inputsTr"]
193
+ failed_files = []
194
+ if not os.path.isdir(output_dir):
195
+ os.makedirs(output_dir)
196
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
197
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
198
+ meta = meta_data()
199
+
200
+ # Initialize the JSON file
201
+ if not os.path.exists(json_output_path):
202
+ with open(json_output_path, 'w') as json_file:
203
+ json.dump({}, json_file)
204
+
205
+
206
+ input_dir=os.path.join(target_path,'inputsTr')
207
+ target_dir=os.path.join(target_path,'targetsTr')
208
+
209
+ fp_files=get_filename_list(input_dir)
210
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
211
+ if pid_dirs:
212
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
213
+ for fp_file in tqdm(fp_files, desc="Processing all dataset"):
214
+ meta_file=fp_file
215
+ df_meta=pd.read_csv(meta_file)
216
+ fp_name=os.path.basename(fp_file)[:-4]
217
+ ##依次查找BL以及FU的所有影像以及对应的mask
218
+ for sub_mod in ['BL','FU']:
219
+
220
+ bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
221
+ if len(bl_fps)>0:
222
+ for bl_fp in bl_fps:
223
+ basename=os.path.basename(bl_fp)[:-5]
224
+ bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
225
+ bl_fp_img=os.path.join(input_dir,bl_fp_name)
226
+
227
+ if os.path.isfile(bl_fp_img):
228
+ ##判定存在进行正常处理
229
+
230
+
231
+ bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")
232
+
233
+ bl_fp_mask=os.path.join(input_dir,bl_mask_name)
234
+ if os.path.isfile(bl_fp_mask):
235
+ label_fp=bl_fp_mask
236
+ label_flag=True
237
+ else:
238
+ bl_fp_mask=os.path.join(target_dir,bl_mask_name)
239
+ if os.path.isfile(bl_fp_mask):
240
+ label_fp=bl_fp_mask
241
+ label_flag=True
242
+ else:
243
+ label_fp=None
244
+ label_flag=False
245
+
246
+
247
+ modality="CT"
248
+ study='PSMA_Longitudinal_CT'##Dataset_name
249
+ CIA_other_info = {
250
+ 'Image_id':basename,
251
+ 'metadata_file':''
252
+ # 'Series_Description':serise_desc
253
+ }
254
+ CIA_other_info['split'] = "train"
255
+
256
+ CIA_other_info['metadata_file']=meta_file
257
+ stk_image=util.load_nifti(bl_fp_img)
258
+ spacing_info = stk_image.GetSpacing()
259
+ size = list(stk_image.GetSize())
260
+ resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
261
+ if resampler is not None:
262
+ proces_image = resampler.Execute(stk_image)
263
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
264
+ CIA_other_info['Resample'] = True
265
+ else:
266
+ proces_image = stk_image
267
+ CIA_other_info['Resample'] = False
268
+
269
+ output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
270
+ # output_path=convert_windows_to_linux_path(output_path)
271
+ save_nifti(proces_image, output_path, input_dir)
272
+ print(f"Saved NIfTI file to {output_path}")
273
+
274
+ with open(bl_fp,'r') as fi:
275
+ json_info=json.load(fi)
276
+
277
+ label_dict={
278
+ "0":"backgroud"
279
+ }
280
+ for lesion_info in json_info['points']:
281
+ df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
282
+ df_row=df_row.reset_index()
283
+ lesion_type=df_row['lesion_type'][0]
284
+ label_dict[lesion_info['name']]=lesion_type
285
+
286
+
287
+
288
+ if label_flag:
289
+ label_path_dict = {}
290
+ label_stk_img=util.load_nifti(label_fp)
291
+ resampler =util.get_unisize_resampler(label_stk_img, interpolator='nearest', spacing=spacing_info, size=size)
292
+ if resampler is not None:
293
+ proces_label = resampler.Execute(label_stk_img)
294
+ else:
295
+ proces_label = label_stk_img
296
+
297
+ # print(proces_image.GetSize(),proces_label.GetSize())
298
+ try:
299
+ assert proces_image.GetSize() == proces_label.GetSize()
300
+ except Exception as e:
301
+ failed_files.append(label_fp)
302
+ continue
303
+
304
+ label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")
305
+
306
+ label_path_dict['tumor'] = label_output_path
307
+ util.save_nifti(proces_label, label_output_path, label_fp)
308
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
309
+ else:
310
+ continue
311
+
312
+
313
+
314
+
315
+ size_processed = list(proces_image.GetSize())
316
+ print('size_processed',size_processed,size)
317
+
318
+ # meta.add_keyvalue('Image_id',meta_image_id)
319
+ meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
320
+ meta.add_keyvalue('OriImg_path',bl_fp_img)
321
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
322
+ meta.add_keyvalue('Modality',modality)
323
+ meta.add_keyvalue('Dataset_name',study)
324
+ meta.add_keyvalue('ROI','whole-body')
325
+
326
+
327
+ if label_flag:
328
+ # print(label_path_dict.keys())
329
+ meta.add_keyvalue('Task',TASK_VALUE)
330
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
331
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
332
+
333
+ meta.add_keyvalue('Label_Dict',label_dict)
334
+
335
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+ # Write the mapping to the JSON file on the fly
344
+ with open(json_output_path, 'r+') as json_file:
345
+ existing_mappings = json.load(json_file)
346
+ existing_mappings[output_path] = meta.get_meta_data()
347
+ json_file.seek(0)
348
+ # print(existing_mappings)
349
+ json.dump(existing_mappings, json_file, indent=4)
350
+ json_file.truncate()
351
+ # else:
352
+ # print("No metadata.csv files found.")
353
+
354
+ with open(failed_files_path, "w") as json_file:
355
+ json.dump(failed_files, json_file)
356
+
357
+ print(f"The list has been written to {failed_files_path}")
358
+ print(f"Saved NIfTI mappings to {json_output_path}")
359
+
360
+ if __name__ == "__main__":
361
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
362
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//")
363
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/")
364
+ args = parser.parse_args()
365
+ print(args.target_path, args.output_dir)
366
+ main(args.target_path, args.output_dir)
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
PSMA_clean/dataclean_PSMA_Longitudinal_v2.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+
6
+
7
+ BL = Baseline(基线)
8
+ FU = Follow-up(随访)
9
+
10
+ 1. Baseline (基线)
11
+ 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。
12
+ 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。
13
+ 2. Follow-up (随访)
14
+ 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。
15
+ 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。
16
+ “BL FU” 在报告中的应用场景:
17
+ 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是:
18
+ “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。”
19
+
20
+ 例如:
21
+ 肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。”
22
+ 慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。
23
+
24
+ label:
25
+ 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息
26
+
27
+ 编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--
28
+ 备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict
29
+ BL的以及对应的MASK都是inputsTr目录下面
30
+ 命名形式:
31
+ 93dd4de5cd_BL_img_BL_img_00.nii.gz
32
+ 93dd4de5cd_BL_mask_BL_img_00.nii.gz
33
+ 93dd4de5cd_BL_00.json
34
+
35
+ FU在inputsTr目录下面,对应的mask在targetsTr力猛
36
+ 命名形式:
37
+ c6f057b865_FU_img_FU_img_00.nii.gz
38
+ c6f057b865_FU_mask_FU_img_00.nii.gz
39
+ c6f057b865_FU_img_FU_img_01.nii.gz
40
+ c6f057b865_FU_mask_FU_img_01.nii.gz
41
+ c6f057b865_FU_00.json
42
+ c6f057b865_FU_01.json
43
+
44
+
45
+ 元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置
46
+ lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
47
+ 1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung
48
+ 2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node
49
+
50
+ json格式样例
51
+ {
52
+ "name": "Points of interest",
53
+ "points": [
54
+ {
55
+ "name": "1",
56
+ "point": [
57
+ 84.9530896759608,
58
+ 273.525433308214,
59
+ 148.780708364732
60
+ ]
61
+ },
62
+ {
63
+ "name": "2",
64
+ "point": [
65
+ 206.307026476578,
66
+ 258.39816700611,
67
+ 177.256619144603
68
+ ]
69
+ }
70
+ ],
71
+ "type": "Multiple points",
72
+ "version": {
73
+ "major": 1,
74
+ "minor": 0
75
+ }
76
+ }
77
+
78
+ 20251101补充增加,将病灶编号进行合并同类项目,
79
+ 注意处理完成后保留原影像的几何空间信息以及元数据文件信息
80
+
81
+
82
+ '''
83
+ import os
84
+ import glob
85
+ import pandas as pd
86
+ import SimpleITK as sitk
87
+ import argparse
88
+ import json
89
+ from tqdm import tqdm
90
+ from util import meta_data
91
+ import util
92
+ import numpy as np
93
+ # from bert_helper import *
94
+
95
+ import shutil
96
+
97
+
98
+ ##统一编码
99
+ label_id_lut={'backgroud': 0,
100
+ 'Lymph node': 1,
101
+ 'Lung': 2,
102
+ 'Soft tissue / Skin': 3,
103
+ 'Liver': 4,
104
+ 'Skeleton': 5,
105
+ 'Adrenals': 6,
106
+ 'Spleen': 7,
107
+ 'CNS': 8,
108
+ 'Kidney': 9,
109
+ 'Heart': 10,
110
+ 'Others': 11,
111
+ 'unclear': 12,
112
+ }
113
+
114
+
115
+ TASK_VALUE="segmentation"
116
+ CLAMP_RANGE_CT = [-300,300]
117
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
118
+ TARGET_VOXEL_SPACING=None
119
+
120
+ # ##参考MSD的sub_modality描述信息
121
+ # SUB_MODALITY=["CT","PET"]
122
+ # ##文件名对应的排序顺序
123
+ # SERIES_ORDER=["0000","0001"]
124
+
125
+ ##根据对应的json信息进行补充1-N的数值
126
+ LABEL_DICT={
127
+ "0":"backgroud",
128
+ }
129
+ META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']
130
+
131
+ # def find_metadata_files(path):
132
+ # # for Cancer Image Archive (TCIA) dataset
133
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
134
+ # return glob.glob(search_pattern, recursive=True)
135
+
136
+ def find_metadata_files(path):
137
+ # for Cancer Image Archive (TCIA) dataset
138
+ search_pattern = os.path.join(path, '*.csv')
139
+ return glob.glob(search_pattern, recursive=True)
140
+ ##added by yanguoqing on 20250527
141
+ def find_image_dirs(path):
142
+ return os.listdir(path)
143
+
144
+ ##modify by yanguoqing on 20250527
145
+ def load_dicom_images(folder_path):
146
+ reader = sitk.ImageSeriesReader()
147
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
148
+ reader.SetFileNames(dicom_names)
149
+ image = reader.Execute()
150
+ return dicom_names,image
151
+
152
+ ##added by yanguoqing on 20250527
153
+ def load_dicom_tag(imgs):
154
+ reader = sitk.ImageFileReader()
155
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
156
+ reader.SetFileName(imgs)
157
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
158
+ # metadata_keys = reader.GetMetaDataKeys()
159
+ tag=reader.Execute()
160
+ return tag
161
+
162
+ def load_nrrd(fp):
163
+ return sitk.ReadImage(fp)
164
+
165
+ ##modify by yanguoqing on 20250830
166
+ def merge_images(series_files):
167
+ '''
168
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
169
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
170
+ '''
171
+ reader = sitk.ImageSeriesReader()
172
+ reader.SetFileNames(series_files)
173
+ image = reader.Execute()
174
+ return image
175
+
176
+ def save_nifti(image, output_path, folder_path):
177
+ # Set metadata in the NIfTI file's header
178
+ output_dirpath = os.path.dirname(output_path)
179
+ if not os.path.exists(output_dirpath):
180
+ print(f"Creating directory {output_dirpath}")
181
+ os.makedirs(output_dirpath)
182
+ # Set metadata in the NIfTI file's header
183
+ image.SetMetaData("FolderPath", folder_path)
184
+ sitk.WriteImage(image, output_path)
185
+
186
+ ##modify by yanguoqing on 20250527
187
+ def convert_windows_to_linux_path(windows_path):
188
+ # Replace backslashes with forward slashes and remove the drive letter
189
+ # Some meta files have windows paths, but the data is stored on a linux server
190
+ linux_path = windows_path.replace('\\', '/')
191
+ if ':' in linux_path:
192
+ linux_path = linux_path.split(':', 1)[1]
193
+ return linux_path
194
+ ##added by yanguoqing on 2025-08-31
195
+ ##根据csv文件返回的所有数据文件名称,获取所有数据id的
196
+ def get_filename_list(fp_dir):
197
+ all_file_list=glob.glob("%s/*.csv"%fp_dir)
198
+
199
+
200
+ return all_file_list
201
+ ##获取study_id以及study_date
202
+ def check_fname(fname):
203
+ if fname.startswith("fdg"):
204
+ sid=fname[:14]
205
+ sdate=fname[15:25]
206
+ else:
207
+ sid=fname[:21]
208
+ sdate=fname[22:]
209
+ return sid,sdate
210
+ def main(target_path, output_dir):
211
+
212
+ pid_dirs=["inputsTr"]
213
+ failed_files = []
214
+ if not os.path.isdir(output_dir):
215
+ os.makedirs(output_dir)
216
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
217
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
218
+ meta = meta_data()
219
+
220
+ # Initialize the JSON file
221
+ if not os.path.exists(json_output_path):
222
+ with open(json_output_path, 'w') as json_file:
223
+ json.dump({}, json_file)
224
+
225
+
226
+ input_dir=os.path.join(target_path,'inputsTr')
227
+ target_dir=os.path.join(target_path,'targetsTr')
228
+
229
+ fp_files=get_filename_list(input_dir)
230
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
231
+ if pid_dirs:
232
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
233
+ for fp_file in tqdm(fp_files, desc="Processing all dataset"):
234
+ meta_file=fp_file
235
+ df_meta=pd.read_csv(meta_file)
236
+
237
+ fp_name=os.path.basename(fp_file)[:-4]
238
+ ##依次查找BL以及FU的所有影像以及对应的mask
239
+ for sub_mod in ['BL','FU']:
240
+
241
+ bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
242
+ if len(bl_fps)>0:
243
+ for bl_fp in bl_fps:
244
+ basename=os.path.basename(bl_fp)[:-5]
245
+ bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
246
+ bl_fp_img=os.path.join(input_dir,bl_fp_name)
247
+
248
+ if os.path.isfile(bl_fp_img):
249
+ ##判定存在进行正常处理
250
+
251
+
252
+ bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")
253
+
254
+ bl_fp_mask=os.path.join(input_dir,bl_mask_name)
255
+ if os.path.isfile(bl_fp_mask):
256
+ label_fp=bl_fp_mask
257
+ label_flag=True
258
+ else:
259
+ bl_fp_mask=os.path.join(target_dir,bl_mask_name)
260
+ if os.path.isfile(bl_fp_mask):
261
+ label_fp=bl_fp_mask
262
+ label_flag=True
263
+ else:
264
+ label_fp=None
265
+ label_flag=False
266
+
267
+
268
+ modality="CT"
269
+ study='PSMA_Longitudinal_CT'##Dataset_name
270
+ CIA_other_info = {
271
+ 'Image_id':basename,
272
+ 'metadata_file':''
273
+ # 'Series_Description':serise_desc
274
+ }
275
+ CIA_other_info['split'] = "train"
276
+
277
+ CIA_other_info['metadata_file']=meta_file
278
+ stk_image=util.load_nifti(bl_fp_img)
279
+ spacing_info = stk_image.GetSpacing()
280
+ size = list(stk_image.GetSize())
281
+ resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
282
+ if resampler is not None:
283
+ proces_image = resampler.Execute(stk_image)
284
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
285
+ CIA_other_info['Resample'] = True
286
+ else:
287
+ proces_image = stk_image
288
+ CIA_other_info['Resample'] = False
289
+
290
+ output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
291
+ # output_path=convert_windows_to_linux_path(output_path)
292
+ save_nifti(proces_image, output_path, input_dir)
293
+ print(f"Saved NIfTI file to {output_path}")
294
+
295
+
296
+
297
+
298
+ if label_flag:
299
+ label_path_dict = {}
300
+ label_stk_img=util.load_nifti(label_fp)
301
+
302
+ image_array = sitk.GetArrayFromImage(label_stk_img)
303
+ ##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项
304
+ with open(bl_fp,'r') as fi:
305
+ json_info=json.load(fi)
306
+
307
+ label_dict={
308
+ "0":"backgroud"
309
+ }
310
+
311
+ update_image_array=np.copy(image_array)
312
+ ##获取合并同类项后的基本信息
313
+ group_meta=df_meta.groupby('lesion_type')['lesion_id']
314
+ for name,group in group_meta:
315
+ ##分组名称以及分组后的所有leision_id
316
+ ids=group_meta.get_group(name)
317
+ target_id=label_id_lut[name]
318
+ # ##取每个分组的最小leision_id赋值
319
+ # ids_min=ids.min()
320
+ # label_dict[str(ids_min)]=name
321
+ label_dict[str(target_id)]=name
322
+ ##并对
323
+ for v in ids.tolist():
324
+ update_image_array[image_array==v]=target_id
325
+
326
+ image_array=None
327
+ label_stk_img_update=sitk.GetImageFromArray(update_image_array)
328
+ label_stk_img_update.CopyInformation(label_stk_img)
329
+ # 手动复制所有元数据
330
+ # 获取元数据键
331
+ meta_keys = label_stk_img.GetMetaDataKeys()
332
+ for key in meta_keys:
333
+ value = label_stk_img.GetMetaData(key)
334
+ label_stk_img_update.SetMetaData(key, value)
335
+
336
+ # for lesion_info in json_info['points']:
337
+ # df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
338
+ # df_row=df_row.reset_index()
339
+ # lesion_type=df_row['lesion_type'][0]
340
+ # label_dict[lesion_info['name']]=lesion_type
341
+
342
+ resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size)
343
+ if resampler is not None:
344
+ proces_label = resampler.Execute(label_stk_img_update)
345
+
346
+ ary_process_label=sitk.GetArrayFromImage(proces_label)
347
+
348
+ if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0:
349
+ print('momingqimiao',ary_process_label[-1,0,0])
350
+ ary_process_label[-1,:,:]=0
351
+
352
+ label_stk_img_process=sitk.GetImageFromArray(ary_process_label)
353
+ label_stk_img_process.CopyInformation(proces_label)
354
+ meta_keys = proces_label.GetMetaDataKeys()
355
+ for key in meta_keys:
356
+ value = proces_label.GetMetaData(key)
357
+ label_stk_img_process.SetMetaData(key, value)
358
+
359
+
360
+
361
+ else:
362
+ label_stk_img_process = label_stk_img_update
363
+
364
+ # print(proces_image.GetSize(),proces_label.GetSize())
365
+ try:
366
+ assert proces_image.GetSize() == label_stk_img_process.GetSize()
367
+ except Exception as e:
368
+ failed_files.append(label_fp)
369
+ continue
370
+
371
+ label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")
372
+
373
+ label_path_dict['tumor'] = label_output_path
374
+ util.save_nifti(label_stk_img_process, label_output_path, label_fp)
375
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
376
+
377
+
378
+
379
+ else:
380
+ continue
381
+
382
+
383
+
384
+
385
+ size_processed = list(proces_image.GetSize())
386
+ print('size_processed',size_processed,size)
387
+
388
+ # meta.add_keyvalue('Image_id',meta_image_id)
389
+ meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
390
+ meta.add_keyvalue('OriImg_path',bl_fp_img)
391
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
392
+ meta.add_keyvalue('Modality',modality)
393
+ meta.add_keyvalue('Dataset_name',study)
394
+ meta.add_keyvalue('ROI','whole-body')
395
+
396
+
397
+ if label_flag:
398
+ # print(label_path_dict.keys())
399
+ meta.add_keyvalue('Task',TASK_VALUE)
400
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
401
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
402
+
403
+ meta.add_keyvalue('Label_Dict',label_dict)
404
+
405
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+ # Write the mapping to the JSON file on the fly
414
+ with open(json_output_path, 'r+') as json_file:
415
+ existing_mappings = json.load(json_file)
416
+ existing_mappings[output_path] = meta.get_meta_data()
417
+ json_file.seek(0)
418
+ # print(existing_mappings)
419
+ json.dump(existing_mappings, json_file, indent=4)
420
+ json_file.truncate()
421
+ # else:
422
+ # print("No metadata.csv files found.")
423
+
424
+ with open(failed_files_path, "w") as json_file:
425
+ json.dump(failed_files, json_file)
426
+
427
+ print(f"The list has been written to {failed_files_path}")
428
+ print(f"Saved NIfTI mappings to {json_output_path}")
429
+
430
+ if __name__ == "__main__":
431
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
432
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//")
433
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/")
434
+ args = parser.parse_args()
435
+ print(args.target_path, args.output_dir)
436
+ main(args.target_path, args.output_dir)
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
PSMA_clean/dataclean_PSMA_petct.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+ PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。
6
+
7
+ PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。
8
+ 而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。
9
+
10
+
11
+ fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。
12
+ fdgpet/ct
13
+ 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”)
14
+ 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤
15
+ 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤
16
+ 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊
17
+
18
+ psmapet/ct
19
+ 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”)
20
+ 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤
21
+ 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警
22
+ 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊
23
+
24
+
25
+ PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。
26
+ 这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。
27
+ 前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。
28
+ PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。
29
+ FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。
30
+
31
+
32
+
33
+ PSMA-FDG-PET/CT:
34
+
35
+ https://autopet-iii.grand-challenge.org/
36
+ "channel_names": {
37
+ "0": "CT",
38
+ "1": "CT"--PET
39
+ },
40
+ "labels": {
41
+ "background": 0,
42
+ "tumor": 1
43
+ },
44
+ 同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY
45
+
46
+ label:
47
+ 0:backgroud 1: tumor
48
+
49
+ FDG-元数据信息
50
+ 'Series UID', 'Collection', '3rd Party Analysis',
51
+ 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
52
+ 'Study Date', 'Series Description', 'Manufacturer', 'Modality',
53
+ 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
54
+ 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex'
55
+ 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】
56
+ FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz
57
+ Subject ID[PETCT_b2f82ed4b9] && Modality[CT]
58
+
59
+
60
+ PSMA-元数据信息
61
+ 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name',
62
+ 'pet_radionuclide', 'ct_contrast_agent'
63
+ 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理,
64
+ PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz
65
+ Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15]
66
+
67
+ 综上:将id定义为subject_id+study_date 共同标识唯一的ID
68
+
69
+ 处理流程:
70
+ 1.查找所有的ID;
71
+ 2.根据ID查找对应的两个channel的影像以及对应的label;
72
+ 3.对两个channel的影像进行合并转4D;
73
+ 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label
74
+ 5.保存
75
+
76
+ '''
77
+ import os
78
+ import glob
79
+ import pandas as pd
80
+ import SimpleITK as sitk
81
+ import argparse
82
+ import json
83
+ from tqdm import tqdm
84
+ from util import meta_data
85
+ import util
86
+ import numpy as np
87
+ # from bert_helper import *
88
+
89
+ import shutil
90
+ ##dataset_meta
91
+ meta_id_name='BraTS_2019_subject_ID'
92
+ meta_grade_name='Grade'
93
+
94
+ ##HGG_survival_info
95
+ survival_id_name='BraTS19ID'
96
+ meta_age_name='Age'
97
+ meta_survival_name='Survival'
98
+ meta_status_name='ResectionStatus'
99
+
100
+
101
+ TASK_VALUE="segmentation"
102
+ CLAMP_RANGE_CT = [-300,300]
103
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
104
+ TARGET_VOXEL_SPACING=None
105
+
106
+ ##参考MSD的sub_modality描述信息
107
+ SUB_MODALITY=["CT","PET"]
108
+ ##文件名对应的排序顺序
109
+ SERIES_ORDER=["0000","0001"]
110
+
111
+ LABEL_DICT={
112
+ "0":"backgroud",
113
+ "1":"tumor",
114
+ }
115
+ PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent']
116
+ FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex']
117
+ # def find_metadata_files(path):
118
+ # # for Cancer Image Archive (TCIA) dataset
119
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
120
+ # return glob.glob(search_pattern, recursive=True)
121
+
122
+ def find_metadata_files(path):
123
+ # for Cancer Image Archive (TCIA) dataset
124
+ search_pattern = os.path.join(path, '*.csv')
125
+ return glob.glob(search_pattern, recursive=True)
126
+ ##added by yanguoqing on 20250527
127
+ def find_image_dirs(path):
128
+ return os.listdir(path)
129
+
130
+ ##modify by yanguoqing on 20250527
131
+ def load_dicom_images(folder_path):
132
+ reader = sitk.ImageSeriesReader()
133
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
134
+ reader.SetFileNames(dicom_names)
135
+ image = reader.Execute()
136
+ return dicom_names,image
137
+
138
+ ##added by yanguoqing on 20250527
139
+ def load_dicom_tag(imgs):
140
+ reader = sitk.ImageFileReader()
141
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
142
+ reader.SetFileName(imgs)
143
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
144
+ # metadata_keys = reader.GetMetaDataKeys()
145
+ tag=reader.Execute()
146
+ return tag
147
+
148
+ def load_nrrd(fp):
149
+ return sitk.ReadImage(fp)
150
+
151
+ ##modify by yanguoqing on 20250830
152
+ def merge_images(series_files):
153
+ '''
154
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
155
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
156
+ '''
157
+ reader = sitk.ImageSeriesReader()
158
+ reader.SetFileNames(series_files)
159
+ image = reader.Execute()
160
+ return image
161
+
162
+ def save_nifti(image, output_path, folder_path):
163
+ # Set metadata in the NIfTI file's header
164
+ output_dirpath = os.path.dirname(output_path)
165
+ if not os.path.exists(output_dirpath):
166
+ print(f"Creating directory {output_dirpath}")
167
+ os.makedirs(output_dirpath)
168
+ # Set metadata in the NIfTI file's header
169
+ image.SetMetaData("FolderPath", folder_path)
170
+ sitk.WriteImage(image, output_path)
171
+
172
+ ##modify by yanguoqing on 20250527
173
+ def convert_windows_to_linux_path(windows_path):
174
+ # Replace backslashes with forward slashes and remove the drive letter
175
+ # Some meta files have windows paths, but the data is stored on a linux server
176
+ linux_path = windows_path.replace('\\', '/')
177
+ if ':' in linux_path:
178
+ linux_path = linux_path.split(':', 1)[1]
179
+ return linux_path
180
+ ##added by yanguoqing on 2025-08-30
181
+ ##获取PSMA-PET-CT的1614个数据名称
182
+ def get_filename_list(fp):
183
+ with open(fp,'r') as fi:
184
+ fls=json.load(fi)
185
+ filename_list=fls[0]['train']+fls[0]['val']
186
+
187
+ return filename_list
188
+ ##获取study_id以及study_date
189
+ def check_fname(fname):
190
+ if fname.startswith("fdg"):
191
+ sid=fname[:14]
192
+ sdate=fname[15:25]
193
+ else:
194
+ sid=fname[:21]
195
+ sdate=fname[22:]
196
+ return sid,sdate
197
+ def main(target_path, output_dir):
198
+ # metadata_files = find_metadata_files(target_path)
199
+ # pid_dirs=find_image_dirs(target_path)
200
+ fdg_meta="fdg_metadata.csv"
201
+ psma_meta="psma_metadata.csv"
202
+ filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息
203
+ # pid_dirs=["imagesTr","labelsTr"]
204
+ pid_dirs=["imagesTr"]
205
+ failed_files = []
206
+ if not os.path.isdir(output_dir):
207
+ os.makedirs(output_dir)
208
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
209
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
210
+ meta = meta_data()
211
+
212
+ # Initialize the JSON file
213
+ if not os.path.exists(json_output_path):
214
+ with open(json_output_path, 'w') as json_file:
215
+ json.dump({}, json_file)
216
+ psma_meta_file=os.path.join(target_path,psma_meta)
217
+ fdg_meta_file=os.path.join(target_path,fdg_meta)
218
+
219
+ filename_file=os.path.join(target_path,filename_meta)
220
+
221
+ pdf_meta=pd.read_csv(psma_meta_file)
222
+ fdf_meta=pd.read_csv(fdg_meta_file)
223
+
224
+ fp_names=get_filename_list(filename_file)
225
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
226
+ if pid_dirs:
227
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
228
+ for fp_name in tqdm(fp_names, desc="Processing all dataset"):
229
+
230
+ ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz")
231
+ pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz")
232
+ label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz")
233
+
234
+ modality="CT"
235
+ study='PSMA-FDG-PET-CT-LESION'##Dataset_name
236
+ CIA_other_info = {'metadata_file':''}
237
+ CIA_other_info['split'] = "train"
238
+
239
+
240
+ if fp_name.startswith("fdg"):
241
+ CIA_other_info['metadata_file']=fdg_meta_file
242
+ df_meta=fdf_meta
243
+ sid,sdate=check_fname(fp_name)
244
+ study_id=sid.replace("fdg","PETCT")
245
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')]
246
+ data_info_row=data_info_row.reset_index()
247
+ for keyname in FDG_META_COLUMN:
248
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
249
+
250
+ CIA_other_info['Image_id']=fp_name
251
+
252
+ else:
253
+ CIA_other_info['metadata_file']=psma_meta_file
254
+ df_meta=pdf_meta
255
+ sid,sdate=check_fname(fp_name)
256
+ study_id=sid.replace("psma","PSMA")
257
+ # print('>>',study_id,sdate)
258
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)]
259
+ data_info_row=data_info_row.reset_index()
260
+ # print(data_info_row.columns)
261
+ for keyname in PSMA_META_COLUMN:
262
+ print(keyname)
263
+ print(data_info_row[keyname][0])
264
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
265
+
266
+ CIA_other_info['Image_id']=fp_name
267
+
268
+
269
+
270
+
271
+ try:
272
+ ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与
273
+
274
+
275
+ series_files=[ct_fp,pet_fp]
276
+ sub_modality=['CT','PET']
277
+ if len(series_files)>0:
278
+ ##存在有效的MRI影像数据进行后续处理
279
+ sitk_img_original=merge_images(series_files)
280
+
281
+
282
+
283
+
284
+ original_spacing = list(sitk_img_original.GetSpacing())
285
+ original_size = list(sitk_img_original.GetSize())
286
+
287
+ is_4d_image = sitk_img_original.GetDimension() == 4
288
+ frame_flag=False
289
+ # --- Resampling Logic (Revised for 4D) ---
290
+ if is_4d_image:
291
+
292
+ # Always process 4D images channel-wise for resampling
293
+ # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
294
+ channels = []
295
+ num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
296
+ channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
297
+
298
+
299
+ for i in range(num_channels):
300
+ extractor = sitk.ExtractImageFilter()
301
+ current_3d_channel_size = original_size[:3]
302
+
303
+ if sitk_img_original.GetDimension() == 4:
304
+ extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
305
+ extractor.SetIndex([0,0,0,i])
306
+ channel_3d_img = extractor.Execute(sitk_img_original)
307
+ else:
308
+ channel_3d_img = sitk_img_original
309
+ if i > 0: break
310
+
311
+ channel_resampler = util.get_unisize_resampler(
312
+ channel_3d_img, 'linear',
313
+ spacing=channel_target_spacing, size=current_3d_channel_size
314
+ )
315
+ if channel_resampler:
316
+ channels.append(channel_resampler.Execute(channel_3d_img))
317
+ else:
318
+ channels.append(channel_3d_img)
319
+
320
+ if channels:
321
+ if len(channels) > 1: # Only join if there are multiple channels
322
+ sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
323
+ ##aded by yanguoqing on 2025-08-11
324
+ frame_flag=True
325
+ # imgDict={}
326
+ # for kf_idx in range(num_channels):
327
+ # imgDict[str(kf_idx)]='none'
328
+ # if str(meta_ed):imgDict[str(meta_ed)]='ed'
329
+ # if str(meta_es):imgDict[str(meta_es)]='es'
330
+ # meta.add_keyvalue('ImgDict',imgDict)
331
+ elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
332
+ sitk_img_processed = channels[0]
333
+ elif TARGET_VOXEL_SPACING: # 3D image with target spacing
334
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
335
+ spacing=TARGET_VOXEL_SPACING, size=original_size)
336
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
337
+ else: # 3D image, no TARGET_VOXEL_SPACING
338
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
339
+ spacing=original_spacing, size=original_size)
340
+ if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
341
+
342
+
343
+ output_path = os.path.join(output_dir,fp_name,fp_name+".nii.gz")
344
+ # output_path=convert_windows_to_linux_path(output_path)
345
+ save_nifti(sitk_img_processed, output_path, os.path.dirname(ct_fp))
346
+ print(f"Saved NIfTI file to {output_path}")
347
+
348
+
349
+ size_processed = list(sitk_img_processed.GetSize())
350
+ print('size_processed',size_processed,original_size)
351
+
352
+ # meta.add_keyvalue('Image_id',meta_image_id)
353
+ meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
354
+ meta.add_keyvalue('OriImg_path',",".join(series_files))
355
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
356
+ meta.add_keyvalue('Modality',modality)
357
+ meta.add_keyvalue('Dataset_name',study)
358
+ meta.add_keyvalue('ROI','whole-body')
359
+
360
+
361
+ sub_modality_dict={}
362
+ for idx,value in enumerate(sub_modality):
363
+ if value:
364
+ sub_modality_dict[str(idx)]=SUB_MODALITY[idx]
365
+
366
+ meta.add_keyvalue('Sub_modality',sub_modality_dict)
367
+
368
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
369
+
370
+
371
+ ##Label processing
372
+
373
+ label_path_dict={}
374
+ full_label_file=label_fp
375
+ full_path_label=os.path.dirname(full_label_file)
376
+ process_label_path=os.path.join(output_dir,fp_name,'segmentation')
377
+
378
+ processed_lbl_full_path=os.path.join(process_label_path, f"{fp_name}.nii.gz")
379
+
380
+ if not os.path.isdir(process_label_path):
381
+ os.makedirs(process_label_path,exist_ok=True)
382
+
383
+ if not os.path.isfile(full_label_file):
384
+ pass
385
+ label_flag=False
386
+ else:
387
+ sitk_lbl_original = util.load_nifti(full_label_file)
388
+
389
+ if sitk_lbl_original:
390
+ label_resampler = sitk.ResampleImageFilter()
391
+ reference_for_label = sitk_img_processed # Default to processed image
392
+
393
+ if sitk_img_processed.GetDimension() == 4:
394
+ num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
395
+ if num_comp_proc > 0:
396
+ extractor = sitk.ExtractImageFilter()
397
+ proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
398
+ extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
399
+ extractor.SetIndex([0,0,0,0])
400
+ try:
401
+ reference_for_label = extractor.Execute(sitk_img_processed)
402
+ except Exception as ref_err:
403
+ print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
404
+ # print(traceback.format_exc())
405
+ reference_for_label = None
406
+ else: # Fallback if extraction fails
407
+ print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
408
+ reference_for_label = None # This will cause an issue below if not handled
409
+
410
+ sitk_lbl_processed = None
411
+
412
+ if reference_for_label and reference_for_label.GetDimension() > 0:
413
+ label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
414
+ label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
415
+
416
+ if sitk_lbl_original.GetDimension() == 4:
417
+ lbl_channels = []
418
+ lbl_size = list(sitk_lbl_original.GetSize())
419
+ for i in range(lbl_size[3]):
420
+ extractor = sitk.ExtractImageFilter()
421
+ extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
422
+ extractor.SetIndex([0, 0, 0, i])
423
+ single_channel = extractor.Execute(sitk_lbl_original)
424
+
425
+ label_resampler.SetReferenceImage(reference_for_label)
426
+ resampled_channel = label_resampler.Execute(single_channel)
427
+ lbl_channels.append(resampled_channel)
428
+
429
+ if len(lbl_channels) > 1:
430
+ sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
431
+ elif len(lbl_channels) == 1:
432
+ sitk_lbl_processed = lbl_channels[0]
433
+ else:
434
+ label_resampler.SetReferenceImage(reference_for_label)
435
+ sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
436
+ if processed_lbl_full_path:
437
+ if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
438
+ print(f" Mismatch between image and label size (ignoring channels):")
439
+ print(f" Image size: {sitk_img_processed.GetSize()}")
440
+ print(f" Label size: {sitk_lbl_processed.GetSize()}")
441
+ util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
442
+ else:
443
+ print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
444
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original
445
+ # processed_lbl_full_path should still point to this saved original label
446
+ sitk_lbl_processed=sitk_lbl_original
447
+ else:
448
+ processed_lbl_full_path = None
449
+
450
+
451
+ util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original
452
+ print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
453
+
454
+
455
+
456
+
457
+ if processed_lbl_full_path:
458
+ label_path_dict['tumor'] = processed_lbl_full_path
459
+ print(label_path_dict.keys())
460
+ meta.add_keyvalue('Task',TASK_VALUE)
461
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
462
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
463
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
464
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
465
+
466
+
467
+
468
+
469
+
470
+ # try:
471
+ # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
472
+ # except Exception as e:
473
+ # failed_files.append(full_path_label)
474
+ # continue
475
+ print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
476
+
477
+ except Exception as e:
478
+ print(e)
479
+ failed_files.append(ct_fp)
480
+ print(f"Failed to load PSMA images from {ct_fp}")
481
+ continue
482
+
483
+
484
+
485
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
486
+
487
+
488
+ # Write the mapping to the JSON file on the fly
489
+ with open(json_output_path, 'r+') as json_file:
490
+ existing_mappings = json.load(json_file)
491
+ existing_mappings[output_path] = meta.get_meta_data()
492
+ json_file.seek(0)
493
+ # print(existing_mappings)
494
+ json.dump(existing_mappings, json_file, indent=4)
495
+ json_file.truncate()
496
+ # else:
497
+ # print("No metadata.csv files found.")
498
+
499
+ with open(failed_files_path, "w") as json_file:
500
+ json.dump(failed_files, json_file)
501
+
502
+ print(f"The list has been written to {failed_files_path}")
503
+ print(f"Saved NIfTI mappings to {json_output_path}")
504
+
505
+ if __name__ == "__main__":
506
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
507
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/")
508
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/")
509
+ args = parser.parse_args()
510
+ print(args.target_path, args.output_dir)
511
+ main(args.target_path, args.output_dir)
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
PSMA_clean/dataclean_PSMA_petct_v2.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+ PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。
6
+
7
+ PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。
8
+ 而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。
9
+
10
+
11
+ fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。
12
+ fdgpet/ct
13
+ 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”)
14
+ 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤
15
+ 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤
16
+ 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊
17
+
18
+ psmapet/ct
19
+ 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”)
20
+ 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤
21
+ 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警
22
+ 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊
23
+
24
+
25
+ PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。
26
+ 这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。
27
+ 前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。
28
+ PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。
29
+ FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。
30
+
31
+
32
+
33
+ PSMA-FDG-PET/CT:
34
+
35
+ https://autopet-iii.grand-challenge.org/
36
+ "channel_names": {
37
+ "0": "CT",
38
+ "1": "CT"--PET
39
+ },
40
+ "labels": {
41
+ "background": 0,
42
+ "tumor": 1
43
+ },
44
+ 同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY
45
+
46
+ label:
47
+ 0:backgroud 1: tumor
48
+
49
+ FDG-元数据信息
50
+ 'Series UID', 'Collection', '3rd Party Analysis',
51
+ 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
52
+ 'Study Date', 'Series Description', 'Manufacturer', 'Modality',
53
+ 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
54
+ 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex'
55
+ 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】
56
+ FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz
57
+ Subject ID[PETCT_b2f82ed4b9] && Modality[CT]
58
+
59
+
60
+ PSMA-元数据信息
61
+ 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name',
62
+ 'pet_radionuclide', 'ct_contrast_agent'
63
+ 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理,
64
+ PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz
65
+ Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15]
66
+
67
+ 综上:将id定义为subject_id+study_date 共同标识唯一的ID
68
+
69
+ 处理流程:
70
+ 1.查找所有的ID;
71
+ 2.根据ID查找对应的两个channel的影像以及对应的label;
72
+ 3.对两个channel的影像进行合并转4D;
73
+ 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label
74
+ 5.保存
75
+
76
+ '''
77
+ import os
78
+ import glob
79
+ import pandas as pd
80
+ import SimpleITK as sitk
81
+ import argparse
82
+ import json
83
+ from tqdm import tqdm
84
+ from util import meta_data
85
+ import util
86
+ import numpy as np
87
+ # from bert_helper import *
88
+
89
+ import shutil
90
+ ##dataset_meta
91
+ meta_id_name='BraTS_2019_subject_ID'
92
+ meta_grade_name='Grade'
93
+
94
+ ##HGG_survival_info
95
+ survival_id_name='BraTS19ID'
96
+ meta_age_name='Age'
97
+ meta_survival_name='Survival'
98
+ meta_status_name='ResectionStatus'
99
+
100
+
101
+ TASK_VALUE="segmentation"
102
+ CLAMP_RANGE_CT = [-400,400]
103
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
104
+ TARGET_VOXEL_SPACING=None
105
+
106
+ ##参考MSD的sub_modality描述信息
107
+ SUB_MODALITY=["CT","PET"]
108
+ ##文件名对应的排序顺序
109
+ SERIES_ORDER=["0000","0001"]
110
+
111
+ LABEL_DICT={
112
+ "0":"backgroud",
113
+ "1":"tumor",
114
+ }
115
+ PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent']
116
+ FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex']
117
+ # def find_metadata_files(path):
118
+ # # for Cancer Image Archive (TCIA) dataset
119
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
120
+ # return glob.glob(search_pattern, recursive=True)
121
+
122
+ def find_metadata_files(path):
123
+ # for Cancer Image Archive (TCIA) dataset
124
+ search_pattern = os.path.join(path, '*.csv')
125
+ return glob.glob(search_pattern, recursive=True)
126
+ ##added by yanguoqing on 20250527
127
+ def find_image_dirs(path):
128
+ return os.listdir(path)
129
+
130
+ ##modify by yanguoqing on 20250527
131
+ def load_dicom_images(folder_path):
132
+ reader = sitk.ImageSeriesReader()
133
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
134
+ reader.SetFileNames(dicom_names)
135
+ image = reader.Execute()
136
+ return dicom_names,image
137
+
138
+ ##added by yanguoqing on 20250527
139
+ def load_dicom_tag(imgs):
140
+ reader = sitk.ImageFileReader()
141
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
142
+ reader.SetFileName(imgs)
143
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
144
+ # metadata_keys = reader.GetMetaDataKeys()
145
+ tag=reader.Execute()
146
+ return tag
147
+
148
+ def load_nrrd(fp):
149
+ return sitk.ReadImage(fp)
150
+
151
+ ##modify by yanguoqing on 20250830
152
+ def merge_images(series_files):
153
+ '''
154
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
155
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
156
+ '''
157
+ reader = sitk.ImageSeriesReader()
158
+ reader.SetFileNames(series_files)
159
+ image = reader.Execute()
160
+ return image
161
+
162
+ def save_nifti(image, output_path, folder_path):
163
+ # Set metadata in the NIfTI file's header
164
+ output_dirpath = os.path.dirname(output_path)
165
+ if not os.path.exists(output_dirpath):
166
+ print(f"Creating directory {output_dirpath}")
167
+ os.makedirs(output_dirpath)
168
+ # Set metadata in the NIfTI file's header
169
+ image.SetMetaData("FolderPath", folder_path)
170
+ sitk.WriteImage(image, output_path)
171
+
172
+ ##modify by yanguoqing on 20250527
173
+ def convert_windows_to_linux_path(windows_path):
174
+ # Replace backslashes with forward slashes and remove the drive letter
175
+ # Some meta files have windows paths, but the data is stored on a linux server
176
+ linux_path = windows_path.replace('\\', '/')
177
+ if ':' in linux_path:
178
+ linux_path = linux_path.split(':', 1)[1]
179
+ return linux_path
180
+ ##added by yanguoqing on 2025-08-30
181
+ ##获取PSMA-PET-CT的1614个数据名称
182
+ def get_filename_list(fp):
183
+ with open(fp,'r') as fi:
184
+ fls=json.load(fi)
185
+ filename_list=fls[0]['train']+fls[0]['val']
186
+
187
+ return filename_list
188
+ ##获取study_id以及study_date
189
+ def check_fname(fname):
190
+ if fname.startswith("fdg"):
191
+ sid=fname[:14]
192
+ sdate=fname[15:25]
193
+ sdate=sdate.split("-")
194
+ sdate=sdate[-1]+"-"+sdate[0]+"-"+sdate[1]
195
+ else:
196
+ sid=fname[:21]
197
+ sdate=fname[22:]
198
+ return sid,sdate
199
+ def main(target_path, output_dir):
200
+ # metadata_files = find_metadata_files(target_path)
201
+ # pid_dirs=find_image_dirs(target_path)
202
+ fdg_meta="fdg_metadata.csv"
203
+ psma_meta="psma_metadata.csv"
204
+ filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息
205
+ # pid_dirs=["imagesTr","labelsTr"]
206
+ pid_dirs=["imagesTr"]
207
+ failed_files = []
208
+ if not os.path.isdir(output_dir):
209
+ os.makedirs(output_dir)
210
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
211
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
212
+ meta = meta_data()
213
+
214
+ # Initialize the JSON file
215
+ if not os.path.exists(json_output_path):
216
+ with open(json_output_path, 'w') as json_file:
217
+ json.dump({}, json_file)
218
+ psma_meta_file=os.path.join(target_path,psma_meta)
219
+ fdg_meta_file=os.path.join(target_path,fdg_meta)
220
+
221
+ filename_file=os.path.join(target_path,filename_meta)
222
+
223
+ pdf_meta=pd.read_csv(psma_meta_file)
224
+ fdf_meta=pd.read_csv(fdg_meta_file)
225
+
226
+ fp_names=get_filename_list(filename_file)
227
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
228
+ if pid_dirs:
229
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
230
+ for fp_name in tqdm(fp_names, desc="Processing all dataset"):
231
+
232
+ ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz")
233
+ pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz")
234
+ label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz")
235
+
236
+ modality="CT"
237
+ study='PSMA-FDG-PET-CT-LESION'##Dataset_name
238
+ CIA_other_info = {'metadata_file':''}
239
+ CIA_other_info['split'] = "train"
240
+
241
+
242
+ if fp_name.startswith("fdg"):
243
+ CIA_other_info['metadata_file']=fdg_meta_file
244
+ df_meta=fdf_meta
245
+ sid,sdate=check_fname(fp_name)
246
+ study_id=sid.replace("fdg","PETCT")
247
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')]
248
+ data_info_row=data_info_row.reset_index()
249
+ for keyname in FDG_META_COLUMN:
250
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
251
+
252
+ CIA_other_info['Image_id']=sid+"_"+sdate
253
+ CIA_other_info['patientid']=sid
254
+ CIA_other_info['datetime']=sdate
255
+
256
+ else:
257
+ CIA_other_info['metadata_file']=psma_meta_file
258
+ df_meta=pdf_meta
259
+ sid,sdate=check_fname(fp_name)
260
+ study_id=sid.replace("psma","PSMA")
261
+ # print('>>',study_id,sdate)
262
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)]
263
+ data_info_row=data_info_row.reset_index()
264
+ # print(data_info_row.columns)
265
+ for keyname in PSMA_META_COLUMN:
266
+ print(keyname)
267
+ print(data_info_row[keyname][0])
268
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
269
+
270
+ CIA_other_info['Image_id']=sid+"_"+sdate
271
+ CIA_other_info['patientid']=sid
272
+ CIA_other_info['datetime']=sdate
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+ series_files=[ct_fp,pet_fp]
281
+ sub_modality=['CT','PET']
282
+ # if len(series_files)>0:
283
+ # ##存在有效的MRI影像数据进行后续处理
284
+ # sitk_img_original=merge_images(series_files)
285
+
286
+
287
+ for fpidex,fp in enumerate(series_files):
288
+
289
+ try:
290
+ sitk_img_original=util.load_nifti(fp)
291
+
292
+ original_spacing = list(sitk_img_original.GetSpacing())
293
+ original_size = list(sitk_img_original.GetSize())
294
+
295
+
296
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',spacing=original_spacing, size=original_size)
297
+
298
+ sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
299
+ ##CLAMP PET_CT
300
+
301
+ sitk_img_processed = util.clamp_image(sitk_img_processed, CLAMP_RANGE_CT)
302
+
303
+
304
+ output_path = os.path.join(output_dir,sid+"_"+sdate,sid+"_"+sdate+"_%s.nii.gz"%sub_modality[fpidex])
305
+ # output_path=convert_windows_to_linux_path(output_path)
306
+ save_nifti(sitk_img_processed, output_path, os.path.dirname(fp))
307
+ print(f"Saved NIfTI file to {output_path}")
308
+
309
+
310
+ size_processed = list(sitk_img_processed.GetSize())
311
+ print('size_processed',size_processed,original_size)
312
+
313
+ # meta.add_keyvalue('Image_id',meta_image_id)
314
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))##保留前三个x,y,z的最小spacing
315
+ meta.add_keyvalue('OriImg_path',fp)
316
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
317
+ meta.add_keyvalue('Modality',sub_modality[fpidex])
318
+ meta.add_keyvalue('Dataset_name',study)
319
+ meta.add_keyvalue('ROI','whole-body')
320
+
321
+
322
+ # sub_modality_dict={}
323
+ # for idx,value in enumerate(sub_modality):
324
+ # if value:
325
+ # sub_modality_dict[str(idx)]=SUB_MODALITY[idx]
326
+
327
+ # meta.add_keyvalue('Sub_modality',sub_modality_dict)
328
+
329
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
330
+
331
+
332
+ ##Label processing
333
+
334
+ label_path_dict={}
335
+ full_label_file=label_fp
336
+ full_path_label=os.path.dirname(full_label_file)
337
+ process_label_path=os.path.join(output_dir,sid+"_"+sdate,'segmentation')
338
+
339
+ processed_lbl_full_path=os.path.join(process_label_path, "%s_%s.nii.gz"%(sid+"_"+sdate,sub_modality[fpidex]))
340
+
341
+ if not os.path.isdir(process_label_path):
342
+ os.makedirs(process_label_path,exist_ok=True)
343
+
344
+ if not os.path.isfile(full_label_file):
345
+ pass
346
+ label_flag=False
347
+ else:
348
+ sitk_lbl_original = util.load_nifti(full_label_file)
349
+
350
+ if sitk_lbl_original:
351
+ resampler =util.get_unisize_resampler(sitk_lbl_original, interpolator='nearest', spacing=original_spacing, size=original_size)
352
+ if resampler is not None:
353
+ proces_label = resampler.Execute(sitk_lbl_original)
354
+ else:
355
+ proces_label = sitk_lbl_original
356
+
357
+
358
+ try:
359
+ assert sitk_img_processed.GetSize() == proces_label.GetSize()
360
+ except Exception as e:
361
+ failed_files.append(full_path_label)
362
+ continue
363
+
364
+
365
+
366
+ util.save_nifti(proces_label, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
367
+ print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
368
+
369
+ label_path_dict['tumor'] = processed_lbl_full_path
370
+ print(label_path_dict.keys())
371
+ meta.add_keyvalue('Task',TASK_VALUE)
372
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
373
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
374
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
375
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
376
+
377
+
378
+
379
+ # Write the mapping to the JSON file on the fly
380
+ with open(json_output_path, 'r+') as json_file:
381
+ existing_mappings = json.load(json_file)
382
+ existing_mappings[output_path] = meta.get_meta_data()
383
+ json_file.seek(0)
384
+ # print(existing_mappings)
385
+ json.dump(existing_mappings, json_file, indent=4)
386
+ json_file.truncate()
387
+ # else:
388
+ except Exception as e:
389
+ print(e)
390
+ failed_files.append(fp)
391
+ print(f"Failed to load PSMA images from {fp}")
392
+ continue
393
+
394
+
395
+ # print("No metadata.csv files found.")
396
+
397
+ with open(failed_files_path, "w") as json_file:
398
+ json.dump(failed_files, json_file)
399
+
400
+ print(f"The list has been written to {failed_files_path}")
401
+ print(f"Saved NIfTI mappings to {json_output_path}")
402
+
403
+ if __name__ == "__main__":
404
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
405
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/")
406
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/")
407
+ args = parser.parse_args()
408
+ print(args.target_path, args.output_dir)
409
+ main(args.target_path, args.output_dir)
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
PSMA_clean/dataclean_PSMA_petct_v2_json.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+ PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。
6
+
7
+ PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。
8
+ 而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。
9
+
10
+
11
+ fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。
12
+ fdgpet/ct
13
+ 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”)
14
+ 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤
15
+ 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤
16
+ 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊
17
+
18
+ psmapet/ct
19
+ 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”)
20
+ 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤
21
+ 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警
22
+ 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊
23
+
24
+
25
+ PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。
26
+ 这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。
27
+ 前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。
28
+ PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。
29
+ FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。
30
+
31
+
32
+
33
+ PSMA-FDG-PET/CT:
34
+
35
+ https://autopet-iii.grand-challenge.org/
36
+ "channel_names": {
37
+ "0": "CT",
38
+ "1": "CT"--PET
39
+ },
40
+ "labels": {
41
+ "background": 0,
42
+ "tumor": 1
43
+ },
44
+ 同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY
45
+
46
+ label:
47
+ 0:backgroud 1: tumor
48
+
49
+ FDG-元数据信息
50
+ 'Series UID', 'Collection', '3rd Party Analysis',
51
+ 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
52
+ 'Study Date', 'Series Description', 'Manufacturer', 'Modality',
53
+ 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
54
+ 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex'
55
+ 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】
56
+ FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz
57
+ Subject ID[PETCT_b2f82ed4b9] && Modality[CT]
58
+
59
+
60
+ PSMA-元数据信息
61
+ 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name',
62
+ 'pet_radionuclide', 'ct_contrast_agent'
63
+ 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理,
64
+ PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz
65
+ Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15]
66
+
67
+ 综上:将id定义为subject_id+study_date 共同标识唯一的ID
68
+
69
+ 处理流程:
70
+ 1.查找所有的ID;
71
+ 2.根据ID查找对应的两个channel的影像以及对应的label;
72
+ 3.对两个channel的影像进行合并转4D;
73
+ 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label
74
+ 5.保存
75
+
76
+ '''
77
+ import os
78
+ import glob
79
+ import pandas as pd
80
+ import SimpleITK as sitk
81
+ import argparse
82
+ import json
83
+ from tqdm import tqdm
84
+ from util import meta_data
85
+ import util
86
+ import numpy as np
87
+ # from bert_helper import *
88
+
89
+ import shutil
90
+ ##dataset_meta
91
+ meta_id_name='BraTS_2019_subject_ID'
92
+ meta_grade_name='Grade'
93
+
94
+ ##HGG_survival_info
95
+ survival_id_name='BraTS19ID'
96
+ meta_age_name='Age'
97
+ meta_survival_name='Survival'
98
+ meta_status_name='ResectionStatus'
99
+
100
+
101
+ TASK_VALUE="segmentation"
102
+ CLAMP_RANGE_CT = [-400,400]
103
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
104
+ TARGET_VOXEL_SPACING=None
105
+
106
+ ##参考MSD的sub_modality描述信息
107
+ SUB_MODALITY=["CT","PET"]
108
+ ##文件名对应的排序顺序
109
+ SERIES_ORDER=["0000","0001"]
110
+
111
+ LABEL_DICT={
112
+ "0":"backgroud",
113
+ "1":"tumor",
114
+ }
115
+ PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent']
116
+ FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex']
117
+ # def find_metadata_files(path):
118
+ # # for Cancer Image Archive (TCIA) dataset
119
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
120
+ # return glob.glob(search_pattern, recursive=True)
121
+
122
+ def find_metadata_files(path):
123
+ # for Cancer Image Archive (TCIA) dataset
124
+ search_pattern = os.path.join(path, '*.csv')
125
+ return glob.glob(search_pattern, recursive=True)
126
+ ##added by yanguoqing on 20250527
127
+ def find_image_dirs(path):
128
+ return os.listdir(path)
129
+
130
+ ##modify by yanguoqing on 20250527
131
+ def load_dicom_images(folder_path):
132
+ reader = sitk.ImageSeriesReader()
133
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
134
+ reader.SetFileNames(dicom_names)
135
+ image = reader.Execute()
136
+ return dicom_names,image
137
+
138
+ ##added by yanguoqing on 20250527
139
+ def load_dicom_tag(imgs):
140
+ reader = sitk.ImageFileReader()
141
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
142
+ reader.SetFileName(imgs)
143
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
144
+ # metadata_keys = reader.GetMetaDataKeys()
145
+ tag=reader.Execute()
146
+ return tag
147
+
148
+ def load_nrrd(fp):
149
+ return sitk.ReadImage(fp)
150
+
151
+ ##modify by yanguoqing on 20250830
152
+ def merge_images(series_files):
153
+ '''
154
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
155
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
156
+ '''
157
+ reader = sitk.ImageSeriesReader()
158
+ reader.SetFileNames(series_files)
159
+ image = reader.Execute()
160
+ return image
161
+
162
+ def save_nifti(image, output_path, folder_path):
163
+ # Set metadata in the NIfTI file's header
164
+ output_dirpath = os.path.dirname(output_path)
165
+ if not os.path.exists(output_dirpath):
166
+ print(f"Creating directory {output_dirpath}")
167
+ os.makedirs(output_dirpath)
168
+ # Set metadata in the NIfTI file's header
169
+ image.SetMetaData("FolderPath", folder_path)
170
+ sitk.WriteImage(image, output_path)
171
+
172
+ ##modify by yanguoqing on 20250527
173
+ def convert_windows_to_linux_path(windows_path):
174
+ # Replace backslashes with forward slashes and remove the drive letter
175
+ # Some meta files have windows paths, but the data is stored on a linux server
176
+ linux_path = windows_path.replace('\\', '/')
177
+ if ':' in linux_path:
178
+ linux_path = linux_path.split(':', 1)[1]
179
+ return linux_path
180
+ ##added by yanguoqing on 2025-08-30
181
+ ##获取PSMA-PET-CT的1614个数据名称
182
+ def get_filename_list(fp):
183
+ with open(fp,'r') as fi:
184
+ fls=json.load(fi)
185
+ filename_list=fls[0]['train']+fls[0]['val']
186
+
187
+ return filename_list
188
+ ##获取study_id以及study_date
189
+ def check_fname(fname):
190
+ if fname.startswith("fdg"):
191
+ sid=fname[:14]
192
+ sdate=fname[15:25]
193
+ sdate=sdate.split("-")
194
+ sdate=sdate[-1]+"-"+sdate[0]+"-"+sdate[1]
195
+ else:
196
+ sid=fname[:21]
197
+ sdate=fname[22:]
198
+ return sid,sdate
199
+ def main(target_path, output_dir):
200
+ # metadata_files = find_metadata_files(target_path)
201
+ # pid_dirs=find_image_dirs(target_path)
202
+ fdg_meta="fdg_metadata.csv"
203
+ psma_meta="psma_metadata.csv"
204
+ filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息
205
+ # pid_dirs=["imagesTr","labelsTr"]
206
+ pid_dirs=["imagesTr"]
207
+ failed_files = []
208
+ if not os.path.isdir(output_dir):
209
+ os.makedirs(output_dir)
210
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
211
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
212
+ meta = meta_data()
213
+
214
+ # Initialize the JSON file
215
+ if not os.path.exists(json_output_path):
216
+ with open(json_output_path, 'w') as json_file:
217
+ json.dump({}, json_file)
218
+ psma_meta_file=os.path.join(target_path,psma_meta)
219
+ fdg_meta_file=os.path.join(target_path,fdg_meta)
220
+
221
+ filename_file=os.path.join(target_path,filename_meta)
222
+
223
+ pdf_meta=pd.read_csv(psma_meta_file)
224
+ fdf_meta=pd.read_csv(fdg_meta_file)
225
+
226
+ fp_names=get_filename_list(filename_file)
227
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
228
+ if pid_dirs:
229
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
230
+ for fp_name in tqdm(fp_names, desc="Processing all dataset"):
231
+
232
+ ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz")
233
+ pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz")
234
+ label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz")
235
+
236
+ modality="CT"
237
+ study='PSMA-FDG-PET-CT-LESION'##Dataset_name
238
+ CIA_other_info = {'metadata_file':''}
239
+ CIA_other_info['split'] = "train"
240
+
241
+
242
+ if fp_name.startswith("fdg"):
243
+ CIA_other_info['metadata_file']=fdg_meta_file
244
+ df_meta=fdf_meta
245
+ sid,sdate=check_fname(fp_name)
246
+ study_id=sid.replace("fdg","PETCT")
247
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')]
248
+ data_info_row=data_info_row.reset_index()
249
+ for keyname in FDG_META_COLUMN:
250
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
251
+
252
+ CIA_other_info['Image_id']=sid+"_"+sdate
253
+ CIA_other_info['patientid']=sid
254
+ CIA_other_info['datetime']=sdate
255
+
256
+ else:
257
+ CIA_other_info['metadata_file']=psma_meta_file
258
+ df_meta=pdf_meta
259
+ sid,sdate=check_fname(fp_name)
260
+ study_id=sid.replace("psma","PSMA")
261
+ # print('>>',study_id,sdate)
262
+ data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)]
263
+ data_info_row=data_info_row.reset_index()
264
+ # print(data_info_row.columns)
265
+ for keyname in PSMA_META_COLUMN:
266
+ try:
267
+ print(keyname)
268
+ print(data_info_row[keyname][0])
269
+ CIA_other_info[keyname]=str(data_info_row[keyname][0])
270
+ except Exception as e:
271
+ continue
272
+ CIA_other_info['Image_id']=sid+"_"+sdate
273
+ CIA_other_info['patientid']=sid
274
+ CIA_other_info['datetime']=sdate
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+ series_files=[ct_fp,pet_fp]
283
+ sub_modality=['CT','PET']
284
+ # if len(series_files)>0:
285
+ # ##存在有效的MRI影像数据进行后续处理
286
+ # sitk_img_original=merge_images(series_files)
287
+
288
+
289
+ for fpidex,fp in enumerate(series_files):
290
+
291
+ try:
292
+ sitk_img_original=util.load_nifti(fp)
293
+
294
+ original_spacing = list(sitk_img_original.GetSpacing())
295
+ original_size = list(sitk_img_original.GetSize())
296
+
297
+
298
+ img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',spacing=original_spacing, size=original_size)
299
+
300
+ sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
301
+ ##CLAMP PET_CT
302
+
303
+ sitk_img_processed = util.clamp_image(sitk_img_processed, CLAMP_RANGE_CT)
304
+
305
+
306
+ output_path = os.path.join(output_dir,sid+"_"+sdate,sid+"_"+sdate+"_%s.nii.gz"%sub_modality[fpidex])
307
+ # output_path=convert_windows_to_linux_path(output_path)
308
+ #save_nifti(sitk_img_processed, output_path, os.path.dirname(fp))
309
+ print(f"Saved NIfTI file to {output_path}")
310
+
311
+
312
+ size_processed = list(sitk_img_processed.GetSize())
313
+ print('size_processed',size_processed,original_size)
314
+
315
+ # meta.add_keyvalue('Image_id',meta_image_id)
316
+ meta.add_keyvalue('Spacing_mm',min(original_spacing))##保留前三个x,y,z的最小spacing
317
+ meta.add_keyvalue('OriImg_path',fp)
318
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
319
+ meta.add_keyvalue('Modality',sub_modality[fpidex])
320
+ meta.add_keyvalue('Dataset_name',study)
321
+ meta.add_keyvalue('ROI','whole-body')
322
+
323
+
324
+ # sub_modality_dict={}
325
+ # for idx,value in enumerate(sub_modality):
326
+ # if value:
327
+ # sub_modality_dict[str(idx)]=SUB_MODALITY[idx]
328
+
329
+ # meta.add_keyvalue('Sub_modality',sub_modality_dict)
330
+
331
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
332
+
333
+
334
+ ##Label processing
335
+
336
+ label_path_dict={}
337
+ full_label_file=label_fp
338
+ full_path_label=os.path.dirname(full_label_file)
339
+ process_label_path=os.path.join(output_dir,sid+"_"+sdate,'segmentation')
340
+
341
+ processed_lbl_full_path=os.path.join(process_label_path, "%s_%s.nii.gz"%(sid+"_"+sdate,sub_modality[fpidex]))
342
+
343
+ if not os.path.isdir(process_label_path):
344
+ os.makedirs(process_label_path,exist_ok=True)
345
+
346
+ if not os.path.isfile(full_label_file):
347
+ pass
348
+ label_flag=False
349
+ else:
350
+ sitk_lbl_original = util.load_nifti(full_label_file)
351
+
352
+ if sitk_lbl_original:
353
+ resampler =util.get_unisize_resampler(sitk_lbl_original, interpolator='nearest', spacing=original_spacing, size=original_size)
354
+ if resampler is not None:
355
+ proces_label = resampler.Execute(sitk_lbl_original)
356
+ else:
357
+ proces_label = sitk_lbl_original
358
+
359
+
360
+ try:
361
+ assert sitk_img_processed.GetSize() == proces_label.GetSize()
362
+ except Exception as e:
363
+ failed_files.append(full_path_label)
364
+ continue
365
+
366
+
367
+
368
+ #util.save_nifti(proces_label, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
369
+ print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
370
+
371
+ label_path_dict['tumor'] = processed_lbl_full_path
372
+ print(label_path_dict.keys())
373
+ meta.add_keyvalue('Task',TASK_VALUE)
374
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
375
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
376
+ meta.add_keyvalue('Label_Dict',LABEL_DICT)
377
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
378
+
379
+
380
+
381
+ # Write the mapping to the JSON file on the fly
382
+ with open(json_output_path, 'r+') as json_file:
383
+ existing_mappings = json.load(json_file)
384
+ existing_mappings[output_path] = meta.get_meta_data()
385
+ json_file.seek(0)
386
+ # print(existing_mappings)
387
+ json.dump(existing_mappings, json_file, indent=4)
388
+ json_file.truncate()
389
+ # else:
390
+ except Exception as e:
391
+ print(e)
392
+ failed_files.append(fp)
393
+ print(f"Failed to load PSMA images from {fp}")
394
+ continue
395
+
396
+
397
+ # print("No metadata.csv files found.")
398
+
399
+ with open(failed_files_path, "w") as json_file:
400
+ json.dump(failed_files, json_file)
401
+
402
+ print(f"The list has been written to {failed_files_path}")
403
+ print(f"Saved NIfTI mappings to {json_output_path}")
404
+
405
+ if __name__ == "__main__":
406
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
407
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/")
408
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/")
409
+ args = parser.parse_args()
410
+ print(args.target_path, args.output_dir)
411
+ main(args.target_path, args.output_dir)
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
PSMA_clean/demo.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+ '''
3
+ writebyygq
4
+ createon2025-08-30
5
+
6
+
7
+ BL = Baseline(基线)
8
+ FU = Follow-up(随访)
9
+
10
+ 1. Baseline (基线)
11
+ 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。
12
+ 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。
13
+ 2. Follow-up (随访)
14
+ 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。
15
+ 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。
16
+ “BL FU” 在报告中的应用场景:
17
+ 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是:
18
+ “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。”
19
+
20
+ 例如:
21
+ 肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。”
22
+ 慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。
23
+
24
+ label:
25
+ 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息
26
+
27
+ 编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--
28
+ 备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict
29
+ BL的以及对应的MASK都是inputsTr目录下面
30
+ 命名形式:
31
+ 93dd4de5cd_BL_img_BL_img_00.nii.gz
32
+ 93dd4de5cd_BL_mask_BL_img_00.nii.gz
33
+ 93dd4de5cd_BL_00.json
34
+
35
+ FU在inputsTr目录下面,对应的mask在targetsTr力猛
36
+ 命名形式:
37
+ c6f057b865_FU_img_FU_img_00.nii.gz
38
+ c6f057b865_FU_mask_FU_img_00.nii.gz
39
+ c6f057b865_FU_img_FU_img_01.nii.gz
40
+ c6f057b865_FU_mask_FU_img_01.nii.gz
41
+ c6f057b865_FU_00.json
42
+ c6f057b865_FU_01.json
43
+
44
+
45
+ 元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置
46
+ lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
47
+ 1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung
48
+ 2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node
49
+
50
+ json格式样例
51
+ {
52
+ "name": "Points of interest",
53
+ "points": [
54
+ {
55
+ "name": "1",
56
+ "point": [
57
+ 84.9530896759608,
58
+ 273.525433308214,
59
+ 148.780708364732
60
+ ]
61
+ },
62
+ {
63
+ "name": "2",
64
+ "point": [
65
+ 206.307026476578,
66
+ 258.39816700611,
67
+ 177.256619144603
68
+ ]
69
+ }
70
+ ],
71
+ "type": "Multiple points",
72
+ "version": {
73
+ "major": 1,
74
+ "minor": 0
75
+ }
76
+ }
77
+
78
+ 20251101补充增加,将病灶编号进行合并同类项目,
79
+ 注意处理完成后保留原影像的几何空间信息以及元数据文件信息
80
+
81
+
82
+ '''
83
+ import os
84
+ import glob
85
+ import pandas as pd
86
+ import SimpleITK as sitk
87
+ import argparse
88
+ import json
89
+ from tqdm import tqdm
90
+ from util import meta_data
91
+ import util
92
+ import numpy as np
93
+ # from bert_helper import *
94
+
95
+ import shutil
96
+
97
+
98
+ ##统一编码
99
+ label_id_lut={'backgroud': 0,
100
+ 'Lymph node': 1,
101
+ 'Lung': 2,
102
+ 'Soft tissue / Skin': 3,
103
+ 'Liver': 4,
104
+ 'Skeleton': 5,
105
+ 'Adrenals': 6,
106
+ 'Spleen': 7,
107
+ 'CNS': 8,
108
+ 'Kidney': 9,
109
+ 'Heart': 10,
110
+ 'Others': 11,
111
+ 'unclear': 12,
112
+ }
113
+
114
+
115
+ TASK_VALUE="segmentation"
116
+ CLAMP_RANGE_CT = [-300,300]
117
+ CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
118
+ TARGET_VOXEL_SPACING=None
119
+
120
+ # ##参考MSD的sub_modality描述信息
121
+ # SUB_MODALITY=["CT","PET"]
122
+ # ##文件名对应的排序顺序
123
+ # SERIES_ORDER=["0000","0001"]
124
+
125
+ ##根据对应的json信息进行补充1-N的数值
126
+ LABEL_DICT={
127
+ "0":"backgroud",
128
+ }
129
+ META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']
130
+
131
+ # def find_metadata_files(path):
132
+ # # for Cancer Image Archive (TCIA) dataset
133
+ # search_pattern = os.path.join(path, '**', 'metadata.csv')
134
+ # return glob.glob(search_pattern, recursive=True)
135
+
136
+ def find_metadata_files(path):
137
+ # for Cancer Image Archive (TCIA) dataset
138
+ search_pattern = os.path.join(path, '*.csv')
139
+ return glob.glob(search_pattern, recursive=True)
140
+ ##added by yanguoqing on 20250527
141
+ def find_image_dirs(path):
142
+ return os.listdir(path)
143
+
144
+ ##modify by yanguoqing on 20250527
145
+ def load_dicom_images(folder_path):
146
+ reader = sitk.ImageSeriesReader()
147
+ dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
148
+ reader.SetFileNames(dicom_names)
149
+ image = reader.Execute()
150
+ return dicom_names,image
151
+
152
+ ##added by yanguoqing on 20250527
153
+ def load_dicom_tag(imgs):
154
+ reader = sitk.ImageFileReader()
155
+ # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
156
+ reader.SetFileName(imgs)
157
+ reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
158
+ # metadata_keys = reader.GetMetaDataKeys()
159
+ tag=reader.Execute()
160
+ return tag
161
+
162
+ def load_nrrd(fp):
163
+ return sitk.ReadImage(fp)
164
+
165
+ ##modify by yanguoqing on 20250830
166
+ def merge_images(series_files):
167
+ '''
168
+ 每个病例包含两种不同序列的 CT:CT/PET--0000/0001
169
+ 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
170
+ '''
171
+ reader = sitk.ImageSeriesReader()
172
+ reader.SetFileNames(series_files)
173
+ image = reader.Execute()
174
+ return image
175
+
176
+ def save_nifti(image, output_path, folder_path):
177
+ # Set metadata in the NIfTI file's header
178
+ output_dirpath = os.path.dirname(output_path)
179
+ if not os.path.exists(output_dirpath):
180
+ print(f"Creating directory {output_dirpath}")
181
+ os.makedirs(output_dirpath)
182
+ # Set metadata in the NIfTI file's header
183
+ image.SetMetaData("FolderPath", folder_path)
184
+ sitk.WriteImage(image, output_path)
185
+
186
+ ##modify by yanguoqing on 20250527
187
+ def convert_windows_to_linux_path(windows_path):
188
+ # Replace backslashes with forward slashes and remove the drive letter
189
+ # Some meta files have windows paths, but the data is stored on a linux server
190
+ linux_path = windows_path.replace('\\', '/')
191
+ if ':' in linux_path:
192
+ linux_path = linux_path.split(':', 1)[1]
193
+ return linux_path
194
+ ##added by yanguoqing on 2025-08-31
195
+ ##根据csv文件返回的所有数据文件名称,获取所有数据id的
196
+ def get_filename_list(fp_dir):
197
+ all_file_list=glob.glob("%s/*.csv"%fp_dir)
198
+
199
+
200
+ return all_file_list
201
+ ##获取study_id以及study_date
202
+ def check_fname(fname):
203
+ if fname.startswith("fdg"):
204
+ sid=fname[:14]
205
+ sdate=fname[15:25]
206
+ else:
207
+ sid=fname[:21]
208
+ sdate=fname[22:]
209
+ return sid,sdate
210
+ def main(target_path, output_dir):
211
+
212
+ pid_dirs=["inputsTr"]
213
+ failed_files = []
214
+ if not os.path.isdir(output_dir):
215
+ os.makedirs(output_dir)
216
+ json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
217
+ failed_files_path = os.path.join(output_dir, 'failed_files.json')
218
+ meta = meta_data()
219
+
220
+ # Initialize the JSON file
221
+ if not os.path.exists(json_output_path):
222
+ with open(json_output_path, 'w') as json_file:
223
+ json.dump({}, json_file)
224
+
225
+
226
+ input_dir=os.path.join(target_path,'inputsTr')
227
+ target_dir=os.path.join(target_path,'targetsTr')
228
+
229
+ fp_files=get_filename_list(input_dir)
230
+ ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
231
+ if pid_dirs:
232
+ for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
233
+ for fp_file in tqdm(fp_files, desc="Processing all dataset"):
234
+ meta_file=fp_file
235
+ df_meta=pd.read_csv(meta_file)
236
+
237
+ fp_name=os.path.basename(fp_file)[:-4]
238
+ ##依次查找BL以及FU的所有影像以及对应的mask
239
+ for sub_mod in ['BL','FU']:
240
+
241
+ bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
242
+ if len(bl_fps)>0:
243
+ for bl_fp in bl_fps:
244
+ basename=os.path.basename(bl_fp)[:-5]
245
+ bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
246
+ bl_fp_img=os.path.join(input_dir,bl_fp_name)
247
+
248
+ if os.path.isfile(bl_fp_img):
249
+ ##判定存在进行正常处理
250
+
251
+
252
+ bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")
253
+
254
+ bl_fp_mask=os.path.join(input_dir,bl_mask_name)
255
+ if os.path.isfile(bl_fp_mask):
256
+ label_fp=bl_fp_mask
257
+ label_flag=True
258
+ else:
259
+ bl_fp_mask=os.path.join(target_dir,bl_mask_name)
260
+ if os.path.isfile(bl_fp_mask):
261
+ label_fp=bl_fp_mask
262
+ label_flag=True
263
+ else:
264
+ label_fp=None
265
+ label_flag=False
266
+
267
+
268
+ modality="CT"
269
+ study='PSMA_Longitudinal_CT'##Dataset_name
270
+ CIA_other_info = {
271
+ 'Image_id':basename,
272
+ 'metadata_file':''
273
+ # 'Series_Description':serise_desc
274
+ }
275
+ CIA_other_info['split'] = "train"
276
+
277
+ CIA_other_info['metadata_file']=meta_file
278
+ stk_image=util.load_nifti(bl_fp_img)
279
+ spacing_info = stk_image.GetSpacing()
280
+ size = list(stk_image.GetSize())
281
+ resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
282
+ if resampler is not None:
283
+ proces_image = resampler.Execute(stk_image)
284
+ print('SPACIE INFO AFTER', proces_image.GetSpacing())
285
+ CIA_other_info['Resample'] = True
286
+ else:
287
+ proces_image = stk_image
288
+ CIA_other_info['Resample'] = False
289
+
290
+ output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
291
+ # output_path=convert_windows_to_linux_path(output_path)
292
+ save_nifti(proces_image, output_path, input_dir)
293
+ print(f"Saved NIfTI file to {output_path}")
294
+
295
+
296
+
297
+
298
+ if label_flag:
299
+ label_path_dict = {}
300
+ label_stk_img=util.load_nifti(label_fp)
301
+
302
+ image_array = sitk.GetArrayFromImage(label_stk_img)
303
+ ##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项
304
+ with open(bl_fp,'r') as fi:
305
+ json_info=json.load(fi)
306
+
307
+ label_dict={
308
+ "0":"backgroud"
309
+ }
310
+
311
+ update_image_array=np.copy(image_array)
312
+ ##获取合并同类项后的基本信息
313
+ group_meta=df_meta.groupby('lesion_type')['lesion_id']
314
+ for name,group in group_meta:
315
+ ##分组名称以及分组后的所有leision_id
316
+ ids=group_meta.get_group(name)
317
+ target_id=label_id_lut[name]
318
+ # ##取每个分组的最小leision_id赋值
319
+ # ids_min=ids.min()
320
+ # label_dict[str(ids_min)]=name
321
+ label_dict[str(target_id)]=name
322
+ ##并对
323
+ for v in ids.tolist():
324
+ print(name,v,target_id)
325
+ update_image_array[image_array==v]=target_id
326
+ print(np.where(update_image_array==10))
327
+ image_array=None
328
+ label_stk_img_update=sitk.GetImageFromArray(update_image_array)
329
+ label_stk_img_update.CopyInformation(label_stk_img)
330
+ # 手动复制所有元数据
331
+ # 获取元数据键
332
+ meta_keys = label_stk_img.GetMetaDataKeys()
333
+ for key in meta_keys:
334
+ value = label_stk_img.GetMetaData(key)
335
+ label_stk_img_update.SetMetaData(key, value)
336
+
337
+ # for lesion_info in json_info['points']:
338
+ # df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
339
+ # df_row=df_row.reset_index()
340
+ # lesion_type=df_row['lesion_type'][0]
341
+ # label_dict[lesion_info['name']]=lesion_type
342
+
343
+ resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size)
344
+ if resampler is not None:
345
+ proces_label = resampler.Execute(label_stk_img_update)
346
+
347
+ ary_process_label=sitk.GetArrayFromImage(proces_label)
348
+
349
+ if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0:
350
+ print('momingqimiao',ary_process_label[-1,0,0])
351
+ ary_process_label[-1,:,:]=0
352
+
353
+ label_stk_img_process=sitk.GetImageFromArray(ary_process_label)
354
+ label_stk_img_process.CopyInformation(proces_label)
355
+ meta_keys = proces_label.GetMetaDataKeys()
356
+ for key in meta_keys:
357
+ value = proces_label.GetMetaData(key)
358
+ label_stk_img_process.SetMetaData(key, value)
359
+
360
+
361
+
362
+ else:
363
+ label_stk_img_process = label_stk_img_update
364
+
365
+ # print(proces_image.GetSize(),proces_label.GetSize())
366
+ try:
367
+ assert proces_image.GetSize() == label_stk_img_process.GetSize()
368
+ except Exception as e:
369
+ failed_files.append(label_fp)
370
+ continue
371
+
372
+ label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")
373
+
374
+ label_path_dict['tumor'] = label_output_path
375
+ util.save_nifti(label_stk_img_process, label_output_path, label_fp)
376
+ print(f"Saved Label Segment NIfTI file to {label_output_path}")
377
+
378
+
379
+
380
+ else:
381
+ continue
382
+
383
+
384
+
385
+
386
+ size_processed = list(proces_image.GetSize())
387
+ print('size_processed',size_processed,size)
388
+
389
+ # meta.add_keyvalue('Image_id',meta_image_id)
390
+ meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
391
+ meta.add_keyvalue('OriImg_path',bl_fp_img)
392
+ meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
393
+ meta.add_keyvalue('Modality',modality)
394
+ meta.add_keyvalue('Dataset_name',study)
395
+ meta.add_keyvalue('ROI','whole-body')
396
+
397
+
398
+ if label_flag:
399
+ # print(label_path_dict.keys())
400
+ meta.add_keyvalue('Task',TASK_VALUE)
401
+ # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
402
+ meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
403
+
404
+ meta.add_keyvalue('Label_Dict',label_dict)
405
+
406
+ meta.add_extra_keyvalue('Metadata',CIA_other_info)
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+ # Write the mapping to the JSON file on the fly
415
+ with open(json_output_path, 'r+') as json_file:
416
+ existing_mappings = json.load(json_file)
417
+ existing_mappings[output_path] = meta.get_meta_data()
418
+ json_file.seek(0)
419
+ # print(existing_mappings)
420
+ json.dump(existing_mappings, json_file, indent=4)
421
+ json_file.truncate()
422
+ # else:
423
+ # print("No metadata.csv files found.")
424
+
425
+ with open(failed_files_path, "w") as json_file:
426
+ json.dump(failed_files, json_file)
427
+
428
+ print(f"The list has been written to {failed_files_path}")
429
+ print(f"Saved NIfTI mappings to {json_output_path}")
430
+
431
+ if __name__ == "__main__":
432
+ parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
433
+ parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/ygq/Data_Engineering/PSMA_clean/demo")
434
+ parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="//home/data/ygq/Data_Engineering/PSMA_clean/sample/")
435
+ args = parser.parse_args()
436
+ print(args.target_path, args.output_dir)
437
+ main(args.target_path, args.output_dir)
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
PSMA_clean/demo/inputsTr/9c838d2e45.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
2
+ 1,173.652542372881 319.652542372881 579.387005649717,00,189.53247388079484 287.2536900264025 222.17010423686224,188.202349869452 289.144908616188 223.849869451697,00,Lymph node
3
+ 2,281.307079646018 277.819469026549 501.061061946903,00,302.00463138533826 240.3165855091747 192.69462680091806,341.600891156759 241.744290345559 191.096086469827,00,Others
4
+ 3,114.172695951766 270.197674418605 687.132213608958,00,130.76397678397964 228.70035929316566 266.4885911692181,130.707082371055 231.155119322556 268.416089299461,00,Lymph node
5
+ 4,356.415238954013 301.567628494139 451.692064923354,00,376.87428084515284 263.45722698112957 174.4720429146538,381.853823088456 257.722888555722 174.580459770115,00,Others
6
+ 5,208.050684931507 195.371232876712 444.935616438356,00,209.83459805091746 149.68124304981973 169.57017028347764,206.659624413146 144.776995305164 172.457746478873,00,Liver
7
+ 6,164.686585470452 328.149644051805 690.955545072476,00,152.33141797244215 282.595521811701 123.90492553416982,145.783406214666 282.353392728214 124.015291457218,01,Lung
8
+ 7,228.565467266367 187.678910544728 492.744877561219,00,237.67185465810823 144.09238609967332 190.15033170294646,235.767605633803 148.153923541247 191.01509054326,00,Liver
9
+ 8,138.372302158273 304.519184652278 553.549560351719,00,109.44264837721721 246.66039498848312 73.57523491828391,106.450819672131 251.254098360656 74.091654247392,01,Lung
10
+ 9,173.506048387097 306.191532258065 599.612903225806,00,189.86575749680912 272.81057573235324 230.3317260920997,191.972929936306 273.027070063694 230.562101910828,00,Lymph node
11
+ 10,250.798805601318 219.040568369028 340.824341021417,00,264.02473463342807 186.36201422295846 126.59140444149511,260.173992673993 183.694139194139 125.826007326007,00,Soft tissue / Skin
PSMA_clean/demo/inputsTr/9c838d2e45_BL_00.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Points of interest",
3
+ "points": [
4
+ {
5
+ "name": "1",
6
+ "point": [
7
+ 173.652542372881,
8
+ 319.652542372881,
9
+ 579.387005649717
10
+ ]
11
+ },
12
+ {
13
+ "name": "2",
14
+ "point": [
15
+ 281.307079646018,
16
+ 277.819469026549,
17
+ 501.061061946903
18
+ ]
19
+ },
20
+ {
21
+ "name": "3",
22
+ "point": [
23
+ 114.172695951766,
24
+ 270.197674418605,
25
+ 687.132213608958
26
+ ]
27
+ },
28
+ {
29
+ "name": "4",
30
+ "point": [
31
+ 356.415238954013,
32
+ 301.567628494139,
33
+ 451.692064923354
34
+ ]
35
+ },
36
+ {
37
+ "name": "5",
38
+ "point": [
39
+ 208.050684931507,
40
+ 195.371232876712,
41
+ 444.935616438356
42
+ ]
43
+ },
44
+ {
45
+ "name": "6",
46
+ "point": [
47
+ 164.686585470452,
48
+ 328.149644051805,
49
+ 690.955545072476
50
+ ]
51
+ },
52
+ {
53
+ "name": "7",
54
+ "point": [
55
+ 228.565467266367,
56
+ 187.678910544728,
57
+ 492.744877561219
58
+ ]
59
+ },
60
+ {
61
+ "name": "8",
62
+ "point": [
63
+ 138.372302158273,
64
+ 304.519184652278,
65
+ 553.549560351719
66
+ ]
67
+ },
68
+ {
69
+ "name": "9",
70
+ "point": [
71
+ 173.506048387097,
72
+ 306.191532258065,
73
+ 599.612903225806
74
+ ]
75
+ },
76
+ {
77
+ "name": "10",
78
+ "point": [
79
+ 250.798805601318,
80
+ 219.040568369028,
81
+ 340.824341021417
82
+ ]
83
+ }
84
+ ],
85
+ "type": "Multiple points",
86
+ "version": {
87
+ "major": 1,
88
+ "minor": 0
89
+ }
90
+ }
PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff6d270fea41f22b006922b6dbe8ad63451f8fa44ed9d99bc29034b7a2431a1b
3
+ size 23855104
PSMA_clean/demo/inputsTr/9c838d2e45_BL_mask_BL_img_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316c5372a73b5d1bcb221494ae24a91b8f1c4ba405f1d271a264f77a622a7973
3
+ size 259615
PSMA_clean/demo/inputsTr/9c838d2e45_FU_00.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Points of interest",
3
+ "points": [
4
+ {
5
+ "name": "1",
6
+ "point": [
7
+ 189.53247388079484,
8
+ 287.2536900264025,
9
+ 222.17010423686224
10
+ ]
11
+ },
12
+ {
13
+ "name": "2",
14
+ "point": [
15
+ 302.00463138533826,
16
+ 240.3165855091747,
17
+ 192.69462680091806
18
+ ]
19
+ },
20
+ {
21
+ "name": "3",
22
+ "point": [
23
+ 130.76397678397964,
24
+ 228.70035929316566,
25
+ 266.4885911692181
26
+ ]
27
+ },
28
+ {
29
+ "name": "4",
30
+ "point": [
31
+ 376.87428084515284,
32
+ 263.45722698112957,
33
+ 174.4720429146538
34
+ ]
35
+ },
36
+ {
37
+ "name": "5",
38
+ "point": [
39
+ 209.83459805091746,
40
+ 149.68124304981973,
41
+ 169.57017028347764
42
+ ]
43
+ },
44
+ {
45
+ "name": "7",
46
+ "point": [
47
+ 237.67185465810823,
48
+ 144.09238609967332,
49
+ 190.15033170294646
50
+ ]
51
+ },
52
+ {
53
+ "name": "9",
54
+ "point": [
55
+ 189.86575749680912,
56
+ 272.81057573235324,
57
+ 230.3317260920997
58
+ ]
59
+ },
60
+ {
61
+ "name": "10",
62
+ "point": [
63
+ 264.02473463342807,
64
+ 186.36201422295846,
65
+ 126.59140444149511
66
+ ]
67
+ }
68
+ ],
69
+ "type": "Multiple points",
70
+ "version": {
71
+ "major": 1,
72
+ "minor": 0
73
+ }
74
+ }
PSMA_clean/demo/inputsTr/9c838d2e45_FU_01.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Points of interest",
3
+ "points": [
4
+ {
5
+ "name": "6",
6
+ "point": [
7
+ 152.33141797244215,
8
+ 282.595521811701,
9
+ 123.90492553416982
10
+ ]
11
+ },
12
+ {
13
+ "name": "8",
14
+ "point": [
15
+ 109.44264837721721,
16
+ 246.66039498848312,
17
+ 73.57523491828391
18
+ ]
19
+ }
20
+ ],
21
+ "type": "Multiple points",
22
+ "version": {
23
+ "major": 1,
24
+ "minor": 0
25
+ }
26
+ }
PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a80f03265f11541538ede2ad76b3d7958e3c87f99899ebd2ee336db7ddcbf3e
3
+ size 23855104
PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_01.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b9e3c4feaf49e3007d738026fecd44f7bd1126c9450ae2458e8ff17f43f5b77
3
+ size 23855104
PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0892f4358c271c3737c0bf92970d2d93d1b4ba4ae7338c82a45ff4f8dba8fdb2
3
+ size 106508
PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_01.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0852db3ee8137210401c145e64c129107ce21ece610ef29926f061b18d38aeb
3
+ size 44672
PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1678de5358e3d66a3421a3a19cac69e8fd766930c21f42a509c3d16c2dfbd0
3
+ size 23855104
PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0629e3f73e040123553a005021789f59f82a70a1ea40a4a621f3ca24d538052c
3
+ size 282175
PSMA_clean/sample/failed_files.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []