Data_Engineering / PSMA_clean /dataclean_PSMA_Longitudinal_v2.py
maxmo2009's picture
Initial upload: data cleanup pipeline for 12 medical imaging datasets
da9fb1e verified
#coding:utf-8
'''
writebyygq
createon2025-08-30
BL = Baseline(基线)
FU = Follow-up(随访)
1. Baseline (基线)
含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。
作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。
2. Follow-up (随访)
含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。
作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。
“BL FU” 在报告中的应用场景:
当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是:
“本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。”
例如:
肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。”
慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。
label:
0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息
编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--
备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict
BL的以及对应的MASK都是inputsTr目录下面
命名形式:
93dd4de5cd_BL_img_BL_img_00.nii.gz
93dd4de5cd_BL_mask_BL_img_00.nii.gz
93dd4de5cd_BL_00.json
FU在inputsTr目录下面,对应的mask在targetsTr力猛
命名形式:
c6f057b865_FU_img_FU_img_00.nii.gz
c6f057b865_FU_mask_FU_img_00.nii.gz
c6f057b865_FU_img_FU_img_01.nii.gz
c6f057b865_FU_mask_FU_img_01.nii.gz
c6f057b865_FU_00.json
c6f057b865_FU_01.json
元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置
lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung
2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node
json格式样例
{
"name": "Points of interest",
"points": [
{
"name": "1",
"point": [
84.9530896759608,
273.525433308214,
148.780708364732
]
},
{
"name": "2",
"point": [
206.307026476578,
258.39816700611,
177.256619144603
]
}
],
"type": "Multiple points",
"version": {
"major": 1,
"minor": 0
}
}
20251101补充增加,将病灶编号进行合并同类项目,
注意处理完成后保留原影像的几何空间信息以及元数据文件信息
'''
import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *
import shutil
##统一编码
label_id_lut={'backgroud': 0,
'Lymph node': 1,
'Lung': 2,
'Soft tissue / Skin': 3,
'Liver': 4,
'Skeleton': 5,
'Adrenals': 6,
'Spleen': 7,
'CNS': 8,
'Kidney': 9,
'Heart': 10,
'Others': 11,
'unclear': 12,
}
TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None
# ##参考MSD的sub_modality描述信息
# SUB_MODALITY=["CT","PET"]
# ##文件名对应的排序顺序
# SERIES_ORDER=["0000","0001"]
##根据对应的json信息进行补充1-N的数值
LABEL_DICT={
"0":"backgroud",
}
META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']
# def find_metadata_files(path):
# # for Cancer Image Archive (TCIA) dataset
# search_pattern = os.path.join(path, '**', 'metadata.csv')
# return glob.glob(search_pattern, recursive=True)
def find_metadata_files(path):
# for Cancer Image Archive (TCIA) dataset
search_pattern = os.path.join(path, '*.csv')
return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
return os.listdir(path)
##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
reader.SetFileNames(dicom_names)
image = reader.Execute()
return dicom_names,image
##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
reader = sitk.ImageFileReader()
# dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
reader.SetFileName(imgs)
reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
# metadata_keys = reader.GetMetaDataKeys()
tag=reader.Execute()
return tag
def load_nrrd(fp):
return sitk.ReadImage(fp)
##modify by yanguoqing on 20250830
def merge_images(series_files):
'''
每个病例包含两种不同序列的 CT:CT/PET--0000/0001
将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放
'''
reader = sitk.ImageSeriesReader()
reader.SetFileNames(series_files)
image = reader.Execute()
return image
def save_nifti(image, output_path, folder_path):
# Set metadata in the NIfTI file's header
output_dirpath = os.path.dirname(output_path)
if not os.path.exists(output_dirpath):
print(f"Creating directory {output_dirpath}")
os.makedirs(output_dirpath)
# Set metadata in the NIfTI file's header
image.SetMetaData("FolderPath", folder_path)
sitk.WriteImage(image, output_path)
##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
# Replace backslashes with forward slashes and remove the drive letter
# Some meta files have windows paths, but the data is stored on a linux server
linux_path = windows_path.replace('\\', '/')
if ':' in linux_path:
linux_path = linux_path.split(':', 1)[1]
return linux_path
##added by yanguoqing on 2025-08-31
##根据csv文件返回的所有数据文件名称,获取所有数据id的
def get_filename_list(fp_dir):
all_file_list=glob.glob("%s/*.csv"%fp_dir)
return all_file_list
##获取study_id以及study_date
def check_fname(fname):
if fname.startswith("fdg"):
sid=fname[:14]
sdate=fname[15:25]
else:
sid=fname[:21]
sdate=fname[22:]
return sid,sdate
def main(target_path, output_dir):
pid_dirs=["inputsTr"]
failed_files = []
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
failed_files_path = os.path.join(output_dir, 'failed_files.json')
meta = meta_data()
# Initialize the JSON file
if not os.path.exists(json_output_path):
with open(json_output_path, 'w') as json_file:
json.dump({}, json_file)
input_dir=os.path.join(target_path,'inputsTr')
target_dir=os.path.join(target_path,'targetsTr')
fp_files=get_filename_list(input_dir)
##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并;
if pid_dirs:
for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
for fp_file in tqdm(fp_files, desc="Processing all dataset"):
meta_file=fp_file
df_meta=pd.read_csv(meta_file)
fp_name=os.path.basename(fp_file)[:-4]
##依次查找BL以及FU的所有影像以及对应的mask
for sub_mod in ['BL','FU']:
bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
if len(bl_fps)>0:
for bl_fp in bl_fps:
basename=os.path.basename(bl_fp)[:-5]
bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
bl_fp_img=os.path.join(input_dir,bl_fp_name)
if os.path.isfile(bl_fp_img):
##判定存在进行正常处理
bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")
bl_fp_mask=os.path.join(input_dir,bl_mask_name)
if os.path.isfile(bl_fp_mask):
label_fp=bl_fp_mask
label_flag=True
else:
bl_fp_mask=os.path.join(target_dir,bl_mask_name)
if os.path.isfile(bl_fp_mask):
label_fp=bl_fp_mask
label_flag=True
else:
label_fp=None
label_flag=False
modality="CT"
study='PSMA_Longitudinal_CT'##Dataset_name
CIA_other_info = {
'Image_id':basename,
'metadata_file':''
# 'Series_Description':serise_desc
}
CIA_other_info['split'] = "train"
CIA_other_info['metadata_file']=meta_file
stk_image=util.load_nifti(bl_fp_img)
spacing_info = stk_image.GetSpacing()
size = list(stk_image.GetSize())
resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
if resampler is not None:
proces_image = resampler.Execute(stk_image)
print('SPACIE INFO AFTER', proces_image.GetSpacing())
CIA_other_info['Resample'] = True
else:
proces_image = stk_image
CIA_other_info['Resample'] = False
output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
# output_path=convert_windows_to_linux_path(output_path)
save_nifti(proces_image, output_path, input_dir)
print(f"Saved NIfTI file to {output_path}")
if label_flag:
label_path_dict = {}
label_stk_img=util.load_nifti(label_fp)
image_array = sitk.GetArrayFromImage(label_stk_img)
##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项
with open(bl_fp,'r') as fi:
json_info=json.load(fi)
label_dict={
"0":"backgroud"
}
update_image_array=np.copy(image_array)
##获取合并同类项后的基本信息
group_meta=df_meta.groupby('lesion_type')['lesion_id']
for name,group in group_meta:
##分组名称以及分组后的所有leision_id
ids=group_meta.get_group(name)
target_id=label_id_lut[name]
# ##取每个分组的最小leision_id赋值
# ids_min=ids.min()
# label_dict[str(ids_min)]=name
label_dict[str(target_id)]=name
##并对
for v in ids.tolist():
update_image_array[image_array==v]=target_id
image_array=None
label_stk_img_update=sitk.GetImageFromArray(update_image_array)
label_stk_img_update.CopyInformation(label_stk_img)
# 手动复制所有元数据
# 获取元数据键
meta_keys = label_stk_img.GetMetaDataKeys()
for key in meta_keys:
value = label_stk_img.GetMetaData(key)
label_stk_img_update.SetMetaData(key, value)
# for lesion_info in json_info['points']:
# df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
# df_row=df_row.reset_index()
# lesion_type=df_row['lesion_type'][0]
# label_dict[lesion_info['name']]=lesion_type
resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size)
if resampler is not None:
proces_label = resampler.Execute(label_stk_img_update)
ary_process_label=sitk.GetArrayFromImage(proces_label)
if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0:
print('momingqimiao',ary_process_label[-1,0,0])
ary_process_label[-1,:,:]=0
label_stk_img_process=sitk.GetImageFromArray(ary_process_label)
label_stk_img_process.CopyInformation(proces_label)
meta_keys = proces_label.GetMetaDataKeys()
for key in meta_keys:
value = proces_label.GetMetaData(key)
label_stk_img_process.SetMetaData(key, value)
else:
label_stk_img_process = label_stk_img_update
# print(proces_image.GetSize(),proces_label.GetSize())
try:
assert proces_image.GetSize() == label_stk_img_process.GetSize()
except Exception as e:
failed_files.append(label_fp)
continue
label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")
label_path_dict['tumor'] = label_output_path
util.save_nifti(label_stk_img_process, label_output_path, label_fp)
print(f"Saved Label Segment NIfTI file to {label_output_path}")
else:
continue
size_processed = list(proces_image.GetSize())
print('size_processed',size_processed,size)
# meta.add_keyvalue('Image_id',meta_image_id)
meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
meta.add_keyvalue('OriImg_path',bl_fp_img)
meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
meta.add_keyvalue('Modality',modality)
meta.add_keyvalue('Dataset_name',study)
meta.add_keyvalue('ROI','whole-body')
if label_flag:
# print(label_path_dict.keys())
meta.add_keyvalue('Task',TASK_VALUE)
# meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
meta.add_keyvalue('Label_Dict',label_dict)
meta.add_extra_keyvalue('Metadata',CIA_other_info)
# Write the mapping to the JSON file on the fly
with open(json_output_path, 'r+') as json_file:
existing_mappings = json.load(json_file)
existing_mappings[output_path] = meta.get_meta_data()
json_file.seek(0)
# print(existing_mappings)
json.dump(existing_mappings, json_file, indent=4)
json_file.truncate()
# else:
# print("No metadata.csv files found.")
with open(failed_files_path, "w") as json_file:
json.dump(failed_files, json_file)
print(f"The list has been written to {failed_files_path}")
print(f"Saved NIfTI mappings to {json_output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//")
parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/")
args = parser.parse_args()
print(args.target_path, args.output_dir)
main(args.target_path, args.output_dir)