Data_Engineering / PSMA_clean /dataclean_PSMA_Longitudinal_v2.py

maxmo2009

Initial upload: data cleanup pipeline for 12 medical imaging datasets

da9fb1e verified 6 days ago

20.7 kB

	#coding:utf-8
	'''
	writebyygq
	createon2025-08-30


	BL = Baseline（基线）
	FU = Follow-up（随访）

	1. Baseline (基线)
	含义：指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像（如CT、MRI、X光）。
	作用：这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较，来判断病情的变化。
	2. Follow-up (随访)
	含义：指的是在基线影像之后，按计划或根据病情需要再次拍摄的影像。
	作用：用于评估治疗效果（如肿瘤是否缩小）、监测疾病进展（如病灶是否增大或增多）、或观察术后恢复情况。
	“BL FU” 在报告中的应用场景：
	当放射科医生在报告中写下“BL FU”或“compare to BL FU”时，他们的意思是：
	“本次的影像检查结果，需要与之前拍摄的基线影像进行对比，以评估变化。”

	例如：
	肿瘤患者：一位肺癌患者在化疗前做了一次CT（作为基线BL），化疗2个周期后又做了一次CT（作为随访FU）。放射科医生会在新报告中将两次影像进行对比，并描述：“与20XX年X月X日的基线CT（BL FU）相比，右肺下叶肿块明显缩小。”
	慢性病患者：如肺炎、肝硬化、多发性硬化等需要长期监测的疾病，医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。

	label:
	0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息

	编号ID：10位的16进制编号，每一个对应一个csv文件，对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--
	备注：CSV包含所有的label信息和编号，如果考虑按照tissue进行分别存储，可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict
	BL的以及对应的MASK都是inputsTr目录下面
	命名形式：
	93dd4de5cd_BL_img_BL_img_00.nii.gz
	93dd4de5cd_BL_mask_BL_img_00.nii.gz
	93dd4de5cd_BL_00.json

	FU在inputsTr目录下面,对应的mask在targetsTr力猛
	命名形式：
	c6f057b865_FU_img_FU_img_00.nii.gz
	c6f057b865_FU_mask_FU_img_00.nii.gz
	c6f057b865_FU_img_FU_img_01.nii.gz
	c6f057b865_FU_mask_FU_img_01.nii.gz
	c6f057b865_FU_00.json
	c6f057b865_FU_01.json


	元数据信息CSV-病灶或者癌症信息--对应基线的位置，对应的基线影像编号，位置，以及对应的随访位置编号以及病灶位置
	lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type
	1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung
	2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node

	json格式样例
	{
	"name": "Points of interest",
	"points": [
	{
	"name": "1",
	"point": [
	84.9530896759608,
	273.525433308214,
	148.780708364732
	]
	},
	{
	"name": "2",
	"point": [
	206.307026476578,
	258.39816700611,
	177.256619144603
	]
	}
	],
	"type": "Multiple points",
	"version": {
	"major": 1,
	"minor": 0
	}
	}

	20251101补充增加，将病灶编号进行合并同类项目，
	注意处理完成后保留原影像的几何空间信息以及元数据文件信息


	'''
	import os
	import glob
	import pandas as pd
	import SimpleITK as sitk
	import argparse
	import json
	from tqdm import tqdm
	from util import meta_data
	import util
	import numpy as np
	# from bert_helper import *

	import shutil


	##统一编码
	label_id_lut={'backgroud': 0,
	'Lymph node': 1,
	'Lung': 2,
	'Soft tissue / Skin': 3,
	'Liver': 4,
	'Skeleton': 5,
	'Adrenals': 6,
	'Spleen': 7,
	'CNS': 8,
	'Kidney': 9,
	'Heart': 10,
	'Others': 11,
	'unclear': 12,
	}


	TASK_VALUE="segmentation"
	CLAMP_RANGE_CT = [-300,300]
	CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
	TARGET_VOXEL_SPACING=None

	# ##参考MSD的sub_modality描述信息
	# SUB_MODALITY=["CT","PET"]
	# ##文件名对应的排序顺序
	# SERIES_ORDER=["0000","0001"]

	##根据对应的json信息进行补充1-N的数值
	LABEL_DICT={
	"0":"backgroud",
	}
	META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']

	# def find_metadata_files(path):
	# # for Cancer Image Archive (TCIA) dataset
	# search_pattern = os.path.join(path, '**', 'metadata.csv')
	# return glob.glob(search_pattern, recursive=True)

	def find_metadata_files(path):
	# for Cancer Image Archive (TCIA) dataset
	search_pattern = os.path.join(path, '*.csv')
	return glob.glob(search_pattern, recursive=True)
	##added by yanguoqing on 20250527
	def find_image_dirs(path):
	return os.listdir(path)

	##modify by yanguoqing on 20250527
	def load_dicom_images(folder_path):
	reader = sitk.ImageSeriesReader()
	dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileNames(dicom_names)
	image = reader.Execute()
	return dicom_names,image

	##added by yanguoqing on 20250527
	def load_dicom_tag(imgs):
	reader = sitk.ImageFileReader()
	# dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileName(imgs)
	reader.ReadImageInformation() # 仅读取元信息，不加载像素数据
	# metadata_keys = reader.GetMetaDataKeys()
	tag=reader.Execute()
	return tag

	def load_nrrd(fp):
	return sitk.ReadImage(fp)

	##modify by yanguoqing on 20250830
	def merge_images(series_files):
	'''
	每个病例包含两种不同序列的 CT：CT/PET--0000/0001
	将多个分开的模态合并，构建第四个维度的数组，分别按照CT,PET顺序存放
	'''
	reader = sitk.ImageSeriesReader()
	reader.SetFileNames(series_files)
	image = reader.Execute()
	return image

	def save_nifti(image, output_path, folder_path):
	# Set metadata in the NIfTI file's header
	output_dirpath = os.path.dirname(output_path)
	if not os.path.exists(output_dirpath):
	print(f"Creating directory {output_dirpath}")
	os.makedirs(output_dirpath)
	# Set metadata in the NIfTI file's header
	image.SetMetaData("FolderPath", folder_path)
	sitk.WriteImage(image, output_path)

	##modify by yanguoqing on 20250527
	def convert_windows_to_linux_path(windows_path):
	# Replace backslashes with forward slashes and remove the drive letter
	# Some meta files have windows paths, but the data is stored on a linux server
	linux_path = windows_path.replace('\\', '/')
	if ':' in linux_path:
	linux_path = linux_path.split(':', 1)[1]
	return linux_path
	##added by yanguoqing on 2025-08-31
	##根据csv文件返回的所有数据文件名称，获取所有数据id的
	def get_filename_list(fp_dir):
	all_file_list=glob.glob("%s/*.csv"%fp_dir)


	return all_file_list
	##获取study_id以及study_date
	def check_fname(fname):
	if fname.startswith("fdg"):
	sid=fname[:14]
	sdate=fname[15:25]
	else:
	sid=fname[:21]
	sdate=fname[22:]
	return sid,sdate
	def main(target_path, output_dir):

	pid_dirs=["inputsTr"]
	failed_files = []
	if not os.path.isdir(output_dir):
	os.makedirs(output_dir)
	json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
	failed_files_path = os.path.join(output_dir, 'failed_files.json')
	meta = meta_data()

	# Initialize the JSON file
	if not os.path.exists(json_output_path):
	with open(json_output_path, 'w') as json_file:
	json.dump({}, json_file)


	input_dir=os.path.join(target_path,'inputsTr')
	target_dir=os.path.join(target_path,'targetsTr')

	fp_files=get_filename_list(input_dir)
	##从辅助文件信息中获取所有1614个病例名称，每个病例名称存在0000，0001两个三维影像数据，按照顺序合并；
	if pid_dirs:
	for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
	for fp_file in tqdm(fp_files, desc="Processing all dataset"):
	meta_file=fp_file
	df_meta=pd.read_csv(meta_file)

	fp_name=os.path.basename(fp_file)[:-4]
	##依次查找BL以及FU的所有影像以及对应的mask
	for sub_mod in ['BL','FU']:

	bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
	if len(bl_fps)>0:
	for bl_fp in bl_fps:
	basename=os.path.basename(bl_fp)[:-5]
	bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
	bl_fp_img=os.path.join(input_dir,bl_fp_name)

	if os.path.isfile(bl_fp_img):
	##判定存在进行正常处理


	bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")

	bl_fp_mask=os.path.join(input_dir,bl_mask_name)
	if os.path.isfile(bl_fp_mask):
	label_fp=bl_fp_mask
	label_flag=True
	else:
	bl_fp_mask=os.path.join(target_dir,bl_mask_name)
	if os.path.isfile(bl_fp_mask):
	label_fp=bl_fp_mask
	label_flag=True
	else:
	label_fp=None
	label_flag=False


	modality="CT"
	study='PSMA_Longitudinal_CT'##Dataset_name
	CIA_other_info = {
	'Image_id':basename,
	'metadata_file':''
	# 'Series_Description':serise_desc
	}
	CIA_other_info['split'] = "train"

	CIA_other_info['metadata_file']=meta_file
	stk_image=util.load_nifti(bl_fp_img)
	spacing_info = stk_image.GetSpacing()
	size = list(stk_image.GetSize())
	resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
	if resampler is not None:
	proces_image = resampler.Execute(stk_image)
	print('SPACIE INFO AFTER', proces_image.GetSpacing())
	CIA_other_info['Resample'] = True
	else:
	proces_image = stk_image
	CIA_other_info['Resample'] = False

	output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
	# output_path=convert_windows_to_linux_path(output_path)
	save_nifti(proces_image, output_path, input_dir)
	print(f"Saved NIfTI file to {output_path}")




	if label_flag:
	label_path_dict = {}
	label_stk_img=util.load_nifti(label_fp)

	image_array = sitk.GetArrayFromImage(label_stk_img)
	##注意处理label的赋值并还原附带原始影像的基本信息，并重新赋值合并同类项
	with open(bl_fp,'r') as fi:
	json_info=json.load(fi)

	label_dict={
	"0":"backgroud"
	}

	update_image_array=np.copy(image_array)
	##获取合并同类项后的基本信息
	group_meta=df_meta.groupby('lesion_type')['lesion_id']
	for name,group in group_meta:
	##分组名称以及分组后的所有leision_id
	ids=group_meta.get_group(name)
	target_id=label_id_lut[name]
	# ##取每个分组的最小leision_id赋值
	# ids_min=ids.min()
	# label_dict[str(ids_min)]=name
	label_dict[str(target_id)]=name
	##并对
	for v in ids.tolist():
	update_image_array[image_array==v]=target_id

	image_array=None
	label_stk_img_update=sitk.GetImageFromArray(update_image_array)
	label_stk_img_update.CopyInformation(label_stk_img)
	# 手动复制所有元数据
	# 获取元数据键
	meta_keys = label_stk_img.GetMetaDataKeys()
	for key in meta_keys:
	value = label_stk_img.GetMetaData(key)
	label_stk_img_update.SetMetaData(key, value)

	# for lesion_info in json_info['points']:
	# df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
	# df_row=df_row.reset_index()
	# lesion_type=df_row['lesion_type'][0]
	# label_dict[lesion_info['name']]=lesion_type

	resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size)
	if resampler is not None:
	proces_label = resampler.Execute(label_stk_img_update)

	ary_process_label=sitk.GetArrayFromImage(proces_label)

	if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0:
	print('momingqimiao',ary_process_label[-1,0,0])
	ary_process_label[-1,:,:]=0

	label_stk_img_process=sitk.GetImageFromArray(ary_process_label)
	label_stk_img_process.CopyInformation(proces_label)
	meta_keys = proces_label.GetMetaDataKeys()
	for key in meta_keys:
	value = proces_label.GetMetaData(key)
	label_stk_img_process.SetMetaData(key, value)



	else:
	label_stk_img_process = label_stk_img_update

	# print(proces_image.GetSize(),proces_label.GetSize())
	try:
	assert proces_image.GetSize() == label_stk_img_process.GetSize()
	except Exception as e:
	failed_files.append(label_fp)
	continue

	label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")

	label_path_dict['tumor'] = label_output_path
	util.save_nifti(label_stk_img_process, label_output_path, label_fp)
	print(f"Saved Label Segment NIfTI file to {label_output_path}")



	else:
	continue




	size_processed = list(proces_image.GetSize())
	print('size_processed',size_processed,size)

	# meta.add_keyvalue('Image_id',meta_image_id)
	meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
	meta.add_keyvalue('OriImg_path',bl_fp_img)
	meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
	meta.add_keyvalue('Modality',modality)
	meta.add_keyvalue('Dataset_name',study)
	meta.add_keyvalue('ROI','whole-body')


	if label_flag:
	# print(label_path_dict.keys())
	meta.add_keyvalue('Task',TASK_VALUE)
	# meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
	meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})

	meta.add_keyvalue('Label_Dict',label_dict)

	meta.add_extra_keyvalue('Metadata',CIA_other_info)







	# Write the mapping to the JSON file on the fly
	with open(json_output_path, 'r+') as json_file:
	existing_mappings = json.load(json_file)
	existing_mappings[output_path] = meta.get_meta_data()
	json_file.seek(0)
	# print(existing_mappings)
	json.dump(existing_mappings, json_file, indent=4)
	json_file.truncate()
	# else:
	# print("No metadata.csv files found.")

	with open(failed_files_path, "w") as json_file:
	json.dump(failed_files, json_file)

	print(f"The list has been written to {failed_files_path}")
	print(f"Saved NIfTI mappings to {json_output_path}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
	parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//")
	parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/")
	args = parser.parse_args()
	print(args.target_path, args.output_dir)
	main(args.target_path, args.output_dir)