This code will read most important dicom tags and store it in a json and excel. It will identify studies within given directory using folder name, dicom study id, and date. It will create a dictionary for each study, containing most useful information, along with a dictionary for dicom series (each dicom study hast so many dicom images, each belong to series). My code is here And here is the full code with descriptions. Oh, you can add or remove your desired information.
#FINAL 20231216
#My context: I coded this on my windows11 with RTC3080Ti and Corei9-12gen and 32G Ram. I am coding on VS code and using jupyter notebook.
#Your requirment: It doesn't need any exceptional hardward you can run it on an average pc/labtob
import pydicom as pm #for reading dicoms
import os #for looping through system direcotries
from pydicom.multival import MultiValue #for reading dicom metadata
from pydicom.valuerep import PersonName #since tunring dictionary to json raised an error you should use this
from tqdm.notebook import tqdm #for that fancy loop progress, I like it though
import pandas as pd #for tunring dic to excel, first we trasnform it to pandas dataframe
import json #for storing as json
from IPython.display import HTML #so you can click on the sotred excel and json and open it from jupyter notebook
def get_dicom_tag_value(dicom_file, tag, default=None):
'''this function will get the dicom tag from the dicom filde for the given tag/code'''
tag_value = dicom_file.get(tag, None)
if tag_value is None:
return default
if isinstance(tag_value, MultiValue):
return list(tag_value) # Convert MultiValue to list
return tag_value.value
def get_path_to_first_subfolder(full_path, first_subfolder):
"""this will get the path to the first folder of root, which is the subfolder that contains all dicom filed of one dicom study """
path_parts = full_path.split(os.sep)
if first_subfolder in path_parts:
subfolder_index = path_parts.index(first_subfolder)
return os.sep.join(path_parts[:subfolder_index + 1])
else:
return full_path
def count_subfolders(directory):
'''this will cont the number of files and folders within a direcotyr'''
total_subfolders = 0
total_files=0
for root, dirs, files in os.walk(directory):
total_subfolders += len(dirs)
total_files += len(files)
return total_subfolders,total_files
class CustomJSONEncoder(json.JSONEncoder): #this class will turn our multilevel dictionary into a json file
def default(self, obj):
if isinstance(obj, MultiValue):
return list(obj) # Convert MultiValue to list
elif isinstance(obj, PersonName):
return str(obj) # Convert PersonName to string
return json.JSONEncoder.default(self, obj)
def ensure_json_extension(directory):
'''this function will ensure that definied json direcotry contains the required extension, otherwise, it will add this to the end of definied dir'''
if not directory.endswith(".json"):
return directory + "\\JSON.json"
return directory
def ensure_excel_extension(directory):
'''this function will ensure that definied excel direcotry contains the required extension, otherwise, it will add this to the end of definied dir'''
if not directory.endswith(".xlsx"):
return directory + "\\excel.xlsx"
return directory
def create_clickable_dir_path(dir_path):
# Convert the directory path to a file URL
file_url = f"{dir_path}"
return HTML(f'<a href="{file_url}" target="_blank">{dir_path}</a>')
def get_dicomdir_give_dicomdicom_datadic(dicom_dir, #direcotry that you want to read, usually dicom studies should be in one folder, preferably with patient unique id/name
dicom_validation=True, #this will check wether the file in the loop is dicom or not. Although make it slower, I recommend using it to ensure only dicom files go through loop
folder_list_name_indicomdir=None, #In your dicom_dir you can include list of folders name that you want to read. It will not read other folders. Kepp in mind that this will look into subfolders in the main folder, and not the subfolders of subfolders :)
store_as_json_dir=None, #if you want to store your ditionary as json, give your desired json direcotry
store_as_excel_dir=None #if you want to store your ditionary as excel, give your desired excel direcotry
):
"""
This function creates a multi-level dictionary for DICOM meta data (named dicom_data) in a directory (named dicom_dir).
The top level has the last component of dicom_dir, which is the first level subfolder, as a key.
For each subforled it will store study data within this dic, along with another dicitonary for series data, within this study dictionary.
For series dictionary the data corresponding for series number will be stored.
We also have another private_info dictionary within subfodler dictionary.
- dicom_validation: If you set dicom_validation=True, it will validate the file in the loop for being an dicom file. This is super important although it makes code slower.
Becaouse, sometimes some dicom files have no extension, and also reading other files may cause error in the loop.
- folder_list_name_indicomdir: #In your dicom_dir you can include list of folders name that you want to read. It will not read other folders. Kepp in mind that this will look into subfolders in the main folder, and not the subfolders of subfolders :)
- store_as_json_dir: if you want to store your ditionary as json, give your desired json direcotry
- store_as_excel_dir: if you want to store your ditionary as excel, give your desired excel direcotry
For using this function, the best practice is to place each folder containing one dicom study in subfolder, under the dicom_dir.
However, you can change finding unique dicom studies, even placed next to each other beacouse I definied the study_unique=f'{first_subfolder}_{study_id}_{study_date}'.
If you want your code to be faster you can chane the study_unique to study_unique=first_subfolder. It makes your code 15% faster, sometimes at the cost of incurrect retrival.
"""
total_subfolder,total_files=count_subfolders(dicom_dir)
print(f'your direcotry contains {total_subfolder} folders and {total_files} files')
last_dir_name = os.path.basename(os.path.normpath(dicom_dir))
dicom_data = {last_dir_name: {}}
for root, dirs, files in tqdm(os.walk(dicom_dir), desc="Processing directories", total=total_subfolder,unit='folder'):
if folder_list_name_indicomdir:
split_path = root.replace(dicom_dir, '').split(os.sep)
first_subfolder = split_path[1] if len(split_path) > 1 else ""
if first_subfolder not in folder_list_name_indicomdir:
print(f"""The folder {first_subfolder} was not in your definied list.""")
continue # Skip if the first subfolder is not in the user-defined list
for file in files:
if dicom_validation and not pm.misc.is_dicom(os.path.join(root, file)):
continue # Skip if the it is not dicom file
try:
dicom_file = pm.dcmread(os.path.join(root, file))
study_id = get_dicom_tag_value(dicom_file, (0x0020, 0x0010))
dicom_data_number = get_dicom_tag_value(dicom_file, (0x0020, 0x0011))
study_date = get_dicom_tag_value(dicom_file, (0x0008, 0x0020))
split_path = root.replace(dicom_dir, '').split(os.sep)
first_subfolder = split_path[1] if len(split_path) > 1 else ""
if study_id and dicom_data_number and study_date:
study_unique = f'{first_subfolder}_{study_id}_{study_date}' #you can change it for increasing the speed > study_unique=first_subfolder
if study_unique not in dicom_data[last_dir_name]:
private_info={'name': get_dicom_tag_value(dicom_file, (0x0010, 0x0010)),
'institute': get_dicom_tag_value(dicom_file, (0x0008, 0x0080)),
'patient_id': get_dicom_tag_value(dicom_file, (0x0010, 0x0020)),
'accession_number':get_dicom_tag_value(dicom_file, (0x0008, 0x0050))
}
dicom_data[last_dir_name][study_unique] = {
'dir_to_root': get_path_to_first_subfolder(root, first_subfolder),
'study_description': get_dicom_tag_value(dicom_file, (0x0008, 0x1030)),
'date': study_date,
'age': get_dicom_tag_value(dicom_file, (0x0010, 0x1010)),
'sex': get_dicom_tag_value(dicom_file, (0x0010, 0x0040)),
'manufacture_model': get_dicom_tag_value(dicom_file, (0x0008, 0x1090)),
'manufacture_brand': get_dicom_tag_value(dicom_file, (0x0008, 0x0070)),
'manufacture_brand': get_dicom_tag_value(dicom_file, (0x0008, 0x0070)),
'protocol': get_dicom_tag_value(dicom_file, (0x0018, 0x1030)),
'study_id': study_id,
'patient_weight': get_dicom_tag_value(dicom_file, (0x0010, 0x1030)),
'Image_type': get_dicom_tag_value(dicom_file, (0x0008, 0x0008)),
'body_part': get_dicom_tag_value(dicom_file, (0x0018, 0x0015)),
'modalitty':get_dicom_tag_value(dicom_file, (0x0008, 0x0050)),
'private_info':private_info,
'image_dicom_data_list': {}
}
dicom_data_info = {
'dicom_data_description': get_dicom_tag_value(dicom_file, (0x0008, 0x103E)),
'body_part': get_dicom_tag_value(dicom_file, (0x0018, 0x0015)),
'slice_thickness': get_dicom_tag_value(dicom_file, (0x0018, 0x0050)),
'Image_comment': get_dicom_tag_value(dicom_file, (0x0020, 0x4000)),
'kvp': get_dicom_tag_value(dicom_file, (0x0018, 0x0060)),
'exposure': get_dicom_tag_value(dicom_file, (0x0018, 0x1152)),
'exposure_time': get_dicom_tag_value(dicom_file, (0x0018, 0x1150)),
}
dicom_data[last_dir_name][study_unique]['image_dicom_data_list'][dicom_data_number] = dicom_data_info
except Exception as e:
print(f"""Error reading for {file}::: {e} \n """)
continue
if store_as_json_dir is not None:
try:
json_read = json.dumps(dicom_data, indent=4, cls=CustomJSONEncoder)
store_as_json_dir=str(store_as_json_dir)
store_as_json_dir=ensure_json_extension(store_as_json_dir)
with open(store_as_json_dir, 'w') as json_file:
json_file.write(json_read)
print(f"""Json stored at :::""")
display(create_clickable_dir_path(store_as_json_dir))
except:
print(f"""Error storing the json ::: {e} \n """)
if store_as_excel_dir is not None:
try:
dataframes = []
for key, value in dicom_data.items():
# Convert value to DataFrame if necessary
df = pd.DataFrame(value)
# Add the key as a new column or as part of the index
df['Key'] = key # Add key as a column
# df = df.set_index(['Key'], append=True) # Add key as part of a MultiIndex
dataframes.append(df)
# Concatenate all dataframes
df2 = pd.concat(dataframes).T
store_as_excel_dir=str(store_as_excel_dir)
store_as_excel_dir=ensure_excel_extension(store_as_excel_dir)
df2.to_excel(store_as_excel_dir)
print(f"""Excel stored at :::""")
display(create_clickable_dir_path(store_as_excel_dir))
except:
print(f"""Error storing the excel ::: {e} \n """)
return dicom_data
#example of running code
dicom_dir=r"F:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Dr Radmard\Valid Case"
save_dir_json=r'F:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Radmard_all_dcm.json'
save_dir_xlsx=r'F:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Radmard_all_dcm.xlsx'
dicom_dic=get_dicomdir_give_dicomdicom_datadic(
dicom_dir, #direcotry that you want to read, usually dicom studies should be in one folder, preferably with patient unique id/name
dicom_validation=True, #this will check wether the file in the loop is dicom or not. Although make it slower, I recommend using it to ensure only dicom files go through loop
folder_list_name_indicomdir=None, #In your dicom_dir you can include list of folders name that you want to read. It will not read other folders. Kepp in mind that this will look into subfolders in the main folder, and not the subfolders of subfolders :)
store_as_json_dir=save_dir_json, #if you want to store your ditionary as json, give your desired json direcotry
store_as_excel_dir=save_dir_xlsx #if you want to store your ditionary as excel, give your desired excel direcotry
)