Floating point exception (core dumped) error when training the model using mult-gpu

1 hour ago 1
ARTICLE AD BOX

I am trying to train a 3D medical image segmentation model using Tensorflow and Keras :

Model.py:

import time import logging import os import datetime os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async" os.environ["KERAS_BACKEND"] = "tensorflow" # choose any: 'tensorflow', 'torch', 'jax' #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0' import tensorflow as tf gpus = tf.config.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) import numpy as np import pandas as pd from matplotlib import pyplot as plt import keras from keras import ops from keras import layers from keras import mixed_precision from medicai.models import UNETRPlusPlus from medicai.metrics import BinaryDiceMetric from medicai.losses import BinaryDiceCELoss from medicai.utils.inference import SlidingWindowInference from medicai.callbacks import SlidingWindowInferenceCallback import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.experiment_config import ExperimentConfig from src.data_pipeline.data_loader import data_loader import pandas as pd import numpy as np os.environ["KERAS_BACKEND"] = "tensorflow" class TFCheckpointCallback(keras.callbacks.Callback): """Save model + optimizer + epoch using TF checkpointing.""" def __init__(self, ckpt, ckpt_manager): super().__init__() self.ckpt = ckpt self.ckpt_manager = ckpt_manager def on_epoch_end(self, epoch, logs=None): # Update epoch variable and save checkpoint self.ckpt.epoch.assign_add(1) # increment epoch counter save_path = self.ckpt_manager.save() print(f"Saved checkpoint: {save_path} (epoch {int(self.ckpt.epoch.numpy())})") def get_model(): model = UNETRPlusPlus( encoder_name="unetr_plusplus_encoder", input_shape=ExperimentConfig.input_shape, num_classes=ExperimentConfig.num_classes, classifier_activation=None, ) model.compile( optimizer=keras.optimizers.AdamW( learning_rate=ExperimentConfig.lr, weight_decay=ExperimentConfig.weight_decay, ), loss=BinaryDiceCELoss( from_logits=True, dice_weight=1.0, ce_weight=1.0, reduction="mean", num_classes=ExperimentConfig.num_classes, ), metrics=[ BinaryDiceMetric( from_logits=True, ignore_empty=True, num_classes=ExperimentConfig.num_classes, name='dice', ), BinaryDiceMetric( from_logits=True, ignore_empty=True, target_class_ids=[0], num_classes=ExperimentConfig.num_classes, name='dice_tc', ), BinaryDiceMetric( from_logits=True, ignore_empty=True, target_class_ids=[1], num_classes=ExperimentConfig.num_classes, name='dice_wt', ), BinaryDiceMetric( from_logits=True, ignore_empty=True, target_class_ids=[2], num_classes=ExperimentConfig.num_classes, name='dice_et', ) ], ) return model def get_inference_metric(): swi_callback_metric = BinaryDiceMetric( from_logits=True, ignore_empty=True, num_classes=ExperimentConfig.num_classes, name='val_dice', ) return swi_callback_metric def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list): """ Run sliding window inference on a dataset and compute all metrics (average + per class) """ for metric in metrics_list: metric.reset_states() swi = SlidingWindowInference( model, num_classes=metrics_list[0].num_classes, roi_size=roi_size, sw_batch_size=sw_batch_size, overlap=overlap ) for x, y in ds: y_pred = swi(x) for metric in metrics_list: metric.update_state(ops.convert_to_tensor(y), ops.convert_to_tensor(y_pred)) # Gather results results = {} for metric in metrics_list: results[metric.name] = float(ops.convert_to_numpy(metric.result())) return results def main(): print( f"keras backend: {keras.config.backend()}\n" f"keras version: {keras.version()}\n" f"tensorflow version: {tf.__version__}\n" ) # get keras backend keras_backend = keras.config.backend() strategy = tf.distribute.MirroredStrategy() total_device = strategy.num_replicas_in_sync print('Keras backend ', keras_backend) print('Total device found ', total_device) project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) base_save_path = os.path.join(project_root, "experiments", "msd_brain") unetrplusplus_path = os.path.join(base_save_path, "SwinUnetr") os.makedirs(unetrplusplus_path, exist_ok=True) # Subfolders logs_path = os.path.join(unetrplusplus_path, "logs") history_path = os.path.join(unetrplusplus_path, "history") plots_path = os.path.join(unetrplusplus_path, "plots") os.makedirs(logs_path, exist_ok=True) os.makedirs(history_path, exist_ok=True) os.makedirs(plots_path, exist_ok=True) # Timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Save path for best model weights save_path = os.path.join(unetrplusplus_path, f"best_model_weights_{timestamp}.weights.h5") # Load datasets tfrecord_pattern = os.path.join(project_root, "data", "msd_brain", "tfrecords", "{}_shard_*.tfrec") # batch size for training train_batch = ExperimentConfig.batch_size_train * total_device train_ds = data_loader( tfrecord_pattern.format("training"), batch_size=train_batch, shuffle=True ) val_ds = data_loader( tfrecord_pattern.format("validation"), batch_size=ExperimentConfig.batch_size_val, shuffle=False ) test_ds = data_loader( tfrecord_pattern.format("test"), batch_size=ExperimentConfig.batch_size_val, shuffle=False ) with strategy.scope(): model = get_model() checkpoint_dir = os.path.join(unetrplusplus_path, "checkpoints") os.makedirs(checkpoint_dir, exist_ok=True) with strategy.scope(): ckpt = tf.train.Checkpoint( epoch=tf.Variable(0), # epoch counter — saved as part of checkpoint optimizer=model.optimizer, # optimizer state model=model # model weights ) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3) # Validation with sliding window callback swi_callback_metric = get_inference_metric() # Create checkpoint callback tf_ckpt_callback = TFCheckpointCallback(ckpt, ckpt_manager) # Create SWI callback swi_callback = SlidingWindowInferenceCallback( model, dataset=val_ds, metrics=swi_callback_metric, num_classes=ExperimentConfig.num_classes, interval= ExperimentConfig.sliding_window_interval, overlap=ExperimentConfig.sliding_window_overlap, roi_size=(ExperimentConfig.input_shape[0],ExperimentConfig.input_shape[1],ExperimentConfig.input_shape[2]), sw_batch_size=ExperimentConfig.sw_batch_size * total_device , save_path=save_path ) print(f"Model size: {model.count_params() / 1e6:.2f} M") start_time = time.time() history = model.fit( train_ds, epochs=ExperimentConfig.epochs, callbacks=[ swi_callback, tf_ckpt_callback ]) end_time = time.time() training_time = end_time - start_time print(f"Total training time (seconds): {training_time:.2f}") # Save training time to a file with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f: f.write(f"Total training time (seconds): {training_time:.2f}\n") # Save history to CSV history_file = os.path.join(history_path, f"training_history_{timestamp}.csv") pd.DataFrame(history.history).to_csv(history_file, index=False) # Plot training loss plt.figure(figsize=(10, 5)) plt.plot(history.history['loss'], label='train_loss') plt.xlabel("Epoch") plt.ylabel("Loss") plt.title("Training Loss") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png")) plt.close() # Plot average Dice if 'dice' in history.history: plt.figure(figsize=(10, 5)) plt.plot(history.history['dice'], label='train_dice') plt.xlabel("Epoch") plt.ylabel("Average Dice") plt.title("Training Average Dice") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png")) plt.close() print("Training and saving plots finished successfully.")

dataloader.py:

import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.deserialization.full_deserialization import parse_tfrecord_fn from src.data_pipeline.transformations import rearrange_shape from src.data_pipeline.transformations import train_transformation, val_transformation import tensorflow as tf def data_loader (tfrecord_pattern, batch_size=1, shuffle=True): """Create the data loader. This function builds a `tf.data.Dataset` pipeline that reads serialized TFRecords from one or more files (using a glob pattern), applies parsing, reshaping, and transformations, and returns batches ready for training or validation. Args: tfrecord_pattern (str): File path pattern matching one or more TFRecord files. batch_size (int, optional): Number of samples per batch. Defaults to 1. shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True. Returns: tf.data.Dataset: A batched and prefetched dataset yielding tuples of (image, label, image_affine, label_affine, image_pixdim, label_pixdim) after parsing, reshaping, and transformations. """ num_parallel_calls = tf.data.AUTOTUNE dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(tfrecord_pattern)) # dataset is a tf.data.Dataset where each element is one serialized tf.train.Example dataset = dataset.shuffle(buffer_size=16) if shuffle else dataset dataset = dataset.map(parse_tfrecord_fn, num_parallel_calls=num_parallel_calls) dataset = dataset.map(rearrange_shape, num_parallel_calls=num_parallel_calls) if shuffle: dataset = dataset.map(train_transformation, num_parallel_calls= num_parallel_calls) else: dataset = dataset.map (val_transformation, num_parallel_calls= num_parallel_calls) dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) # Reduced from tf.data.AUTOTUNE return dataset

experimental_config.py:

class ExperimentConfig: """ Configuration class with hard-coded hyperparameters for training and validation. """ # Hard-coded hyperparameters batch_size_train = 8 # 8 batch_size_val = 1 #2 epochs = 2 # 1000 lr = 1e-4 weight_decay = 1e-4 sliding_window_interval = 2 # 20 sliding_window_overlap = 0.5 input_shape = (128, 128, 128, 4) num_classes = 3 PROB = 0.5 sw_batch_size= 2 # 2 def __repr__(self): return ( f"ExperimentConfig(batch_size_train={self.batch_size_train}, " f"batch_size_val={self.batch_size_val}, " f"epochs={self.epochs}, " f"lr={self.lr}, " f"weight_decay={self.weight_decay}, " f"sliding_window_interval={self.sliding_window_interval}, " f"sliding_window_overlap={self.sliding_window_overlap}, " f"input_shape={self.input_shape}, " f"num_classes={self.num_classes}, " f"PROB={self.PROB}, " f"sw_batch_size={self.sw_batch_size})" )

When I train with a single GPU, the training always finish successfully. However, when I train on multiple GPus ( tried using 2, 3 , and 4 GPUs ) , I always get Floating point exception (core dumped)

2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. keras backend: tensorflow keras version: 3.12.0 tensorflow version: 2.20.0 WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:1765970958.724694 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0 I0000 00:00:1765970958.725422 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0 I0000 00:00:1765970958.725885 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1 I0000 00:00:1765970958.726444 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory: -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0 Keras backend tensorflow Total device found 2 WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version. Instructions for updating: Use fn_output_signature instead Model size: 42.66 M 2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144 2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] 2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] Epoch 1/2 2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. I0000 00:00:1765971068.804270 160357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process. 2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... Floating point exception (core dumped)

I tried different value of ExperimentConfig.batch_size_train, but still having the same issue.

Here are the framework versions:

TensorFlow version: 2.20.0 Keras version: 3.12.0 CUDA version: 12.5.1 cuDNN version: 9

So my questions are:

Why I am always getting Floating point exception (core dumped) when using multi-gpu?

I always get UserWarning: Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least steps_per_epoch * epochs batches. You may need to use the .repeat() function when building your dataset.

I didnt specify steps_per_epoch or used repeat() , so is it safe to ignore this warning ? My intention is to train the model untill the dataset is exahusted per epoch.

This is my first time to use distributed training using Tensorflow , Am I using it correctly ? I used

with strategy.scope():

When only I define the model, the swi_callback_metric , the ckpt ( tf.train.Checkpoint) , and the ckpt_manger, but didnt use it when created the tf_ckpt_callback and swi_callback . Please see the model.py

Read Entire Article