Exception: process 0 terminated with exit code 1 when use torch.multiprocessing.spawn on GPUs

import torch import torch.distributed as dist import torch.multiprocessing as mp import torch.nn as nn import torch.optim as optim from torch.nn.parallel import DistributedDataParallel as DDP def example(rank, world_size): # create default process group dist.init_process_group("gloo", rank=rank, world_size=world_size) # create local model model = nn.Linear(10, 10).to(rank) # construct DDP model ddp_model = DDP(model, device_ids=[rank]) # define loss function and optimizer loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) # forward pass outputs = ddp_model(torch.randn(20, 10).to(rank)) labels = torch.randn(20, 10).to(rank) backward pass loss_fn(outputs, labels).backward() # update parameters optimizer.step() def main(): world_size = 2 mp.spawn(example, args=(world_size,), nprocs=world_size, join=True) if __name__=="__main__": main()

encounter the same error, I am on MacOS with M3 Max. The below is the traceback output:

ProcessExitedException: process 0 terminated with exit code 1
Input In [13], in <cell line: 1>()
      5 os.environ["MASTER_ADDR"] = "localhost"
      6 os.environ["MASTER_PORT"] = "29500"
----> 7 main()
Hide Traceback
Input In [12], in main()
      1 def main():
      2     world_size = 2
----> 3     mp.spawn(example,
      4         args=(world_size,),
      5         nprocs=world_size,
      6         join=True)

File /opt/anaconda3/envs/biomedgpt/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:241, in spawn(fn, args, nprocs, join, daemon, start_method)
    235     msg = (
    236         "This method only supports start_method=spawn (got: %s).\n"
    237         "To use a different start_method use:\n\t\t"
    238         " torch.multiprocessing.start_processes(...)" % start_method
    239     )
    240     warnings.warn(msg)
--> 241 return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")

File /opt/anaconda3/envs/biomedgpt/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:197, in start_processes(fn, args, nprocs, join, daemon, start_method)
    194     return context
    196 # Loop on join until it returns True or raises an exception.
--> 197 while not context.join():
    198     pass

File /opt/anaconda3/envs/biomedgpt/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:148, in ProcessContext.join(self, timeout)
    140         raise ProcessExitedException(
    141             "process %d terminated with signal %s" % (error_index, name),
    142             error_index=error_index,
   (...)
    145             signal_name=name,
    146         )
    147     else:
--> 148         raise ProcessExitedException(
    149             "process %d terminated with exit code %d" % (error_index, exitcode),
    150             error_index=error_index,
    151             error_pid=failed_process.pid,
    152             exit_code=exitcode,
    153         )
    155 original_trace = self.error_queues[error_index].get()
    156 msg = "\n\n-- Process %d terminated with the following error:\n" % error_index

Recommended topics

Hot tags