You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
> loading shuffle-idx mapping from ../../data/index-cache/91eec3f91588e8e2e456859caa3dcd56_shuffle_idx.npy
loaded indexed file in 0.000 seconds
total number of samples: 369
total number of epochs: 1
> finished creating GPT datasets ...
[after dataloaders are built] datetime: 2025-01-13 03:27:29
done with setup ...
training ...
[before the start of training step] datetime: 2025-01-13 03:27:29
Traceback (most recent call last):
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/pretrain_gpt.py", line 360, in <module>
pretrain(train_valid_test_datasets_provider,
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/training.py", line 235, in pretrain
iteration = train(forward_step_func,
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/training.py", line 1269, in train
train_step(forward_step_func,
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/training.py", line 709, in train_step
losses_reduced = forward_backward_func(
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/core/pipeline_parallel/schedules.py", line 349, in forward_backward_no_pipelining
output_tensor = forward_step(forward_step_func, data_iterator, model, num_microbatches,
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/core/pipeline_parallel/schedules.py", line 199, in forward_step
output_tensor, loss_func = forward_step_func(data_iterator, model)
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/pretrain_gpt.py", line 286, in forward_step
output_tensor, other_losses = model(tokens, position_ids, attention_mask,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/model/distributed.py", line 58, in forward
return self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/model/module.py", line 191, in forward
outputs = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 258, in forward
lm_output = post_language_model_processing(
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 68, in post_language_model_processing
loss = cross_entropy(output.float(), labels)
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/core/tensor_parallel/cross_entropy.py", line 142, in vocab_parallel_cross_entropy
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/workspace/Megatron-DeepSpeed_new/Megatron-DeepSpeed/megatron/core/tensor_parallel/cross_entropy.py", line 47, in forward
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [2048], [4096]
[2025-01-13 03:27:34,178] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 20947) of binary: /usr/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.1.0a0+32f93b1', 'console_scripts', 'torchrun')())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
hardware:
2 nodes
1 4090 per node
The master node config
client is similar with only NODE_RANK different
start training
then error occured
output of some variables
The text was updated successfully, but these errors were encountered: