You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have an issue :
The mamba2 model works fine when training the first batch, but when training the second batch, the following error occurs in the backward propagation of the MambaSplitConv1dScanCombinedFn method in the ssd_combined.py file. Based on the tensor.shape, the problematic tensor is likely to be conv1d_weight. Can anyone help me?
/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/init.py:251: UserWarning: Error detected in MambaSplitConv1dScanCombinedFnBackward. Traceback of forward call that caused the error:
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/autodl-fs/data/20241122v1/videomamba2/trainvm2.py", line 113, in
outputs = myModule(imgs.to(torch.float32))
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/videomamba2/models/videomamba_pretrain.py", line 737, in forward
hidden_states, residual = layer(
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/videomamba2/models/videomamba_pretrain.py", line 94, in forward
hidden_states = self.mixer(hidden_states, inference_params=inference_params)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/modules/videomamba2.py", line 242, in forward
out = mamba_split_conv1d_scan_combined(
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/ops/triton/ssd_combined.py", line 931, in mamba_split_conv1d_scan_combined
return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/autodl-fs/data/20241122v1/videomamba2/trainvm2.py", line 119, in
loss.backward(retain_graph=True)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/ops/triton/ssd_combined.py", line 838, in backward
zxbcdt, conv1d_weight, conv1d_bias, out, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias = ctx.saved_tensors
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [768, 4]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The text was updated successfully, but these errors were encountered:
I have an issue :
The mamba2 model works fine when training the first batch, but when training the second batch, the following error occurs in the backward propagation of the MambaSplitConv1dScanCombinedFn method in the ssd_combined.py file. Based on the tensor.shape, the problematic tensor is likely to be conv1d_weight. Can anyone help me?
/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/init.py:251: UserWarning: Error detected in MambaSplitConv1dScanCombinedFnBackward. Traceback of forward call that caused the error:
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/autodl-fs/data/20241122v1/videomamba2/trainvm2.py", line 113, in
outputs = myModule(imgs.to(torch.float32))
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/videomamba2/models/videomamba_pretrain.py", line 737, in forward
hidden_states, residual = layer(
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/videomamba2/models/videomamba_pretrain.py", line 94, in forward
hidden_states = self.mixer(hidden_states, inference_params=inference_params)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/modules/videomamba2.py", line 242, in forward
out = mamba_split_conv1d_scan_combined(
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/ops/triton/ssd_combined.py", line 931, in mamba_split_conv1d_scan_combined
return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py310/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/autodl-fs/data/20241122v1/videomamba2/trainvm2.py", line 119, in
loss.backward(retain_graph=True)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/root/miniconda3/envs/py310/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/autodl-fs/data/20241122v1/mamba/mamba_ssm/ops/triton/ssd_combined.py", line 838, in backward
zxbcdt, conv1d_weight, conv1d_bias, out, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias = ctx.saved_tensors
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [768, 4]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The text was updated successfully, but these errors were encountered: