- Notifications
You must be signed in to change notification settings - Fork376
Open
Description
Bug Description
from typing import Optional, Tuplefrom contextlib import nullcontextimport numpy as npimport torchimport torch.nn as nnimport torch_tensorrtclass CosmosLearnablePositionalEmbed(nn.Module): def __init__( self, hidden_size: int, max_size: Tuple[int, int, int], patch_size: Tuple[int, int, int], eps: float = 1e-6, ) -> None: super().__init__() self.max_size = [size // patch for size, patch in zip(max_size, patch_size)] self.patch_size = patch_size self.eps = eps self.pos_emb_t = nn.Parameter(torch.zeros(self.max_size[0], hidden_size)) self.pos_emb_h = nn.Parameter(torch.zeros(self.max_size[1], hidden_size)) self.pos_emb_w = nn.Parameter(torch.zeros(self.max_size[2], hidden_size)) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = hidden_states.shape pe_size = [num_frames // self.patch_size[0], height // self.patch_size[1], width // self.patch_size[2]] # Use expand() instead of repeat() - torch_tensorrt compatible # expand() creates a view without copying data, better for dynamic shapes emb_t = self.pos_emb_t[: pe_size[0]][None, :, None, None, :].expand(batch_size, -1, pe_size[1], pe_size[2], -1) emb_h = self.pos_emb_h[: pe_size[1]][None, None, :, None, :].expand(batch_size, pe_size[0], -1, pe_size[2], -1) emb_w = self.pos_emb_w[: pe_size[2]][None, None, None, :, :].expand(batch_size, pe_size[0], pe_size[1], -1, -1) emb = emb_t + emb_h + emb_w emb = emb.flatten(1, 3) norm = torch.linalg.vector_norm(emb, dim=-1, keepdim=True, dtype=torch.float32) norm = torch.add(self.eps, norm, alpha=np.sqrt(norm.numel() / emb.numel())) # # Use torch operations instead of np.sqrt to support dynamic shapes in torch.export # # Compute the scale factor: sqrt(norm.numel() / emb.numel()) # alpha = (norm.numel() / emb.numel()) ** 0.5 # norm = torch.add(self.eps, norm, alpha=alpha) out = (emb / norm).type_as(hidden_states) return outdef export_attention(model, hidden_states): with torch.no_grad(): # Only mark sequence length as dynamic, like run_llm.py does # Don't mark batch dimension as dynamic to avoid constraint violations seq_len = torch.export.Dim("seq_len", min=1, max=16) print("Trying to export the model using torch.export.export()..") # strict=False only enables autograd tracing and excludes dynamo. # Use tuple format like export_llm - only mark sequence length (dim 1) as dynamic ep = torch.export.export( model, args=(hidden_states,), kwargs={}, dynamic_shapes=({2: seq_len},), strict=False, ) return epdef compile_torchtrt(model, hidden_states, min_block_size, debug): ep = export_attention(model, hidden_states) # Set precision specific flags use_fp32_acc = False use_explicit_typing = False enabled_precisions = {torch.bfloat16} use_fp32_acc = False with torch_tensorrt.logging.debug() if debug else nullcontext(): trt_model = torch_tensorrt.dynamo.compile( ep, inputs=[hidden_states], enabled_precisions=enabled_precisions, # truncate_double=True, use_explicit_typing=use_explicit_typing, use_fp32_acc=use_fp32_acc, disable_tf32=True, use_python_runtime=True, debug=debug, offload_module_to_cpu=False, min_block_size=min_block_size, ) return trt_modelif __name__ == "__main__": precision = "BF16" min_block_size = 1 batch_size = 1 seq_len = 28160 num_attention_heads = 32 attention_head_dim = 128 enable_pytorch_run = True debug = False device = "cuda" hidden_size = num_attention_heads * attention_head_dim with torch.inference_mode(): model = CosmosLearnablePositionalEmbed( hidden_size=hidden_size, max_size=(128, 240, 240), patch_size=(1, 2, 2), ).to(device) # Convert model to the appropriate precision model = model.to(torch.bfloat16) input_dtype = torch.bfloat16 # Prepare input for benchmarking or evaluation hidden_states = torch.randn( 1, 17, 16, 88, 160, dtype=input_dtype ).to(device) # Pyt pyt_output = model(hidden_states) print("PyTorch output shape:", pyt_output.shape) print("Pytorch output:", pyt_output.flatten()) # Compile the model with Torch-TensorRT trt_model = compile_torchtrt(model, hidden_states, min_block_size, debug) # trt_model = torch.compile( # model, # backend="torch_tensorrt", # options={ # "enabled_precisions": {input_dtype}, # "use_python_runtime": True, # "min_block_size": min_block_size, # }, # dynamic=None, # ) trt_model = trt_model.to(device) trt_output = trt_model(hidden_states) print("TensorRT output shape:", trt_output.shape) print("TensorRT output:", trt_output.flatten()) # Verify results match diff = (pyt_output - trt_output).abs().max().item() print(f"Max difference between PyTorch and TRT: {diff}") # Check if results are close enough tolerance = 0.01 if diff < tolerance: print(f"✅ Results match! (difference: {diff} < {tolerance})") else: print(f"⚠️ Results differ! (difference: {diff} >= {tolerance})")Here is the error message:
AttributeError: 'SymFloat' object has no attribute 'sqrt'The above exception was the direct cause of the following exception:Traceback (most recent call last): File "/home/scratch.zhaoyuanh_coreai/torch-trt/tools/llm/minimal_reproducer.py", line 215, in <module> trt_model = compile_torchtrt(model, hidden_states, min_block_size, debug) File "/home/scratch.zhaoyuanh_coreai/torch-trt/tools/llm/minimal_reproducer.py", line 146, in compile_torchtrt ep = export_attention(model, hidden_states) File "/home/scratch.zhaoyuanh_coreai/torch-trt/tools/llm/minimal_reproducer.py", line 134, in export_attention ep = torch.export.export( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/__init__.py", line 311, in export raise e File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/__init__.py", line 277, in export return _export( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1229, in wrapper raise e File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1195, in wrapper ep = fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/exported_program.py", line 124, in wrapper return fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 2336, in _export ep = _export_for_training( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1229, in wrapper raise e File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1195, in wrapper ep = fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/exported_program.py", line 124, in wrapper return fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 2144, in _export_for_training export_artifact = export_func( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 2075, in _non_strict_export aten_export_artifact = _to_aten_func( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1863, in _export_to_aten_ir_make_fx gm, graph_signature = transform(_make_fx_helper)( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1993, in _aot_export_non_strict gm, sig = aot_export(stack, wrapped_mod, args, kwargs=kwargs, **flags) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1775, in _make_fx_helper gm = make_fx( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2581, in wrapped return make_fx_tracer.trace(f, *args) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2508, in trace return self._trace_inner(f, *args) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2470, in _trace_inner t = dispatch_trace( File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/_compile.py", line 54, in inner return disable_fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1129, in _fn return fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 1443, in dispatch_trace graph = tracer.trace(root, concrete_args) # type: ignore[arg-type] File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2062, in trace res = super().trace(root, concrete_args) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1129, in _fn return fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 869, in trace (self.create_arg(fn(*args)),), File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 1501, in wrapped out = f(*tensors) # type:ignore[call-arg] File "<string>", line 1, in <lambda> File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1662, in wrapped_fn return tuple(flat_fn(*args)) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 192, in flat_fn tree_out = fn(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/graph_capture_wrappers.py", line 1369, in functional_call out = mod(*args[params_len:], **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 844, in module_call_wrapper return self.call_module(mod, forward, args, kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2149, in call_module return Tracer.call_module(self, m, forward, args, kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 560, in call_module ret_val = forward(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 837, in forward return _orig_module_call(mod, *args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1783, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1794, in _call_impl return forward_call(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/export/_trace.py", line 1977, in forward tree_out = mod(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 844, in module_call_wrapper return self.call_module(mod, forward, args, kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py", line 2149, in call_module return Tracer.call_module(self, m, forward, args, kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 560, in call_module ret_val = forward(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py", line 837, in forward return _orig_module_call(mod, *args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1783, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/zhaoyuanh/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1794, in _call_impl return forward_call(*args, **kwargs) File "/home/scratch.zhaoyuanh_coreai/torch-trt/tools/llm/minimal_reproducer.py", line 117, in forward norm = torch.add(self.eps, norm, alpha=np.sqrt(norm.numel() / emb.numel()))TypeError: loop of ufunc does not support argument 0 of type SymFloat which has no callable sqrt methodTo Reproduce
Steps to reproduce the behavior:
- Run Python script above
Expected behavior
Passed and the Torch-TRT output matches the Torch output.
Environment
Build information about Torch-TensorRT can be found by turning on debug messages
- Torch-TensorRT Version (e.g. 1.0.0): 2.10.0.dev0
- PyTorch Version (e.g. 1.0): 2.9.0
- CPU Architecture: x86_64
- OS (e.g., Linux): Linux
- How you installed PyTorch (
conda,pip,libtorch, source): pip - Build command you used (if compiling from source): PYTHON_ONLY=1 pip install -e .
- Are you using local sources or building from archives: No
- Python version: 3.10
- CUDA version: 12.9
- GPU models and configuration: Nvidia B200
- Any other relevant information: None
Additional context
The WAR is to use a torch tensor instead of numpy