Address review comments: dtypes, stale cache, stream_ptr, sync notes

leofang · claude · leofang · commit 6b0dffec2059 · 2026-04-24T04:10:38.000Z
- Add uint16/uint32/uint64 to AOTI dtype and itemsize maps (fixes
  regression where these torch dtypes would raise TypeError instead
  of being handled by the bridge)
- Clear buf._dtype when repopulating a reused StridedMemoryView to
  prevent returning a stale cached dtype
- Reject stream_ptr=None for CUDA tensors with BufferError (matches
  DLPack semantics where None is ambiguous)
- Add "keep in sync" comments to aoti_shim.h and aoti_shim.def
  per rwgk's review suggestion

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/cuda_core/cuda/core/_include/aoti_shim.def b/cuda_core/cuda/core/_include/aoti_shim.def
@@ -2,6 +2,11 @@
 ; Used on Windows only: 'lib /DEF:aoti_shim.def /OUT:aoti_shim.lib /MACHINE:X64'
 ; generates a minimal import library that satisfies the MSVC linker.
 ; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').
+;
+; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations
+; in aoti_shim.h. build_hooks.py turns this file into the stub import library
+; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI
+; symbol must be updated in both files.
 LIBRARY torch_cpu.dll
 EXPORTS
     aoti_torch_get_data_ptr
@@ -14,6 +19,9 @@ EXPORTS
     aoti_torch_dtype_float64
     aoti_torch_dtype_bfloat16
     aoti_torch_dtype_uint8
+    aoti_torch_dtype_uint16
+    aoti_torch_dtype_uint32
+    aoti_torch_dtype_uint64
     aoti_torch_dtype_int8
     aoti_torch_dtype_int16
     aoti_torch_dtype_int32
diff --git a/cuda_core/cuda/core/_include/aoti_shim.h b/cuda_core/cuda/core/_include/aoti_shim.h
@@ -50,6 +50,14 @@ typedef int32_t AOTITorchError;
 struct AtenTensorOpaque;
 typedef struct AtenTensorOpaque* AtenTensorHandle;
 
+/*
+ * IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with
+ * aoti_shim.def. On Windows, build_hooks.py turns that .def file into the
+ * stub import library that MSVC needs to link _tensor_bridge without making
+ * PyTorch a build-time dependency. If you add, remove, or rename an imported
+ * AOTI symbol here, update aoti_shim.def in the same change.
+ */
+
 /* ---- tensor metadata --------------------------------------------------- */
 
 AOTI_SHIM_API AOTITorchError aoti_torch_get_data_ptr(
@@ -74,6 +82,9 @@ AOTI_SHIM_API int32_t aoti_torch_dtype_float32(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_float64(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_bfloat16(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_uint8(void);
+AOTI_SHIM_API int32_t aoti_torch_dtype_uint16(void);
+AOTI_SHIM_API int32_t aoti_torch_dtype_uint32(void);
+AOTI_SHIM_API int32_t aoti_torch_dtype_uint64(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_int8(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_int16(void);
 AOTI_SHIM_API int32_t aoti_torch_dtype_int32(void);
diff --git a/cuda_core/cuda/core/_tensor_bridge.pyx b/cuda_core/cuda/core/_tensor_bridge.pyx
@@ -85,6 +85,9 @@ cdef extern from "_include/aoti_shim.h":
     int32_t aoti_torch_dtype_float64()
     int32_t aoti_torch_dtype_bfloat16()
     int32_t aoti_torch_dtype_uint8()
+    int32_t aoti_torch_dtype_uint16()
+    int32_t aoti_torch_dtype_uint32()
+    int32_t aoti_torch_dtype_uint64()
     int32_t aoti_torch_dtype_int8()
     int32_t aoti_torch_dtype_int16()
     int32_t aoti_torch_dtype_int32()
@@ -196,6 +199,9 @@ cdef dict _build_dtype_map():
         aoti_torch_dtype_float32(): numpy.dtype(numpy.float32),
         aoti_torch_dtype_float64(): numpy.dtype(numpy.float64),
         aoti_torch_dtype_uint8(): numpy.dtype(numpy.uint8),
+        aoti_torch_dtype_uint16(): numpy.dtype(numpy.uint16),
+        aoti_torch_dtype_uint32(): numpy.dtype(numpy.uint32),
+        aoti_torch_dtype_uint64(): numpy.dtype(numpy.uint64),
         aoti_torch_dtype_int8(): numpy.dtype(numpy.int8),
         aoti_torch_dtype_int16(): numpy.dtype(numpy.int16),
         aoti_torch_dtype_int32(): numpy.dtype(numpy.int32),
@@ -228,6 +234,9 @@ cdef dict _build_itemsize_map():
     return {
         aoti_torch_dtype_bool():       sizeof(uint8_t),
         aoti_torch_dtype_uint8():      sizeof(uint8_t),
+        aoti_torch_dtype_uint16():     sizeof(int16_t),
+        aoti_torch_dtype_uint32():     sizeof(int32_t),
+        aoti_torch_dtype_uint64():     sizeof(int64_t),
         aoti_torch_dtype_int8():       sizeof(int8_t),
         aoti_torch_dtype_float16():    sizeof(int16_t),    # no C float16
         aoti_torch_dtype_bfloat16():   sizeof(int16_t),    # no C bfloat16
@@ -344,6 +353,7 @@ def view_as_torch_tensor(object obj, object stream_ptr, view=None):
         buf = StridedMemoryView.__new__(StridedMemoryView)
 
     buf.ptr = <intptr_t>data_ptr
+    buf._dtype = None  # clear cached dtype (view may be reused)
     # PyTorch always reports tensors as writable via both DLPack
     # (flags=0, no DLPACK_FLAG_BITMASK_READ_ONLY) and CAI
     # (__cuda_array_interface__["data"] = (ptr, False)).  Tensors that
@@ -364,10 +374,16 @@ def view_as_torch_tensor(object obj, object stream_ptr, view=None):
         buf.is_device_accessible = True
 
         # -- stream ordering (matches the DLPack contract) --
-        if stream_ptr is not None:
-            _stream_ptr_int = int(stream_ptr)
-            if _stream_ptr_int != -1:
-                sync_torch_stream(device_index, _stream_ptr_int)
+        # stream_ptr=None is ambiguous for CUDA tensors — the caller must
+        # explicitly choose -1 (no sync) or a valid stream pointer.
+        if stream_ptr is None:
+            raise BufferError(
+                "stream_ptr=None is ambiguous for CUDA tensors; "
+                "pass stream_ptr=-1 to opt out of synchronization, "
+                "or pass a valid stream pointer")
+        _stream_ptr_int = int(stream_ptr)
+        if _stream_ptr_int != -1:
+            sync_torch_stream(device_index, _stream_ptr_int)
     else:
         raise BufferError(
             f"Unsupported device type from torch tensor "