Examples
Memory Fences
Use @fence to ensure memory operations are visible across threads before proceeding:
using KernelIntrinsics
using KernelAbstractions, CUDA
@kernel function fence_kernel(X, Flag)
X[1] = 10
@fence # Ensure X[1]=10 is visible to all threads before next operations
Flag[1] = 1
end
X = cu([1])
Flag = cu([0])
fence_kernel(CUDABackend())(X, Flag; ndrange=1)The @fence macro generates fence.acq_rel.gpu instructions in PTX assembly, ensuring proper memory ordering across the GPU.
Ordered Memory Access
The @access macro provides acquire/release semantics for fine-grained memory ordering:
@kernel function access_kernel(X, Flag)
if @index(Global, Linear) == 1
X[1] = 10
@access Flag[1] = 1 # Release store
end
# Other threads wait for Flag[1] == 1
while (@access Acquire Flag[1]) != 1 # Acquire load
end
# Safely use X[1] here
end
X = cu([i for i in 1:1000])
Flag = cu([0])
access_kernel(CUDABackend())(X, Flag; ndrange=1000)This generates st.release.gpu and ld.acquire.gpu instructions, providing lock-free synchronization patterns.
Warp Operations
Shuffle Operations
Exchange values between threads within a warp:
@kernel function shfl_kernel(dst, src)
I = @index(Global, Linear)
val = src[I]
offset = 1
shuffled_val = @shfl(Up, val, offset) # Default: warpsize=32, full mask
dst[I] = shuffled_val
end
src = cu(Int32.(1:32))
dst = cu(zeros(Int32, 32))
shfl_kernel(CUDABackend())(dst, src; ndrange=32)
# dst = [1, 1, 2, 3, 4, ..., 31]Shuffle with Custom Types
Unlike CUDA.jl, KernelIntrinsics.jl supports shuffle operations on arbitrary user-defined bitstype structs, including nested and composite types, as well as NTuples:
struct Sub
a::Float16
b::UInt8
end
struct ComplexType
x::Int32
y::Sub
z::Float64
end
@kernel function shfl_custom_kernel(dst, src)
I = @index(Global, Linear)
val = src[I]
offset = 1
shuffled_val = @shfl(Up, val, offset)
dst[I] = shuffled_val
end
# Nested structs
src = cu([ComplexType(i, Sub(i, i), i) for i in 1:32])
dst = cu([ComplexType(0, Sub(0, 0), 0) for i in 1:32])
shfl_custom_kernel(CUDABackend())(dst, src; ndrange=32)
# NTuples
src = cu([(Int32(i), Int32(i + 100)) for i in 1:32])
dst = cu([(Int32(0), Int32(0)) for _ in 1:32])
shfl_custom_kernel(CUDABackend())(dst, src; ndrange=32)Warp Reduce (Inclusive Scan)
Perform inclusive prefix sum within a warp:
@kernel function warpreduce_kernel(dst, src)
I = @index(Global, Linear)
val = src[I]
@warpreduce(val, +)
dst[I] = val
end
src = cu(Int32.(1:32))
dst = cu(zeros(Int32, 32))
warpreduce_kernel(CUDABackend())(dst, src; ndrange=32)
# dst = [1, 3, 6, 10, ..., 528] (cumulative sum)Warp Fold (Reduction)
Reduce all values in a warp to a single result:
@kernel function warpfold_kernel(dst, src)
I = @index(Global, Linear)
val = src[I]
@warpfold(val, +)
dst[I] = val
end
src = cu(Int32.(1:32))
dst = cu(zeros(Int32, 32))
warpfold_kernel(CUDABackend())(dst, src; ndrange=32)
# dst[1] = 528 (sum of 1:32), rest are undefinedVectorized Memory Access
Basic Vectorized Access
Use vload and vstore! for aligned vectorized operations:
@kernel function vectorized_kernel(dst, src, i)
# Load 4 elements with rebase (i=2 → loads from index 5,6,7,8)
values = vload(src, i, Val(4), Val(true))
# Store 4 elements with rebase
vstore!(dst, i, values, Val(true))
end
src = cu(Int32.(1:32))
dst = cu(zeros(Int32, 32))
vectorized_kernel(CUDABackend())(dst, src, 2; ndrange=1)
# dst[5:8] = [5, 6, 7, 8]This generates efficient ld.global.v4 and st.global.v4 PTX instructions. The vector width depends on element type (v4 for Int32/Float32, v2 for Int64/Float64).
Dynamic Alignment with vload_multi / vstore_multi!
When the starting index is not known at compile time, alignment cannot be guaranteed. vload_multi and vstore_multi! handle this by:
- Computing
mod = (i - 1) % N + 1at runtime (whereNis the vector width) - Using a switch table to dispatch to the appropriate statically-compiled function with
Val(mod) - Emitting a mix of vectorized instructions to maximize throughput
@kernel function dynamic_load_kernel(dst, src, i, ::Val{N}) where {N}
# i can be any runtime value — alignment handled automatically
values = vload_multi(src, i, Val(N))
for j in 1:N
dst[j] = values[j]
end
end
src = cu(Int32.(1:100))
dst = cu(zeros(Int32, 16))
# Works for any starting index
dynamic_load_kernel(CUDABackend())(dst, src, 7, Val(16); ndrange=1)
# dst = [7, 8, 9, ..., 22]The generated PTX will contain a mix of ld.global.v4, ld.global.v2, and scalar loads depending on the runtime alignment, maximizing memory throughput while handling arbitrary offsets.
@kernel function dynamic_store_kernel(dst, i)
values = (Int32(10), Int32(20), Int32(30), Int32(40))
vstore_multi!(dst, i, values)
end
dst = cu(zeros(Int32, 100))
dynamic_store_kernel(CUDABackend())(dst, 3; ndrange=1)
# dst[3:6] = [10, 20, 30, 40]Pattern-Based Access
For custom access patterns, use vload_pattern and vstore_pattern!:
@kernel function pattern_kernel(dst, src, i)
# Pattern (1, 2, 1) means: load 1, then 2, then 1 element
values = vload_pattern(src, i, Val((1, 2, 1)))
vstore_pattern!(dst, i, values, Val((1, 2, 1)))
end
src = cu(Int32.(1:16))
dst = cu(zeros(Int32, 16))
pattern_kernel(CUDABackend())(dst, src, 2; ndrange=1)
# dst[2:5] = src[2:5]Inspecting Generated Code
You can verify the generated PTX assembly to confirm proper instruction generation:
buf = IOBuffer()
CUDA.@device_code_ptx io = buf fence_kernel(CUDABackend())(X, Flag; ndrange=1)
asm = String(take!(buf))
occursin("fence.acq_rel.gpu", asm) # trueExample: verifying vectorized instructions are generated:
@kernel function test_vload_kernel(a, b, i)
y = vload(a, i, Val(4))
b[1] = sum(y)
end
a = cu(Int32.(1:16))
b = cu(zeros(Int32, 4))
buf = IOBuffer()
CUDA.@device_code_ptx io = buf test_vload_kernel(CUDABackend())(a, b, 2; ndrange=1)
asm = String(take!(buf))
occursin("ld.global.v4", asm) # true