For the complete documentation index, see llms.txt. Markdown versions of all pages are available by appending .md to any URL (e.g. /max/get-started.md).

Mojo struct

AMDPingPongMatmul

struct AMDPingPongMatmul[a_type: DType, b_type: DType, c_type: DType, config: KernelConfig, /, enable_swizzle: Bool, elementwise_lambda_fn: Optional[def[dtype: DType, width: SIMDSize, *, alignment: Int = Int(1)](IndexList[Int(2)], SIMD[dtype, width]) capturing -> None] = None]

Structured ping-pong matmul for AMD MI355X.

8-warp double-buffered kernel with register-based DRAM→SMEM path.

Parameters

a_type (DType): Input A element type.
b_type (DType): Input B element type.
c_type (DType): Output C element type.
config (KernelConfig): KernelConfig with block/warp/mma shapes.
enable_swizzle (Bool): Enable LDS bank conflict avoidance.
elementwise_lambda_fn (Optional[def[dtype: DType, width: SIMDSize, *, alignment: Int = Int(1)](IndexList[Int(2)], SIMD[dtype, width]) capturing -> None]): Optional epilogue.

Implemented traits

AnyType, ImplicitlyDeletable

`comptime` members

`accum_dtype`

comptime accum_dtype = get_accum_type[c_type]()

`accum_width`

comptime accum_width = (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(1)])) // _resolve_warp_size())

`BK`

comptime BK = config.block_shape[Int(2)]

`BM`

comptime BM = config.block_shape[Int(0)]

`BN`

comptime BN = config.block_shape[Int(1)]

`byte_swizzle`

comptime byte_swizzle = Optional(Swizzle(Int((add log2_floor((config.mma_shape[Int(2)] // Int(32))), 1)), log2_floor(Int((mul size_of[a_type](), Int(16) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) and (eq config.mma_shape[Int(0)], 16) and (eq config.mma_shape[Int(2)], 128) else (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size())))) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) else Int((add log2_floor((Int((mul simd_width_of[a_type](), 4)) // Int(2))), log2_floor(size_of[a_type]()))), Int(4))) if enable_swizzle else Optional()

`c_frag_size`

comptime c_frag_size = (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(1)])) // _resolve_warp_size())

`half_BM`

comptime half_BM = AMDPingPongMatmul[a_type, b_type, c_type, config, enable_swizzle, elementwise_lambda_fn].WM

`half_BN`

comptime half_BN = (config.block_shape[Int(1)] // Int(2))

`in_type`

comptime in_type = a_type

`LGKM_PER_LOAD_A`

comptime LGKM_PER_LOAD_A = (Int((mul (config.block_shape[Int(2)] // config.mma_shape[Int(2)]), ((config.warp_shape[Int(0)] // config.mma_shape[Int(0)]) // Int(2)), ((Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size()) // Int(16) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) and (eq config.mma_shape[Int(0)], 16) and (eq config.mma_shape[Int(2)], 128) else (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size())))) * ceildiv(Int((mul size_of[a_type](), Int(16) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) and (eq config.mma_shape[Int(0)], 16) and (eq config.mma_shape[Int(2)], 128) else (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size()))), Int(16)))

`LGKM_PER_LOAD_B`

comptime LGKM_PER_LOAD_B = (Int((mul (config.block_shape[Int(2)] // config.mma_shape[Int(2)]), ((config.warp_shape[Int(1)] // config.mma_shape[Int(1)]) // Int(2)), ((Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size()) // Int(16) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) and (eq config.mma_shape[Int(0)], 16) and (eq config.mma_shape[Int(2)], 128) else (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size())))) * ceildiv(Int((mul size_of[a_type](), Int(16) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) if (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 73) else (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 74) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 75) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 76) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 77) or (eq #pop.dtype_to_ui8<#lit.struct.extract<:!lit.struct<@std::@builtin::@dtype::@DType> a_type, "_mlir_value">>, 78) and (eq config.mma_shape[Int(0)], 16) and (eq config.mma_shape[Int(2)], 128) else (Int((mul config.mma_shape[Int(0)], config.mma_shape[Int(2)])) // _resolve_warp_size()))), Int(16)))

`loads_per_row`

comptime loads_per_row = (config.block_shape[Int(2)] // simd_width_of[a_type]())

`MMA_K`

comptime MMA_K = config.mma_shape[Int(2)]

`MMA_M`

comptime MMA_M = config.mma_shape[Int(0)]

`MMA_N`

comptime MMA_N = config.mma_shape[Int(1)]

`mma_swizzle`

comptime mma_swizzle = Optional(AMDPingPongMatmul.make_mma_swizzle()) if enable_swizzle else Optional()

`mma_tile_m`

comptime mma_tile_m = (config.warp_shape[Int(0)] // Int(2))

`mma_tile_n`

comptime mma_tile_n = (config.warp_shape[Int(1)] // Int(2))

`num_k_mmas`

comptime num_k_mmas = (config.block_shape[Int(2)] // config.mma_shape[Int(2)])

`num_m_mmas`

comptime num_m_mmas = (config.warp_shape[Int(0)] // config.mma_shape[Int(0)])

`num_n_mmas`

comptime num_n_mmas = (config.warp_shape[Int(1)] // config.mma_shape[Int(1)])

`num_warps_m`

comptime num_warps_m = (config.block_shape[Int(0)] // config.warp_shape[Int(0)])

`num_warps_n`

comptime num_warps_n = (config.block_shape[Int(1)] // config.warp_shape[Int(1)])

`quadrant_m_mmas`

comptime quadrant_m_mmas = ((config.warp_shape[Int(0)] // config.mma_shape[Int(0)]) // Int(2))

`quadrant_n_mmas`

comptime quadrant_n_mmas = ((config.warp_shape[Int(1)] // config.mma_shape[Int(1)]) // Int(2))

`rows_per_iter_8warp`

comptime rows_per_iter_8warp = (Int((mul _resolve_warp_size(), 8)) // (config.block_shape[Int(2)] // simd_width_of[a_type]()))

`simd_width`

comptime simd_width = simd_width_of[AMDPingPongMatmul[a_type, b_type, c_type, config, enable_swizzle, elementwise_lambda_fn].in_type]()

`total_warps`

comptime total_warps = ((config.block_shape[Int(0)] // config.warp_shape[Int(0)]) * (config.block_shape[Int(1)] // config.warp_shape[Int(1)]))

`VMCNT_PER_LOAD_A`

comptime VMCNT_PER_LOAD_A = (config.warp_shape[Int(0)] // (Int((mul _resolve_warp_size(), 8)) // (config.block_shape[Int(2)] // simd_width_of[a_type]())))

`VMCNT_PER_LOAD_B`

comptime VMCNT_PER_LOAD_B = ((config.block_shape[Int(1)] // Int(2)) // (Int((mul _resolve_warp_size(), 8)) // (config.block_shape[Int(2)] // simd_width_of[a_type]())))

`WM`

comptime WM = config.warp_shape[Int(0)]

`WN`

comptime WN = config.warp_shape[Int(1)]

Methods

`make_mma_swizzle`

static def make_mma_swizzle() -> Swizzle

Consumer swizzle for MMA LDS reads (element-space).

AMD MI355X have 64 LDS banks x 4 bytes each. Without swizzling, the MMA thread access pattern causes 4-way bank conflicts. The swizzle XORs high-order address bits into the bank selection bits to distribute accesses across banks.

Swizzle parameters:

log_tile: Number of bits to XOR, scales with MMA_K.
base: Log2 of read granularity in bytes (lds_frag_width * elem_size).
shift: Fixed at 4 for AMD LDS bank geometry.

Configuration examples: BF16 16x16x32: lds_frag=8 bytes=16 -> Swizzle(1, 4, 4) FP8 16x16x128: lds_frag=16 bytes=16 -> Swizzle(3, 4, 4) FP8 32x32x64: lds_frag=32 bytes=32 -> Swizzle(2, 5, 4)

Returns:

Swizzle: Swizzle pattern for bank-conflict-free LDS access.

`validate_config`

static def validate_config()

`run`

static def run[a_layout: TensorLayout, b_layout: TensorLayout, c_layout: TensorLayout](a: TileTensor[a_type, a_layout, ImmutAnyOrigin], b: TileTensor[b_type, b_layout, ImmutAnyOrigin], c: TileTensor[c_type, c_layout, MutAnyOrigin])

Structured ping-pong GEMM kernel entry point.

Parameters​

Implemented traits​

comptime members​

accum_dtype​

accum_width​

BK​

BM​

BN​

byte_swizzle​

c_frag_size​

half_BM​

half_BN​

in_type​

LGKM_PER_LOAD_A​

LGKM_PER_LOAD_B​

loads_per_row​

MMA_K​

MMA_M​

MMA_N​

mma_swizzle​

mma_tile_m​

mma_tile_n​

num_k_mmas​

num_m_mmas​

num_n_mmas​

num_warps_m​

num_warps_n​

quadrant_m_mmas​

quadrant_n_mmas​

rows_per_iter_8warp​

simd_width​

total_warps​

VMCNT_PER_LOAD_A​

VMCNT_PER_LOAD_B​

WM​

WN​

Methods​

make_mma_swizzle​

validate_config​

run​