For the complete documentation index, see llms.txt. Markdown versions of all pages are available by appending .md to any URL (e.g. /max/get-started.md).

Mojo struct

MLAPrefillSparse

struct MLAPrefillSparse[KVLUTType: MHAOperand, output_dtype: DType, config: MLASparseConfig[config.qkv_dtype, config.b_topk_, config.num_mbars_, config.q_smem_depth_, config.q_tmem_depth_, config.cta_group_]]

Implemented traits

AnyType, Copyable, ImplicitlyCopyable, ImplicitlyDeletable, Movable, RegisterPassable, TrivialRegisterPassable

`comptime` members

`accum_dtype`

comptime accum_dtype = DType.float32

`B_TOPK_PER_CTA`

comptime B_TOPK_PER_CTA = (config.B_TOPK // config.cta_group)

`Common`

comptime Common = MLAPrefillSparseCommon[KVLUTType, output_dtype, config]

`CTA_MASK`

comptime CTA_MASK = UInt16(3) if (config.cta_group == Int(2)) else UInt16(1)

`FP8_K_SWIZZLE`

comptime FP8_K_SWIZZLE = TensorMapSwizzle.SWIZZLE_NONE

`FP8_V_SWIZZLE`

comptime FP8_V_SWIZZLE = TensorMapSwizzle.SWIZZLE_NONE

`FULL_Q_TYPE`

comptime FULL_Q_TYPE = MLAPrefillSparse[KVLUTType, output_dtype, config].SMemType.FULL_Q_TYPE

`k_desc_shape`

comptime k_desc_shape = Index[Int, Int](Int(1), _gather4_box_width[qkv_dtype, config.qk_depth, TensorMapSwizzle(Int32(3))]())

`k_gather_box`

comptime k_gather_box = _gather4_box_width[MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype, config.qk_depth, MLAPrefillSparse[KVLUTType, output_dtype, config].k_swizzle_mode]()

`k_swizzle_mode`

comptime k_swizzle_mode = config.k_swizzle_mode

`k_tile_height`

comptime k_tile_height = MLAPrefillSparse[KVLUTType, output_dtype, config].B_TOPK_PER_CTA

`k_tile_shape`

comptime k_tile_shape = Index[Int, Int](MLAPrefillSparse[KVLUTType, output_dtype, config].k_tile_height, _gather4_box_width[qkv_dtype, config.qk_depth, TensorMapSwizzle(Int32(3))]())

`k_tile_width`

comptime k_tile_width = config.qk_depth

`k_tma_desc_shape_fp8`

comptime k_tma_desc_shape_fp8 = Index[Int, Int](Int(1), _gather4_box_width[DType.int64, (config // Int(8)), TensorMapSwizzle(Int32(0))]())

`k_tma_dtype_fp8`

comptime k_tma_dtype_fp8 = DType.int64

`k_tma_gather_box_fp8`

comptime k_tma_gather_box_fp8 = _gather4_box_width[DType.int64, (config // Int(8)), MLAPrefillSparse[KVLUTType, output_dtype, config].k_tma_swizzle_fp8]()

`k_tma_swizzle_fp8`

comptime k_tma_swizzle_fp8 = MLAPrefillSparse[KVLUTType, output_dtype, config].FP8_K_SWIZZLE

`k_tma_tile_shape_fp8`

comptime k_tma_tile_shape_fp8 = Index[Int, Int](MLAPrefillSparse[KVLUTType, output_dtype, config].k_tile_height, _gather4_box_width[DType.int64, (config // Int(8)), TensorMapSwizzle(Int32(0))]())

`k_tma_tile_width_fp8`

comptime k_tma_tile_width_fp8 = (config // Int(8))

`NUM_Q_HEADS_PER_CTA`

comptime NUM_Q_HEADS_PER_CTA = (config // config.cta_group)

`NUM_SV_ATOMS`

comptime NUM_SV_ATOMS = 2

`O_ATOM_PHYS_COLS`

comptime O_ATOM_PHYS_COLS = ((config // Int(2)) // Int(2))

`o_desc_shape`

comptime o_desc_shape = Index[Int, Int]((config // cta_group_), Int(64))

`o_tile_shape`

comptime o_tile_shape = Index[Int, Int]((config // cta_group_), config)

`O_TMEM_ADDR`

comptime O_TMEM_ADDR = 0

`O_TMEM_ADDR_ATOM2`

comptime O_TMEM_ADDR_ATOM2 = (Int(0) + ((config // Int(2)) // Int(2)))

`O_TYPE`

comptime O_TYPE = MLAPrefillSparse[KVLUTType, output_dtype, config].SMemType.O_TYPE

`P_TMEM_ADDR`

comptime P_TMEM_ADDR = 256

`PADDED_HEADS_PER_CTA`

comptime PADDED_HEADS_PER_CTA = (config // config.cta_group)

`q_desc_shape`

comptime q_desc_shape = _default_desc_shape[Int(3), MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype, MLAPrefillSparse[KVLUTType, output_dtype, config].q_tile_shape, config.q_swizzle_mode]()

`q_smem_depth`

comptime q_smem_depth = config.q_smem_depth

`q_tile_shape`

comptime q_tile_shape = Index[Int, Int, Int](Int(1), (config // cta_group_), config)

`Q_TMEM_ADDR`

comptime Q_TMEM_ADDR = (Int(512) - (MLAPrefillSparse[KVLUTType, output_dtype, config].q_tmem_depth // Int(2)))

`q_tmem_depth`

comptime q_tmem_depth = config.q_tmem_depth

`QKMMAOpType`

comptime QKMMAOpType = QKMMAOp[MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype, DType.float32, config]

`qkv_dtype`

comptime qkv_dtype

`qkv_dtype_size`

comptime qkv_dtype_size = size_of[MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype]()

`SHARED_QKV_TYPE`

comptime SHARED_QKV_TYPE = MLAPrefillSparse[KVLUTType, output_dtype, config].SMemType.SHARED_QKV_TYPE

`SMemType`

comptime SMemType = MLASparseSharedMemory[config]

`SV_ATOM_MMA_N`

comptime SV_ATOM_MMA_N = (config // Int(2))

`SVMMAType`

comptime SVMMAType = SVMMAType[MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype, DType.float32, config]

`V_BMN_PER_ATOM`

comptime V_BMN_PER_ATOM = ((config // Int(2)) // config.cta_group)

`V_DEPTH_PER_CTA`

comptime V_DEPTH_PER_CTA = MLAPrefillSparse[KVLUTType, output_dtype, config].SV_ATOM_MMA_N

`v_desc_shape`

comptime v_desc_shape = Index[Int, Int](Int(1), _gather4_box_width[qkv_dtype, Int((mul ((config // Int(2)) // cta_group_), 2)), TensorMapSwizzle(Int32(3))]())

`v_gather_box`

comptime v_gather_box = _gather4_box_width[MLAPrefillSparse[KVLUTType, output_dtype, config].qkv_dtype, Int((mul ((config // Int(2)) // cta_group_), 2)), MLAPrefillSparse[KVLUTType, output_dtype, config].v_swizzle_mode]()

`V_SMEM_COLS_PER_CTA`

comptime V_SMEM_COLS_PER_CTA = (((config // Int(2)) // cta_group_) * Int(2))

`v_swizzle_mode`

comptime v_swizzle_mode = TensorMapSwizzle.SWIZZLE_128B

`v_tile_height`

comptime v_tile_height = (config.B_TOPK // Int(2))

`v_tile_shape`

comptime v_tile_shape = Index[Int, Int](MLAPrefillSparse[KVLUTType, output_dtype, config].v_tile_height, _gather4_box_width[qkv_dtype, Int((mul ((config // Int(2)) // cta_group_), 2)), TensorMapSwizzle(Int32(3))]())

`v_tile_width`

comptime v_tile_width = MLAPrefillSparse[KVLUTType, output_dtype, config].V_SMEM_COLS_PER_CTA

`v_tma_desc_shape_fp8`

comptime v_tma_desc_shape_fp8 = Index[Int, Int](Int(1), _gather4_box_width[DType.int64, (((config // Int(2)) // cta_group_) // Int(8)), TensorMapSwizzle(Int32(0))]())

`v_tma_dtype_fp8`

comptime v_tma_dtype_fp8 = DType.int64

`v_tma_gather_box_fp8`

comptime v_tma_gather_box_fp8 = _gather4_box_width[DType.int64, (((config // Int(2)) // cta_group_) // Int(8)), MLAPrefillSparse[KVLUTType, output_dtype, config].v_tma_swizzle_fp8]()

`v_tma_swizzle_fp8`

comptime v_tma_swizzle_fp8 = MLAPrefillSparse[KVLUTType, output_dtype, config].FP8_V_SWIZZLE

`v_tma_tile_height_fp8`

comptime v_tma_tile_height_fp8 = config.B_TOPK

`v_tma_tile_shape_fp8`

comptime v_tma_tile_shape_fp8 = Index[Int, Int](MLAPrefillSparse[KVLUTType, output_dtype, config].v_tma_tile_height_fp8, _gather4_box_width[DType.int64, (((config // Int(2)) // cta_group_) // Int(8)), TensorMapSwizzle(Int32(0))]())

`v_tma_tile_width_fp8`

comptime v_tma_tile_width_fp8 = (((config // Int(2)) // cta_group_) // Int(8))

Methods

`kernel`

static def kernel[TopKLengthLayout: TensorLayout, IndicesLayout: TensorLayout](q_tma_op: TMATensorTile[Self.qkv_dtype, Int(3), Self.q_tile_shape, Self.q_desc_shape], k_tma_op: TMATensorTile[Self.qkv_dtype, Int(2), Self.k_tile_shape, Self.k_desc_shape], v_tma_op: TMATensorTile[Self.qkv_dtype, Int(2), Self.v_tile_shape, Self.v_desc_shape], o_tma_op: TMATensorTile[output_dtype, Int(2), Self.o_tile_shape, Self.o_desc_shape], topk_lengths: TileTensor[DType.uint32, TopKLengthLayout, MutAnyOrigin], indices: TileTensor[DType.uint32, IndicesLayout, MutAnyOrigin], kv_lut: KVLUTType, scale: Float32, attn_sink_ptr: Optional[Pointer[Float32, ImmutAnyOrigin, _safe=False]], indices_stride: Int32, output_gmem_ptr: Pointer[Scalar[output_dtype], MutAnyOrigin, _safe=False]) where (Int(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(IndicesLayout.__shape_types), [idx: __mlir_type.index] IndicesLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType)))) == Int(1)) if (Int(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType)))) == Int(1)) else (Int(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(#kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))), [idx: __mlir_type.index] #kgen.param_list.concat(#kgen.param_list.tabulate(len(TopKLengthLayout.__shape_types), [idx: __mlir_type.index] TopKLengthLayout.__shape_types[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType))[idx]._ParamListType)))) == Int(1))

`k_tma_gather4_load`

static def k_tma_gather4_load[col_range: Tuple[UInt32, UInt32], num_rows: Int](tma_op: TMATensorTile[Self.qkv_dtype, Int(2)], smem_barrier: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], smem_tensor: TileTensor[Self.qkv_dtype, Storage=smem_tensor.Storage, address_space=AddressSpace.SHARED, linear_idx_type=smem_tensor.linear_idx_type], local_indices: Array[SIMD[DType.int32, SIMDLength(4)], num_rows], warp_idx: UInt32)

`v_tma_gather4_load`

static def v_tma_gather4_load[local_row_range: Tuple[Int, Int]](tma_op: TMATensorTile[Self.qkv_dtype, Int(2)], smem_barrier: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], smem_tensor: TileTensor[Self.qkv_dtype, Storage=smem_tensor.Storage, address_space=AddressSpace.SHARED, linear_idx_type=smem_tensor.linear_idx_type], indices: TileTensor[DType.uint32, Storage=indices.Storage, linear_idx_type=indices.linear_idx_type], kv_lut: KVLUTType, warp_idx: UInt32, k: UInt32, cta_id: UInt32, indices_base: UInt32)

`load_k`

static def load_k(k_tma_op: TMATensorTile[Self.qkv_dtype, Int(2), Self.k_tile_shape, Self.k_desc_shape], indices: TileTensor[DType.uint32, Storage=indices.Storage, linear_idx_type=indices.linear_idx_type], kv_lut: KVLUTType, k_smem_ptr: Pointer[Scalar[Self.qkv_dtype], address_space=AddressSpace.SHARED, _safe=False], qk_ss_done: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], qk_ts_done: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], k_p0_ready: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], k_p1_ready: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], k: UInt32, cta_id: UInt32, warp_idx: UInt32, num_kv_rows: Int32, indices_base: UInt32)

`load_v`

static def load_v(v_tma_op: TMATensorTile[Self.qkv_dtype, Int(2), Self.v_tile_shape, Self.v_desc_shape], v_smem_ptr: Pointer[Scalar[Self.qkv_dtype], address_space=AddressSpace.SHARED, _safe=False], sv_p0_done: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], sv_p1_done: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], v_p0_ready: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], v_p1_ready: Pointer[SharedMemBarrier, address_space=AddressSpace.SHARED, _safe=False], indices: TileTensor[DType.uint32, Storage=indices.Storage, linear_idx_type=indices.linear_idx_type], kv_lut: KVLUTType, k: UInt32, warp_idx: UInt32, cta_id: UInt32, indices_base: UInt32)

Implemented traits
comptime members
Methods

Implemented traits​

comptime members​

accum_dtype​

B_TOPK_PER_CTA​

Common​

CTA_MASK​

FP8_K_SWIZZLE​

FP8_V_SWIZZLE​

FULL_Q_TYPE​

k_desc_shape​

k_gather_box​

k_swizzle_mode​

k_tile_height​

k_tile_shape​

k_tile_width​

k_tma_desc_shape_fp8​

k_tma_dtype_fp8​

k_tma_gather_box_fp8​

k_tma_swizzle_fp8​

k_tma_tile_shape_fp8​

k_tma_tile_width_fp8​

NUM_Q_HEADS_PER_CTA​

NUM_SV_ATOMS​

O_ATOM_PHYS_COLS​

o_desc_shape​

o_tile_shape​

O_TMEM_ADDR​

O_TMEM_ADDR_ATOM2​

O_TYPE​

P_TMEM_ADDR​

PADDED_HEADS_PER_CTA​

q_desc_shape​

q_smem_depth​

q_tile_shape​

Q_TMEM_ADDR​

q_tmem_depth​

QKMMAOpType​

qkv_dtype​

qkv_dtype_size​

SHARED_QKV_TYPE​

SMemType​

SV_ATOM_MMA_N​

SVMMAType​

V_BMN_PER_ATOM​

V_DEPTH_PER_CTA​

v_desc_shape​

v_gather_box​

V_SMEM_COLS_PER_CTA​

v_swizzle_mode​

v_tile_height​

v_tile_shape​

v_tile_width​

v_tma_desc_shape_fp8​

v_tma_dtype_fp8​

v_tma_gather_box_fp8​

v_tma_swizzle_fp8​

v_tma_tile_height_fp8​

v_tma_tile_shape_fp8​

v_tma_tile_width_fp8​

Methods​

kernel​

k_tma_gather4_load​

v_tma_gather4_load​

load_k​

load_v​