Mojo struct

MatmulTileWriter

@register_passable(trivial) struct MatmulTileWriter[dtype: DType, layout: Layout, address_space: AddressSpace, element_layout: Layout, layout_int_type: DType, linear_idx_type: DType, masked: Bool, alignment: Int, smem_tile_layout: Layout, //, *, BM: Int, BN: Int, swizzle: TensorMapSwizzle, wgmma_shape: IndexList[3], num_consumer: Int = 1, use_tma_store: Bool = False, elementwise_lambda_fn: Optional[elementwise_epilogue_type] = None, elementwise_compute_lambda_fn: Optional[elementwise_compute_lambda_type] = None, swapAB: Bool = False]

Fields

tensor (MatmulTileWriter[BM=BM, BN=BN, swizzle=swizzle, wgmma_shape=wgmma_shape, num_consumer=num_consumer, use_tma_store=use_tma_store, elementwise_lambda_fn=elementwise_lambda_fn, elementwise_compute_lambda_fn=elementwise_compute_lambda_fn, swapAB=swapAB].CTensorType):
smem_tile (LayoutTensor[dtype, smem_tile_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128]):
warp_group_thread_idx (UInt):
local_warp_group_idx (UInt):
local_thread_idx (UInt):
block_y (Int):
block_x (Int):

Implemented traits

AnyType, Copyable, ImplicitlyCopyable, ImplicitlyDestructible, Movable, RegisterPassable, TrivialRegisterPassable

`comptime` members

`copyinitis_trivial`

comptime __copyinit__is_trivial = True

`delis_trivial`

comptime __del__is_trivial = True

`moveinitis_trivial`

comptime __moveinit__is_trivial = True

`CTensorType`

comptime CTensorType = LayoutTensor[dtype, layout, MutAnyOrigin, address_space=address_space, element_layout=element_layout, layout_int_type=layout_int_type, linear_idx_type=linear_idx_type, masked=masked, alignment=alignment]

`frag_size`

comptime frag_size = ((wgmma_shape.__getitem__[Int](0) * wgmma_shape.__getitem__[Int](1)) // WARPGROUP_SIZE)

`lambda_type`

comptime lambda_type = fn[dtype: DType, width: Int, *, alignment: Int = 1](IndexList[2], mut SIMD[dtype, width]) capturing -> None

`N`

comptime N = layout.shape[1].value()

`num_consumer_threads`

comptime num_consumer_threads = (num_consumer * WARPGROUP_SIZE)

`num_m_mmas`

comptime num_m_mmas = ((BM // wgmma_shape.__getitem__[Int](0)) // num_consumer)

`num_n_mmas`

comptime num_n_mmas = (BN // wgmma_shape.__getitem__[Int](1))

`simd_size`

comptime simd_size = simd_width_of[dtype]()

`WG_BM`

comptime WG_BM = smem_tile_layout.shape[0].value()

`WG_BN`

comptime WG_BN = smem_tile_layout.shape[1].value()

Methods

`init`

__init__(tensor: LayoutTensor[dtype, layout, MutAnyOrigin, address_space=address_space, element_layout=element_layout, layout_int_type=layout_int_type, linear_idx_type=linear_idx_type, masked=masked, alignment=alignment], smem_tile: LayoutTensor[dtype, smem_tile_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128], warp_group_thread_idx: Scalar[DType.uint], local_warp_group_idx: Scalar[DType.uint], local_thread_idx: Scalar[DType.uint], block_y: Int, block_x: Int) -> Self

`write_tile`

write_tile[tma_layout: Layout, desc_layout: Layout, accum_type: DType, reg_tile_layout: Layout, //](self, tma_op: TMATensorTile[dtype, tma_layout, desc_layout], reg_tile: LayoutTensor[accum_type, reg_tile_layout, MutAnyOrigin, address_space=AddressSpace.LOCAL])

Write output from registers to global memory.

Selects optimized st.matrix path for bf16 when constraints are met, otherwise uses general register-to-global path.

Fields​

Implemented traits​

comptime members​

__copyinit__is_trivial​

__del__is_trivial​

__moveinit__is_trivial​

CTensorType​

frag_size​

lambda_type​

N​

num_consumer_threads​

num_m_mmas​

num_n_mmas​

simd_size​

WG_BM​

WG_BN​

Methods​

__init__​

write_tile​