Mojo struct

BlockScaledSmem

struct BlockScaledSmem[a_type: DType, b_type: DType, c_type: DType, sfa_dtype: DType, sfb_dtype: DType, transpose_b: Bool, *, config: BlockScaledMatmulConfig[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b]]

SMEM struct containing A/B tiles, scaling factors, C output, and barriers.

Fields

a_tiles_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].ATileArray.Storage):
b_tiles_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BTileArray.Storage):
c_tiles_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].CTileArray.Storage):
sfa_tiles_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].SFATileArray.Storage):
sfb_tiles_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].SFBTileArray.Storage):
tma_mma_mbars_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].InputBarriers.Storage):
accum_mbars_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].AccumBarriers.Storage):
clc_mbars_full_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].ClcBarriers.Storage):
clc_mbars_empty_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].ClcBarriers.Storage):
clc_throttle_mbars_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].ClcThrottleBarriers.Storage):
clc_response_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].ClcResponse.Storage):
tmem_dealloc_mbar_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].TmemDealloc.Storage):
tmem_addr_storage (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].TmemAddr.Storage):

Implemented traits

AnyType, ImplicitlyDestructible

`comptime` members

`delis_trivial`

comptime __del__is_trivial = True

`a_smem_layout`

comptime a_smem_layout = tile_layout_k_major[a_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BM, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BK, config.a_swizzle]()

`AccumBarriers`

comptime AccumBarriers = SMemArray[SharedMemBarrier, (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_accum_pipeline_stages * 2)]

`ATileArray`

comptime ATileArray = SMemTileArray[a_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].a_smem_layout, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_pipeline_stages, 128]

`b_smem_layout`

comptime b_smem_layout = tile_layout_k_major[b_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BN, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BK, config.b_swizzle]() if transpose_b else tile_layout_mn_major[b_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BN, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BK, config.b_swizzle]()

`BK`

comptime BK = config.block_tile_shape.__getitem__[3, DType.int64, Int](2)

`BM`

comptime BM = config.block_tile_shape.__getitem__[3, DType.int64, Int](0)

`BN`

comptime BN = config.block_tile_shape.__getitem__[3, DType.int64, Int](1)

`BTileArray`

comptime BTileArray = SMemTileArray[b_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].b_smem_layout, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_pipeline_stages, 128]

`c_smem_layout`

comptime c_smem_layout = Layout.row_major(BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].OutputM, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].OutputN)

`ClcBarriers`

comptime ClcBarriers = SMemArray[SharedMemBarrier, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_clc_pipeline_stages]

`ClcResponse`

comptime ClcResponse = SMemArray[UInt128, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_clc_pipeline_stages]

`ClcThrottleBarriers`

comptime ClcThrottleBarriers = SMemArray[SharedMemBarrier, (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_clc_pipeline_stages * 2)]

`CTileArray`

comptime CTileArray = SMemTileArray[c_type, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].c_smem_layout, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_output_stages, 128]

`InputBarriers`

comptime InputBarriers = SMemArray[SharedMemBarrier, (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_group_pipeline_stages * 2)]

`MMA_M`

comptime MMA_M = config.mma_shape.__getitem__[3, DType.int64, Int](0)

`MMA_N`

comptime MMA_N = config.mma_shape.__getitem__[3, DType.int64, Int](1)

`num_accum_pipeline_stages`

comptime num_accum_pipeline_stages = config.num_accum_pipeline_stages

`num_clc_pipeline_stages`

comptime num_clc_pipeline_stages = config.num_clc_pipeline_stages

`num_group_pipeline_stages`

comptime num_group_pipeline_stages = (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_pipeline_stages // config)

`num_output_stages`

comptime num_output_stages = config.num_output_stages

`num_pipeline_stages`

comptime num_pipeline_stages = config.num_pipeline_stages

`OutputM`

comptime OutputM = config.output_tile_shape.__getitem__[2, DType.int64, Int](0)

`OutputN`

comptime OutputN = config.output_tile_shape.__getitem__[2, DType.int64, Int](1)

`SF_K_GROUP_SIZE`

comptime SF_K_GROUP_SIZE = (4 * config)

`sfa_smem_layout`

comptime sfa_smem_layout = tile_sf_layout_k_major[BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].BM, (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].SF_K_GROUP_SIZE * config), config.vec_sf_size]()

`SFATileArray`

comptime SFATileArray = SMemTileArray[sfa_dtype, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].sfa_smem_layout, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_pipeline_stages, 128]

`sfb_smem_layout`

comptime sfb_smem_layout = tile_sf_layout_k_major[BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].MMA_N, (BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].SF_K_GROUP_SIZE * config), config.vec_sf_size]()

`SFBTileArray`

comptime SFBTileArray = SMemTileArray[sfb_dtype, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].sfb_smem_layout, BlockScaledSmem[a_type, b_type, c_type, sfa_dtype, sfb_dtype, transpose_b, config=config].num_pipeline_stages, 128]

`TmemAddr`

comptime TmemAddr = SMemArray[UInt32, 1]

`TmemDealloc`

comptime TmemDealloc = SMemArray[SharedMemBarrier, 1]

Methods

Int

View source

Was this page helpful?

Thank you! We'll create more content like this.

Thank you for helping us improve!

Fields​

Implemented traits​

comptime members​

__del__is_trivial​

a_smem_layout​

AccumBarriers​

ATileArray​

b_smem_layout​

BK​

BM​

BN​

BTileArray​

c_smem_layout​

ClcBarriers​

ClcResponse​

ClcThrottleBarriers​

CTileArray​

InputBarriers​

MMA_M​

MMA_N​

num_accum_pipeline_stages​

num_clc_pipeline_stages​

num_group_pipeline_stages​

num_output_stages​

num_pipeline_stages​

OutputM​

OutputN​

SF_K_GROUP_SIZE​

sfa_smem_layout​

SFATileArray​

sfb_smem_layout​

SFBTileArray​

TmemAddr​

TmemDealloc​

Methods​

a_tiles​

b_tiles​

c_tiles​

sfa_tiles​

sfb_tiles​

tma_mma_mbars​

accum_mbars​

clc_mbars_full​

clc_mbars_empty​

clc_throttle_mbars​

clc_response​

tmem_dealloc_mbar​

tmem_addr​

input_barriers​

accum_barriers​

tmem_dealloc​

ab_pipeline_size​

sf_pipeline_size​

c_output_size​

total_tile_size​

Fields

Implemented traits

`comptime` members

`delis_trivial`

`a_smem_layout`

`AccumBarriers`

`ATileArray`

`b_smem_layout`

`BK`

`BM`

`BN`

`BTileArray`

`c_smem_layout`

`ClcBarriers`

`ClcResponse`

`ClcThrottleBarriers`

`CTileArray`

`InputBarriers`

`MMA_M`

`MMA_N`

`num_accum_pipeline_stages`

`num_clc_pipeline_stages`

`num_group_pipeline_stages`

`num_output_stages`

`num_pipeline_stages`

`OutputM`

`OutputN`

`SF_K_GROUP_SIZE`

`sfa_smem_layout`

`SFATileArray`

`sfb_smem_layout`

`SFBTileArray`

`TmemAddr`

`TmemDealloc`

Methods

`a_tiles`

`b_tiles`

`c_tiles`

`sfa_tiles`

`sfb_tiles`

`tma_mma_mbars`

`accum_mbars`

`clc_mbars_full`

`clc_mbars_empty`

`clc_throttle_mbars`

`clc_response`

`tmem_dealloc_mbar`

`tmem_addr`

`input_barriers`

`accum_barriers`

`tmem_dealloc`

`ab_pipeline_size`

`sf_pipeline_size`

`c_output_size`

`total_tile_size`