Mojo struct
B200MatmulSmem
struct B200MatmulSmem[a_type: DType, b_type: DType, c_type: DType, transpose_b: Bool, *, config: MatmulConfig[a_type, b_type, c_type, transpose_b]]
Shared memory layout for B200 SM100 matrix multiplication kernel.
This struct manages the shared memory allocation for:
- Input tiles (A and B matrices) with multi-stage pipelining
- Output tile (C matrix) for accumulation
- Synchronization barriers for producer-consumer coordination
- CLC (Cluster Launch Control) barriers and response storage
- TMEM (Tensor Memory) address and deallocation barrier
The memory is organized to support asynchronous TMA loads and efficient bank-conflict-free access patterns for tensor core operations.
Type aliases are provided for tile types (ATile, BTile, CTile) to enable cleaner function signatures without verbose LayoutTensor declarations.
Fields
- a_smem (
InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].AType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_size]): - b_smem (
InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_size]): - c_smem (
InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].CType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_size]): - tma_mma_mbars (
InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_group_pipeline_stages * 2)]): - accum_mbars (
InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_accum_pipeline_stages * 2)]): - clc_mbars_full (
InlineArray[SharedMemBarrier, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]): - clc_mbars_empty (
InlineArray[SharedMemBarrier, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]): - clc_throttle_mbars (
InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages * 2)]): - clc_response (
InlineArray[UInt128, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]): - tmem_dealloc_mbar (
InlineArray[SharedMemBarrier, 1]): - tmem_addr (
InlineArray[UInt32, 1]):
Implemented traits
AnyType,
UnknownDestructibility
comptime members
__del__is_trivial
comptime __del__is_trivial = True
a_smem_layout
comptime a_smem_layout = tile_layout_k_major[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BM, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.a_swizzle]()
a_smem_size
comptime a_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BM * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages)
ATile
comptime ATile = LayoutTensor[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]
ATileArray
comptime ATileArray = SMemTileArrayType[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages, 128]
AType
comptime AType = Scalar[a_type]
b_smem_layout
comptime b_smem_layout = tile_layout_k_major[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.b_swizzle]() if transpose_b else tile_layout_mn_major[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.b_swizzle]()
b_smem_size
comptime b_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages)
BK
comptime BK = config.block_tile_shape.__getitem__[3, DType.int64, Int](2)
BM
comptime BM = config.block_tile_shape.__getitem__[3, DType.int64, Int](0)
BN
comptime BN = config.block_tile_shape.__getitem__[3, DType.int64, Int](1)
BTile
comptime BTile = LayoutTensor[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]
BTileArray
comptime BTileArray = SMemTileArrayType[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages, 128]
BType
comptime BType = Scalar[b_type]
c_smem_layout
comptime c_smem_layout = Layout.row_major(B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputM, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputN)
c_smem_size
comptime c_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputM * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputN) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_output_stages)
CTile
comptime CTile = LayoutTensor[c_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]
CTileArray
comptime CTileArray = SMemTileArrayType[c_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_output_stages, 128]
CType
comptime CType = Scalar[c_type]
num_accum_pipeline_stages
comptime num_accum_pipeline_stages = Int(config)
num_clc_pipeline_stages
comptime num_clc_pipeline_stages = Int(config)
num_group_pipeline_stages
comptime num_group_pipeline_stages = (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages // Int(config))
num_output_stages
comptime num_output_stages = Int(config)
num_pipeline_stages
comptime num_pipeline_stages = Int(config)
OutputM
comptime OutputM = config.output_tile_shape.__getitem__[2, DType.int64, Int](0)
OutputN
comptime OutputN = config.output_tile_shape.__getitem__[2, DType.int64, Int](1)
SMM
comptime SMM = SharedMemoryManager[NVIDIASharedMemoryBasePtr]
Methods
ab_pipeline_size
static ab_pipeline_size() -> Int
Calculate the total size of A+B tiles for all pipeline stages.
Returns:
c_output_size
total_tile_size
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!