Skip to main content

Mojo struct

B200MatmulSmem

struct B200MatmulSmem[a_type: DType, b_type: DType, c_type: DType, transpose_b: Bool, *, config: MatmulConfig[a_type, b_type, c_type, transpose_b]]

Shared memory layout for B200 SM100 matrix multiplication kernel.

This struct manages the shared memory allocation for:

  • Input tiles (A and B matrices) with multi-stage pipelining
  • Output tile (C matrix) for accumulation
  • Synchronization barriers for producer-consumer coordination
  • CLC (Cluster Launch Control) barriers and response storage
  • TMEM (Tensor Memory) address and deallocation barrier

The memory is organized to support asynchronous TMA loads and efficient bank-conflict-free access patterns for tensor core operations.

Type aliases are provided for tile types (ATile, BTile, CTile) to enable cleaner function signatures without verbose LayoutTensor declarations.

Fields

  • a_smem (InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].AType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_size]):
  • b_smem (InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_size]):
  • c_smem (InlineArray[B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].CType, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_size]):
  • tma_mma_mbars (InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_group_pipeline_stages * 2)]):
  • accum_mbars (InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_accum_pipeline_stages * 2)]):
  • clc_mbars_full (InlineArray[SharedMemBarrier, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]):
  • clc_mbars_empty (InlineArray[SharedMemBarrier, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]):
  • clc_throttle_mbars (InlineArray[SharedMemBarrier, (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages * 2)]):
  • clc_response (InlineArray[UInt128, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_clc_pipeline_stages]):
  • tmem_dealloc_mbar (InlineArray[SharedMemBarrier, 1]):
  • tmem_addr (InlineArray[UInt32, 1]):

Implemented traits

AnyType, UnknownDestructibility

comptime members

__del__is_trivial

comptime __del__is_trivial = True

a_smem_layout

comptime a_smem_layout = tile_layout_k_major[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BM, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.a_swizzle]()

a_smem_size

comptime a_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BM * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages)

ATile

comptime ATile = LayoutTensor[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]

ATileArray

comptime ATileArray = SMemTileArrayType[a_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].a_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages, 128]

AType

comptime AType = Scalar[a_type]

b_smem_layout

comptime b_smem_layout = tile_layout_k_major[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.b_swizzle]() if transpose_b else tile_layout_mn_major[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK, config.b_swizzle]()

b_smem_size

comptime b_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BN * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].BK) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages)

BK

comptime BK = config.block_tile_shape.__getitem__[3, DType.int64, Int](2)

BM

comptime BM = config.block_tile_shape.__getitem__[3, DType.int64, Int](0)

BN

comptime BN = config.block_tile_shape.__getitem__[3, DType.int64, Int](1)

BTile

comptime BTile = LayoutTensor[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]

BTileArray

comptime BTileArray = SMemTileArrayType[b_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].b_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages, 128]

BType

comptime BType = Scalar[b_type]

c_smem_layout

comptime c_smem_layout = Layout.row_major(B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputM, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputN)

c_smem_size

comptime c_smem_size = ((B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputM * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].OutputN) * B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_output_stages)

CTile

comptime CTile = LayoutTensor[c_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=NVIDIASharedMemoryBasePtr.alignment]

CTileArray

comptime CTileArray = SMemTileArrayType[c_type, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].c_smem_layout, B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_output_stages, 128]

CType

comptime CType = Scalar[c_type]

num_accum_pipeline_stages

comptime num_accum_pipeline_stages = Int(config)

num_clc_pipeline_stages

comptime num_clc_pipeline_stages = Int(config)

num_group_pipeline_stages

comptime num_group_pipeline_stages = (B200MatmulSmem[a_type, b_type, c_type, transpose_b, config=config].num_pipeline_stages // Int(config))

num_output_stages

comptime num_output_stages = Int(config)

num_pipeline_stages

comptime num_pipeline_stages = Int(config)

OutputM

comptime OutputM = config.output_tile_shape.__getitem__[2, DType.int64, Int](0)

OutputN

comptime OutputN = config.output_tile_shape.__getitem__[2, DType.int64, Int](1)

SMM

comptime SMM = SharedMemoryManager[NVIDIASharedMemoryBasePtr]

Methods

ab_pipeline_size

static ab_pipeline_size() -> Int

Calculate the total size of A+B tiles for all pipeline stages.

Returns:

Int

c_output_size

static c_output_size() -> Int

Calculate the size of C tiles for all output stages.

Returns:

Int

total_tile_size

static total_tile_size() -> Int

Calculate the total tile storage size (A+B+C).

Returns:

Int

Was this page helpful?