Skip to main content

Mojo struct

SM100TensorAccumulatorSS

@register_passable(trivial) struct SM100TensorAccumulatorSS[operand_type: DType, accum_type: DType, MMA_M: Int, MMA_N: Int, BK: Int, *, swizzle_a: TensorMapSwizzle = TensorMapSwizzle.SWIZZLE_128B, swizzle_b: TensorMapSwizzle = TensorMapSwizzle.SWIZZLE_128B, transpose_b: Bool = True, cta_group: Int = 1, num_stages: Int = 1]

Implemented traits

AnyType, Copyable, ImplicitlyCopyable, Movable, UnknownDestructibility

Aliases

__copyinit__is_trivial

comptime __copyinit__is_trivial = True

__del__is_trivial

comptime __del__is_trivial = True

__moveinit__is_trivial

comptime __moveinit__is_trivial = True

a_layout

comptime a_layout = tile_layout_k_major[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t, align_up(MMA_M, 8), SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].padded_BK, swizzle_a]()

accum_t

comptime accum_t = accum_type

AType

comptime AType = MMASmemDescriptorPair

b_layout

comptime b_layout = tile_layout_k_major[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t, MMA_N, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].padded_BK, swizzle_b]() if transpose_b else tile_layout_mn_major[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t, MMA_N, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].padded_BK, swizzle_b]()

BType

comptime BType = MMASmemDescriptorPair

CType

comptime CType = TMemTile[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].accum_t, MMA_M, MMA_N]

idesc

comptime idesc = UMMAInsDescriptor.create[UMMAKind.KIND_F16, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].accum_t, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t, Index[dtype=DType.uint32](MMA_M, MMA_N), transpose_b=transpose_b]()

MMA_K

comptime MMA_K = 16

num_k_blocks

comptime num_k_blocks = (SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].padded_BK // 16)

num_k_blocks_per_stage

comptime num_k_blocks_per_stage = (SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].num_k_blocks // num_stages)

num_k_mmas

comptime num_k_mmas = (BK // 16)

operand_size

comptime operand_size = size_of[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t]()

operand_t

comptime operand_t = operand_type

padded_BK

comptime padded_BK = align_up(BK, SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].swizzle_granularity)

swizzle_granularity

comptime swizzle_granularity = (max(swizzle_a.bytes(), swizzle_b.bytes()) // size_of[SM100TensorAccumulatorSS[operand_type, accum_type, MMA_M, MMA_N, BK, swizzle_a=swizzle_a, swizzle_b=swizzle_b, transpose_b=transpose_b, cta_group=cta_group, num_stages=num_stages].operand_t]())

Methods

mma

static mma[*, stage_idx: Int = 0](a: MMASmemDescriptorPair, b: MMASmemDescriptorPair, c: UInt32, *, c_scale: UInt32, elect: Int32)

Was this page helpful?