Mojo struct
SM100TensorAccumulatorTS
@register_passable(trivial)
struct SM100TensorAccumulatorTS[operand_type: DType, accum_type: DType, MMA_M: Int, MMA_N: Int, BM: Int, BN: Int, BK: Int, num_softmax_threads: Int, swizzle_b: TensorMapSwizzle = 3, transpose_b: Bool = True, cta_group: Int = 1]
Fields
- mbar (
UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)]
): - phase (
UInt32
):
Implemented traits
AnyType
,
Copyable
,
ImplicitlyCopyable
,
Movable
,
UnknownDestructibility
Aliases
__copyinit__is_trivial
alias __copyinit__is_trivial = UInt32.__copyinit__is_trivial if UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__copyinit__is_trivial else UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__copyinit__is_trivial
__del__is_trivial
alias __del__is_trivial = UInt32.__del__is_trivial if UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__del__is_trivial else UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__del__is_trivial
__moveinit__is_trivial
alias __moveinit__is_trivial = UInt32.__moveinit__is_trivial if UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__moveinit__is_trivial else UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)].__moveinit__is_trivial
a_frag_size
alias a_frag_size = ((MMA_M * 16) // num_softmax_threads)
a_t
alias a_t = TMemOperand[operand_type, ((BM * 2) // num_softmax_threads), (BN // MMA_N), (BM // ((BM * 2) // num_softmax_threads)), BK, 16, num_softmax_threads]
ab_t
alias ab_t = UMMADescriptorTS[operand_type, ((BM * 2) // num_softmax_threads), (BN // MMA_N), MMA_M=(BM // ((BM * 2) // num_softmax_threads)), MMA_N=BK, MMA_K=16, consumer_group_size=num_softmax_threads]
accum_t
alias accum_t = accum_type
b_offset
alias b_offset = MMAOperandOffsetFn[operand_type, BN, BK, swizzle_b, transpose_b, MMA_N, 16]()
b_t
alias b_t = MMASmemDescriptor
c_frag_size
alias c_frag_size = ((MMA_M * MMA_N) // num_softmax_threads)
c_t
alias c_t = TMemAccumulator[accum_type, (BM // ((BM * 2) // num_softmax_threads)), MMA_N, ((BM * 2) // num_softmax_threads), (BN // MMA_N), num_softmax_threads]
idesc
alias idesc = UMMAInsDescriptor.create[UMMAKind(2), accum_type, operand_type, operand_type, Index[dtype=DType.uint32](MMA_M, MMA_N), transpose_b=transpose_b]()
MMA_K
alias MMA_K = 16
num_k_mmas
alias num_k_mmas = (BK // 16)
num_m_blocks_per_warp
alias num_m_blocks_per_warp = ((BM * 2) // num_softmax_threads)
num_m_mmas
alias num_m_mmas = (BM // MMA_M)
num_n_mmas
alias num_n_mmas = (BN // MMA_N)
operand_t
alias operand_t = operand_type
smem_ptr_t
alias smem_ptr_t = UnsafePointer[Scalar[operand_type], address_space=AddressSpace(3)]
Methods
__init__
__init__(smem: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3)]) -> Self
check_constraints
static check_constraints()
init
init(self)
a_mma_descriptor
static a_mma_descriptor(a_tmem: UInt32) -> TMemOperand[operand_type, ((BM * 2) // num_softmax_threads), (BN // MMA_N), (BM // ((BM * 2) // num_softmax_threads)), BK, 16, num_softmax_threads]
Returns:
b_mma_descriptor
static b_mma_descriptor[dtype_b: DType](p_b: UnsafePointer[Scalar[dtype_b], address_space=AddressSpace(3)]) -> MMASmemDescriptor
Returns:
mma
mma(self, a: TMemOperand[operand_type, ((BM * 2) // num_softmax_threads), (BN // MMA_N), (BM // ((BM * 2) // num_softmax_threads)), BK, 16, num_softmax_threads], b: MMASmemDescriptor, c: TMemAccumulator[accum_type, (BM // ((BM * 2) // num_softmax_threads)), MMA_N, ((BM * 2) // num_softmax_threads), (BN // MMA_N), num_softmax_threads], c_scale: UInt32)
wait
wait(mut self, idx: UInt32)
wait_for_mma
wait_for_mma(mut self)
Wait for the mma to be complete.
wait_for_tmem
wait_for_tmem(mut self)
Wait for the output
and A
tmem to be ready.
tmem_arrive
tmem_arrive(self)
Indicate that the accumulator and the tensor memory arguments are ready for the MMA to begin.
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!