Skip to main content

Mojo struct

MLA_SM100_Decode_Common

@register_passable(trivial) struct MLA_SM100_Decode_Common[q_type: DType, KVLUTType: MHAOperand, output_type: DType, MaskType: MHAMask, ScoreModType: ScoreModTrait, config: MLA_SM100_Decode_Config, use_score_mod: Bool, ValidLengthType: OptionalPointer, _is_cache_length_accurate: Bool = False, ragged: Bool = False]

Implemented traits

AnyType, Copyable, ImplicitlyCopyable, ImplicitlyDestructible, Movable, TrivialRegisterType

comptime members

__copyinit__is_trivial

comptime __copyinit__is_trivial = True

__del__is_trivial

comptime __del__is_trivial = True

__moveinit__is_trivial

comptime __moveinit__is_trivial = True

AccumType

comptime AccumType = get_accum_type[q_type]()

BlockElems

comptime BlockElems = (config * config)

bytes_per_element

comptime bytes_per_element = size_of[q_type]()

kv_type

comptime kv_type = KVLUTType.dtype

KVStageElems

comptime KVStageElems = (MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].NumQKBlocks * MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].BlockElems)

NumQKBlocks

comptime NumQKBlocks = (config // config)

NumVOBlocks

comptime NumVOBlocks = (config // config)

O_M

comptime O_M = (config * 2)

O_N

comptime O_N = (config // 2)

OTMemTile

comptime OTMemTile = TMemTile[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].O_M, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].O_N]

output_tile_width

comptime output_tile_width = ((config // 2) * (4 // size_of[output_type]()))

S_M

comptime S_M = (config * 2)

S_N

comptime S_N = (config // 2)

STMemTile

comptime STMemTile = TMemTile[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].S_M, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].S_N]

UMMAPVSS

comptime UMMAPVSS = DecodeSM100PVSS[q_type, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType, config=config]

UMMAQKTSS

comptime UMMAQKTSS = DecodeSM100QKTSS[q_type, MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType, config=config]

Methods

apply_mask

static apply_mask[half_load: Int, NonCausalMask: Bool, CausalMask: Bool](tiles_done: Int, col0: Int, num_keys: Int, s_row: LayoutTensor[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType, Layout.row_major(half_load), MutAnyOrigin, address_space=AddressSpace.LOCAL], mask: MaskType, score_mod: ScoreModType, prompt_idx: UInt32, q_head_idx: UInt32, score_row: UInt32, max_seq_len: UInt32, cache_len: Int, start_pos: UInt32, cache_start_pos: UInt32) -> Scalar[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType]

Returns:

Scalar

Softmax

static Softmax(tmem_addr: UInt32, s_bars: DecodeSM100MiscMBars[2, 1, WARPGROUP_SIZE], p_bars: DecodeSM100MiscMBars[2, WARPGROUP_SIZE, 1], kv_smem: UnsafePointer[Scalar[q_type], MutAnyOrigin, address_space=AddressSpace.SHARED], max_smem: UnsafePointer[Scalar[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType], MutAnyOrigin, address_space=AddressSpace.SHARED], li_smem: UnsafePointer[Scalar[MLA_SM100_Decode_Common[q_type, KVLUTType, output_type, MaskType, ScoreModType, config, use_score_mod, ValidLengthType, _is_cache_length_accurate, ragged].AccumType], MutAnyOrigin, address_space=AddressSpace.SHARED], out_smem: UnsafePointer[Scalar[output_type], MutAnyOrigin, address_space=AddressSpace.SHARED], c_bars: DecodeSM100MiscMBars[1, WARPGROUP_SIZE, WARPGROUP_SIZE], corr_done_bars: DecodeSM100MiscMBars[1, WARPGROUP_SIZE, WARPGROUP_SIZE], out_pipeline: OutPipeline[DecodeOutProducer[output_type, config].num_out_stages, WARPGROUP_SIZE, 1], num_k_tiles: Int, offset_position: OffsetPosition[config, KVLUTType, ragged, _is_cache_length_accurate, ValidLengthType], scale: Float32, mask: MaskType, score_mod: ScoreModType, prompt_idx: UInt32, max_seq_len: UInt32)

Correction

static Correction(tmem_addr: UInt32, o_bars: DecodeSM100MiscMBars[1, 1, WARPGROUP_SIZE], c_bars: DecodeSM100MiscMBars[1, WARPGROUP_SIZE, WARPGROUP_SIZE], corr_done_bars: DecodeSM100MiscMBars[1, WARPGROUP_SIZE, WARPGROUP_SIZE], num_k_tiles: Int)

store

static store(out_pipeline: OutPipeline[DecodeOutProducer[output_type, config].num_out_stages, WARPGROUP_SIZE, 1], out_smem: UnsafePointer[Scalar[output_type], MutAnyOrigin, address_space=AddressSpace.SHARED], o_tma: TMATensorTile[output_type, tile_layout_k_major[output_type, config.out_rows, config.BN, config.swizzle_mode](), _tma_desc_tile_layout[output_type, 2, IndexList[2, DType.int64](config.out_rows, config.BN, Tuple[]()), config.swizzle_mode]()], offset_position: OffsetPosition[config, KVLUTType, ragged, _is_cache_length_accurate, ValidLengthType])

Was this page helpful?