Skip to main content

Mojo function

mla_decode_sm100_sink_split_k

mla_decode_sm100_sink_split_k[q_type: DType, k_t: MHAOperand, output_type: DType, mask_t: MHAMask, *, config: MHAConfig[config.dtype], depth: Int, num_heads: Int, SplitAccumType: OptionalPointer, group: Int, ragged: Bool, _is_cache_length_accurate: Bool, decoding_warp_split_k: Bool, split_page_size: Int = 128, per_token_scale_rope_aware: Bool = False, has_attn_sink: Bool = False, sparse: Bool = False](q: TileTensor[q_type, q.LayoutType, q.origin, linear_idx_type=q.linear_idx_type, element_size=q.element_size], k: k_t, output: TileTensor[output.dtype, output.LayoutType, output.origin, linear_idx_type=output.linear_idx_type, element_size=output.element_size], lse_accum_split_ptr: SplitAccumType, scale: Float32, batch_size: Int, block_z: Int, num_partitions: Int, q_max_seq_len: Int, valid_length: TileTensor[DType.uint32, valid_length.LayoutType, valid_length.origin, linear_idx_type=valid_length.linear_idx_type, element_size=valid_length.element_size], mask: mask_t, scales_ptr: UnsafePointer[Float32, MutAnyOrigin], scalar_args_buf: TileTensor[DType.int64, scalar_args_buf.LayoutType, scalar_args_buf.origin, linear_idx_type=scalar_args_buf.linear_idx_type, element_size=scalar_args_buf.element_size], ctx: DeviceContext, q_scale_ptr: UnsafePointer[Float32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), d_indices: UnsafePointer[Int32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), indices_stride: Int = 0, topk_lengths: UnsafePointer[Int32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), attn_sink_ptr: UnsafePointer[Float32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), extra_k: OptionalReg[k_t] = None, extra_d_indices: UnsafePointer[Int32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), extra_indices_stride: Int = 0, extra_topk_lengths: UnsafePointer[Int32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()), extra_scales_ptr: UnsafePointer[Float32, MutAnyOrigin] = UnsafePointer(_unsafe_null=Tuple()))

Was this page helpful?