For the complete documentation index, see llms.txt. Markdown versions of all pages are available by appending .md to any URL (e.g. /max/get-started.md).

Mojo function

mha_sm90_dispatch

def mha_sm90_dispatch[q_type: DType, KVType: MHAOperand, MaskType: MHAMask, output_type: DType, MaxPromptLenType: OptionallyStaticInt, PartitionType: MHAPartitionScheme, //, config: MHAConfig[config.dtype], group: Int, ragged: Bool, sink: Bool, _is_cache_length_accurate: Bool](output: DeviceBuffer[output_type], q_arg: DeviceBuffer[q_type], k: KVType, v: KVType, num_rows_q: Int, mask_functor: MaskType, valid_length: DeviceBuffer[DType.uint32], max_prompt_len_arg: MaxPromptLenType, max_cache_valid_length_arg: Int, scale: Float32, kv_input_row_offsets: OptionalReg[LayoutTensor[DType.uint32, Layout.row_major(Int(-1)), ImmutAnyOrigin]], batch_size_arg: Int, partition: PartitionType, ctx: DeviceContext, sink_weights: OptionalReg[LayoutTensor[q_type, Layout.row_major(Int(-1)), ImmutAnyOrigin]])