Mojo function

mha_decoding

mha_decoding[q_type: DType, k_t: MHAOperand, v_t: MHAOperand, output_type: DType, mask_t: MHAMask, score_mod_t: ScoreModTrait, BM: UInt, BN: UInt, BK: UInt, WM: UInt, WN: UInt, depth: UInt, num_heads: UInt, num_threads: UInt, num_pipeline_stages: UInt, group: UInt = UInt(1), use_score_mod: Bool = False, ragged: Bool = False, is_shared_kv: Bool = False, _use_valid_length: Bool = False, _is_cache_length_accurate: Bool = False, decoding_warp_split_k: Bool = False](q_ptr: UnsafePointer[SIMD[q_type, 1]], k: k_t, v: v_t, output_ptr: UnsafePointer[SIMD[output_type, 1]], exp_sum_ptr: UnsafePointer[SIMD[get_accum_type[::DType,::DType](), 1]], qk_max_ptr: UnsafePointer[SIMD[get_accum_type[::DType,::DType](), 1]], scale: SIMD[float32, 1], batch_size: Int, num_partitions: Int, max_cache_valid_length: Int, valid_length: NDBuffer[uint32, 1, MutableAnyOrigin], mask: mask_t, score_mod: score_mod_t)