Mojo function

mha_gpu_naive

mha_gpu_naive[output_type: DType, k_t: MHAOperand, v_t: MHAOperand, mask_t: MHAMask, //, ragged: Bool = False, sink: Bool = False, _use_valid_length: Bool = False, _is_cache_length_accurate: Bool = False](q: LayoutTensor[q.dtype, q.layout, q.origin, element_layout=q.element_layout, layout_int_type=q.layout_int_type, linear_idx_type=q.linear_idx_type, masked=q.masked, alignment=q.alignment], k: k_t, v: v_t, mask_functor: mask_t, output: LayoutTensor[output_type, output.layout, output.origin, element_layout=output.element_layout, layout_int_type=output.layout_int_type, linear_idx_type=output.linear_idx_type, masked=output.masked, alignment=output.alignment], valid_length: LayoutTensor[DType.uint32, valid_length.layout, valid_length.origin, element_layout=valid_length.element_layout, layout_int_type=valid_length.layout_int_type, linear_idx_type=valid_length.linear_idx_type, masked=valid_length.masked, alignment=valid_length.alignment], scale: Float32, batch_size: Int, max_prompt_len: Int, max_cache_size: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext, sink_weights: OptionalReg[LayoutTensor[q.dtype, Layout.row_major(VariadicList(-1)), ImmutAnyOrigin]] = None)

mha_gpu_naive[q_type: DType, k_type: DType, v_type: DType, output_type: DType, mask_type: DType, //, sink: Bool = False](q: LayoutTensor[q_type, q.layout, q.origin, element_layout=q.element_layout, layout_int_type=q.layout_int_type, linear_idx_type=q.linear_idx_type, masked=q.masked, alignment=q.alignment], k: LayoutTensor[k_type, k.layout, k.origin, element_layout=k.element_layout, layout_int_type=k.layout_int_type, linear_idx_type=k.linear_idx_type, masked=k.masked, alignment=k.alignment], v: LayoutTensor[v_type, v.layout, v.origin, element_layout=v.element_layout, layout_int_type=v.layout_int_type, linear_idx_type=v.linear_idx_type, masked=v.masked, alignment=v.alignment], mask: LayoutTensor[mask_type, mask.layout, mask.origin, element_layout=mask.element_layout, layout_int_type=mask.layout_int_type, linear_idx_type=mask.linear_idx_type, masked=mask.masked, alignment=mask.alignment], output: LayoutTensor[output_type, output.layout, output.origin, element_layout=output.element_layout, layout_int_type=output.layout_int_type, linear_idx_type=output.linear_idx_type, masked=output.masked, alignment=output.alignment], scale: Float32, batch_size: Int, seq_len: Int, num_keys: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext, sink_weights: OptionalReg[LayoutTensor[q_type, Layout.row_major(VariadicList(-1)), ImmutAnyOrigin]] = None)

mha_gpu_naive[q_type: DType, k_type: DType, v_type: DType, output_type: DType, MaskType: MHAMask, //, sink: Bool = False](q: LayoutTensor[q_type, q.layout, q.origin, element_layout=q.element_layout, layout_int_type=q.layout_int_type, linear_idx_type=q.linear_idx_type, masked=q.masked, alignment=q.alignment], k: LayoutTensor[k_type, k.layout, k.origin, element_layout=k.element_layout, layout_int_type=k.layout_int_type, linear_idx_type=k.linear_idx_type, masked=k.masked, alignment=k.alignment], v: LayoutTensor[v_type, v.layout, v.origin, element_layout=v.element_layout, layout_int_type=v.layout_int_type, linear_idx_type=v.linear_idx_type, masked=v.masked, alignment=v.alignment], mask: MaskType, output: LayoutTensor[output_type, output.layout, output.origin, element_layout=output.element_layout, layout_int_type=output.layout_int_type, linear_idx_type=output.linear_idx_type, masked=output.masked, alignment=output.alignment], scale: Float32, batch_size: Int, seq_len: Int, num_keys: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext, sink_weights: OptionalReg[LayoutTensor[q_type, Layout.row_major(VariadicList(-1)), ImmutAnyOrigin]] = None)

mha_gpu_naive[q_type: DType, output_type: DType, cache_t: KVCacheT, mask_t: MHAMask, //, ragged: Bool = False, sink: Bool = False](q: LayoutTensor[q_type, q.layout, q.origin, element_layout=q.element_layout, layout_int_type=q.layout_int_type, linear_idx_type=q.linear_idx_type, masked=q.masked, alignment=q.alignment], k: cache_t, v: cache_t, mask_functor: mask_t, output: LayoutTensor[output_type, output.layout, output.origin, element_layout=output.element_layout, layout_int_type=output.layout_int_type, linear_idx_type=output.linear_idx_type, masked=output.masked, alignment=output.alignment], valid_length: LayoutTensor[DType.uint32, valid_length.layout, valid_length.origin, element_layout=valid_length.element_layout, layout_int_type=valid_length.layout_int_type, linear_idx_type=valid_length.linear_idx_type, masked=valid_length.masked, alignment=valid_length.alignment], scale: Float32, batch_size: Int, max_prompt_len: Int, max_cache_size: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext, sink_weights: OptionalReg[LayoutTensor[q_type, Layout.row_major(VariadicList(-1)), ImmutAnyOrigin]] = None)