Mojo function

mha_gpu_naive

mha_gpu_naive[output_type: DType, k_t: MHAOperand, v_t: MHAOperand, mask_t: MHAMask, rank: Int, //, ragged: Bool = False, _use_valid_length: Bool = False, _is_cache_length_accurate: Bool = False](q: NDBuffer[dtype, rank, origin, shape, strides], k: k_t, v: v_t, mask_functor: mask_t, output: NDBuffer[output_type, rank, origin, shape, strides], valid_length: ManagedTensorSlice[io_spec, static_spec=static_spec], scale: SIMD[float32, 1], batch_size: Int, max_prompt_len: Int, max_cache_size: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext)

mha_gpu_naive[q_type: DType, k_type: DType, v_type: DType, output_type: DType, rank: Int, mask_type: DType, mask_rank: Int, //](q: NDBuffer[q_type, rank, origin, shape, strides], k: NDBuffer[k_type, rank, origin, shape, strides], v: NDBuffer[v_type, rank, origin, shape, strides], mask: NDBuffer[mask_type, mask_rank, origin, shape, strides, alignment=alignment, address_space=address_space, exclusive=exclusive], output: NDBuffer[output_type, rank, origin, shape, strides], scale: SIMD[float32, 1], batch_size: Int, seq_len: Int, num_keys: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext)

mha_gpu_naive[q_type: DType, output_type: DType, cache_t: KVCacheT, mask_t: MHAMask, rank: Int, //, ragged: Bool = False](q: NDBuffer[q_type, rank, origin, shape, strides], k: cache_t, v: cache_t, mask_functor: mask_t, output: NDBuffer[output_type, rank, origin, shape, strides], valid_length: ManagedTensorSlice[io_spec, static_spec=static_spec], scale: SIMD[float32, 1], batch_size: Int, max_prompt_len: Int, max_cache_size: Int, num_heads: Int, depth: Int, group: Int, ctx: DeviceContext)