Mojo function

mla_sm100_prefill_blockscale

mla_sm100_prefill_blockscale[output_dtype: DType, q_type: DType, KVType: MHAOperand, KRopeType: MHAOperand, MaskType: MHAMask, MaxPromptLenType: OptionallyStaticInt, //, config: MHAConfig[config.dtype], group: Int, q_depth: Int, cache_depth: Int, _ndbuffer_mha_operand: Bool, blockwise_scale: Int = 0](output: TileTensor[output_dtype, output.LayoutType, output.origin, linear_idx_type=output.linear_idx_type, element_size=output.element_size], q: TileTensor[q_type, q.LayoutType, q.origin, linear_idx_type=q.linear_idx_type, element_size=q.element_size], k: KVType, v: KVType, k_rope: KRopeType, mask_functor: MaskType, valid_length: TileTensor[DType.uint32, valid_length.LayoutType, valid_length.origin, linear_idx_type=valid_length.linear_idx_type, element_size=valid_length.element_size], max_prompt_len: MaxPromptLenType, scale: Float32, batch_size: Int, ctx: DeviceContext)