Mojo function

batched_quantize_dynamic_scaled_fp8

batched_quantize_dynamic_scaled_fp8[out_dtype: DType, in_dtype: DType, scales_dtype: DType, //, input_fn: fn[width: Int, alignment: Int](batch: Int, row: Int, col: Int) capturing -> SIMD[in_dtype, width], group_size_or_per_token: Int, num_cols: Int](scaled_output: NDBuffer[out_dtype, 3, MutAnyOrigin], scales: NDBuffer[scales_dtype, 3, MutAnyOrigin], scale_ub: Float32, ctx: DeviceContext, num_rows: Int, batch_size: Int)