Mojo function

grouped_matmul_dynamic_scaled_fp8

grouped_matmul_dynamic_scaled_fp8[c_type: DType, a_type: DType, b_type: DType, a_scales_type: DType, b_scales_type: DType, a_offsets_type: DType, expert_ids_type: DType, //, input_scale_granularity: StringSlice[StaticConstantOrigin], weight_scale_granularity: StringSlice[StaticConstantOrigin], m_scale_granularity: Int, n_scale_granularity: Int, k_scale_granularity: Int, transpose_b: Bool = False, tokens_padded_per_expert: Bool = False, target: StringSlice[StaticConstantOrigin] = "cpu"](c: NDBuffer[c_type, 2, MutAnyOrigin, c.shape], a: NDBuffer[a_type, 2, MutAnyOrigin, a.shape], b: NDBuffer[b_type, 3, MutAnyOrigin, b.shape], a_scales: NDBuffer[a_scales_type, 2, MutAnyOrigin, a_scales.shape], b_scales: NDBuffer[b_scales_type, 3, MutAnyOrigin, b_scales.shape], a_offsets: NDBuffer[a_offsets_type, 1, MutAnyOrigin, a_offsets.shape], expert_ids: NDBuffer[expert_ids_type, 1, MutAnyOrigin, expert_ids.shape], max_num_tokens_per_expert: Int, num_active_experts: Int, ctx: DeviceContext)