Mojo function

promote_accumulators

promote_accumulators[pipeline_stages: UInt, num_accum_pipeline_stages: UInt, accum_type: DType, accum_layout: Layout, a_scales_type: DType, b_scales_type: DType, b_scales_layout: Layout, a_scales_smem_layout: Layout, /, *, block_tile_shape: IndexList[3], mma_shape: IndexList[3], cta_group: Int, CLUSTER_SIZE: Int32](b_scales: LayoutTensor[b_scales_type, b_scales_layout, MutableAnyOrigin], a_scales_smem_iter: LayoutTensorIter[a_scales_type, a_scales_smem_layout, MutableAnyOrigin, address_space=AddressSpace(3), alignment=128], c_upper_main_tile: LayoutTensor[accum_type, accum_layout, MutableAnyOrigin, address_space=AddressSpace(5), element_layout=element_layout, layout_int_type=layout_int_type, linear_idx_type=linear_idx_type, masked=masked, alignment=alignment], c_lower_main_tile: LayoutTensor[accum_type, accum_layout, MutableAnyOrigin, address_space=AddressSpace(5), element_layout=element_layout, layout_int_type=layout_int_type, linear_idx_type=linear_idx_type, masked=masked, alignment=alignment], accum_pipeline_consumer_state: PipelineState[num_accum_pipeline_stages], accum_full_mbar: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=16], accum_empty_mbar: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=16], tmem_addr: UInt32, mma_mbar: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=16], consumer_phase: PipelineState[pipeline_stages], work_tile_coord: Tuple[UInt, UInt], elect_one_warp: Bool, stage_stride_cols: UInt, k_iter: UInt, problem_shape: StaticTuple[Int32, 3])