Mojo function

consumer_main_loop

consumer_main_loop[accum_type: DType, c_type: DType, a_type: DType, b_type: DType, sfa_dtype: DType, sfb_dtype: DType, a_smem_layout: Layout, b_smem_layout: Layout, sfa_smem_layout: Layout, sfb_smem_layout: Layout, a_swizzle: TensorMapSwizzle, b_swizzle: TensorMapSwizzle, transpose_b: Bool, pipeline_stages: Int, scaling_kind: UMMAKind, /, *, block_tile_shape: IndexList[3], mma_shape: IndexList[3], SFA_NUM_COLS: Int, SFB_NUM_COLS: Int, cta_group: Int = 1, cluster_shape: IndexList[3] = Index(1, 1, 1), k_group_size: Scalar[DType.uint] = 1](tmem_addr: UInt32, sfa_tmem: UInt32, sfb_tmem: UInt32, a_smem_iter: LayoutTensorIter[a_type, a_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128], b_smem_iter: LayoutTensorIter[b_type, b_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128], sfa_smem_iter: LayoutTensorIter[sfa_dtype, sfa_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128], sfb_smem_iter: LayoutTensorIter[sfb_dtype, sfb_smem_layout, MutAnyOrigin, address_space=AddressSpace.SHARED, alignment=128], load_mma_pipeline: ProducerConsumerPipeline[pipeline_stages], mma_op: MmaOpSM100_BlockScaled_SS[c_type, a_type, b_type, sfa_dtype, sfb_dtype, scaling_kind, block_tile_shape, mma_shape, accum_type=accum_type, cta_group=cta_group, cluster_shape=cluster_shape, a_swizzle=a_swizzle, b_swizzle=b_swizzle, transpose_b=transpose_b], elect_one_warp: Bool, iter_idx: UInt32, k_start: UInt32, work_tile_coord: Tuple[UInt, UInt])