Mojo function

load_AB

load_AB[a_type: DType, b_type: DType, a_layout: Layout, b_layout: Layout, a_desc_layout: Layout, b_desc_layout: Layout, a_smem_layout: Layout, b_smem_layout: Layout, num_pipeline_stages: UInt, /, *, block_tile_shape: IndexList[3], mma_shape: IndexList[3], cta_group: Int = 1](a_tma_op: TMATensorTile[a_type, a_layout, a_desc_layout], b_tma_op: TMATensorTile[b_type, b_layout, b_desc_layout], a_smem: LayoutTensorIter[a_type, a_smem_layout, MutableAnyOrigin, address_space=AddressSpace(3), alignment=128], b_smem: LayoutTensorIter[b_type, b_smem_layout, MutableAnyOrigin, address_space=AddressSpace(3), alignment=128], mma_mbar: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=16], tma_mbar: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=16], producer_phase: PipelineState[num_pipeline_stages], peer_cta_coord: Tuple[UInt, UInt, UInt], work_tile_coord: Tuple[UInt, UInt], a_multicast_mask: UInt16, b_multicast_mask: UInt16, iter_idx: UInt, elect_one_cta: Bool)