Mojo function

rms_norm_fused_residual_add_gpu

rms_norm_fused_residual_add_gpu[dtype: DType, rank: Int, //, input_fn: fn[width: Int, rank: Int](IndexList[rank]) capturing -> SIMD[dtype, width], residual_input_fn: fn[width: Int, rank: Int](IndexList[rank]) capturing -> SIMD[dtype, width], output_residual_fn: fn[width: Int, alignment: Int](IndexList[rank], SIMD[dtype, width]) capturing -> None, output_fn: fn[width: Int, alignment: Int](IndexList[rank], SIMD[dtype, width]) capturing -> None, multiply_before_cast: Bool](shape: IndexList[rank, element_type=shape.element_type], gamma1: TileTensor[dtype, gamma1.LayoutType, gamma1.origin, address_space=gamma1.address_space, linear_idx_type=gamma1.linear_idx_type, element_shape_types=gamma1.element_shape_types], epsilon1: Scalar[dtype], weight_offset1: Scalar[dtype], gamma2: TileTensor[dtype, gamma2.LayoutType, gamma2.origin, address_space=gamma2.address_space, linear_idx_type=gamma2.linear_idx_type, element_shape_types=gamma2.element_shape_types], epsilon2: Scalar[dtype], weight_offset2: Scalar[dtype], ctx: DeviceContext)