Mojo function

rms_norm_gpu

rms_norm_gpu[dtype: DType, rank: Int, //, input_fn: def[width: Int, rank: Int](IndexList[rank]) capturing -> SIMD[dtype, width], output_fn: def[width: Int, alignment: Int](IndexList[rank], SIMD[dtype, width]) capturing -> None, multiply_before_cast: Bool](shape: IndexList[rank, element_type=shape.element_type], gamma: TileTensor[dtype, gamma.LayoutType, gamma.origin, address_space=gamma.address_space, linear_idx_type=gamma.linear_idx_type, element_size=gamma.element_size], epsilon: Scalar[dtype], weight_offset: Scalar[dtype], ctx: DeviceContext)