Mojo struct
ConvDirectNHWC
struct ConvDirectNHWC[input_mut: Bool, filter_mut: Bool, conv_attr_rank: Int, //, input_layout: Layout, filter_layout: Layout, output_layout: Layout, input_origin: Origin[input_mut], filter_origin: Origin[filter_mut], output_origin: MutOrigin, input_type: DType, filter_type: DType, output_type: DType, filter_packed: Bool, conv_attr: ConvInfoStatic[conv_attr_rank], elementwise_epilogue: OptionalReg[fn[rank: Int](coords: IndexList[rank], f_size: Int) capturing -> None] = None]
Implement the outer loops for direct convolution. Collapse N, HO, WO into one dimension n_ho_wo. Tile n_ho_wo, C, and F. The tile factor for C and F are chosen by a heuristic prioritizing C. n_ho_wo is tiled by micro kernel's height.
If n_ho_wo is large enough to spill LLC, we may need to tile n_ho_wo as the outer most loop with a factor fit in LLC.
Assume F is divisible at least by simd_size.
Fields
- output (
LayoutTensor[output_type, output_layout, output_origin]): - input (
LayoutTensor[input_type, input_layout, input_origin]): - filter (
LayoutTensor[filter_type, filter_layout, filter_origin]): - conv_shape (
ConvShape[conv_attr_rank]): - partition (
ConvPartition): - cf_tile_size (
IndexList[2]):
Implemented traits
AnyType,
Copyable,
ImplicitlyCopyable,
Movable,
UnknownDestructibility
Aliases
__copyinit__is_trivial
alias __copyinit__is_trivial = True
__del__is_trivial
alias __del__is_trivial = True
__moveinit__is_trivial
alias __moveinit__is_trivial = True
packed_and_fully_static
alias packed_and_fully_static = filter_packed if filter_layout.shape.all_known() if output_layout.shape.all_known[1, output_layout.rank()]() if input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else output_layout.shape.all_known[1, output_layout.rank()]() if input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else filter_layout.shape.all_known() if output_layout.shape.all_known[1, output_layout.rank()]() if input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else output_layout.shape.all_known[1, output_layout.rank()]() if input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]() else input_layout.shape.all_known[1, input_layout.rank()]() if conv_attr.all_known[conv_attr_rank]() else conv_attr.all_known[conv_attr_rank]()
Methods
run
static run(output: LayoutTensor[output_type, output_layout, output_origin], input: LayoutTensor[input_type, input_layout, input_origin], filter: LayoutTensor[filter_type, filter_layout, filter_origin], conv_shape: ConvShape[conv_attr_rank])
is_new_c_accum
update_output_tile_no_padding
update_output_tile_no_padding[micro_kernel_height: Int, micro_kernel_width: Int, c_fully_cached: Bool, has_residual: Bool, last_c_tile: Bool](self, n: Int, f_tile_offset: Int, f_tile_size: Int, c_tile_offset: Int, c_tile_size: Int, output_flat_coord: Int)
output_space_flat_loop
output_space_flat_loop[micro_kernel_f_size: Int, has_residual: Bool, last_c_tile: Bool](self, n: Int, f_tile_offset: Int, f_tile_size: Int, c_tile_offset: Int, c_tile_size: Int)
output_space_loop
output_space_loop[micro_kernel_height: Int, micro_kernel_width: Int, has_residual: Bool, last_c_tile: Bool](self, n: Int, f_tile_offset: Int, f_tile_size: Int, c_tile_offset: Int, c_tile_size: Int)
output_space_loop_1d
output_space_loop_1d[micro_kernel_height: Int, micro_kernel_width: Int, has_residual: Bool, last_c_tile: Bool, output_dt: DType, input_dt: DType, filter_dt: DType](self, output: LegacyUnsafePointer[Scalar[output_dt]], input: LegacyUnsafePointer[Scalar[input_dt]], filter: LegacyUnsafePointer[Scalar[filter_dt]], n: Int, first_c_tile_in_group: Bool, c_tile_size: Int, f_tile_offset: Int, f_tile_size: Int, left_pad_impact_end: Int, right_pad_impact_start: Int)
output_space_loop_2d
output_space_loop_2d[micro_kernel_height: Int, micro_kernel_width: Int, has_residual: Bool, last_c_tile: Bool, output_dt: DType, input_dt: DType, filter_dt: DType](self, output: LegacyUnsafePointer[Scalar[output_dt]], input: LegacyUnsafePointer[Scalar[input_dt]], filter: LegacyUnsafePointer[Scalar[filter_dt]], n: Int, first_c_tile_in_group: Bool, c_tile_size: Int, f_tile_offset: Int, f_tile_size: Int, left_pad_impact_end: Int, right_pad_impact_start: Int)
output_space_loop_3d
output_space_loop_3d[micro_kernel_height: Int, micro_kernel_width: Int, has_residual: Bool, last_c_tile: Bool, output_dt: DType, input_dt: DType, filter_dt: DType](self, output: LegacyUnsafePointer[Scalar[output_dt]], input: LegacyUnsafePointer[Scalar[input_dt]], filter: LegacyUnsafePointer[Scalar[filter_dt]], n: Int, first_c_tile_in_group: Bool, c_tile_size: Int, f_tile_offset: Int, f_tile_size: Int, left_pad_impact_end: Int, right_pad_impact_start: Int)
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!