Mojo struct
FA4Config
@register_passable(trivial)
struct FA4Config
Fields
- MMA_M (
Int): - BM (
Int): - BN (
Int): - BK0 (
Int): - BK1 (
Int): - depth (
Int): - padded_depth (
Int): - group (
Int): - num_q_heads (
Int): - num_kv_heads (
Int): - TMEM_S1 (
Int): - TMEM_O0 (
Int): - TMEM_O1 (
Int): - TMEM_P0 (
Int): - TMEM_P1 (
Int): - TMEM_C0 (
Int): - TMEM_C1 (
Int): - tmem_used (
Int): - num_kv_stages (
Int): - num_mma_stages (
Int): - smem_used (
Int): - dtype_size (
Int): - split_m (
Bool): - swizzle_mode (
TensorMapSwizzle):
Implemented traits
AnyType,
Copyable,
ImplicitlyCopyable,
Movable,
UnknownDestructibility
Aliases
__copyinit__is_trivial
alias __copyinit__is_trivial = True
__del__is_trivial
alias __del__is_trivial = True
__moveinit__is_trivial
alias __moveinit__is_trivial = True
mbar_size
alias mbar_size = DType.int64.size_of()
MMA_K
alias MMA_K = 16
num_correction_cols
alias num_correction_cols = 1
num_threads
alias num_threads = 512
sm100_smem_carveout
alias sm100_smem_carveout = (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "B200", Vendor(2), "cuda", "blackwell", 10, "sm_100a", 148) - 1024)
sm100_tmem_cols
alias sm100_tmem_cols = 512
TMEM_S0
alias TMEM_S0 = 0
Methods
__init__
__init__(*, num_q_heads: Int, group: Int, depth: Int, dtype_size: Int, swizzle_mode: TensorMapSwizzle) -> Self
num_qo
supported
use_tmem_for_correction
correction_smem_elements
num_active_warps_per_group
num_active_threads_per_group
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!