Skip to main content

Mojo module

dispatch

Aliases

DISPATCH_HIT

alias DISPATCH_HIT = 1

DISPATCH_MISS

alias DISPATCH_MISS = 0

llama_405b_fp8_list

alias llama_405b_fp8_list = List[TuningConfigSM90](TuningConfigSM90(64, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 2304, 16384, IndexList[3, DType.int64](64, 48, 32, Tuple[]()), Index(64, 48, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 2304, 16384, IndexList[3, DType.int64](64, 48, 32, Tuple[]()), Index(64, 48, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 2304, 16384, IndexList[3, DType.int64](64, 96, 32, Tuple[]()), Index(64, 96, 128), 4, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(1, 1, 1), 2, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(1, 1, 1), 2, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(2048, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(16, 8)), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 2304, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 13312, 16384, IndexList[3, DType.int64](64, 208, 32, Tuple[]()), Index(128, 208, 128), 4, Index(1, 2, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), Tuple[]())

llama_405b_fp8_table

alias llama_405b_fp8_table = Table[TuningConfigSM90](List[TuningConfigSM90](TuningConfigSM90(64, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 16384, 2048, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 2304, 16384, IndexList[3, DType.int64](64, 48, 32, Tuple[]()), Index(64, 48, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 2304, 16384, IndexList[3, DType.int64](64, 48, 32, Tuple[]()), Index(64, 48, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 2304, 16384, IndexList[3, DType.int64](64, 96, 32, Tuple[]()), Index(64, 96, 128), 4, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(1, 1, 1), 2, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(1, 1, 1), 2, False, OptionalReg[IndexList[2]](Index(GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132).sm_count, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(2048, 2304, 16384, IndexList[3, DType.int64](64, 144, 32, Tuple[]()), Index(128, 144, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(16, 8)), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 2304, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(128, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(256, 13312, 16384, IndexList[3, DType.int64](64, 208, 32, Tuple[]()), Index(128, 208, 128), 4, Index(1, 2, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(512, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 13312, 16384, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(64, 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, False, OptionalReg[IndexList[2]](Index(128, 1)), MatmulSchedule(3), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), 16384, 6656, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 4, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), Tuple[]()), "llama_405b_fp8")

llama_8b_fp8_list

alias llama_8b_fp8_list = List[TuningConfigSM90](TuningConfigSM90(128, -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 6, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 6, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), Tuple[]())

llama_8b_fp8_table

alias llama_8b_fp8_table = Table[TuningConfigSM90](List[TuningConfigSM90](TuningConfigSM90(128, -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(64, 128, 128), 8, Index(1, 1, 1), 1, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(1024, -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 6, Index(1, 1, 1), 2, True, OptionalReg[IndexList[2]](None), MatmulSchedule(0), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), TuningConfigSM90(Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]())), -1, -1, IndexList[3, DType.int64](64, 128, 32, Tuple[]()), Index(128, 128, 128), 6, Index(2, 1, 1), 2, True, OptionalReg[IndexList[2]](Index(8, (GPUInfo.from_family(AcceleratorArchitectureFamily(32, 2048, 233472, 65536, 1024), "H100", Vendor(2), "cuda", "hopper", 9, "sm_90a", 132) // 8))), MatmulSchedule(2), OptionalReg[Int](None), OptionalReg[RasterOrder](None)), Tuple[]()), "llama_8b_fp8")

MAX_M

alias MAX_M = Int.__init__[Scalar[DType.index]](SIMD[DType.index, 1](max_or_inf[DType.index]()))

Structs

Functions

Was this page helpful?