Mojo struct
PagedKVCacheCollection
struct PagedKVCacheCollection[dtype_: DType, kv_params_: KVCacheStaticParams, page_size: Int, scale_dtype_: DType = DType.invalid, quantization_granularity_: Int = 1]
Fieldsβ
- βscales (
OptionalReg[TileTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scale_dtype, Layout[*?, *?], MutAnyOrigin]]): - βkv_cache_scales_dynamic_shape (
IndexList[4]): - βkv_cache_scales_dynamic_strides (
IndexList[4]): - βblocks (
PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].blocks_tt_type): - βcache_lengths (
PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType.cache_lengths_tt_type): - βlookup_table (
PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType.lookup_table_tt_type): - βmax_seq_length (
UInt32): - βmax_cache_length (
UInt32): - βkv_cache_dynamic_shape (
IndexList[4]): - βkv_cache_dynamic_strides (
IndexList[4]):
Implemented traitsβ
AnyType,
Copyable,
ImplicitlyCopyable,
ImplicitlyDestructible,
KVCollectionT,
Movable
comptime membersβ
blocks_layoutβ
comptime blocks_layout = Layout.row_major(PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].blocks_shape)
blocks_shapeβ
comptime blocks_shape = IntTuple(-1, 2 if not PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params.is_mla.__bool__() else 1, -1, page_size, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params)
blocks_tt_layoutβ
comptime blocks_tt_layout = Layout[*?, *?]
blocks_tt_typeβ
comptime blocks_tt_type = TileTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].dtype, Layout[*?, *?], MutAnyOrigin]
CacheTypeβ
comptime CacheType = PagedKVCache[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].dtype, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params, page_size, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scale_dtype, quantization_granularity_]
dtypeβ
comptime dtype = dtype_
head_dim_granularityβ
comptime head_dim_granularity = ceildiv(PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params.head_size, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType.quantization_granularity)
kv_paramsβ
comptime kv_params = kv_params_
name_strβ
comptime name_str = "paged"
scale_dtypeβ
comptime scale_dtype = scale_dtype_
scales_layoutβ
comptime scales_layout = Layout.row_major(PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scales_shape)
scales_shapeβ
comptime scales_shape = IntTuple(-1, 2 if not PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params.is_mla.__bool__() else 1, -1, page_size, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].kv_params, PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].head_dim_granularity)
scales_tt_layoutβ
comptime scales_tt_layout = Layout[*?, *?]
scales_tt_typeβ
comptime scales_tt_type = TileTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scale_dtype, Layout[*?, *?], MutAnyOrigin]
Methodsβ
__init__β
__init__(out self, blocks: LayoutTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].dtype, Layout.row_major[6](), MutAnyOrigin], cache_lengths: LayoutTensor[DType.uint32, Layout(IntTuple(-1)), ImmutAnyOrigin], lookup_table: LayoutTensor[DType.uint32, Layout.row_major[2](), ImmutAnyOrigin], max_seq_length: UInt32, max_cache_length: UInt32, scales: OptionalReg[LayoutTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scale_dtype, Layout.row_major[6](), MutAnyOrigin]] = None)
Construct from LayoutTensor params (MOGG boundary).
__init__(out self, blocks: TileTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].dtype, Layout[*?, *?], MutAnyOrigin], cache_lengths: TileTensor[DType.uint32, Layout[*?, *?], ImmutAnyOrigin], lookup_table: TileTensor[DType.uint32, Layout[*?, *?], ImmutAnyOrigin], max_seq_length: UInt32, max_cache_length: UInt32, scales: OptionalReg[TileTensor[PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].scale_dtype, Layout[*?, *?], MutAnyOrigin]] = None)
Construct from TileTensor fields directly.
get_key_cacheβ
get_key_cache(self, layer_idx: Int) -> PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType
Returns:
PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType
get_value_cacheβ
get_value_cache(self, layer_idx: Int) -> PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType
Returns:
PagedKVCacheCollection[dtype_, kv_params_, page_size, scale_dtype_, quantization_granularity_].CacheType
cache_lengthβ
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!