Python class
KVCacheMetrics
KVCacheMetrics
class max.nn.kv_cache.KVCacheMetrics(input_tokens=0, cache_tokens=0, h2d_blocks_copied=0, d2h_blocks_copied=0, disk_blocks_written=0, disk_blocks_read=0, nixl_read_blocks=0, nixl_write_blocks=0, nixl_read_latency_total_ms=0.0, nixl_read_latency_count=0, nixl_write_latency_total_ms=0.0, nixl_write_latency_count=0, rpc_acquire_latency_total_ms=0.0, rpc_acquire_latency_count=0, rpc_read_latency_total_ms=0.0, rpc_read_latency_count=0, nixl_read_bytes=0, nixl_write_bytes=0, nixl_read_blocks_local=0, nixl_read_blocks_remote=0)
Bases: object
Metrics for the KV cache.
Tracks token usage and block transfer statistics for KV cache operations.
-
Parameters:
-
- input_tokens (int)
- cache_tokens (int)
- h2d_blocks_copied (int)
- d2h_blocks_copied (int)
- disk_blocks_written (int)
- disk_blocks_read (int)
- nixl_read_blocks (int)
- nixl_write_blocks (int)
- nixl_read_latency_total_ms (float)
- nixl_read_latency_count (int)
- nixl_write_latency_total_ms (float)
- nixl_write_latency_count (int)
- rpc_acquire_latency_total_ms (float)
- rpc_acquire_latency_count (int)
- rpc_read_latency_total_ms (float)
- rpc_read_latency_count (int)
- nixl_read_bytes (int)
- nixl_write_bytes (int)
- nixl_read_blocks_local (int)
- nixl_read_blocks_remote (int)
cache_hit_rate
property cache_hit_rate: float
Proportion of prompt tokens that were retrieved from cache.
-
Returns:
-
Ratio of cache_tokens to total prompt_tokens, or 0.0 if no tokens were processed.
cache_tokens
cache_tokens: int = 0
Number of tokens retrieved from cache (cache hits).
d2h_blocks_copied
d2h_blocks_copied: int = 0
Number of cache blocks copied from device to host.
disk_blocks_read
disk_blocks_read: int = 0
Number of cache blocks read from disk.
disk_blocks_written
disk_blocks_written: int = 0
Number of cache blocks written to disk.
h2d_blocks_copied
h2d_blocks_copied: int = 0
Number of cache blocks copied from host to device.
input_tokens
input_tokens: int = 0
Number of tokens processed as new input (cache misses).
nixl_read_blocks
nixl_read_blocks: int = 0
Number of cache blocks read via NIXL (dKV GET).
nixl_read_blocks_local
nixl_read_blocks_local: int = 0
NIXL reads from co-located (default) block store.
nixl_read_blocks_remote
nixl_read_blocks_remote: int = 0
NIXL reads from non-default (remote) block stores.
nixl_read_bytes
nixl_read_bytes: int = 0
Total bytes transferred via NIXL READ.
nixl_read_gib_per_s
property nixl_read_gib_per_s: float
NIXL READ throughput in GiB/s.
nixl_read_latency_avg_ms
property nixl_read_latency_avg_ms: float
Average NIXL READ transfer latency in milliseconds.
nixl_read_latency_count
nixl_read_latency_count: int = 0
Number of NIXL READ transfer completions.
nixl_read_latency_total_ms
nixl_read_latency_total_ms: float = 0.0
Cumulative NIXL READ transfer latency in milliseconds.
nixl_write_blocks
nixl_write_blocks: int = 0
Number of cache blocks written via NIXL (dKV PUT).
nixl_write_bytes
nixl_write_bytes: int = 0
Total bytes transferred via NIXL WRITE.
nixl_write_gib_per_s
property nixl_write_gib_per_s: float
NIXL WRITE throughput in GiB/s.
nixl_write_latency_avg_ms
property nixl_write_latency_avg_ms: float
Average NIXL WRITE transfer latency in milliseconds.
nixl_write_latency_count
nixl_write_latency_count: int = 0
Number of NIXL WRITE transfer completions.
nixl_write_latency_total_ms
nixl_write_latency_total_ms: float = 0.0
Cumulative NIXL WRITE transfer latency in milliseconds.
prompt_tokens
property prompt_tokens: int
Total number of prompt tokens (input + cached).
-
Returns:
-
Sum of input_tokens and cache_tokens.
remote_read_ratio
property remote_read_ratio: float
Fraction of NIXL reads hitting non-default (remote) block stores.
rpc_acquire_latency_avg_ms
property rpc_acquire_latency_avg_ms: float
Average dKV acquire_blocks RPC latency in milliseconds.
rpc_acquire_latency_count
rpc_acquire_latency_count: int = 0
Number of acquire_blocks RPC calls.
rpc_acquire_latency_total_ms
rpc_acquire_latency_total_ms: float = 0.0
Cumulative dKV acquire_blocks RPC latency in milliseconds.
rpc_read_latency_avg_ms
property rpc_read_latency_avg_ms: float
Average dKV read_blocks RPC latency in milliseconds.
rpc_read_latency_count
rpc_read_latency_count: int = 0
Number of read_blocks RPC calls.
rpc_read_latency_total_ms
rpc_read_latency_total_ms: float = 0.0
Cumulative dKV read_blocks RPC latency in milliseconds.
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!