Skip to main content
Log in

Python module

config

Standardized config for Pipeline Inference.

HuggingFaceRepo

class max.pipelines.config.HuggingFaceRepo(repo_id: 'str', trust_remote_code: 'bool' = False, repo_type: 'Optional[RepoType]' = None)

download()

download(filename: str, force_download: bool = False) → Path

encoding_for_file()

encoding_for_file(file: str | Path) → SupportedEncoding

file_exists()

file_exists(filename: str) → bool

files_for_encoding()

files_for_encoding(encoding: SupportedEncoding, weights_format: WeightsFormat | None = None, alternate_encoding: SupportedEncoding | None = None) → dict[max.pipelines.config.WeightsFormat, list[pathlib.Path]]

formats_available

property formats_available*: list[max.pipelines.config.WeightsFormat]*

info

property info*: ModelInfo*

repo_id

repo_id*: str*

repo_type

repo_type*: RepoType | None* = None

size_of()

size_of(filename: str) → int | None

supported_encodings

property supported_encodings*: list[max.pipelines.config.SupportedEncoding]*

trust_remote_code

trust_remote_code*: bool* = False

weight_files

property weight_files*: dict[max.pipelines.config.WeightsFormat, list[str]]*

PipelineConfig

class max.pipelines.config.PipelineConfig(model_path: 'str' = '', huggingface_repo_id: 'str' = '', engine: 'Optional[PipelineEngine]' = None, architecture: 'Optional[str]' = None, weight_path: 'list[Path]' = <factory>, device_specs: 'list[DeviceSpec]' = <factory>, quantization_encoding: 'Optional[SupportedEncoding]' = None, serialized_model_path: 'Optional[str]' = None, save_to_serialized_model_path: 'Optional[str]' = None, max_length: 'Optional[int]' = None, max_new_tokens: 'int' = -1, max_batch_size: 'Optional[int]' = None, max_ce_batch_size: 'int' = 32, enable_chunked_prefill: 'bool' = True, enable_in_flight_batching: 'bool' = False, cache_strategy: 'KVCacheStrategy' = model_default, max_num_steps: 'int' = -1, pad_to_multiple_of: 'int' = 2, kv_cache_page_size: 'int' = 128, enable_prefix_caching: 'bool' = False, device_memory_utilization: 'float' = 0.9, target_num_new_tokens: 'Optional[int]' = None, top_k: 'int' = 1, enable_structured_output: 'bool' = False, trust_remote_code: 'bool' = False, force_download: 'bool' = False, enable_echo: 'bool' = False, rope_type: 'Optional[RopeType]' = None, pool_embeddings: 'bool' = True, _huggingface_config: 'Optional[AutoConfig]' = None, _devices: 'list[Device]' = <factory>, _weights_converter: 'Optional[type[WeightsConverter]]' = None, _weights_repo_id: 'Optional[str]' = None, _available_cache_memory: 'Optional[int]' = None, _quant_config: 'Optional[QuantizationConfig]' = None, max_cache_batch_size: 'Optional[int]' = None, gpu_profiling: 'str' = 'false', use_experimental_kernels: 'str' = 'false')

architecture

architecture*: str | None* = None

Model architecture to run.

cache_dtype

property cache_dtype*: DType*

cache_strategy

cache_strategy*: KVCacheStrategy* = 'model_default'

The cache strategy to use. This defaults to model_default, which will set the cache strategy based on the default strategy for the architecture requested.

You can also force the engine to use a specific caching strategy: naive | continuous | paged.

device_memory_utilization

device_memory_utilization*: float* = 0.9

The fraction of available device memory that the process should consume.

This is used to inform the size of the KVCache workspace: : kv_cache_workspace = (total_free_memory * device_memory_utilization) - model_weights_size

device_specs

device_specs*: list[max.driver.driver.DeviceSpec]*

Devices to run inference upon. This option is not documented in help() as it shouldn’t be used directly via the CLI entrypoint.

devices

property devices*: list[max._core.driver.Device]*

Initialize and return a list of devices, given a list of device specs.

download_weights()

download_weights() → None

dtype

property dtype*: DType*

enable_chunked_prefill

enable_chunked_prefill*: bool* = True

Enable chunked prefill to split context encoding requests into multiple chunks based on ‘target_num_new_tokens’.

enable_echo

enable_echo*: bool* = False

Whether the model should be built with echo capabilities.

enable_in_flight_batching

enable_in_flight_batching*: bool* = False

When enabled, prioritizes token generation by batching it with context encoding requests. Requires chunked prefill.

enable_prefix_caching

enable_prefix_caching*: bool* = False

Whether to enable prefix caching for the paged attention KVCache.

enable_structured_output

enable_structured_output*: bool* = False

Enable structured generation/guided decoding for the server. This allows the user to pass a json schema in the response_format field, which the LLM will adhere to.

engine

engine*: PipelineEngine | None* = None

Engine backend to use for serving, ‘max’ for the max engine, or ‘huggingface’ as fallback option for improved model coverage.

finalize_encoding_config()

finalize_encoding_config()

Depending on the encoding picked, we get some more parameters from the hf config

force_download

force_download*: bool* = False

Whether to force download a given file if it’s not already present in the local cache.

gpu_profiling

gpu_profiling*: str* = 'false'

Whether to enable GPU profiling of the model.

graph_quantization_encoding

property graph_quantization_encoding*: QuantizationEncoding | None*

Converts the CLI encoding to a MAX graph quantization encoding.

  • Returns:

    The graph quantization encoding corresponding to the CLI encoding.

  • Raises:

    ValueError – If no CLI encoding was specified.

help()

static help() → dict[str, str]

huggingface_config

property huggingface_config*: AutoConfig*

Given the model_path, return the Hugging Face Config.

huggingface_repo_id

huggingface_repo_id*: str* = ''

repo_id of a Hugging Face model repository to use. Use model_path instead.

  • Type:

    DEPRECATED

huggingface_weights_repo()

huggingface_weights_repo() → HuggingFaceRepo

kv_cache_page_size

kv_cache_page_size*: int* = 128

The number of tokens in a single page in the paged KVCache.

load_weights()

load_weights() → Weights

max_batch_size

max_batch_size*: int | None* = None

Maximum batch size to execute with the model. This is set to one, to minimize memory consumption for the base case, in which a person is running a local server to test out MAX. For users launching in a server scenario, the expectation is that this value should be set higher based on server capacity.

max_cache_batch_size

max_cache_batch_size*: int | None* = None

The maximum cache batch size to use for the model. Use max_batch_size instead.

  • Type:

    DEPRECATED

max_ce_batch_size

max_ce_batch_size*: int* = 32

Maximum cache size to reserve for a single context encoding batch. The actual limit is the lesser of this and max_batch_size.

max_length

max_length*: int | None* = None

Maximum sequence length of the model.

max_new_tokens

max_new_tokens*: int* = -1

Maximum number of new tokens to generate during a single inference pass of the model.

max_num_steps

max_num_steps*: int* = -1

The number of steps to run for multi-step scheduling. -1 specifies a default value based on configuration and platform. Ignored for models which are not auto-regressive (e.g. embedding models).

model_path

model_path*: str* = ''

repo_id of a Hugging Face model repository to use.

pad_to_multiple_of

pad_to_multiple_of*: int* = 2

Pad input tensors to be a multiple of value provided.

pool_embeddings

pool_embeddings*: bool* = True

Whether to pool embedding outputs.

quantization_encoding

quantization_encoding*: SupportedEncoding | None* = None

Weight encoding type.

rope_type

rope_type*: RopeType | None* = None

none | normal | neox. Only matters for GGUF weights.

  • Type:

    Force using a specific rope type

sampling_params

property sampling_params*: SamplingParams*

save_to_serialized_model_path

save_to_serialized_model_path*: str | None* = None

If specified, tries to save a serialized model to this path.

serialized_model_path

serialized_model_path*: str | None* = None

If specified, tries to load a serialized model from this path.

target_num_new_tokens

target_num_new_tokens*: int | None* = None

The target number of un-encoded tokens to include in each batch. If not set, this will be set to a best-guess optimal value based on model, hardware, and available memory.

top_k

top_k*: int* = 1

Limits the sampling to the K most probable tokens. This defaults to 1, which enables greedy sampling.

trust_remote_code

trust_remote_code*: bool* = False

Whether or not to allow for custom modelling files on Hugging Face.

update_architecture()

update_architecture() → None

use_experimental_kernels

use_experimental_kernels*: str* = 'false'

weight_path

weight_path*: list[pathlib.Path]*

Optional path or url of the model weights to use.

weights_format

property weights_format*: WeightsFormat*

Identify which format our weights are expected in.

weights_size()

weights_size() → int

PipelineEngine

class max.pipelines.config.PipelineEngine(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)

HUGGINGFACE

HUGGINGFACE = 'huggingface'

MAX

MAX = 'max'

RepoType

class max.pipelines.config.RepoType(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)

local

local = 'local'

online

online = 'online'

RopeType

class max.pipelines.config.RopeType(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)

neox

neox = 'neox'

none

none = 'none'

normal

normal = 'normal'

SamplingParams

class max.pipelines.config.SamplingParams(top_k: 'int', enable_structured_output: 'bool', in_dtype: 'DType', out_dtype: 'DType')

enable_structured_output

enable_structured_output*: bool*

in_dtype

in_dtype*: DType*

out_dtype

out_dtype*: DType*

top_k

top_k*: int*

SupportedEncoding

class max.pipelines.config.SupportedEncoding(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)

All possible encodings which may be supported by a particular model.

bfloat16

bfloat16 = 'bfloat16'

cache_dtype

property cache_dtype*: DType*

The dtype that must be used in the kvcache for correctness.

dtype

property dtype*: DType*

The underlying model dtype associated with a quantization_encoding.

float32

float32 = 'float32'

gptq

gptq = 'gptq'

parse_from_file_name()

classmethod parse_from_file_name(name: str)

q4_0

q4_0 = 'q4_0'

q4_k

q4_k = 'q4_k'

q6_k

q6_k = 'q6_k'

quantization_encoding

property quantization_encoding*: QuantizationEncoding | None*

supported_on()

supported_on(device_spec: DeviceSpec) → bool

Returns whether this quantization encoding is supported on a device.

WeightsFormat

class max.pipelines.config.WeightsFormat(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)

gguf

gguf = 'gguf'

pytorch

pytorch = 'pytorch'

safetensors

safetensors = 'safetensors'