Skip to main content
Log in

Python module

config

Standardized config for Pipeline Inference.

HuggingFaceRepo

class max.pipelines.config.HuggingFaceRepo(repo_id: 'str', trust_remote_code: 'bool' = False, repo_type: 'Optional[RepoType]' = None)

download()

download(filename: str, force_download: bool = False) → Path

encoding_for_file()

encoding_for_file(file: str | Path) → SupportedEncoding

file_exists()

file_exists(filename: str) → bool

files_for_encoding()

files_for_encoding(encoding: SupportedEncoding, weights_format: WeightsFormat | None = None, alternate_encoding: SupportedEncoding | None = None) → dict[max.pipelines.config.WeightsFormat, list[pathlib.Path]]

formats_available

property formats_available*: list[max.pipelines.config.WeightsFormat]*

info

property info*: ModelInfo*

repo_id

repo_id*: str*

repo_type

repo_type*: RepoType | None* = None

size_of()

size_of(filename: str) → int | None

supported_encodings

property supported_encodings*: list[max.pipelines.config.SupportedEncoding]*

trust_remote_code

trust_remote_code*: bool* = False

weight_files

property weight_files*: dict[max.pipelines.config.WeightsFormat, list[str]]*

PipelineConfig

class max.pipelines.config.PipelineConfig(huggingface_repo_id: 'str', engine: 'Optional[PipelineEngine]' = None, architecture: 'Optional[str]' = None, weight_path: 'list[Path]' = <factory>, device_specs: 'list[DeviceSpec]' = <factory>, quantization_encoding: 'Optional[SupportedEncoding]' = None, serialized_model_path: 'Optional[str]' = None, save_to_serialized_model_path: 'Optional[str]' = None, max_length: 'int' = 512, max_new_tokens: 'int' = -1, max_cache_batch_size: 'int' = 1, max_ce_batch_size: 'int' = 32, cache_strategy: 'KVCacheStrategy' = continuous, max_num_steps: 'int' = 1, pad_to_multiple_of: 'int' = 2, kv_cache_page_size: 'int' = 512, enable_prefix_caching: 'bool' = False, gpu_memory_utilization: 'float' = 0.9, top_k: 'Optional[int]' = None, trust_remote_code: 'bool' = False, force_download: 'bool' = False, enable_echo: 'bool' = False, rope_type: 'Optional[RopeType]' = None, _huggingface_config: 'Optional[AutoConfig]' = None, _devices: 'list[Device]' = <factory>, _weights_converter: 'Optional[type[WeightsConverter]]' = None, _weights_repo_id: 'Optional[str]' = None)

architecture

architecture*: str | None* = None

Model architecture to run.

cache_strategy

cache_strategy*: KVCacheStrategy* = 'continuous'

Force using a specific cache strategy, ‘naive’ or ‘continuous’.

device

property device*: Device*

Initialize and return a singular device, given a singular device spec.

device_specs

device_specs*: list[max.driver.driver.DeviceSpec]*

Devices to run inference upon.

devices

property devices*: list[max.driver.driver.Device]*

Initialize and return a list of devices, given a list of device specs.

download_weights()

download_weights() → None

dtype

property dtype*: DType*

enable_echo

enable_echo*: bool* = False

Whether the model should be built with echo capabilities.

enable_prefix_caching

enable_prefix_caching*: bool* = False

Whether to enable prefix caching for the paged attention KVCache.

engine

engine*: PipelineEngine | None* = None

Engine backend to use for serving, ‘max’ for the max engine, or ‘huggingface’ as fallback option for improved model coverage.

force_download

force_download*: bool* = False

Whether to force download a given file if it’s not already present in the local cache.

gpu_memory_utilization

gpu_memory_utilization*: float* = 0.9

The fraction of available device memory that the process should consume.

This is used to inform the size of the KVCache workspace: : kv_cache_workspace = (total_free_memory * gpu_memory_utilization) - model_weights_size

help()

static help() → dict[str, str]

huggingface_config

property huggingface_config*: AutoConfig*

Given the huggingface_repo_id, return the HuggingFace Config.

huggingface_repo_id

huggingface_repo_id*: str*

repo_id of a huggingface model repository to use.

huggingface_weights_repo()

huggingface_weights_repo() → HuggingFaceRepo

kv_cache_page_size

kv_cache_page_size*: int* = 512

The number of tokens in a single page in the paged KVCache.

load_weights()

load_weights() → Weights

max_cache_batch_size

max_cache_batch_size*: int* = 1

Maximum cache size to reserve for a single batch. This is set to one, to minimize memory consumption for the base case, in which a person is running a local server to test out MAX. For users launching in a server scenario, the expectation is that this value should be set higher based on server capacity.

max_ce_batch_size

max_ce_batch_size*: int* = 32

Maximum cache size to reserve for a single context encoding batch. The actual limit is the lesser of this and max_cache_batch_size.

max_length

max_length*: int* = 512

Maximum sequence length of the model.

max_new_tokens

max_new_tokens*: int* = -1

Maximum number of new tokens to generate during a single inference pass of the model.

max_num_steps

max_num_steps*: int* = 1

The number of steps to run for multi-step scheduling.

pad_to_multiple_of

pad_to_multiple_of*: int* = 2

Pad input tensors to be a multiple of value provided.

quantization_encoding

quantization_encoding*: SupportedEncoding | None* = None

Weight encoding type.

rope_type

rope_type*: RopeType | None* = None

Force using a specific rope type, ‘none’, ‘normal’, or ‘neox’. Only matters for GGUF weights.

save_to_serialized_model_path

save_to_serialized_model_path*: str | None* = None

If specified, tries to save a serialized model to this path.

serialized_model_path

serialized_model_path*: str | None* = None

If specified, tries to load a serialized model from this path.

short_name

property short_name*: str*

Returns a short name for the model defined by this PipelineConfig.

top_k

top_k*: int | None* = None

Limits the sampling to the K most probable tokens. If None, will default to greedy sampling.

trust_remote_code

trust_remote_code*: bool* = False

Whether or not to allow for custom modelling files on Huggingface.

update_architecture()

update_architecture() → None

weight_path

weight_path*: list[pathlib.Path]*

Optional path or url of the model weights to use.

weights_format

property weights_format*: WeightsFormat*

Identify which format our weights are expected in.

weights_size()

weights_size() → int | None

PipelineEngine

class max.pipelines.config.PipelineEngine(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

HUGGINGFACE

HUGGINGFACE = 'huggingface'

MAX

MAX = 'max'

RepoType

class max.pipelines.config.RepoType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

local

local = 'local'

online

online = 'online'

RopeType

class max.pipelines.config.RopeType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

neox

neox = 'neox'

none

none = 'none'

normal

normal = 'normal'

SupportedEncoding

class max.pipelines.config.SupportedEncoding(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

All possible encodings which may be supported by a particular model.

bfloat16

bfloat16 = 'bfloat16'

dtype

property dtype*: DType*

The underlying model dtype associated with a quantization_encoding.

float32

float32 = 'float32'

parse_from_file_name()

classmethod parse_from_file_name(name: str)

q4_0

q4_0 = 'q4_0'

q4_k

q4_k = 'q4_k'

q6_k

q6_k = 'q6_k'

quantization_encoding

property quantization_encoding*: QuantizationEncoding | None*

WeightsFormat

class max.pipelines.config.WeightsFormat(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

gguf

gguf = 'gguf'

pytorch

pytorch = 'pytorch'

safetensors

safetensors = 'safetensors'