Skip to main content
Log in

Python module

config

Standardized config for Pipeline Inference.

PipelineConfig

class max.pipelines.config.PipelineConfig(engine: Optional[max.pipelines.config.PipelineEngine] = None, architecture: Optional[str] = None, version: Optional[str] = None, weight_path: list[pathlib.Path] = <factory>, huggingface_repo_id: Optional[str] = None, device_spec: max.driver.driver.DeviceSpec = DeviceSpec(id=-1, device_type='cpu'), quantization_encoding: Optional[max.pipelines.config.SupportedEncoding] = None, serialized_model_path: Optional[str] = None, save_to_serialized_model_path: Optional[str] = None, max_length: int = 512, max_new_tokens: int = -1, max_cache_batch_size: int = 1, max_ce_batch_size: int = 32, cache_strategy: max.pipelines.kv_cache.cache_params.KVCacheStrategy = continuous, max_num_steps: int = 1, pad_to_multiple_of: int = 2, top_k: Optional[int] = None, trust_remote_code: bool = False, force_download: bool = False, _huggingface_config: Optional[transformers.models.auto.configuration_auto.AutoConfig] = None, _device: Optional[max.driver.driver.Device] = None, _weights_converter: Optional[type[max.graph.weights.weights.WeightsConverter]] = None, enable_echo: bool = False)

architecture

architecture*: str | None* = None

Model architecture to run.

cache_strategy

cache_strategy*: KVCacheStrategy* = 'continuous'

Force using a specific cache strategy, ‘naive’ or ‘continuous’.

device

property device*: Device*

Initialize and return a device, given the provided device spec.

device_spec

device_spec*: DeviceSpec* = DeviceSpec(id=-1, device_type='cpu')

Device to run inference upon.

download_weights()

download_weights() → None

dtype

property dtype*: DType*

enable_echo

enable_echo*: bool* = False

Whether the model should be built with echo capabilities.

engine

engine*: PipelineEngine | None* = None

Engine backend to use for serving, ‘max’ for the max engine, or ‘huggingface’ as fallback option for improved model coverage.

force_download

force_download*: bool* = False

Whether to force download a given file if it’s not already present in the local cache.

help()

static help() → dict[str, str]

huggingface_config

property huggingface_config*: AutoConfig*

Given the huggingface_repo_id, return the Huggingface Config.

huggingface_repo_id

huggingface_repo_id*: str | None* = None

Optional repo_id of a huggingface model repository to use.

load_weights()

load_weights() → Weights

max_cache_batch_size

max_cache_batch_size*: int* = 1

Maximum cache size to reserve for a single batch. This is set to one, to minimize memory consumption for the base case, in which a person is running a local server to test out MAX. For users launching in a server scenario, the expectation is that this value should be set higher based on server capacity.

max_ce_batch_size

max_ce_batch_size*: int* = 32

Maximum cache size to reserve for a single context encoding batch. The actual limit is the lesser of this and max_cache_batch_size.

max_length

max_length*: int* = 512

Maximum sequence length of the model.

max_new_tokens

max_new_tokens*: int* = -1

Maximum number of new tokens to generate during a single inference pass of the model.

max_num_steps

max_num_steps*: int* = 1

The number of steps to run for multi-step scheduling.

pad_to_multiple_of

pad_to_multiple_of*: int* = 2

Pad input tensors to be a multiple of value provided.

quantization_encoding

quantization_encoding*: SupportedEncoding | None* = None

Weight encoding type.

save_to_serialized_model_path

save_to_serialized_model_path*: str | None* = None

If specified, tries to save a serialized model to this path.

serialized_model_path

serialized_model_path*: str | None* = None

If specified, tries to load a serialized model from this path.

short_name

property short_name*: str*

Returns a short name for the model defined by this PipelineConfig.

top_k

top_k*: int | None* = None

Limits the sampling to the K most probable tokens. If None, will default to greedy sampling.

trust_remote_code

trust_remote_code*: bool* = False

Whether or not to allow for custom modelling files on Huggingface.

version

version*: str | None* = None

Name of the model version to run.

weight_path

weight_path*: list[pathlib.Path]*

Optional path or url of the model weights to use.

weights_format

property weights_format*: WeightsFormat*

Identify which format our weights are expected in.

weights_size()

weights_size() → int | None

PipelineEngine

class max.pipelines.config.PipelineEngine(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

HUGGINGFACE

HUGGINGFACE = 'huggingface'

MAX

MAX = 'max'

SupportedEncoding

class max.pipelines.config.SupportedEncoding(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

All possible encodings which may be supported by a particular model.

bfloat16

bfloat16 = 'bfloat16'

dtype

property dtype*: DType*

The underlying model dtype associated with a quantization_encoding.

float32

float32 = 'float32'

q4_0

q4_0 = 'q4_0'

q4_k

q4_k = 'q4_k'

q6_k

q6_k = 'q6_k'

quantization_encoding

property quantization_encoding*: QuantizationEncoding | None*

WeightsFormat

class max.pipelines.config.WeightsFormat(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

gguf

gguf = 'gguf'

pytorch

pytorch = 'pytorch'

safetensors

safetensors = 'safetensors'