Nightly build: This page might be unfinished.See the stable version.

Python module

config

Standardized config for Pipeline Inference.

`HuggingFaceRepo`

class max.pipelines.config.HuggingFaceRepo(repo_id: 'str', trust_remote_code: 'bool' = False, repo_type: 'Optional[RepoType]' = None)

`download()`

download(filename: str, force_download: bool = False) → Path

`encoding_for_file()`

encoding_for_file(file: str | Path) → SupportedEncoding

`file_exists()`

file_exists(filename: str) → bool

`files_for_encoding()`

files_for_encoding(encoding: SupportedEncoding, weights_format: WeightsFormat | None = None, alternate_encoding: SupportedEncoding | None = None) → dict[max.pipelines.config.WeightsFormat, list[pathlib.Path]]

`formats_available`

property formats_available*: list[max.pipelines.config.WeightsFormat]*

`info`

property info*: ModelInfo*

`repo_id`

repo_id*: str*

`repo_type`

repo_type*: RepoType | None* = None

`size_of()`

size_of(filename: str) → int | None

`supported_encodings`

property supported_encodings*: list[max.pipelines.config.SupportedEncoding]*

`trust_remote_code`

trust_remote_code*: bool* = False

`weight_files`

property weight_files*: dict[max.pipelines.config.WeightsFormat, list[str]]*

`PipelineConfig`

class max.pipelines.config.PipelineConfig(huggingface_repo_id: 'str', engine: 'Optional[PipelineEngine]' = None, architecture: 'Optional[str]' = None, weight_path: 'list[Path]' = <factory>, device_specs: 'list[DeviceSpec]' = <factory>, quantization_encoding: 'Optional[SupportedEncoding]' = None, serialized_model_path: 'Optional[str]' = None, save_to_serialized_model_path: 'Optional[str]' = None, max_length: 'int' = 512, max_new_tokens: 'int' = -1, max_cache_batch_size: 'int' = 1, max_ce_batch_size: 'int' = 32, cache_strategy: 'KVCacheStrategy' = continuous, max_num_steps: 'int' = 1, pad_to_multiple_of: 'int' = 2, kv_cache_page_size: 'int' = 512, enable_prefix_caching: 'bool' = False, gpu_memory_utilization: 'float' = 0.9, top_k: 'Optional[int]' = None, trust_remote_code: 'bool' = False, force_download: 'bool' = False, enable_echo: 'bool' = False, rope_type: 'Optional[RopeType]' = None, _huggingface_config: 'Optional[AutoConfig]' = None, _devices: 'list[Device]' = <factory>, _weights_converter: 'Optional[type[WeightsConverter]]' = None, _weights_repo_id: 'Optional[str]' = None)

`architecture`

architecture*: str | None* = None

Model architecture to run.

`cache_strategy`

cache_strategy*: KVCacheStrategy* = 'continuous'

Force using a specific cache strategy, ‘naive’ or ‘continuous’.

`device`

property device*: Device*

Initialize and return a singular device, given a singular device spec.

`device_specs`

device_specs*: list[max.driver.driver.DeviceSpec]*

Devices to run inference upon.

`devices`

property devices*: list[max.driver.driver.Device]*

Initialize and return a list of devices, given a list of device specs.

`download_weights()`

download_weights() → None

`dtype`

property dtype*: DType*

`enable_echo`

enable_echo*: bool* = False

Whether the model should be built with echo capabilities.

`enable_prefix_caching`

enable_prefix_caching*: bool* = False

Whether to enable prefix caching for the paged attention KVCache.

`engine`

engine*: PipelineEngine | None* = None

Engine backend to use for serving, ‘max’ for the max engine, or ‘huggingface’ as fallback option for improved model coverage.

`force_download`

force_download*: bool* = False

Whether to force download a given file if it’s not already present in the local cache.

`gpu_memory_utilization`

gpu_memory_utilization*: float* = 0.9

The fraction of available device memory that the process should consume.

This is used to inform the size of the KVCache workspace: : kv_cache_workspace = (total_free_memory * gpu_memory_utilization) - model_weights_size

`help()`

static help() → dict[str, str]

`huggingface_config`

property huggingface_config*: AutoConfig*

Given the huggingface_repo_id, return the HuggingFace Config.

`huggingface_repo_id`

huggingface_repo_id*: str*

repo_id of a huggingface model repository to use.

`huggingface_weights_repo()`

huggingface_weights_repo() → HuggingFaceRepo

`kv_cache_page_size`

kv_cache_page_size*: int* = 512

The number of tokens in a single page in the paged KVCache.

`load_weights()`

load_weights() → Weights

`max_cache_batch_size`

max_cache_batch_size*: int* = 1

Maximum cache size to reserve for a single batch. This is set to one, to minimize memory consumption for the base case, in which a person is running a local server to test out MAX. For users launching in a server scenario, the expectation is that this value should be set higher based on server capacity.

`max_ce_batch_size`

max_ce_batch_size*: int* = 32

Maximum cache size to reserve for a single context encoding batch. The actual limit is the lesser of this and max_cache_batch_size.

`max_length`

max_length*: int* = 512

Maximum sequence length of the model.

`max_new_tokens`

max_new_tokens*: int* = -1

Maximum number of new tokens to generate during a single inference pass of the model.

`max_num_steps`

max_num_steps*: int* = 1

The number of steps to run for multi-step scheduling.

`pad_to_multiple_of`

pad_to_multiple_of*: int* = 2

Pad input tensors to be a multiple of value provided.

`quantization_encoding`

quantization_encoding*: SupportedEncoding | None* = None

Weight encoding type.

`rope_type`

rope_type*: RopeType | None* = None

Force using a specific rope type, ‘none’, ‘normal’, or ‘neox’. Only matters for GGUF weights.

`save_to_serialized_model_path`

save_to_serialized_model_path*: str | None* = None

If specified, tries to save a serialized model to this path.

`serialized_model_path`

serialized_model_path*: str | None* = None

If specified, tries to load a serialized model from this path.

`short_name`

property short_name*: str*

Returns a short name for the model defined by this PipelineConfig.

`top_k`

top_k*: int | None* = None

Limits the sampling to the K most probable tokens. If None, will default to greedy sampling.

`trust_remote_code`

trust_remote_code*: bool* = False

Whether or not to allow for custom modelling files on Huggingface.

`update_architecture()`

update_architecture() → None

`weight_path`

weight_path*: list[pathlib.Path]*

Optional path or url of the model weights to use.

`weights_format`

property weights_format*: WeightsFormat*

Identify which format our weights are expected in.

`weights_size()`

weights_size() → int | None

`PipelineEngine`

class max.pipelines.config.PipelineEngine(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

`HUGGINGFACE`

HUGGINGFACE = 'huggingface'

`MAX`

MAX = 'max'

`RepoType`

class max.pipelines.config.RepoType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

`local`

local = 'local'

`online`

online = 'online'

`RopeType`

class max.pipelines.config.RopeType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

`neox`

neox = 'neox'

`none`

none = 'none'

`normal`

normal = 'normal'

`SupportedEncoding`

class max.pipelines.config.SupportedEncoding(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

All possible encodings which may be supported by a particular model.

`bfloat16`

bfloat16 = 'bfloat16'

`dtype`

property dtype*: DType*

The underlying model dtype associated with a quantization_encoding.

`float32`

float32 = 'float32'

`parse_from_file_name()`

classmethod parse_from_file_name(name: str)

`q4_0`

q4_0 = 'q4_0'

`q4_k`

q4_k = 'q4_k'

`q6_k`

q6_k = 'q6_k'

`quantization_encoding`

property quantization_encoding*: QuantizationEncoding | None*

`WeightsFormat`

class max.pipelines.config.WeightsFormat(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)

`gguf`

gguf = 'gguf'

`pytorch`

pytorch = 'pytorch'

`safetensors`

safetensors = 'safetensors'

HuggingFaceRepo
PipelineConfig
PipelineEngine
- HUGGINGFACE
- MAX
RepoType
- local
- online
RopeType
SupportedEncoding
WeightsFormat

Was this page helpful?

Thank you! We'll create more content like this.

Thank you for helping us improve!

HuggingFaceRepo​

download()​

encoding_for_file()​

file_exists()​

files_for_encoding()​

formats_available​

info​

repo_id​

repo_type​

size_of()​

supported_encodings​

trust_remote_code​

weight_files​

PipelineConfig​

architecture​

cache_strategy​

device​

device_specs​

devices​

download_weights()​

dtype​

enable_echo​

enable_prefix_caching​

engine​

force_download​

gpu_memory_utilization​

help()​

huggingface_config​

huggingface_repo_id​

huggingface_weights_repo()​

kv_cache_page_size​

load_weights()​

max_cache_batch_size​

max_ce_batch_size​

max_length​

max_new_tokens​

max_num_steps​

pad_to_multiple_of​

quantization_encoding​

rope_type​

save_to_serialized_model_path​

serialized_model_path​

short_name​

top_k​

trust_remote_code​

update_architecture()​

weight_path​

weights_format​

weights_size()​

PipelineEngine​

HUGGINGFACE​

MAX​

RepoType​

local​

online​

RopeType​

neox​

none​

normal​

SupportedEncoding​

bfloat16​

dtype​

float32​

parse_from_file_name()​

q4_0​

q4_k​

q6_k​

quantization_encoding​

WeightsFormat​

gguf​

pytorch​

safetensors​

`HuggingFaceRepo`

`download()`

`encoding_for_file()`

`file_exists()`

`files_for_encoding()`

`formats_available`

`info`

`repo_id`

`repo_type`

`size_of()`

`supported_encodings`

`trust_remote_code`

`weight_files`

`PipelineConfig`

`architecture`

`cache_strategy`

`device`

`device_specs`

`devices`

`download_weights()`

`dtype`

`enable_echo`

`enable_prefix_caching`

`engine`

`force_download`

`gpu_memory_utilization`

`help()`

`huggingface_config`

`huggingface_repo_id`

`huggingface_weights_repo()`

`kv_cache_page_size`

`load_weights()`

`max_cache_batch_size`

`max_ce_batch_size`

`max_length`

`max_new_tokens`

`max_num_steps`

`pad_to_multiple_of`

`quantization_encoding`

`rope_type`

`save_to_serialized_model_path`

`serialized_model_path`

`short_name`

`top_k`

`trust_remote_code`

`update_architecture()`

`weight_path`

`weights_format`

`weights_size()`

`PipelineEngine`

`HUGGINGFACE`

`MAX`

`RepoType`

`local`

`online`

`RopeType`

`neox`

`none`

`normal`

`SupportedEncoding`

`bfloat16`

`dtype`

`float32`

`parse_from_file_name()`

`q4_0`

`q4_k`

`q6_k`

`quantization_encoding`

`WeightsFormat`

`gguf`

`pytorch`

`safetensors`