Python module
config
Standardized config for Pipeline Inference.
HuggingFaceRepo
class max.pipelines.config.HuggingFaceRepo(repo_id: str, _info: Optional[huggingface_hub.hf_api.ModelInfo] = None, _formats_available: list[max.pipelines.config.WeightsFormat] = <factory>, _supported_encodings: list[max.pipelines.config.SupportedEncoding] = <factory>, _gguf_architecture: Optional[str] = None, _files: Iterable[str] = <factory>, _safetensors_metadata: Optional[huggingface_hub.utils._safetensors.SafetensorsRepoMetadata] = None)
download()
download(filename: str, force_download: bool = False) → Path
encoding_for_file()
encoding_for_file(file: str | Path) → SupportedEncoding
file_exists()
files
files_for_encoding()
files_for_encoding(encoding: SupportedEncoding, weights_format: WeightsFormat | None = None) → dict[max.pipelines.config.WeightsFormat, list[pathlib.Path]]
formats_available
property formats_available*: list[max.pipelines.config.WeightsFormat]*
gguf_architecture
info
property info*: ModelInfo*
repo_id
repo_id*: str*
safetensors_metadata
property safetensors_metadata*: SafetensorsRepoMetadata | None*
size_of()
supported_encodings
property supported_encodings*: list[max.pipelines.config.SupportedEncoding]*
PipelineConfig
class max.pipelines.config.PipelineConfig(huggingface_repo_id: str, engine: Optional[max.pipelines.config.PipelineEngine] = None, architecture: Optional[str] = None, version: Optional[str] = None, weight_path: list[pathlib.Path] = <factory>, device_specs: list[max.driver.driver.DeviceSpec] = <factory>, quantization_encoding: Optional[max.pipelines.config.SupportedEncoding] = None, serialized_model_path: Optional[str] = None, save_to_serialized_model_path: Optional[str] = None, max_length: int = 512, max_new_tokens: int = -1, max_cache_batch_size: int = 1, max_ce_batch_size: int = 32, cache_strategy: max.pipelines.kv_cache.cache_params.KVCacheStrategy = continuous, max_num_steps: int = 1, pad_to_multiple_of: int = 2, kv_cache_page_size: int = 512, gpu_memory_utilization: float = 0.9, top_k: Optional[int] = None, trust_remote_code: bool = False, force_download: bool = False, _huggingface_config: Optional[transformers.models.auto.configuration_auto.AutoConfig] = None, _devices: list[max.driver.driver.Device] = <factory>, _weights_converter: Optional[type[max.graph.weights.weights.WeightsConverter]] = None, enable_echo: bool = False)
architecture
Model architecture to run.
cache_strategy
cache_strategy*: KVCacheStrategy* = 'continuous'
Force using a specific cache strategy, ‘naive’ or ‘continuous’.
device
property device*: Device*
Initialize and return a singular device, given a singular device spec.
device_specs
device_specs*: list[max.driver.driver.DeviceSpec]*
Devices to run inference upon.
devices
property devices*: list[max.driver.driver.Device]*
Initialize and return a list of devices, given a list of device specs.
download_weights()
download_weights() → None
dtype
property dtype*: DType*
enable_echo
enable_echo*: bool* = False
Whether the model should be built with echo capabilities.
engine
engine*: PipelineEngine | None* = None
Engine backend to use for serving, ‘max’ for the max engine, or ‘huggingface’ as fallback option for improved model coverage.
force_download
force_download*: bool* = False
Whether to force download a given file if it’s not already present in the local cache.
gpu_memory_utilization
gpu_memory_utilization*: float* = 0.9
The fraction of available device memory that the our process should consume.
This is used to inform the size of the KVCache workspace: : kv_cache_workspace = (total_free_memory * gpu_memory_utilization) - model_weights_size
help()
huggingface_config
property huggingface_config*: AutoConfig*
Given the huggingface_repo_id, return the Huggingface Config.
huggingface_repo_id
huggingface_repo_id*: str*
repo_id of a huggingface model repository to use.
kv_cache_page_size
kv_cache_page_size*: int* = 512
The number of tokens in a single page in the KVCache.
load_weights()
load_weights() → Weights
max_cache_batch_size
max_cache_batch_size*: int* = 1
Maximum cache size to reserve for a single batch. This is set to one, to minimize memory consumption for the base case, in which a person is running a local server to test out MAX. For users launching in a server scenario, the expectation is that this value should be set higher based on server capacity.
max_ce_batch_size
max_ce_batch_size*: int* = 32
Maximum cache size to reserve for a single context encoding batch. The actual limit is the lesser of this and max_cache_batch_size.
max_length
max_length*: int* = 512
Maximum sequence length of the model.
max_new_tokens
max_new_tokens*: int* = -1
Maximum number of new tokens to generate during a single inference pass of the model.
max_num_steps
max_num_steps*: int* = 1
The number of steps to run for multi-step scheduling.
pad_to_multiple_of
pad_to_multiple_of*: int* = 2
Pad input tensors to be a multiple of value provided.
quantization_encoding
quantization_encoding*: SupportedEncoding | None* = None
Weight encoding type.
save_to_serialized_model_path
If specified, tries to save a serialized model to this path.
serialized_model_path
If specified, tries to load a serialized model from this path.
short_name
property short_name*: str*
Returns a short name for the model defined by this PipelineConfig.
top_k
Limits the sampling to the K most probable tokens. If None, will default to greedy sampling.
trust_remote_code
trust_remote_code*: bool* = False
Whether or not to allow for custom modelling files on Huggingface.
version
Name of the model version to run.
weight_path
weight_path*: list[pathlib.Path]*
Optional path or url of the model weights to use.
weights_format
property weights_format*: WeightsFormat*
Identify which format our weights are expected in.
weights_size()
PipelineEngine
class max.pipelines.config.PipelineEngine(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)
HUGGINGFACE
HUGGINGFACE = 'huggingface'
MAX
MAX = 'max'
SupportedEncoding
class max.pipelines.config.SupportedEncoding(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)
All possible encodings which may be supported by a particular model.
bfloat16
bfloat16 = 'bfloat16'
dtype
property dtype*: DType*
The underlying model dtype associated with a quantization_encoding.
float32
float32 = 'float32'
parse_from_file_name()
classmethod parse_from_file_name(name: str)
q4_0
q4_0 = 'q4_0'
q4_k
q4_k = 'q4_k'
q6_k
q6_k = 'q6_k'
quantization_encoding
property quantization_encoding*: QuantizationEncoding | None*
WeightsFormat
class max.pipelines.config.WeightsFormat(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)
gguf
gguf = 'gguf'
pytorch
pytorch = 'pytorch'
safetensors
safetensors = 'safetensors'
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!