OpenELM-1_1B-Instruct_a1359.../configuration_openelm.py

#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#

"""Implements HF OpenELMConfig based on PretrainedConfig"""
from numbers import Number
from typing import List, Optional, Union

import numpy as np
from transformers import PretrainedConfig


def make_divisible(
    v: Union[float, int],
    divisor: Optional[int] = 8,
    min_value: Optional[Union[float, int]] = None,
) -> Union[float, int]:
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by the divisor
    It can be seen at:
    https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62

    Args:
        v: input value
        divisor: default to 8
        min_value: minimum divisor value
    Returns:
        new_v: new divisible value
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


def compute_heads(model_dim: int, head_dim: int) -> int:
    """Compute the number of heads.

    Args:
        model_dim: Model dimension.
        head_dim: Head dimension.

    Returns:
        An integer denoting number of heads in multi-head attention is returned.

    Raises:
        ValueError: if model dimension is not divisible by head dimension.
    """
    if model_dim % head_dim == 0:
        return model_dim // head_dim
    else:
        raise ValueError(
            f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}."
        )


OpenELM_CONFIGS = {
    "OpenELM-270M": dict(
        num_transformer_layers=16,
        model_dim=1280,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-450M": dict(
        num_transformer_layers=20,
        model_dim=1536,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-1_1B": dict(
        num_transformer_layers=28,
        model_dim=2048,
        head_dim=64,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
    "OpenELM-3B": dict(
        num_transformer_layers=36,
        model_dim=3072,
        head_dim=128,
        num_gqa_groups=4,
        normalize_qk_projections=True,
        share_input_output_layers=True,
        # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.
        ffn_multipliers=(0.5, 4.0),
        qkv_multipliers=(0.5, 1.0),
    ),
}


class OpenELMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the OpenELM model.
        max_context_length (`int`, *optional*, defaults to 2048):
            Maximum number of input tokens.
        num_transformer_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer decoder.
        model_dim (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        qkv_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 1.0):
            If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions,
            resulting in uniform allocation of parameters.
            If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions
            assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer.
            This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
        num_query_heads (`Union[int, None]`, *optional*, defaults to None):
            The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`.
        num_gqa_groups (`int`, *optional*, defaults to 1):
            This variable allows to switch between multi-head attention, group query attention, and multi-query attention.
            When num_gqa_groups == 1, then it is multi-head attention.
            When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention
            When num_gqa_groups == num_heads, then it is multi-query attention
        ffn_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 4.0):
            Feed-forward network (FFN) multipliers.
            If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions,
            resulting in uniform allocation of parameters.
            If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions
            assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer.
            This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
        ffn_with_glu (`bool`, *optional*, defaults to True):
            Whether to use FFN with Gated Linear Unit (GLU)
        ffn_dim_divisor (`int`, *optional*, defaults to 256):
            The ffn layer dimension divisor.
        activation_fn_name (`str` or `function`, *optional*, defaults to `"swish"`):
            The non-linear activation function (function or string) in the decoder.
        normalization_layer_name (`str` or `function`, *optional*, defaults to `"rms_norm"`):
            Type of normalization layer.
        normalize_qk_projections (`bool`, *optional*, defaults to False):
            Whether to normalize queries and keys after projections
        share_input_output_layers (`bool`, *optional*, defaults to False):
            Whether to share the embedding between input and output linear layer
        rope_freq_constant (`int`, *optional*, defaults to 10000):
            The base period of the RoPE embeddings.
        rope_max_length (`int`, *optional*, defaults to 4096):
            That rope_max_length is set to twice of max_context_length.
            This allows flexibility in token lengths during training or fine-tuning.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
    """

    model_type = "openelm"

    def __init__(
        self,
        vocab_size: int = 32000,
        max_context_length: int = 2048,
        num_transformer_layers: int = 12,
        model_dim: int = 2048,
        head_dim: int = 128,
        qkv_multipliers: Union[Number, List[Number]] = 1.0,
        num_query_heads: Union[int, None] = None,
        num_gqa_groups: int = 1,
        ffn_multipliers: Union[Number, List[Number]] = 4.0,
        ffn_with_glu: bool = True,
        ffn_dim_divisor: int = 256,
        activation_fn_name: str = "swish",
        normalization_layer_name: str = "rms_norm",
        normalize_qk_projections: bool = False,
        share_input_output_layers: bool = False,
        rope_freq_constant: int = 10000,
        rope_max_length: int = 4096,
        initializer_range: float = 0.02,
        use_cache: bool = True,
        bos_token_id: int = 1,
        eos_token_id: int = 2,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        self.max_context_length = max_context_length
        self.num_transformer_layers = num_transformer_layers
        self.model_dim = model_dim
        self.head_dim = head_dim
        self.qkv_multipliers = qkv_multipliers
        self.num_query_heads = num_query_heads
        self.num_gqa_groups = num_gqa_groups
        self.ffn_multipliers = ffn_multipliers
        self.ffn_with_glu = ffn_with_glu
        self.ffn_dim_divisor = ffn_dim_divisor
        self.activation_fn_name = activation_fn_name
        self.normalization_layer_name = normalization_layer_name
        self.normalize_qk_projections = normalize_qk_projections
        self.share_input_output_layers = share_input_output_layers
        self.rope_freq_constant = rope_freq_constant
        self.rope_max_length = rope_max_length
        self.num_query_heads = (
            compute_heads(model_dim=model_dim, head_dim=head_dim)
            if num_query_heads is None
            else num_query_heads
        )
        self.initializer_range = initializer_range

        self.__post_init__()
        super().__init__(
            use_cache=use_cache,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )

    def __post_init__(self) -> None:
        if self.num_gqa_groups is not None:
            head_multiple_of = self.num_gqa_groups
        else:
            head_multiple_of = 2

        if isinstance(self.qkv_multipliers, Number):
            # All attention layers have the same latent dimensions, resulting in uniform allocation of parameters.
            qkv_dim = make_divisible(
                self.model_dim * self.qkv_multipliers,
                divisor=self.head_dim * head_multiple_of,
            )
            query_dims = [int(qkv_dim)] * self.num_transformer_layers

        elif (
            isinstance(self.qkv_multipliers, (tuple, list))
            and len(self.qkv_multipliers) == 2
        ):
            # Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1].
            # This results in variable allocation of parameters in attention layer.
            # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
            qkv_multipliers = [
                round(v, 2)
                for v in np.linspace(
                    self.qkv_multipliers[0],
                    self.qkv_multipliers[1],
                    num=self.num_transformer_layers,
                    dtype=float,
                )
            ]
            # Make sure that scaled model dimension is divisible by scaled head dimension.
            query_dims = [
                int(
                    make_divisible(
                        self.model_dim * m, divisor=self.head_dim * head_multiple_of
                    )
                )
                for m in qkv_multipliers
            ]
        else:
            raise NotImplementedError(
                f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
            )

        # compute the number of query, key, and value heads
        # For multi-head and multi-query attention, the number of heads for query, key, and value are the same.
        # For group query attention, the number of key and value heads are the same.
        self.num_query_heads = [
            int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims
        ]
        self.num_kv_heads = [
            q_heads // self.num_gqa_groups for q_heads in self.num_query_heads
        ]

        # Feed-forward network (FFN) multipliers
        if isinstance(self.ffn_multipliers, Number):
            # All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters.
            self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers
        elif isinstance(self.ffn_multipliers, (tuple, list)):
            # Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1].
            # This results in variable allocation of parameters in FFN layer.
            # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623
            if len(self.ffn_multipliers) == 2:
                self.ffn_multipliers = [
                    round(v, 2)
                    for v in np.linspace(
                        self.ffn_multipliers[0],
                        self.ffn_multipliers[1],
                        num=self.num_transformer_layers,
                        dtype=float,
                    )
                ]
            else:
                assert (
                    len(self.ffn_multipliers) == self.num_transformer_layers
                ), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}"
        else:
            raise NotImplementedError(
                f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."
            )

        # check num_query_heads divisible by num_kv_heads for every layer
        for layer_idx in range(len(query_dims)):
            assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0
first commit 2024-11-14 15:47:40 +08:00			`#`
			`# For licensing see accompanying LICENSE file.`
			`# Copyright (C) 2024 Apple Inc. All Rights Reserved.`
			`#`

			`"""Implements HF OpenELMConfig based on PretrainedConfig"""`
			`from numbers import Number`
			`from typing import List, Optional, Union`

			`import numpy as np`
			`from transformers import PretrainedConfig`


			`def make_divisible(`
			`v: Union[float, int],`
			`divisor: Optional[int] = 8,`
			`min_value: Optional[Union[float, int]] = None,`
			`) -> Union[float, int]:`
			`"""`
			`This function is taken from the original tf repo.`
			`It ensures that all layers have a channel number that is divisible by the divisor`
			`It can be seen at:`
			`https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62`

			`Args:`
			`v: input value`
			`divisor: default to 8`
			`min_value: minimum divisor value`
			`Returns:`
			`new_v: new divisible value`
			`"""`
			`if min_value is None:`
			`min_value = divisor`
			`new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)`
			`# Make sure that round down does not go down by more than 10%.`
			`if new_v < 0.9 * v:`
			`new_v += divisor`
			`return new_v`


			`def compute_heads(model_dim: int, head_dim: int) -> int:`
			`"""Compute the number of heads.`

			`Args:`
			`model_dim: Model dimension.`
			`head_dim: Head dimension.`

			`Returns:`
			`An integer denoting number of heads in multi-head attention is returned.`

			`Raises:`
			`ValueError: if model dimension is not divisible by head dimension.`
			`"""`
			`if model_dim % head_dim == 0:`
			`return model_dim // head_dim`
			`else:`
			`raise ValueError(`
			`f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}."`
			`)`


			`OpenELM_CONFIGS = {`
			`"OpenELM-270M": dict(`
			`num_transformer_layers=16,`
			`model_dim=1280,`
			`head_dim=64,`
			`num_gqa_groups=4,`
			`normalize_qk_projections=True,`
			`share_input_output_layers=True,`
			`# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.`
			`ffn_multipliers=(0.5, 4.0),`
			`qkv_multipliers=(0.5, 1.0),`
			`),`
			`"OpenELM-450M": dict(`
			`num_transformer_layers=20,`
			`model_dim=1536,`
			`head_dim=64,`
			`num_gqa_groups=4,`
			`normalize_qk_projections=True,`
			`share_input_output_layers=True,`
			`# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.`
			`ffn_multipliers=(0.5, 4.0),`
			`qkv_multipliers=(0.5, 1.0),`
			`),`
			`"OpenELM-1_1B": dict(`
			`num_transformer_layers=28,`
			`model_dim=2048,`
			`head_dim=64,`
			`num_gqa_groups=4,`
			`normalize_qk_projections=True,`
			`share_input_output_layers=True,`
			`# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.`
			`ffn_multipliers=(0.5, 4.0),`
			`qkv_multipliers=(0.5, 1.0),`
			`),`
			`"OpenELM-3B": dict(`
			`num_transformer_layers=36,`
			`model_dim=3072,`
			`head_dim=128,`
			`num_gqa_groups=4,`
			`normalize_qk_projections=True,`
			`share_input_output_layers=True,`
			`# Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively.`
			`ffn_multipliers=(0.5, 4.0),`
			`qkv_multipliers=(0.5, 1.0),`
			`),`
			`}`


			`class OpenELMConfig(PretrainedConfig):`
			`r"""`
			This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture.

			Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
			documentation from [`PretrainedConfig`] for more information.

			`Args:`
			vocab_size (`int`, optional, defaults to 32000):
			`Vocabulary size of the OpenELM model.`
			max_context_length (`int`, optional, defaults to 2048):
			`Maximum number of input tokens.`
			num_transformer_layers (`int`, optional, defaults to 12):
			`Number of hidden layers in the Transformer decoder.`
			model_dim (`int`, optional, defaults to 2048):
			`Dimension of the hidden representations.`
			head_dim (`int`, optional, defaults to 128):
			`The attention head dimension.`
			qkv_multipliers (`Union[Number, List[Number]]`, optional, defaults to 1.0):
			`If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions,`
			`resulting in uniform allocation of parameters.`
			`If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions`
			`assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer.`
			`This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623`
			num_query_heads (`Union[int, None]`, optional, defaults to None):
			The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`.
			num_gqa_groups (`int`, optional, defaults to 1):
			`This variable allows to switch between multi-head attention, group query attention, and multi-query attention.`
			`When num_gqa_groups == 1, then it is multi-head attention.`
			`When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention`
			`When num_gqa_groups == num_heads, then it is multi-query attention`
			ffn_multipliers (`Union[Number, List[Number]]`, optional, defaults to 4.0):
			`Feed-forward network (FFN) multipliers.`
			`If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions,`
			`resulting in uniform allocation of parameters.`
			`If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions`
			`assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer.`
			`This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623`
			ffn_with_glu (`bool`, optional, defaults to True):
			`Whether to use FFN with Gated Linear Unit (GLU)`
			ffn_dim_divisor (`int`, optional, defaults to 256):
			`The ffn layer dimension divisor.`
			activation_fn_name (`str` or `function`, optional, defaults to `"swish"`):
			`The non-linear activation function (function or string) in the decoder.`
			normalization_layer_name (`str` or `function`, optional, defaults to `"rms_norm"`):
			`Type of normalization layer.`
			normalize_qk_projections (`bool`, optional, defaults to False):
			`Whether to normalize queries and keys after projections`
			share_input_output_layers (`bool`, optional, defaults to False):
			`Whether to share the embedding between input and output linear layer`
			rope_freq_constant (`int`, optional, defaults to 10000):
			`The base period of the RoPE embeddings.`
			rope_max_length (`int`, optional, defaults to 4096):
			`That rope_max_length is set to twice of max_context_length.`
			`This allows flexibility in token lengths during training or fine-tuning.`
			initializer_range (`float`, optional, defaults to 0.02):
			`The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`
			use_cache (`bool`, optional, defaults to `True`):
			`Whether or not the model should return the last key/values attentions (not used by all models). Only`
			relevant if `config.is_decoder=True`.
			bos_token_id (`int`, optional, defaults to 2):
			`Beginning of stream token id.`
			eos_token_id (`int`, optional, defaults to 1):
			`End of stream token id.`
			`"""`

			`model_type = "openelm"`

			`def __init__(`
			`self,`
			`vocab_size: int = 32000,`
			`max_context_length: int = 2048,`
			`num_transformer_layers: int = 12,`
			`model_dim: int = 2048,`
			`head_dim: int = 128,`
			`qkv_multipliers: Union[Number, List[Number]] = 1.0,`
			`num_query_heads: Union[int, None] = None,`
			`num_gqa_groups: int = 1,`
			`ffn_multipliers: Union[Number, List[Number]] = 4.0,`
			`ffn_with_glu: bool = True,`
			`ffn_dim_divisor: int = 256,`
			`activation_fn_name: str = "swish",`
			`normalization_layer_name: str = "rms_norm",`
			`normalize_qk_projections: bool = False,`
			`share_input_output_layers: bool = False,`
			`rope_freq_constant: int = 10000,`
			`rope_max_length: int = 4096,`
			`initializer_range: float = 0.02,`
			`use_cache: bool = True,`
			`bos_token_id: int = 1,`
			`eos_token_id: int = 2,`
			`**kwargs,`
			`) -> None:`
			`self.vocab_size = vocab_size`
			`self.max_context_length = max_context_length`
			`self.num_transformer_layers = num_transformer_layers`
			`self.model_dim = model_dim`
			`self.head_dim = head_dim`
			`self.qkv_multipliers = qkv_multipliers`
			`self.num_query_heads = num_query_heads`
			`self.num_gqa_groups = num_gqa_groups`
			`self.ffn_multipliers = ffn_multipliers`
			`self.ffn_with_glu = ffn_with_glu`
			`self.ffn_dim_divisor = ffn_dim_divisor`
			`self.activation_fn_name = activation_fn_name`
			`self.normalization_layer_name = normalization_layer_name`
			`self.normalize_qk_projections = normalize_qk_projections`
			`self.share_input_output_layers = share_input_output_layers`
			`self.rope_freq_constant = rope_freq_constant`
			`self.rope_max_length = rope_max_length`
			`self.num_query_heads = (`
			`compute_heads(model_dim=model_dim, head_dim=head_dim)`
			`if num_query_heads is None`
			`else num_query_heads`
			`)`
			`self.initializer_range = initializer_range`

			`self.__post_init__()`
			`super().__init__(`
			`use_cache=use_cache,`
			`bos_token_id=bos_token_id,`
			`eos_token_id=eos_token_id,`
			`**kwargs,`
			`)`

			`def __post_init__(self) -> None:`
			`if self.num_gqa_groups is not None:`
			`head_multiple_of = self.num_gqa_groups`
			`else:`
			`head_multiple_of = 2`

			`if isinstance(self.qkv_multipliers, Number):`
			`# All attention layers have the same latent dimensions, resulting in uniform allocation of parameters.`
			`qkv_dim = make_divisible(`
			`self.model_dim * self.qkv_multipliers,`
			`divisor=self.head_dim * head_multiple_of,`
			`)`
			`query_dims = [int(qkv_dim)] * self.num_transformer_layers`

			`elif (`
			`isinstance(self.qkv_multipliers, (tuple, list))`
			`and len(self.qkv_multipliers) == 2`
			`):`
			`# Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1].`
			`# This results in variable allocation of parameters in attention layer.`
			`# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623`
			`qkv_multipliers = [`
			`round(v, 2)`
			`for v in np.linspace(`
			`self.qkv_multipliers[0],`
			`self.qkv_multipliers[1],`
			`num=self.num_transformer_layers,`
			`dtype=float,`
			`)`
			`]`
			`# Make sure that scaled model dimension is divisible by scaled head dimension.`
			`query_dims = [`
			`int(`
			`make_divisible(`
			`self.model_dim * m, divisor=self.head_dim * head_multiple_of`
			`)`
			`)`
			`for m in qkv_multipliers`
			`]`
			`else:`
			`raise NotImplementedError(`
			`f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."`
			`)`

			`# compute the number of query, key, and value heads`
			`# For multi-head and multi-query attention, the number of heads for query, key, and value are the same.`
			`# For group query attention, the number of key and value heads are the same.`
			`self.num_query_heads = [`
			`int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims`
			`]`
			`self.num_kv_heads = [`
			`q_heads // self.num_gqa_groups for q_heads in self.num_query_heads`
			`]`

			`# Feed-forward network (FFN) multipliers`
			`if isinstance(self.ffn_multipliers, Number):`
			`# All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters.`
			`self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers`
			`elif isinstance(self.ffn_multipliers, (tuple, list)):`
			`# Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1].`
			`# This results in variable allocation of parameters in FFN layer.`
			`# This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623`
			`if len(self.ffn_multipliers) == 2:`
			`self.ffn_multipliers = [`
			`round(v, 2)`
			`for v in np.linspace(`
			`self.ffn_multipliers[0],`
			`self.ffn_multipliers[1],`
			`num=self.num_transformer_layers,`
			`dtype=float,`
			`)`
			`]`
			`else:`
			`assert (`
			`len(self.ffn_multipliers) == self.num_transformer_layers`
			`), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}"`
			`else:`
			`raise NotImplementedError(`
			`f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}."`
			`)`

			`# check num_query_heads divisible by num_kv_heads for every layer`
			`for layer_idx in range(len(query_dims)):`
			`assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0`