Skip to content

vllm.multimodal.processing.dummy_inputs

_I module-attribute

_I = TypeVar('_I', bound=BaseProcessingInfo)

logger module-attribute

logger = init_logger(__name__)

BaseDummyInputsBuilder

Bases: ABC, Generic[_I]

Abstract base class that constructs the dummy data to profile multi-modal models.

Source code in vllm/multimodal/processing/dummy_inputs.py
class BaseDummyInputsBuilder(ABC, Generic[_I]):
    """
    Abstract base class that constructs the dummy data to profile
    multi-modal models.
    """

    def __init__(self, info: _I) -> None:
        super().__init__()

        self.info = info

    @abstractmethod
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        """
        Build the text input corresponding to `mm_counts`.
        """
        raise NotImplementedError

    @abstractmethod
    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Mapping[str, BaseDummyOptions] | None = None,
    ) -> MultiModalDataDict:
        """
        Build the multimodal input which, after processing, results in
        the maximum possible number of placeholder tokens.

        Args:
            seq_len: Sequence length
            mm_counts: Count of items per modality
            mm_options: Configurable options per modality (optional).
                       If None, use model defaults for backward compatibility.
                       If provided, models can use these to customize dummy
                       data generation.
        """
        raise NotImplementedError

    def get_dummy_processor_inputs(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Mapping[str, BaseDummyOptions] | None = None,
    ) -> ProcessorInputs:
        """
        Build the input which, after processing, results in
        the maximum possible number of placeholder tokens.

        Args:
            seq_len: Sequence length
            mm_counts: Count of items per modality
            mm_options: Configurable options per modality (optional)
        """
        dummy_text = self.get_dummy_text(mm_counts)
        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
        dummy_mm_items = self.info.parse_mm_data(dummy_mm_data)

        tokenization_kwargs = {"truncation": False}

        return ProcessorInputs(
            prompt=dummy_text,
            mm_items=dummy_mm_items,
            tokenization_kwargs=tokenization_kwargs,
        )

    def _get_dummy_audios(
        self,
        *,
        length: int,
        num_audios: int,
        overrides: AudioDummyOptions | None = None,
    ) -> list[npt.NDArray]:
        if num_audios == 0:
            return []
        if overrides and overrides.length:
            if overrides.length > length:
                logger.warning(
                    "audio.length override (%d) exceeds model's "
                    "maximum length (%d), will be ignored",
                    overrides.length,
                    length,
                )
            length = min(length, overrides.length)
        audio = np.zeros((length,))
        return [audio] * num_audios

    def _get_dummy_images(
        self,
        *,
        width: int,
        height: int,
        num_images: int,
        overrides: ImageDummyOptions | None = None,
    ) -> list[Image.Image]:
        if num_images == 0:
            return []
        if overrides:
            if overrides.width:
                if overrides.width > width:
                    logger.warning(
                        "image.width override (%d) exceeds model's "
                        "maximum width (%d), will be ignored",
                        overrides.width,
                        width,
                    )
                width = min(width, overrides.width)
            if overrides.height:
                if overrides.height > height:
                    logger.warning(
                        "image.height override (%d) exceeds model's "
                        "maximum height (%d), will be ignored",
                        overrides.height,
                        height,
                    )
                height = min(height, overrides.height)
        image = Image.new("RGB", (width, height), color=255)
        return [image] * num_images

    def _get_dummy_videos(
        self,
        *,
        width: int,
        height: int,
        num_frames: int,
        num_videos: int,
        overrides: VideoDummyOptions | None = None,
    ) -> list[npt.NDArray]:
        if num_videos == 0:
            return []
        if overrides:
            if overrides.num_frames:
                if overrides.num_frames > num_frames:
                    logger.warning(
                        "video.num_frames override (%d) exceeds model's "
                        "maximum number of frames (%d), will be ignored",
                        overrides.num_frames,
                        num_frames,
                    )
                num_frames = min(num_frames, overrides.num_frames)
            if overrides.width:
                if overrides.width > width:
                    logger.warning(
                        "video.width override (%d) exceeds model's "
                        "maximum width (%d), will be ignored",
                        overrides.width,
                        width,
                    )
                width = min(width, overrides.width)
            if overrides.height:
                if overrides.height > height:
                    logger.warning(
                        "video.height override (%d) exceeds model's "
                        "maximum height (%d), will be ignored",
                        overrides.height,
                        height,
                    )
                height = min(height, overrides.height)
        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
        return [video] * num_videos

    @abstractmethod
    def _get_img_feature_dim(self) -> int:
        """
        Get the image feature dimension for MM encoder CUDA graph capture.

        Returns:
            The image feature dimension.
        """
        raise NotImplementedError

    @abstractmethod
    def _calculate_patch_size(self, patches: int) -> tuple[int, int]:
        """
        Calculate the patch grid size (height, width) from the total number of
        patches.
        """
        raise NotImplementedError

    def get_dummy_mm_encoder_input(
        self,
        num_patches: int,
    ) -> "dict[str, torch.Tensor]":
        """
        Get dummy MM encoder input for CUDA graph capture or padding.

        Args:
            num_patches: Number of patches (tokens) for the dummy input

        Returns:
            dict with pixel_values and image_grid_thw
        """
        img_feature_dim = self._get_img_feature_dim()

        dtype = self.info.ctx.model_config.dtype

        h_patches, w_patches = self._calculate_patch_size(num_patches)

        pixel_values = torch.zeros(
            (num_patches, img_feature_dim), dtype=dtype, device="cuda"
        )
        grid_thw_list = torch.tensor(
            [[1, h_patches, w_patches]], dtype=torch.long, device="cpu"
        )

        return {
            "pixel_values": pixel_values,
            "image_grid_thw": grid_thw_list,
        }

info instance-attribute

info = info

__init__

__init__(info: _I) -> None
Source code in vllm/multimodal/processing/dummy_inputs.py
def __init__(self, info: _I) -> None:
    super().__init__()

    self.info = info

_calculate_patch_size abstractmethod

_calculate_patch_size(patches: int) -> tuple[int, int]

Calculate the patch grid size (height, width) from the total number of patches.

Source code in vllm/multimodal/processing/dummy_inputs.py
@abstractmethod
def _calculate_patch_size(self, patches: int) -> tuple[int, int]:
    """
    Calculate the patch grid size (height, width) from the total number of
    patches.
    """
    raise NotImplementedError

_get_dummy_audios

_get_dummy_audios(
    *,
    length: int,
    num_audios: int,
    overrides: AudioDummyOptions | None = None,
) -> list[NDArray]
Source code in vllm/multimodal/processing/dummy_inputs.py
def _get_dummy_audios(
    self,
    *,
    length: int,
    num_audios: int,
    overrides: AudioDummyOptions | None = None,
) -> list[npt.NDArray]:
    if num_audios == 0:
        return []
    if overrides and overrides.length:
        if overrides.length > length:
            logger.warning(
                "audio.length override (%d) exceeds model's "
                "maximum length (%d), will be ignored",
                overrides.length,
                length,
            )
        length = min(length, overrides.length)
    audio = np.zeros((length,))
    return [audio] * num_audios

_get_dummy_images

_get_dummy_images(
    *,
    width: int,
    height: int,
    num_images: int,
    overrides: ImageDummyOptions | None = None,
) -> list[Image]
Source code in vllm/multimodal/processing/dummy_inputs.py
def _get_dummy_images(
    self,
    *,
    width: int,
    height: int,
    num_images: int,
    overrides: ImageDummyOptions | None = None,
) -> list[Image.Image]:
    if num_images == 0:
        return []
    if overrides:
        if overrides.width:
            if overrides.width > width:
                logger.warning(
                    "image.width override (%d) exceeds model's "
                    "maximum width (%d), will be ignored",
                    overrides.width,
                    width,
                )
            width = min(width, overrides.width)
        if overrides.height:
            if overrides.height > height:
                logger.warning(
                    "image.height override (%d) exceeds model's "
                    "maximum height (%d), will be ignored",
                    overrides.height,
                    height,
                )
            height = min(height, overrides.height)
    image = Image.new("RGB", (width, height), color=255)
    return [image] * num_images

_get_dummy_videos

_get_dummy_videos(
    *,
    width: int,
    height: int,
    num_frames: int,
    num_videos: int,
    overrides: VideoDummyOptions | None = None,
) -> list[NDArray]
Source code in vllm/multimodal/processing/dummy_inputs.py
def _get_dummy_videos(
    self,
    *,
    width: int,
    height: int,
    num_frames: int,
    num_videos: int,
    overrides: VideoDummyOptions | None = None,
) -> list[npt.NDArray]:
    if num_videos == 0:
        return []
    if overrides:
        if overrides.num_frames:
            if overrides.num_frames > num_frames:
                logger.warning(
                    "video.num_frames override (%d) exceeds model's "
                    "maximum number of frames (%d), will be ignored",
                    overrides.num_frames,
                    num_frames,
                )
            num_frames = min(num_frames, overrides.num_frames)
        if overrides.width:
            if overrides.width > width:
                logger.warning(
                    "video.width override (%d) exceeds model's "
                    "maximum width (%d), will be ignored",
                    overrides.width,
                    width,
                )
            width = min(width, overrides.width)
        if overrides.height:
            if overrides.height > height:
                logger.warning(
                    "video.height override (%d) exceeds model's "
                    "maximum height (%d), will be ignored",
                    overrides.height,
                    height,
                )
            height = min(height, overrides.height)
    video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
    return [video] * num_videos

_get_img_feature_dim abstractmethod

_get_img_feature_dim() -> int

Get the image feature dimension for MM encoder CUDA graph capture.

Returns:

Type Description
int

The image feature dimension.

Source code in vllm/multimodal/processing/dummy_inputs.py
@abstractmethod
def _get_img_feature_dim(self) -> int:
    """
    Get the image feature dimension for MM encoder CUDA graph capture.

    Returns:
        The image feature dimension.
    """
    raise NotImplementedError

get_dummy_mm_data abstractmethod

get_dummy_mm_data(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> MultiModalDataDict

Build the multimodal input which, after processing, results in the maximum possible number of placeholder tokens.

Parameters:

Name Type Description Default
seq_len int

Sequence length

required
mm_counts Mapping[str, int]

Count of items per modality

required
mm_options Mapping[str, BaseDummyOptions] | None

Configurable options per modality (optional). If None, use model defaults for backward compatibility. If provided, models can use these to customize dummy data generation.

None
Source code in vllm/multimodal/processing/dummy_inputs.py
@abstractmethod
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
    """
    Build the multimodal input which, after processing, results in
    the maximum possible number of placeholder tokens.

    Args:
        seq_len: Sequence length
        mm_counts: Count of items per modality
        mm_options: Configurable options per modality (optional).
                   If None, use model defaults for backward compatibility.
                   If provided, models can use these to customize dummy
                   data generation.
    """
    raise NotImplementedError

get_dummy_mm_encoder_input

get_dummy_mm_encoder_input(
    num_patches: int,
) -> dict[str, Tensor]

Get dummy MM encoder input for CUDA graph capture or padding.

Parameters:

Name Type Description Default
num_patches int

Number of patches (tokens) for the dummy input

required

Returns:

Type Description
dict[str, Tensor]

dict with pixel_values and image_grid_thw

Source code in vllm/multimodal/processing/dummy_inputs.py
def get_dummy_mm_encoder_input(
    self,
    num_patches: int,
) -> "dict[str, torch.Tensor]":
    """
    Get dummy MM encoder input for CUDA graph capture or padding.

    Args:
        num_patches: Number of patches (tokens) for the dummy input

    Returns:
        dict with pixel_values and image_grid_thw
    """
    img_feature_dim = self._get_img_feature_dim()

    dtype = self.info.ctx.model_config.dtype

    h_patches, w_patches = self._calculate_patch_size(num_patches)

    pixel_values = torch.zeros(
        (num_patches, img_feature_dim), dtype=dtype, device="cuda"
    )
    grid_thw_list = torch.tensor(
        [[1, h_patches, w_patches]], dtype=torch.long, device="cpu"
    )

    return {
        "pixel_values": pixel_values,
        "image_grid_thw": grid_thw_list,
    }

get_dummy_processor_inputs

get_dummy_processor_inputs(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> ProcessorInputs

Build the input which, after processing, results in the maximum possible number of placeholder tokens.

Parameters:

Name Type Description Default
seq_len int

Sequence length

required
mm_counts Mapping[str, int]

Count of items per modality

required
mm_options Mapping[str, BaseDummyOptions] | None

Configurable options per modality (optional)

None
Source code in vllm/multimodal/processing/dummy_inputs.py
def get_dummy_processor_inputs(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> ProcessorInputs:
    """
    Build the input which, after processing, results in
    the maximum possible number of placeholder tokens.

    Args:
        seq_len: Sequence length
        mm_counts: Count of items per modality
        mm_options: Configurable options per modality (optional)
    """
    dummy_text = self.get_dummy_text(mm_counts)
    dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
    dummy_mm_items = self.info.parse_mm_data(dummy_mm_data)

    tokenization_kwargs = {"truncation": False}

    return ProcessorInputs(
        prompt=dummy_text,
        mm_items=dummy_mm_items,
        tokenization_kwargs=tokenization_kwargs,
    )

get_dummy_text abstractmethod

get_dummy_text(mm_counts: Mapping[str, int]) -> str

Build the text input corresponding to mm_counts.

Source code in vllm/multimodal/processing/dummy_inputs.py
@abstractmethod
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    """
    Build the text input corresponding to `mm_counts`.
    """
    raise NotImplementedError

ProcessorInputs dataclass

Represents the keyword arguments to vllm.multimodal.processing.BaseMultiModalProcessor.apply.

Source code in vllm/multimodal/processing/dummy_inputs.py
@dataclass
class ProcessorInputs:
    """
    Represents the keyword arguments to
    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
    """

    prompt: str | list[int]
    mm_items: MultiModalDataItems
    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
    tokenization_kwargs: Mapping[str, object] = field(default_factory=dict)

hf_processor_mm_kwargs class-attribute instance-attribute

hf_processor_mm_kwargs: Mapping[str, object] = field(
    default_factory=dict
)

mm_items instance-attribute

prompt instance-attribute

prompt: str | list[int]

tokenization_kwargs class-attribute instance-attribute

tokenization_kwargs: Mapping[str, object] = field(
    default_factory=dict
)

__init__

__init__(
    prompt: str | list[int],
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object] = dict(),
    tokenization_kwargs: Mapping[str, object] = dict(),
) -> None