vllm.v1.worker.block_table

logger `module-attribute` ¶

logger = init_logger(__name__)

BlockTable ¶

Source code in vllm/v1/worker/block_table.py

class BlockTable:

    def __init__(
        self,
        block_size: int,
        max_num_reqs: int,
        max_num_blocks_per_req: int,
        max_num_batched_tokens: int,
        pin_memory: bool,
        device: torch.device,
    ):
        self.block_size = block_size
        self.max_num_reqs = max_num_reqs
        self.max_num_blocks_per_req = max_num_blocks_per_req
        self.max_num_batched_tokens = max_num_batched_tokens
        self.pin_memory = pin_memory
        self.device = device

        self.block_table = torch.zeros(
            (max_num_reqs, max_num_blocks_per_req),
            device=self.device,
            dtype=torch.int32,
        )
        self.block_table_cpu = torch.zeros(
            (max_num_reqs, max_num_blocks_per_req),
            device="cpu",
            dtype=torch.int32,
            pin_memory=pin_memory,
        )
        self.block_table_np = self.block_table_cpu.numpy()
        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)

        self.slot_mapping_cpu = torch.zeros(self.max_num_batched_tokens,
                                            dtype=torch.int64,
                                            device="cpu",
                                            pin_memory=self.pin_memory)
        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
        self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
                                        dtype=torch.int64,
                                        device=self.device)

    def append_row(
        self,
        block_ids: list[int],
        row_idx: int,
    ) -> None:
        if not block_ids:
            return
        num_blocks = len(block_ids)
        start = self.num_blocks_per_row[row_idx]
        self.num_blocks_per_row[row_idx] += num_blocks
        self.block_table_np[row_idx, start:start + num_blocks] = block_ids

    def add_row(self, block_ids: list[int], row_idx: int) -> None:
        self.num_blocks_per_row[row_idx] = 0
        self.append_row(block_ids, row_idx)

    def move_row(self, src: int, tgt: int) -> None:
        num_blocks = self.num_blocks_per_row[src]
        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
            src, :num_blocks]
        self.num_blocks_per_row[tgt] = num_blocks

    def swap_row(self, src: int, tgt: int) -> None:
        num_blocks_src = self.num_blocks_per_row[src]
        num_blocks_tgt = self.num_blocks_per_row[tgt]
        self.num_blocks_per_row[src] = num_blocks_tgt
        self.num_blocks_per_row[tgt] = num_blocks_src

        self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]

    def compute_slot_mapping(self, req_indices: np.ndarray,
                             positions: np.ndarray) -> None:
        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
        # where K is the max_num_blocks_per_req and the block size is 2.
        # NOTE(woosuk): We can't simply use `token_indices // block_size`
        # here because M (max_model_len) is not necessarily divisible by
        # block_size.
        block_table_indices = (req_indices * self.max_num_blocks_per_req +
                               positions // self.block_size)
        block_table_cpu = self.get_cpu_tensor()
        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
        block_offsets = positions % self.block_size
        np.add(block_numbers * self.block_size,
               block_offsets,
               out=self.slot_mapping_np[:req_indices.shape[0]])

    def commit_block_table(self, num_reqs: int) -> None:
        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
                                          non_blocking=True)

    def commit_slot_mapping(self, num_tokens: int) -> None:
        self.slot_mapping[:num_tokens].copy_(
            self.slot_mapping_cpu[:num_tokens], non_blocking=True)

    def clear(self) -> None:
        self.block_table.fill_(0)
        self.block_table_cpu.fill_(0)

    def get_device_tensor(self) -> torch.Tensor:
        """Ruturns the device tensor of the block table."""
        return self.block_table

    def get_cpu_tensor(self) -> torch.Tensor:
        """Returns the CPU tensor of the block table."""
        return self.block_table_cpu

    def get_numpy_array(self) -> np.ndarray:
        """Returns the numpy array of the block table."""
        return self.block_table_np

block_size `instance-attribute` ¶

block_size = block_size

block_table `instance-attribute` ¶

block_table = zeros(
    (max_num_reqs, max_num_blocks_per_req),
    device=device,
    dtype=int32,
)

block_table_cpu `instance-attribute` ¶

block_table_cpu = zeros(
    (max_num_reqs, max_num_blocks_per_req),
    device="cpu",
    dtype=int32,
    pin_memory=pin_memory,
)

block_table_np `instance-attribute` ¶

block_table_np = numpy()

device `instance-attribute` ¶

device = device

max_num_batched_tokens `instance-attribute` ¶

max_num_batched_tokens = max_num_batched_tokens

max_num_blocks_per_req `instance-attribute` ¶

max_num_blocks_per_req = max_num_blocks_per_req

max_num_reqs `instance-attribute` ¶

max_num_reqs = max_num_reqs

num_blocks_per_row `instance-attribute` ¶

num_blocks_per_row = zeros(max_num_reqs, dtype=int32)

pin_memory `instance-attribute` ¶

pin_memory = pin_memory

slot_mapping `instance-attribute` ¶

slot_mapping = zeros(
    max_num_batched_tokens, dtype=int64, device=device
)

slot_mapping_cpu `instance-attribute` ¶

slot_mapping_cpu = zeros(
    max_num_batched_tokens,
    dtype=int64,
    device="cpu",
    pin_memory=pin_memory,
)

slot_mapping_np `instance-attribute` ¶

slot_mapping_np = numpy()

init ¶

__init__(
    block_size: int,
    max_num_reqs: int,
    max_num_blocks_per_req: int,
    max_num_batched_tokens: int,
    pin_memory: bool,
    device: device,
)

Source code in vllm/v1/worker/block_table.py

def __init__(
    self,
    block_size: int,
    max_num_reqs: int,
    max_num_blocks_per_req: int,
    max_num_batched_tokens: int,
    pin_memory: bool,
    device: torch.device,
):
    self.block_size = block_size
    self.max_num_reqs = max_num_reqs
    self.max_num_blocks_per_req = max_num_blocks_per_req
    self.max_num_batched_tokens = max_num_batched_tokens
    self.pin_memory = pin_memory
    self.device = device

    self.block_table = torch.zeros(
        (max_num_reqs, max_num_blocks_per_req),
        device=self.device,
        dtype=torch.int32,
    )
    self.block_table_cpu = torch.zeros(
        (max_num_reqs, max_num_blocks_per_req),
        device="cpu",
        dtype=torch.int32,
        pin_memory=pin_memory,
    )
    self.block_table_np = self.block_table_cpu.numpy()
    self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)

    self.slot_mapping_cpu = torch.zeros(self.max_num_batched_tokens,
                                        dtype=torch.int64,
                                        device="cpu",
                                        pin_memory=self.pin_memory)
    self.slot_mapping_np = self.slot_mapping_cpu.numpy()
    self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
                                    dtype=torch.int64,
                                    device=self.device)

add_row ¶

add_row(block_ids: list[int], row_idx: int) -> None

Source code in vllm/v1/worker/block_table.py

def add_row(self, block_ids: list[int], row_idx: int) -> None:
    self.num_blocks_per_row[row_idx] = 0
    self.append_row(block_ids, row_idx)

append_row ¶

append_row(block_ids: list[int], row_idx: int) -> None

Source code in vllm/v1/worker/block_table.py

def append_row(
    self,
    block_ids: list[int],
    row_idx: int,
) -> None:
    if not block_ids:
        return
    num_blocks = len(block_ids)
    start = self.num_blocks_per_row[row_idx]
    self.num_blocks_per_row[row_idx] += num_blocks
    self.block_table_np[row_idx, start:start + num_blocks] = block_ids

clear ¶

clear() -> None

Source code in vllm/v1/worker/block_table.py

def clear(self) -> None:
    self.block_table.fill_(0)
    self.block_table_cpu.fill_(0)

commit_block_table ¶

commit_block_table(num_reqs: int) -> None

Source code in vllm/v1/worker/block_table.py

def commit_block_table(self, num_reqs: int) -> None:
    self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
                                      non_blocking=True)

commit_slot_mapping ¶

commit_slot_mapping(num_tokens: int) -> None

Source code in vllm/v1/worker/block_table.py

def commit_slot_mapping(self, num_tokens: int) -> None:
    self.slot_mapping[:num_tokens].copy_(
        self.slot_mapping_cpu[:num_tokens], non_blocking=True)

compute_slot_mapping ¶

compute_slot_mapping(
    req_indices: ndarray, positions: ndarray
) -> None

Source code in vllm/v1/worker/block_table.py

def compute_slot_mapping(self, req_indices: np.ndarray,
                         positions: np.ndarray) -> None:
    # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
    # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
    # where K is the max_num_blocks_per_req and the block size is 2.
    # NOTE(woosuk): We can't simply use `token_indices // block_size`
    # here because M (max_model_len) is not necessarily divisible by
    # block_size.
    block_table_indices = (req_indices * self.max_num_blocks_per_req +
                           positions // self.block_size)
    block_table_cpu = self.get_cpu_tensor()
    block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
    block_offsets = positions % self.block_size
    np.add(block_numbers * self.block_size,
           block_offsets,
           out=self.slot_mapping_np[:req_indices.shape[0]])

get_cpu_tensor ¶

get_cpu_tensor() -> Tensor

Returns the CPU tensor of the block table.

Source code in vllm/v1/worker/block_table.py

def get_cpu_tensor(self) -> torch.Tensor:
    """Returns the CPU tensor of the block table."""
    return self.block_table_cpu

get_device_tensor ¶

get_device_tensor() -> Tensor

Ruturns the device tensor of the block table.

Source code in vllm/v1/worker/block_table.py

def get_device_tensor(self) -> torch.Tensor:
    """Ruturns the device tensor of the block table."""
    return self.block_table

get_numpy_array ¶

get_numpy_array() -> ndarray

Returns the numpy array of the block table.

Source code in vllm/v1/worker/block_table.py

def get_numpy_array(self) -> np.ndarray:
    """Returns the numpy array of the block table."""
    return self.block_table_np

move_row ¶

move_row(src: int, tgt: int) -> None

Source code in vllm/v1/worker/block_table.py

def move_row(self, src: int, tgt: int) -> None:
    num_blocks = self.num_blocks_per_row[src]
    self.block_table_np[tgt, :num_blocks] = self.block_table_np[
        src, :num_blocks]
    self.num_blocks_per_row[tgt] = num_blocks

swap_row ¶

swap_row(src: int, tgt: int) -> None

Source code in vllm/v1/worker/block_table.py

def swap_row(self, src: int, tgt: int) -> None:
    num_blocks_src = self.num_blocks_per_row[src]
    num_blocks_tgt = self.num_blocks_per_row[tgt]
    self.num_blocks_per_row[src] = num_blocks_tgt
    self.num_blocks_per_row[tgt] = num_blocks_src

    self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]

MultiGroupBlockTable ¶

The BlockTables for each KV cache group.

Source code in vllm/v1/worker/block_table.py

class MultiGroupBlockTable:
    """The BlockTables for each KV cache group."""

    def __init__(self, max_num_reqs: int, max_model_len: int,
                 max_num_batched_tokens: int, pin_memory: bool,
                 device: torch.device, block_sizes: list[int]) -> None:
        self.block_tables = [
            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
                                                      block_size),
                       max_num_batched_tokens, pin_memory, device)
            for block_size in block_sizes
        ]

    def append_row(self, block_ids: tuple[list[int], ...],
                   row_idx: int) -> None:
        for i, block_table in enumerate(self.block_tables):
            block_table.append_row(block_ids[i], row_idx)

    def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
        for i, block_table in enumerate(self.block_tables):
            block_table.add_row(block_ids[i], row_idx)

    def move_row(self, src: int, tgt: int) -> None:
        for block_table in self.block_tables:
            block_table.move_row(src, tgt)

    def swap_row(self, src: int, tgt: int) -> None:
        for block_table in self.block_tables:
            block_table.swap_row(src, tgt)

    def compute_slot_mapping(self, req_indices: np.ndarray,
                             positions: np.ndarray) -> None:
        for block_table in self.block_tables:
            block_table.compute_slot_mapping(req_indices, positions)

    def commit_block_table(self, num_reqs: int) -> None:
        for block_table in self.block_tables:
            block_table.commit_block_table(num_reqs)

    def commit_slot_mapping(self, num_tokens: int) -> None:
        for block_table in self.block_tables:
            block_table.commit_slot_mapping(num_tokens)

    def clear(self) -> None:
        for block_table in self.block_tables:
            block_table.clear()

    def __getitem__(self, idx: int) -> "BlockTable":
        """Returns the BlockTable for the i-th KV cache group."""
        return self.block_tables[idx]

block_tables `instance-attribute` ¶

block_tables = [
    (
        BlockTable(
            block_size,
            max_num_reqs,
            cdiv(max_model_len, block_size),
            max_num_batched_tokens,
            pin_memory,
            device,
        )
    )
    for block_size in block_sizes
]

getitem ¶

__getitem__(idx: int) -> BlockTable

Returns the BlockTable for the i-th KV cache group.

Source code in vllm/v1/worker/block_table.py

def __getitem__(self, idx: int) -> "BlockTable":
    """Returns the BlockTable for the i-th KV cache group."""
    return self.block_tables[idx]

init ¶

__init__(
    max_num_reqs: int,
    max_model_len: int,
    max_num_batched_tokens: int,
    pin_memory: bool,
    device: device,
    block_sizes: list[int],
) -> None

Source code in vllm/v1/worker/block_table.py

def __init__(self, max_num_reqs: int, max_model_len: int,
             max_num_batched_tokens: int, pin_memory: bool,
             device: torch.device, block_sizes: list[int]) -> None:
    self.block_tables = [
        BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
                                                  block_size),
                   max_num_batched_tokens, pin_memory, device)
        for block_size in block_sizes
    ]

add_row ¶

add_row(
    block_ids: tuple[list[int], ...], row_idx: int
) -> None

Source code in vllm/v1/worker/block_table.py

def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
    for i, block_table in enumerate(self.block_tables):
        block_table.add_row(block_ids[i], row_idx)

append_row ¶

append_row(
    block_ids: tuple[list[int], ...], row_idx: int
) -> None

Source code in vllm/v1/worker/block_table.py

def append_row(self, block_ids: tuple[list[int], ...],
               row_idx: int) -> None:
    for i, block_table in enumerate(self.block_tables):
        block_table.append_row(block_ids[i], row_idx)

clear ¶

clear() -> None

Source code in vllm/v1/worker/block_table.py

def clear(self) -> None:
    for block_table in self.block_tables:
        block_table.clear()

commit_block_table ¶

commit_block_table(num_reqs: int) -> None

Source code in vllm/v1/worker/block_table.py

def commit_block_table(self, num_reqs: int) -> None:
    for block_table in self.block_tables:
        block_table.commit_block_table(num_reqs)

commit_slot_mapping ¶

commit_slot_mapping(num_tokens: int) -> None

Source code in vllm/v1/worker/block_table.py

def commit_slot_mapping(self, num_tokens: int) -> None:
    for block_table in self.block_tables:
        block_table.commit_slot_mapping(num_tokens)

compute_slot_mapping ¶

compute_slot_mapping(
    req_indices: ndarray, positions: ndarray
) -> None

Source code in vllm/v1/worker/block_table.py

def compute_slot_mapping(self, req_indices: np.ndarray,
                         positions: np.ndarray) -> None:
    for block_table in self.block_tables:
        block_table.compute_slot_mapping(req_indices, positions)

move_row ¶

move_row(src: int, tgt: int) -> None

Source code in vllm/v1/worker/block_table.py

def move_row(self, src: int, tgt: int) -> None:
    for block_table in self.block_tables:
        block_table.move_row(src, tgt)

swap_row ¶

swap_row(src: int, tgt: int) -> None

Source code in vllm/v1/worker/block_table.py

def swap_row(self, src: int, tgt: int) -> None:
    for block_table in self.block_tables:
        block_table.swap_row(src, tgt)

vllm.v1.worker.block_table

logger module-attribute ¶

BlockTable ¶

block_size instance-attribute ¶

block_table instance-attribute ¶

block_table_cpu instance-attribute ¶

block_table_np instance-attribute ¶

device instance-attribute ¶

max_num_batched_tokens instance-attribute ¶

max_num_blocks_per_req instance-attribute ¶

max_num_reqs instance-attribute ¶

num_blocks_per_row instance-attribute ¶

pin_memory instance-attribute ¶

slot_mapping instance-attribute ¶

slot_mapping_cpu instance-attribute ¶

slot_mapping_np instance-attribute ¶

__init__ ¶

add_row ¶

append_row ¶

clear ¶

commit_block_table ¶

commit_slot_mapping ¶

compute_slot_mapping ¶

get_cpu_tensor ¶

get_device_tensor ¶

get_numpy_array ¶

move_row ¶

swap_row ¶

MultiGroupBlockTable ¶

block_tables instance-attribute ¶

__getitem__ ¶

__init__ ¶

add_row ¶

append_row ¶

clear ¶

commit_block_table ¶

commit_slot_mapping ¶

compute_slot_mapping ¶

move_row ¶

swap_row ¶

logger `module-attribute` ¶

block_size `instance-attribute` ¶

block_table `instance-attribute` ¶

block_table_cpu `instance-attribute` ¶

block_table_np `instance-attribute` ¶

device `instance-attribute` ¶

max_num_batched_tokens `instance-attribute` ¶

max_num_blocks_per_req `instance-attribute` ¶

max_num_reqs `instance-attribute` ¶

num_blocks_per_row `instance-attribute` ¶

pin_memory `instance-attribute` ¶

slot_mapping `instance-attribute` ¶

slot_mapping_cpu `instance-attribute` ¶

slot_mapping_np `instance-attribute` ¶

init ¶

block_tables `instance-attribute` ¶

getitem ¶

init ¶