vllm.model_executor.layers.fused_moe.utils
 
 _count_expert_num_tokens(
    topk_ids_ptr,
    expert_num_tokens_ptr,
    num_experts,
    topk_numel,
    expert_map,
    HAS_EXPERT_MAP: constexpr,
    BLOCK_SIZE: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _fp4_quantize(
    A: Tensor,
    A_scale: Optional[Tensor],
    is_sf_swizzled_layout: bool,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/layers/fused_moe/utils.py
   
  A permutation routine that works on fp8 types.
Source code in vllm/model_executor/layers/fused_moe/utils.py
   
 _fp8_quantize(
    A: Tensor,
    A_scale: Optional[Tensor],
    per_act_token: bool,
    block_shape: Optional[list[int]] = None,
) -> tuple[Tensor, Tensor]
Perform fp8 quantization on the inputs. If a block_shape is provided, the output will be blocked.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _int8_quantize(
    A: Tensor,
    A_scale: Optional[Tensor],
    per_act_token: bool,
    block_shape: Optional[list[int]] = None,
) -> tuple[Tensor, Tensor]
Perform int8 quantization on the inputs. If a block_shape is provided, the output will be blocked.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _mxfp4_quantize(
    A: Tensor,
    A_scale: Optional[Tensor],
    per_act_token_quant: bool,
    block_shape: Optional[list[int]] = None,
) -> tuple[Tensor, None]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
  Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _validate_scale_shape(
    a: Tensor,
    a_scale: Optional[Tensor],
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
) -> None
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 count_expert_num_tokens(
    topk_ids: Tensor,
    num_local_experts: int,
    expert_map: Optional[Tensor],
) -> Tensor
Count the number to tokens assigned to each expert.
Parameters: - topk_ids (torch.Tensor): Tensor mapping each token to its list of experts. - num_local_experts (int): Number of experts in this rank. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard.
Returns: A tensor of size num_local_experts, where tensor[i] holds the number of tokens assigned to the ith expert.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 extract_required_args(
    extra_args: Optional[dict[str, Any]],
    required_keys: list[str],
) -> tuple[Any, ...]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 moe_kernel_quantize_input(
    A: Tensor,
    A_scale: Optional[Tensor],
    quant_dtype: Union[None, dtype, str],
    per_act_token_quant: bool,
    block_shape: Optional[list[int]] = None,
    is_fp4_scale_swizzled: bool = True,
) -> tuple[Tensor, Optional[Tensor]]