Source code for robometric_frame.task_performance.task_completion_rate

"""Task Completion Rate metric for robotics policy evaluation.

Task Completion Rate (TCR) evaluates the ability to execute multi-step task sequences,
revealing critical limitations in current robotics policies when handling complex natural
language instructions that require multiple sequential actions.

Reference:
    A. Brohan et al., "RT-1: Robotics transformer for real-world control at scale,"
    arXiv preprint arXiv:2212.06817, 2022.
"""

from typing import Any, Optional

import torch
from torch import Tensor
from torchmetrics import Metric



[docs]
class TaskCompletionRate(Metric):
    r"""Compute Task Completion Rate for robotics policy task chain evaluation.

    Task Completion Rate is calculated as:

    .. math::

        TCR = \frac{N_{\text{completed tasks}}}{N_{\text{task chains}}}

    where :math:`N_{\text{completed tasks}}` is the number of successfully
    completed task chains and :math:`N_{\text{task chains}}` is the total
    number of task chains attempted.

    This metric evaluates multi-step task sequences, measuring success rates across
    sequential steps. Research shows that success rates drop significantly between
    sequential steps, indicating challenges in complex instruction following.

    Args:
        threshold: Threshold for binary classification when using continuous scores.
            If None, assumes binary inputs (0 or 1). Default: None.
        ignore_index: Value to ignore in the completion tensor. Default: None.
        **kwargs: Additional keyword arguments passed to the base Metric class.

    Example:
        >>> from robometric_frame import TaskCompletionRate
        >>> metric = TaskCompletionRate()
        >>> # Binary completion indicators for task chains
        >>> completion = torch.tensor([1, 0, 1, 1, 0])
        >>> metric(completion)
        tensor(0.6000)

        >>> # With continuous scores and threshold
        >>> metric = TaskCompletionRate(threshold=0.8)
        >>> scores = torch.tensor([0.9, 0.7, 0.85, 0.95])
        >>> metric(scores)
        tensor(0.7500)

    Example (multi-step evaluation):
        >>> # Evaluate task chains over multiple batches
        >>> metric = TaskCompletionRate()
        >>> # First batch: 3 task chains, 2 completed
        >>> batch1 = torch.tensor([1, 0, 1])
        >>> metric.update(batch1)
        >>> # Second batch: 2 task chains, 1 completed
        >>> batch2 = torch.tensor([0, 1])
        >>> metric.update(batch2)
        >>> # Overall completion rate
        >>> metric.compute()
        tensor(0.6000)
    """

    # Metric states that persist across updates
    full_state_update: bool = False

    # Dynamically added by add_state() in __init__
    total_completed: Tensor
    total_chains: Tensor


[docs]
    def __init__(
        self,
        threshold: Optional[float] = None,
        ignore_index: Optional[int] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize the TaskCompletionRate metric."""
        super().__init__(**kwargs)

        self.threshold = threshold
        self.ignore_index = ignore_index

        # Add metric states for distributed computation
        self.add_state("total_completed", default=torch.tensor(0.0), dist_reduce_fx="sum")
        self.add_state("total_chains", default=torch.tensor(0.0), dist_reduce_fx="sum")



[docs]
    def update(self, completion: Tensor) -> None:  # pylint: disable=arguments-differ
        """Update metric state with new task chain completion indicators.

        Args:
            completion: Tensor of shape (N,) containing binary completion indicators
                (0 or 1) or continuous completion scores if threshold is set. Values
                can be int, float, or bool.

        Raises:
            ValueError: If completion tensor is empty or contains invalid values.
        """
        if completion.numel() == 0:
            raise ValueError("Input tensor is empty")

        # Handle ignore_index
        if self.ignore_index is not None:
            mask = completion != self.ignore_index
            completion = completion[mask]

            if completion.numel() == 0:
                return  # All values were ignored

        # Apply threshold if provided (for continuous scores)
        if self.threshold is not None:
            completion = (completion >= self.threshold).float()
        else:
            # Ensure binary values for non-thresholded input
            completion = completion.float()
            if not torch.all((completion == 0) | (completion == 1)):
                raise ValueError(
                    "Completion indicators must be binary (0 or 1) when threshold is not set. "
                    "Set threshold parameter for continuous scores."
                )

        # Update states
        self.total_completed += completion.sum()  # pylint: disable=no-member
        self.total_chains += completion.numel()  # pylint: disable=no-member



[docs]
    def compute(self) -> Tensor:
        """Compute the final Task Completion Rate.

        Returns:
            Task completion rate as a scalar tensor in range [0, 1].

        Raises:
            RuntimeError: If no task chains have been recorded (total_chains == 0).
        """
        if self.total_chains == 0:  # pylint: disable=no-member
            raise RuntimeError(
                "Cannot compute task completion rate: no task chains have been recorded. "
                "Call update() with completion indicators before compute()."
            )

        return self.total_completed.float() / self.total_chains  # pylint: disable=no-member