Source code for tianshou.algorithm.random

from typing import cast

import gymnasium as gym
import numpy as np

from tianshou.algorithm.algorithm_base import OffPolicyAlgorithm, TrainingStats
from tianshou.algorithm.algorithm_base import Policy as BasePolicy
from tianshou.data import Batch
from tianshou.data.batch import BatchProtocol
from tianshou.data.types import ActBatchProtocol, ObsBatchProtocol, RolloutBatchProtocol



[docs]
class MARLRandomTrainingStats(TrainingStats):
    pass




[docs]
class MARLRandomDiscreteMaskedOffPolicyAlgorithm(OffPolicyAlgorithm):
    """A random agent used in multi-agent learning.

    It randomly chooses an action from the legal actions (according to the given mask).
    """


[docs]
    class Policy(BasePolicy):
        """A random agent used in multi-agent learning.

        It randomly chooses an action from the legal actions.
        """

        def __init__(self, action_space: gym.spaces.Space) -> None:
            super().__init__(action_space=action_space)


[docs]
        def forward(
            self,
            batch: ObsBatchProtocol,
            state: dict | BatchProtocol | np.ndarray | None = None,
            **kwargs: dict,
        ) -> ActBatchProtocol:
            """Compute the random action over the given batch data.

            The input should contain a mask in batch.obs, with "True" to be
            available and "False" to be unavailable. For example,
            ``batch.obs.mask == np.array([[False, True, False]])`` means with batch
            size 1, action "1" is available but action "0" and "2" are unavailable.

            :return: A :class:`~tianshou.data.Batch` with "act" key, containing
                the random action.
            """
            mask = batch.obs.mask  # type: ignore
            logits = np.random.rand(*mask.shape)
            logits[~mask] = -np.inf
            result = Batch(act=logits.argmax(axis=-1))
            return cast(ActBatchProtocol, result)



    def __init__(self, action_space: gym.spaces.Space) -> None:
        """:param action_space: the environment's action space."""
        super().__init__(policy=self.Policy(action_space))

    def _update_with_batch(self, batch: RolloutBatchProtocol) -> MARLRandomTrainingStats:  # type: ignore
        """Since a random agent learns nothing, it returns an empty dict."""
        return MARLRandomTrainingStats()