Source code for tianshou.algorithm.random

from typing import cast

import gymnasium as gym
import numpy as np

from tianshou.algorithm.algorithm_base import OffPolicyAlgorithm, TrainingStats
from tianshou.algorithm.algorithm_base import Policy as BasePolicy
from tianshou.data import Batch
from tianshou.data.batch import BatchProtocol
from tianshou.data.types import ActBatchProtocol, ObsBatchProtocol, RolloutBatchProtocol


[docs] class MARLRandomTrainingStats(TrainingStats): pass
[docs] class MARLRandomDiscreteMaskedOffPolicyAlgorithm(OffPolicyAlgorithm): """A random agent used in multi-agent learning. It randomly chooses an action from the legal actions (according to the given mask). """
[docs] class Policy(BasePolicy): """A random agent used in multi-agent learning. It randomly chooses an action from the legal actions. """ def __init__(self, action_space: gym.spaces.Space) -> None: super().__init__(action_space=action_space)
[docs] def forward( self, batch: ObsBatchProtocol, state: dict | BatchProtocol | np.ndarray | None = None, **kwargs: dict, ) -> ActBatchProtocol: """Compute the random action over the given batch data. The input should contain a mask in batch.obs, with "True" to be available and "False" to be unavailable. For example, ``batch.obs.mask == np.array([[False, True, False]])`` means with batch size 1, action "1" is available but action "0" and "2" are unavailable. :return: A :class:`~tianshou.data.Batch` with "act" key, containing the random action. """ mask = batch.obs.mask # type: ignore logits = np.random.rand(*mask.shape) logits[~mask] = -np.inf result = Batch(act=logits.argmax(axis=-1)) return cast(ActBatchProtocol, result)
def __init__(self, action_space: gym.spaces.Space) -> None: """:param action_space: the environment's action space.""" super().__init__(policy=self.Policy(action_space)) def _update_with_batch(self, batch: RolloutBatchProtocol) -> MARLRandomTrainingStats: # type: ignore """Since a random agent learns nothing, it returns an empty dict.""" return MARLRandomTrainingStats()