Source code for tianshou.highlevel.params.alpha

from abc import ABC, abstractmethod

import numpy as np
from sensai.util.string import ToStringMixin

from tianshou.algorithm.modelfree.sac import Alpha, AutoAlpha
from tianshou.highlevel.env import Environments
from tianshou.highlevel.module.core import TDevice
from tianshou.highlevel.params.optim import OptimizerFactoryFactory



[docs]
class AutoAlphaFactory(ToStringMixin, ABC):

[docs]
    @abstractmethod
    def create_auto_alpha(
        self,
        envs: Environments,
        device: TDevice,
    ) -> Alpha:
        pass





[docs]
class AutoAlphaFactoryDefault(AutoAlphaFactory):
    def __init__(
        self,
        lr: float = 3e-4,
        target_entropy_coefficient: float = -1.0,
        log_alpha: float = 0.0,
        optim: OptimizerFactoryFactory | None = None,
    ) -> None:
        """
        :param lr: the learning rate for the optimizer of the alpha parameter
        :param target_entropy_coefficient: the coefficient with which to multiply the target entropy;
            The base value being scaled is `dim(A)` for continuous action spaces and `log(|A|)` for discrete action spaces,
            i.e. with the default coefficient -1, we obtain `-dim(A)` and `-log(dim(A))` for continuous and discrete action
            spaces respectively, which gives a reasonable trade-off between exploration and exploitation.
            For decidedly stochastic exploration, you can use a positive value closer to 1 (e.g. 0.98);
            1.0 would give full entropy exploration.
        :param log_alpha: the (initial) value of the log of the entropy regularization coefficient alpha.
        :param optim: the optimizer factory to use; if None, use default
        """
        self.lr = lr
        self.target_entropy_coefficient = target_entropy_coefficient
        self.log_alpha = log_alpha
        self.optimizer_factory_factory = optim or OptimizerFactoryFactory.default()


[docs]
    def create_auto_alpha(
        self,
        envs: Environments,
        device: TDevice,
    ) -> AutoAlpha:
        action_dim = np.prod(envs.get_action_shape())
        if envs.get_type().is_continuous():
            target_entropy = self.target_entropy_coefficient * float(action_dim)
        else:
            target_entropy = self.target_entropy_coefficient * np.log(action_dim)
        optim_factory = self.optimizer_factory_factory.create_optimizer_factory(lr=self.lr)
        return AutoAlpha(target_entropy, self.log_alpha, optim_factory)