Refactor EpsilonGreedyAgentBase and concretions to improve encapsulation

In this commit, encapsulation in the EpsilonGreedyAgentBase and its concretions (EpsilonGreedyAgent and AlphaEpsilonGreedyAgent) has been improved. The 'action_size' parameter has been removed from the base class' constructor and each child class is now responsible for initializing its own 'action_values' array. New getter method 'action_values' has been added to the base class which is abstract and must be implemented by all child classes. This improves the encapsulation by making 'action_values' private and only accessible through its getter. This change also avoids the need for type checking in 'base.py'. Although 'action_size' now needs to be provided in each concretion, it ensures that the implementations have proper control over their 'action_values' and makes the code more in line with OOP principles.
nakashima-hikaru · Nov 26, 2023 · f77b9e4 · f77b9e4
1 parent 0a708f6
commit f77b9e4
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 20 deletions.
diff --git a/...ment_learning/markov_decision_process/bandit_problem/agents/alpha_epsilon_greedy_agent.py b/...ment_learning/markov_decision_process/bandit_problem/agents/alpha_epsilon_greedy_agent.py
@@ -4,13 +4,21 @@
 """
 from typing import Self, final
 
+import numpy as np
+import numpy.typing as npt
+
 from reinforcement_learning.markov_decision_process.bandit_problem.agents.base import EpsilonGreedyAgentBase
 
 
 @final
 class AlphaEpsilonGreedyAgent(EpsilonGreedyAgentBase):
     """Implement an Alpha Epsilon Greedy Agent for multi-armed bandit problems."""
 
+    @property
+    def action_values(self: Self) -> npt.NDArray[np.float64]:
+        """Return the array of action values for the current agent."""
+        return self.__action_values
+
     def __init__(self: Self, *, epsilon: float, action_size: int, alpha: float, seed: int | None = None) -> None:
         """Initialize AlphaEpsilonGreedyAgent.
 
@@ -21,8 +29,9 @@ def __init__(self: Self, *, epsilon: float, action_size: int, alpha: float, seed
             alpha: The learning rate for updating action values.
             seed: The seed value for random number generation. Must be an integer or None.
         """
-        super().__init__(epsilon=epsilon, action_size=action_size, seed=seed)
-        self.alpha: float = alpha
+        super().__init__(epsilon=epsilon, seed=seed)
+        self.__alpha: float = alpha
+        self.__action_values: npt.NDArray[np.float64] = np.zeros(action_size, dtype=np.float64)
 
     def update(self: Self, i_action: int, reward: float) -> None:
         """Update the action-value estimation for the specified action using the given reward.
@@ -36,4 +45,4 @@ def update(self: Self, i_action: int, reward: float) -> None:
         -------
             None
         """
-        self._action_values[i_action] += (reward - self._action_values[i_action]) * self.alpha
+        self.__action_values[i_action] += (reward - self.__action_values[i_action]) * self.__alpha
diff --git a/reinforcement_learning/markov_decision_process/bandit_problem/agents/base.py b/reinforcement_learning/markov_decision_process/bandit_problem/agents/base.py
@@ -6,12 +6,10 @@
 estimation and 'get_action' to get the next action following the epsilon-greedy policy.
 """
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Self
+from typing import Self
 
 import numpy as np
-
-if TYPE_CHECKING:
-    import numpy.typing as npt
+import numpy.typing as npt
 
 
 class EpsilonGreedyAgentBase(ABC):
@@ -21,7 +19,7 @@ class EpsilonGreedyAgentBase(ABC):
     which is an exploration-exploitation algorithm commonly used in Reinforcement Learning.
     """
 
-    def __init__(self: Self, *, epsilon: float, action_size: int, seed: int | None) -> None:
+    def __init__(self: Self, *, epsilon: float, seed: int | None) -> None:
         """Initialize EpsilonGreedyAgentBase.
 
         Args:
@@ -31,11 +29,14 @@ def __init__(self: Self, *, epsilon: float, action_size: int, seed: int | None)
             seed: An optional seed value for random number generation.
 
         """
-        self._epsilon: float = epsilon
-        self._action_values: npt.NDArray[np.float64] = np.zeros(action_size, dtype=np.float64)
-        self._ns: npt.NDArray[np.int64] = np.zeros(action_size, dtype=np.int64)
+        self.__epsilon: float = epsilon
         self.__rng: np.random.Generator = np.random.default_rng(seed=seed)
 
+    @property
+    @abstractmethod
+    def action_values(self: Self) -> npt.NDArray[np.float64]:
+        """Return the action value."""
+
     @abstractmethod
     def update(self: Self, *, i_action: int, reward: float) -> None:
         """Update the agent's internal state based on the given action and reward.
@@ -49,6 +50,6 @@ def update(self: Self, *, i_action: int, reward: float) -> None:
 
     def get_action(self: Self) -> int:
         """Determine an action according to its policy."""
-        if self.__rng.random() < self._epsilon:
-            return int(self.__rng.integers(0, len(self._action_values)))
-        return int(np.argmax(self._action_values))
+        if self.__rng.random() < self.__epsilon:
+            return int(self.__rng.integers(0, len(self.action_values)))
+        return int(np.argmax(self.action_values))
diff --git a/reinforcement_learning/markov_decision_process/bandit_problem/agents/epsilon_greedy_agent.py b/reinforcement_learning/markov_decision_process/bandit_problem/agents/epsilon_greedy_agent.py
@@ -6,17 +6,23 @@
 otherwise exploits its current knowledge.
 The agent's state and action-value estimations are updated based on the rewards received after choosing actions.
 """
-from typing import Final, Self, final
+from typing import Self, final
 
-from reinforcement_learning.markov_decision_process.bandit_problem.agents.base import EpsilonGreedyAgentBase
+import numpy as np
+import numpy.typing as npt
 
-SEED: Final[int] = 0
+from reinforcement_learning.markov_decision_process.bandit_problem.agents.base import EpsilonGreedyAgentBase
 
 
 @final
 class EpsilonGreedyAgent(EpsilonGreedyAgentBase):
     """An agent for Epsilon-greedy exploration strategy for the multi-armed bandit problem."""
 
+    @property
+    def action_values(self: Self) -> npt.NDArray[np.float64]:
+        """Return the array of action values for the current agent."""
+        return self.__action_values
+
     def __init__(self: Self, epsilon: float, action_size: int, seed: int | None = None) -> None:
         """Initialize an EpsilonGreedyAgent instance.
 
@@ -30,7 +36,9 @@ def __init__(self: Self, epsilon: float, action_size: int, seed: int | None = No
         -------
             None
         """
-        super().__init__(epsilon=epsilon, action_size=action_size, seed=seed)
+        super().__init__(epsilon=epsilon, seed=seed)
+        self.__ns: npt.NDArray[np.int64] = np.zeros(action_size, dtype=np.int64)
+        self.__action_values: npt.NDArray[np.float64] = np.zeros(action_size, dtype=np.float64)
 
     def update(self: Self, *, i_action: int, reward: float) -> None:
         """Update the agent's estimate of the action value based on the received reward.
@@ -44,5 +52,5 @@ def update(self: Self, *, i_action: int, reward: float) -> None:
         -------
             None
         """
-        self._ns[i_action] += 1
-        self._action_values[i_action] += (reward - self._action_values[i_action]) / self._ns[i_action]
+        self.__ns[i_action] += 1
+        self.__action_values[i_action] += (reward - self.__action_values[i_action]) / self.__ns[i_action]