rlberry-py
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎README.md
+62 b/‎README.md
+62
diff --git a/‎empirical_rl/avec_ppo/__init__.py
+1 b/‎empirical_rl/avec_ppo/__init__.py
+1
diff --git a/‎empirical_rl/avec_ppo/avec_ppo.py
+143 b/‎empirical_rl/avec_ppo/avec_ppo.py
+143
diff --git a/‎empirical_rl/evaluating.py
+12 b/‎empirical_rl/evaluating.py
+12
diff --git a/‎empirical_rl/plotting.py
+29 b/‎empirical_rl/plotting.py
+29
diff --git a/‎empirical_rl/statistical_comparing.py
+40 b/‎empirical_rl/statistical_comparing.py
+40
diff --git a/‎empirical_rl/training.py
+85 b/‎empirical_rl/training.py
+85
diff --git a/‎imgs/ExpFlowChart.png
77.5 KB b/‎imgs/ExpFlowChart.png
77.5 KB
diff --git a/‎imgs/evaluations.png
17.4 KB b/‎imgs/evaluations.png
17.4 KB
diff --git a/‎imgs/explained_variance.png
51.4 KB b/‎imgs/explained_variance.png
51.4 KB
diff --git a/‎imgs/rewards.png
50.4 KB b/‎imgs/rewards.png
50.4 KB
diff --git a/‎imgs/value_loss.png
39.9 KB b/‎imgs/value_loss.png
39.9 KB
diff --git a/‎requirements.txt
+6 b/‎requirements.txt
+6
@@ -0,0 +1 @@
+__pycache__
@@ -0,0 +1,62 @@
+# Overview
+
+In this tutorial we are interested in reproducible reinforcement learning research. The experiments in this repository aim to reproduce some deep reinforcement learning results from the paper [Learning Value Functions in Deep Policy Gradients using Residual Variance](https://arxiv.org/pdf/2010.04440). To do so we use specific [emprirical protocole](https://arxiv.org/abs/2304.01315) and open-source libraries that we introduce next.
+
+### Empirical reinforcement learning research.
+![protocole](imgs/ExpFlowChart.png "Empirical protocole from Andrew Patterson, Samuel Neumann, Martha White and Adam White")
+### Stable deep reinforcement learning agents and study tools (seeding, plotting, agents comparison).
+#### Stable-baselines3
+#### rlberry
+- seeding
+- agent manager
+- hyperparams optimization
+#### Adastop
+- statistically significant comparisons
+
+
+### Usage
+Tested on Python 3.10
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+cd empirical_rl
+```
+
+##### Training
+```bash
+python3 training.py
+```
+##### Plotting
+```bash
+python3 plotting.py
+```
+#### Evaluating
+```bash
+python3 evaluating.py
+```
+#### Adastop (long)
+```bash
+python3 statistical_comparing.py
+```
+
+
+## Expected Results
+![rewards](imgs/rewards.png)
+![value_loss](imgs/value_loss.png)
+![var](imgs/explained_variance.png)
+![eval](imgs/evaluations.png)
+#### Adastop expected results
+```bash
+[INFO] 13:10: Test finished 
+[INFO] 13:10: Results are  
+          Agent1 vs Agent2  mean Agent1  mean Agent2  mean diff decisions
+0  default_ppo vs avec_ppo      -86.636    -118.6952    32.0592     equal
+```
+
+
+# TODOs
+- Ant-v4
+- Loop over hyperparams and expand boundaris (hyperparam optim as per Patterson 2023)
+- Docstrings ?
+- Fix bug data loading for plotting data.
@@ -0,0 +1 @@
+from .avec_ppo import AVECPPO
@@ -0,0 +1,143 @@
+from typing import TypeVar
+
+import numpy as np
+import torch as th
+from gymnasium import spaces
+
+from stable_baselines3 import PPO
+from stable_baselines3.common.utils import explained_variance
+
+SelfAVECPPO = TypeVar("SelfAVECPPO", bound="AVECPPO")
+
+class AVECPPO(PPO):
+    """
+    PPO version of LEARNING VALUE FUNCTIONS IN DEEP POLICY GRADIENTS USING RESIDUAL VARIANCE.
+    Paper: https://arxiv.org/abs/2010.04440
+
+    Introduction to PPO: https://spinningup.openai.com/en/latest/algorithms/ppo.html
+    Full PPO documentation: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
+    """
+
+    def train(self) -> None:
+        """
+        Update policy using the currently gathered rollout buffer.
+        """
+        # Switch to train mode (this affects batch norm / dropout)
+        self.policy.set_training_mode(True)
+        # Update optimizer learning rate
+        self._update_learning_rate(self.policy.optimizer)
+        # Compute current clip range
+        clip_range = self.clip_range(self._current_progress_remaining)  # type: ignore[operator]
+        # Optional: clip range for the value function
+        if self.clip_range_vf is not None:
+            clip_range_vf = self.clip_range_vf(self._current_progress_remaining)  # type: ignore[operator]
+
+        entropy_losses = []
+        pg_losses, value_losses = [], []
+        clip_fractions = []
+
+        continue_training = True
+        # train for n_epochs epochs
+        for epoch in range(self.n_epochs):
+            approx_kl_divs = []
+            # Do a complete pass on the rollout buffer
+            for rollout_data in self.rollout_buffer.get(self.batch_size):
+                actions = rollout_data.actions
+                if isinstance(self.action_space, spaces.Discrete):
+                    # Convert discrete action from float to long
+                    actions = rollout_data.actions.long().flatten()
+
+                values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
+                values = values.flatten()
+                # Normalize advantage
+                advantages = rollout_data.advantages
+                # Normalization does not make sense if mini batchsize == 1, see GH issue #325
+                if self.normalize_advantage and len(advantages) > 1:
+                    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+                # ratio between old and new policy, should be one at the first iteration
+                ratio = th.exp(log_prob - rollout_data.old_log_prob)
+
+                # clipped surrogate loss
+                policy_loss_1 = advantages * ratio
+                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
+                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
+
+                # Logging
+                pg_losses.append(policy_loss.item())
+                clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item()
+                clip_fractions.append(clip_fraction)
+
+                if self.clip_range_vf is None:
+                    # No clipping
+                    values_pred = values
+                else:
+                    # Clip the difference between old and new value
+                    # NOTE: this depends on the reward scaling
+                    values_pred = rollout_data.old_values + th.clamp(
+                        values - rollout_data.old_values, -clip_range_vf, clip_range_vf
+                    )
+                # Value loss using the TD(gae_lambda) target
+                # value_loss = F.mse_loss(rollout_data.returns, values_pred)
+
+                # NOTE here is the variance loss:
+                value_loss = th.var(rollout_data.returns - values_pred)
+                value_losses.append(value_loss.item())
+
+                # Entropy loss favor exploration
+                if entropy is None:
+                    # Approximate entropy when no analytical form
+                    entropy_loss = -th.mean(-log_prob)
+                else:
+                    entropy_loss = -th.mean(entropy)
+
+                entropy_losses.append(entropy_loss.item())
+
+                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
+
+                # Calculate approximate form of reverse KL Divergence for early stopping
+                # see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
+                # and discussion in PR #419: https://github.com/DLR-RM/stable-baselines3/pull/419
+                # and Schulman blog: http://joschu.net/blog/kl-approx.html
+                with th.no_grad():
+                    log_ratio = log_prob - rollout_data.old_log_prob
+                    approx_kl_div = th.mean((th.exp(log_ratio) - 1) - log_ratio).cpu().numpy()
+                    approx_kl_divs.append(approx_kl_div)
+
+                if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl:
+                    continue_training = False
+                    if self.verbose >= 1:
+                        print(f"Early stopping at step {epoch} due to reaching max kl: {approx_kl_div:.2f}")
+                    break
+
+                # Optimization step
+                self.policy.optimizer.zero_grad()
+                loss.backward()
+                # Clip grad norm
+                th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
+                self.policy.optimizer.step()
+
+            self._n_updates += 1
+            if not continue_training:
+                break
+
+        explained_var = explained_variance(self.rollout_buffer.values.flatten(), self.rollout_buffer.returns.flatten())
+
+        # Logs
+        self.logger.record("train/entropy_loss", np.mean(entropy_losses))
+        self.logger.record("train/policy_gradient_loss", np.mean(pg_losses))
+        self.logger.record("train/value_loss", np.mean(value_losses))
+        self.logger.record("train/approx_kl", np.mean(approx_kl_divs))
+        self.logger.record("train/clip_fraction", np.mean(clip_fractions))
+        self.logger.record("train/loss", loss.item())
+        self.logger.record("train/explained_variance", explained_var)
+        if hasattr(self.policy, "log_std"):
+            self.logger.record("train/std", th.exp(self.policy.log_std).mean().item())
+
+        self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
+        self.logger.record("train/clip_range", clip_range)
+        if self.clip_range_vf is not None:
+            self.logger.record("train/clip_range_vf", clip_range_vf)
+
+    
+
@@ -0,0 +1,12 @@
+from rlberry.manager import evaluate_agents
+import matplotlib.pyplot as plt
+
+
+# aliases for exp results 
+avec_ppo = "rlberry_data/temp/manager_data/avec-ppo_2024-07-03_11-09-17_06970bd6"
+default_ppo = "rlberry_data/temp/manager_data/avec-ppo_2024-07-03_11-09-17_06970bd6"
+_ = evaluate_agents(
+    [default_ppo, avec_ppo], n_simulations=50, show=False,
+)  # Evaluate the trained agent on
+plt.savefig("evaluations")
+
@@ -0,0 +1,29 @@
+from rlberry.manager import plot_writer_data
+
+
+# aliases for exp results 
+default_ppo = "/data_training_default_ppo/manager_data/ppo-default_2024-07-03_11-26-14_b0045e63/agent_handlers/"
+avec_ppo = "data_training_avec_ppo/manager_data/avec-ppo_2024-07-03_11-26-14_9e4b15e4/"
+
+_ = plot_writer_data([default_ppo, avec_ppo],
+    tag="rollout/ep_rew_mean",
+    title="Training Episode Cumulative Rewards",
+    show=False,
+    savefig_fname="rewards"
+)
+
+_ = plot_writer_data([default_ppo, avec_ppo],
+    tag="train/explained_variance",
+    title="Training Explained Variance",
+    show=False,
+    savefig_fname="explained_variance"
+)
+
+
+_ = plot_writer_data([default_ppo, avec_ppo],
+    tag="train/value_loss",
+    title="Training Value Loss",
+    show=False,
+    savefig_fname="value_loss"
+)
+
@@ -0,0 +1,40 @@
+from rlberry.manager import AdastopComparator
+from rlberry.agents.stable_baselines import StableBaselinesAgent
+from rlberry.envs import gym_make
+from rlberry.seeding import Seeder
+
+from stable_baselines3 import PPO
+from avec_ppo import AVECPPO
+
+
+seed = Seeder(42)
+
+managers = [
+dict(
+    agent_class=StableBaselinesAgent,  # The Agent class.
+    train_env=(gym_make, dict(id="Acrobot-v1")),  # The Environment to solve.
+    fit_budget=5e4,  # The number of interactions
+    # between the agent and the
+    # environment during training.
+    init_kwargs=dict(algo_cls=PPO),  # Init value for StableBaselinesAgent
+    eval_kwargs=dict(eval_horizon=500),  # The number of interactions
+    # between the agent and the
+    # environment during evaluations.
+    agent_name="default_ppo",  # The agent's name.
+),
+dict(
+    agent_class = StableBaselinesAgent,  # The Agent class.
+    train_env=(gym_make, dict(id="Acrobot-v1")),  # The Environment to solve.
+    fit_budget=5e4,  # The number of interactions
+    # between the agent and the
+    # environment during training.
+    init_kwargs=dict(algo_cls=AVECPPO),  # Init value for StableBaselinesAgent
+    eval_kwargs=dict(eval_horizon=500),  # The number of interactions
+    # between the agent and the
+    # environment during evaluations.
+    agent_name="avec_ppo",  # The agent's name.
+) ]
+# # Comparing distributions
+comparator = AdastopComparator(seed=42)
+comparator.compare(managers)
+print(comparator.managers_paths)
@@ -0,0 +1,85 @@
+from rlberry.manager import ExperimentManager
+from rlberry.envs import gym_make
+from rlberry.agents.stable_baselines import StableBaselinesAgent
+from rlberry.seeding import Seeder
+
+from stable_baselines3 import PPO
+from avec_ppo import AVECPPO
+
+seeder = Seeder(42)
+
+# The ExperimentManager class is a compact way of experimenting with a deepRL agent.
+default_xp = ExperimentManager(
+    StableBaselinesAgent,  # The Agent class.
+    (gym_make, dict(id="Acrobot-v1")),  # The Environment to solve.
+    fit_budget=5e4,  # The number of interactions
+    # between the agent and the
+    # environment during training.
+    init_kwargs=dict(algo_cls=PPO),  # Init value for StableBaselinesAgent
+    eval_kwargs=dict(eval_horizon=500),  # The number of interactions
+    # between the agent and the
+    # environment during evaluations.
+    n_fit=5,  # The number of agents to train.
+    # Usually, it is good to do more
+    # than 1 because the training is
+    # stochastic.
+    seed=seeder,
+    agent_name="default_ppo",  # The agent's name.
+    output_dir="data_training_default_ppo"
+)
+
+avec_xp = ExperimentManager(
+    StableBaselinesAgent,  # The Agent class.
+    (gym_make, dict(id="Acrobot-v1")),  # The Environment to solve.
+    fit_budget=5e4,  # The number of interactions
+    # between the agent and the
+    # environment during training.
+    init_kwargs=dict(algo_cls=AVECPPO),  # Init value for StableBaselinesAgent
+    eval_kwargs=dict(eval_horizon=500),  # The number of interactions
+    # between the agent and the
+    # environment during evaluations.
+    n_fit=5,  # The number of agents to train.
+    # Usually, it is good to do more
+    # than 1 because the training is
+    # stochastic.
+    seed=seeder,
+    agent_name="avec_ppo",  # The agent's name.
+    output_dir="data_training_avec_ppo"
+)
+
+default_xp.fit(), avec_xp.fit()
+
+
+
+# FOR TESTING PURPOSES
+from rlberry.manager import plot_writer_data
+
+_ = plot_writer_data([default_xp, avec_xp],
+    tag="rollout/ep_rew_mean",
+    title="Training Episode Cumulative Rewards",
+    show=False,
+    savefig_fname="rewards"
+)
+
+_ = plot_writer_data([default_xp, avec_xp],
+    tag="train/explained_variance",
+    title="Training Explained Variance",
+    show=False,
+    savefig_fname="explained_variance"
+)
+
+_ = plot_writer_data([default_xp, avec_xp],
+    tag="train/value_loss",
+    title="Training Value Loss",
+    show=False,
+    savefig_fname="value_loss"
+)
+
+from rlberry.manager import evaluate_agents
+import matplotlib.pyplot as plt
+
+# Comparing means
+_ = evaluate_agents(
+    [default_xp, avec_xp], n_simulations=50,show=False,
+)  # Evaluate the trained agent on
+plt.savefig("evaluations")
@@ -0,0 +1,6 @@
+rlberry==0.7
+numpy==1.25.2
+torch==2.3.1
+tensorboard==2.17.0
+stable-baselines3==2.2.1
+gymnasium[mujoco]==0.29.1