Howto 17 - (RL) Advanced training with stagnation detection

Ver. 1.2.1 (2022-02-27)

This module demonstrates advanced training with evaluation and stagnation detection.

Prerequisites

Please install the following packages to run this examples properly:

Results

../../../_images/Cartpole1.png

After the multiple environments are initialised, the training will run for the specified amount of limits. When stagnation is detected, the training will be stopped. Along with the training result summary, the logs are stored in the mentioned location

YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: ------------------------------------------------------------------------------
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Training Results of run 0
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: ------------------------------------------------------------------------------
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: ------------------------------------------------------------------------------
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Scenario          : RL-Scenario Matrix
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Model             : Agent Smith
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Start time stamp  : YYYY-MM-DD HH:MM:SS.SSSSSS
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- End time stamp    : YYYY-MM-DD HH:MM:SS.SSSSSS
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Duration          : HH:MM:SS.SSSSSS
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Start cycle id    : 0
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- End cycle id      :
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Training cycles   :
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Evaluation cycles :
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Adaptations       :
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- High score        :
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Results stored in : "C:\Users\%username%\YYYY-MM-DD  HH:MM:SS Training RL"
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Training Episodes : 120
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: -- Evaluations       : 25
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: ------------------------------------------------------------------------------
YYYY-MM-DD  HH:MM:SS.SSSSSS  W  Results  RL: ------------------------------------------------------------------------------
In the folder, there should be some files including:
  • agent_actions.csv

  • env_rewards.csv

  • env_states.csv

  • evaluation.csv

  • summary.csv

  • Agent 0 Smith-1(0).pkl

  • Agent 1 Smith-2(1).pkl

The figure plot is not initialised in this example but the logged metrices are available to access in the csv file.

Example Code

## -------------------------------------------------------------------------------------------------
## -- Project : MLPro - A Synoptic Framework for Standardized Machine Learning Tasks
## -- Package : mlpro
## -- Module  : Howto 17 - Advanced training with stagnation detection
## -------------------------------------------------------------------------------------------------
## -- History :
## -- yyyy-mm-dd  Ver.      Auth.    Description
## -- 2021-12-11  0.0.0     DA       Creation
## -- 2021-12-12  1.0.0     DA       Released first version
## -- 2022-02-04  1.1.0     DA       Introduction of parameter p_stagnation_entry
## -- 2022-02-10  1.2.0     DA       Introduction of parameter p_end_at_stagnation
## -- 2022-02-27  1.2.1     SY       Refactoring due to auto generated ID in class Dimension
## -------------------------------------------------------------------------------------------------

"""
Ver. 1.2.1 (2022-02-27)

This module demonstrates advanced training with evaluation and stagnation detection.
"""


from mlpro.rl.models import *
from mlpro.rl.pool.envs.multicartpole import MultiCartPole
import random
from pathlib import Path




# 1 Implement your own agent policy
class MyPolicy (Policy):

    C_NAME      = 'MyPolicy'

    def set_random_seed(self, p_seed=None):
        random.seed(p_seed)


    def compute_action(self, p_state: State) -> Action:
        # 1 Create a numpy array for your action values 
        my_action_values = np.zeros(self._action_space.get_num_dim())

        # 2 Computing action values is up to you...
        for d in range(self._action_space.get_num_dim()):
            my_action_values[d] = random.random() 

        # 3 Return an action object with your values
        return Action(self._id, self._action_space, my_action_values)


    def _adapt(self, *p_args) -> bool:
        # 1 Adapting the internal policy is up to you...
        self.log(self.C_LOG_TYPE_I, 'Sorry, I am a stupid agent...')

        # 2 Only return True if something has been adapted...
        return False




# 2 Implement your own RL scenario
class MyScenario (RLScenario):

    C_NAME      = 'Matrix'

    def _setup(self, p_mode, p_ada, p_logging):

        # 1 Setup Multi-Agent Environment (consisting of 3 OpenAI Gym Cartpole envs)
        self._env   = MultiCartPole(p_num_envs=3, p_reward_type=Reward.C_TYPE_EVERY_AGENT, p_logging=p_logging)


        # 2 Setup Multi-Agent 

        # 2.1 Create empty Multi-Agent
        multi_agent = MultiAgent(
            p_name='Smith',
            p_ada=True,
            p_logging=p_logging
        )

        # 2.2 Add Single-Agent #1 with own policy (controlling sub-environment #1)
        ss_ids = self._env.get_state_space().get_dim_ids()
        as_ids = self._env.get_action_space().get_dim_ids()
        multi_agent.add_agent(
            p_agent=Agent(
                p_policy=MyPolicy(
                    p_observation_space=self._env.get_state_space().spawn([ss_ids[0],ss_ids[1],ss_ids[2],ss_ids[3]]),
                    p_action_space=self._env.get_action_space().spawn([as_ids[0]]),
                    p_buffer_size=1,
                    p_ada=True,
                    p_logging=p_logging
                ),
                p_envmodel=None,
                p_name='Smith-1',
                p_id=0,
                p_ada=True,
                p_logging=p_logging
            ),
            p_weight=0.3
        )

        # 2.3 Add Single-Agent #2 with own policy (controlling sub-environments #2,#3)
        multi_agent.add_agent(
            p_agent=Agent(
                p_policy=MyPolicy(
                    p_observation_space=self._env.get_state_space().spawn([ss_ids[4],ss_ids[5],ss_ids[6],ss_ids[7],ss_ids[8],ss_ids[9],ss_ids[10],ss_ids[11]]),
                    p_action_space=self._env.get_action_space().spawn([as_ids[1],as_ids[2]]),
                    p_buffer_size=1,
                    p_ada=True,
                    p_logging=p_logging
                ),
                p_envmodel=None,
                p_name='Smith-2',
                p_id=1,
                p_ada=True,
                p_logging=p_logging
            ),
            p_weight=0.7
        )

        # 2.4 Adaptive ML model (here: our multi-agent) is returned
        return multi_agent



# 3 Create scenario and start training

if __name__ == "__main__":
    # 3.1 Parameters for demo mode
    cycle_limit         = 1500
    adaptation_limit    = 50
    stagnation_limit    = 5
    stagnation_entry    = 3
    end_at_stagnation   = True
    eval_frequency      = 10
    eval_grp_size       = 5
    logging             = Log.C_LOG_WE
    visualize           = True
    path                = str(Path.home())
 
else:
    # 3.2 Parameters for internal unit test
    cycle_limit         = 50
    adaptation_limit    = 5
    stagnation_limit    = 5
    stagnation_entry    = 1
    end_at_stagnation   = True
    eval_frequency      = 2
    eval_grp_size       = 1
    logging             = Log.C_LOG_NOTHING
    visualize           = False
    path                = None


# 3.3 Create and run training object
training = RLTraining(
        p_scenario_cls=MyScenario,
        p_cycle_limit=cycle_limit,
        p_adaptation_limit=adaptation_limit,
        p_eval_frequency=eval_frequency,
        p_eval_grp_size=eval_grp_size,
        p_stagnation_limit=stagnation_limit,
        p_stagnation_entry=stagnation_entry,
        p_end_at_stagnation=end_at_stagnation,
        p_score_ma_horizon=3,
        p_success_ends_epi=True,
        p_path=path,
        p_visualize=visualize,
        p_logging=logging )

training.run()