from time import sleep

import matplotlib.pyplot as plt
import names
import numpy as np
from gym.wrappers import RescaleAction
from IPython.display import clear_output, display
from stable_baselines3 import PPO

from utils.helpers import (
    evaluate_ares_ea_agent,
    make_ares_ea_training_videos,
    plot_ares_ea_training_history,
    show_video,
)
from utils.train import ARESEACheetah, make_env, read_from_yaml
from utils.train import train as train_ares_ea
from utils.utils import NotVecNormalize


# Create the environment
env = ARESEACheetah()
env.target_beam_mode = "constant"


target_beam = np.array([1e-3, 2e-4, 1e-3, 2e-4])  # Change it


env.target_beam_values = target_beam
env.reset()  ##
plt.figure(figsize = (7, 4))
plt.imshow(env.render(mode="rgb_array"))  # Plot the screen image

<matplotlib.image.AxesImage at 0x7f596835a260>


action = np.array([1, 0.5, 0.5, 1, 0.6])  # put your action here


env = RescaleAction(env, -1, 1)  # rescales the action to the interval [-1, 1]
env.reset()
env.step(action)
plt.figure(figsize = (7, 4))
plt.imshow(env.render(mode="rgb_array"))

<matplotlib.image.AxesImage at 0x7f59683e5fc0>


env.reset()
steps = 10


def change_vertical_corrector(q1, q2, cv, q3, ch, steps, i):
    action = np.array([q1, q2, cv + 1 / steps * i, q3, ch])
    return action


fig, ax = plt.subplots(1, figsize = (7, 4))
for i in range(steps):
    action = change_vertical_corrector(0.2, -0.2, -0.5, 0.3, 0, steps, i)
    env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


agent_name = "Gary Buchwald"  # names are randomly generated in training

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=False)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


agent_name = "David Archibald"  # names are randomly generated in training

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=False)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


agent_name = "Bertha Sparkman"  # names are randomly generated in training

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=False)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


agent_name = "Betty Gordon"  # names are randomly generated in training

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=False)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


agent_name = "Sean Kelley"  # names are randomly generated in training

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=False)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


# Feel free to change some of the configurations here.
config = {
    "n_envs": 40,
    "n_steps": 50,
    "batch_size": 100,
    "n_epochs": 10,
    "total_timesteps": 200_000,
    "abort_if_off_screen": False,
    "action_mode": "delta",
    "gamma": 0.99,
    "frame_stack": None,
    "net_arch": [64, 64],
    "normalize_observation": True,
    "normalize_reward": True,
    "rescale_action": (-3, 3),
    "reward_mode": "negative_objective",
    "run_name": names.get_full_name(),
    "target_sigma_x_threshold": None,
    "target_sigma_y_threshold": None,
    "threshold_hold": 5,
    "time_limit": 25,
    "time_reward": -0.0,
}


# Toggle comment to re-run the training (can take very long)
%time train_ares_ea(config)

==> Training agent "Marie Willis"
Eval num_timesteps=20000, episode_reward=-14.00 +/- 1.08
Episode length: 25.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-6.88 +/- 0.79
Episode length: 25.00 +/- 0.00
New best mean reward!
Eval num_timesteps=60000, episode_reward=-6.28 +/- 1.96
Episode length: 25.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-4.76 +/- 0.96
Episode length: 25.00 +/- 0.00
New best mean reward!
Eval num_timesteps=100000, episode_reward=-4.57 +/- 0.50
Episode length: 25.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=-5.31 +/- 1.91
Episode length: 25.00 +/- 0.00
Eval num_timesteps=140000, episode_reward=-5.31 +/- 1.02
Episode length: 25.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=-4.97 +/- 0.40
Episode length: 25.00 +/- 0.00
Eval num_timesteps=180000, episode_reward=-4.77 +/- 0.52
Episode length: 25.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=-5.42 +/- 1.10
Episode length: 25.00 +/- 0.00
CPU times: user 8min 22s, sys: 1.35 s, total: 8min 23s
Wall time: 8min 24s


agent_under_investigation = config["run_name"]
# agent_under_investigation = "Donna Brown"


# Training curves from this training
# Change `config["run_name"` to `"ml_worksop` to see curves from example training.
plot_ares_ea_training_history(agent_under_investigation)


plt.figure(figsize = (7,4))
evaluate_ares_ea_agent(agent_under_investigation, include_position=False, n=2000)

Evaluation results (2000 evaluations)
----------------------------------------
==> Mean MAE = 5.207024056733189e-05
==> RMSE = 7.323487962575908e-05
==> Mean no. of steps = 25.0


# Run final agent
agent_name = agent_under_investigation

loaded_model = PPO.load(f"utils/models/{agent_name}/model")
loaded_config = read_from_yaml(f"utils/models/{agent_name}/config")

env = make_env(loaded_config, record_video=True)
env = NotVecNormalize(env, f"utils/models/{agent_name}/normalizer")

done = False
observation = env.reset()
while not done:
    action, _ = loaded_model.predict(observation)
    observation, reward, done, info = env.step(action)
    
    img = env.render(mode="rgb_array")
    ax.imshow(img)
    display(fig)
    clear_output(wait=True)
    sleep(0.5)


# Show polished donkey running (on real accelerator)
show_video("utils/real_world_episode_recording.mp4")

Reinforcement learning for focusing and steering tasks

Use case: ARES linear accelerator at DESY

Implementation example for RL4AA'23 workshop

Today!

Part I: Introduction

ARES (Accelerator Research Experiment at SINBAD)

The accelerator problem we want to solve

Formulating the RL problem

Refresher from the lecture

Formulating the RL problem

Overview of our study case

Discussion

Formulating the RL problem

Actions

Formulating the RL problem

Observation / state

Discussion

Formulating the RL problem

The environment's state

Discussion

Formulating the RL problem

Our definition of observation

Discussion

Formulating the RL problem

Goal and reward

Discussion

Formulating the RL problem

Agent / algorithm

Discussion

Part II: Algorithm implementation in Python

About libraries for RL

Agent / algorithm

Environment

An overview of this RL project

Code directory structure

Code directory structure

What is Cheetah?

The ARESEA (ARES Experimental Area) Environment

Set a target beam you want to achieve

Get familiar with the Gym environment

Part III: Reward definition!

Configurations

Configurations

Environment configurations

Configurations

Environment configurations

Question

What could go wrong?

Pre-trained Agent 1: "Gary Buchwald"

Relevant config parameters

Reward = objective_improvement

Question

Pre-trained Agent 2: "David Archibald"

Relevant config parameters

Reward = sum_of_pixels (focusing-only)

Question

Pre-trained Agent 3: "Bertha Sparkman"

Relevant config parameters

Reward = objective_improvement

Question

Pre-trained Agent 4: "Betty Gordon"

Relevant config parameters

Reward = objective_improvement

Question

Pre-trained Agent 5: "Sean Kelley"

Relevant config parameters

Reward = negative_objective"

Question

Part IV: Training an RL agent

What is inside an actor-critic agent like PPO?

What actually happens when you train a PPO agent?

Step 1: collect samples

What actually happens when you train a PPO agent?

Step 2: update the models (weights of NNs)

What actually happens when you train a PPO agent?

Question

What actually happens when you train a PPO agent?

Example

Question

Training time!

Relevant `config` parameters

Reward = `objective_improvement`

Relevant `config` parameters

Reward = `sum_of_pixels` (focusing-only)

Relevant `config` parameters

Reward = `objective_improvement`

Relevant `config` parameters

Reward = `objective_improvement`

Relevant `config` parameters

Reward = `negative_objective"`