Es wird davon ausgegangen, dass Sie bis zu 20 Bestärkungslernen abgeschlossen haben.
Als ich zum Beispiel chainerRL aus Github klonierte, gab es ein verstärktes Lernen des Pendels mit verschiedenen Algorithmen. Vorerst werde ich es in alphabetischer Reihenfolge versuchen.
Erstens ist A2C. Advantage Actor-Critic Ist eine Abkürzung für. Schauspieler: Beschlossen zu tun Kritiker: Statusbewertung Es ist ein Algorithmus, der beide gleichzeitig optimiert. Sie können Agenten parallel auf mehreren Threads ausführen, um zu lernen. Ist das die Einheit mlagent?
Es ist ein Laborheft. Die Lernzeit betrug etwa eine Stunde. Der CPU-Modus ist schneller.
Google drive mount
import google.colab.drive
google.colab.drive.mount('gdrive')
!ln -s gdrive/My\ Drive mydrive
program install
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install pyvirtualdisplay > /dev/null 2>&1
!pip -q install JSAnimation
!pip -q install chainerrl
Main program An example of training A2C against OpenAI Gym Envs.
This script is an example of training a A2C agent against OpenAI Gym envs. Both discrete and continuous action spaces are supported.
modules import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import * # NOQA
from future import standard_library
standard_library.install_aliases() # NOQA
import argparse
import functools
import chainer
from chainer import functions as F
import gym
import numpy as np
import os
import sys
import chainerrl
from chainerrl.agents import a2c
from chainerrl import experiments
from chainerrl import links
from chainerrl import misc
from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
from chainerrl import policies
from chainerrl import v_function
Class A2CFFSoftmax An example of A2C feedforward softmax policy.
class A2CFFSoftmax(chainer.ChainList, a2c.A2CModel):
def __init__(self, ndim_obs, n_actions, hidden_sizes=(64, 64)):
self.pi = policies.SoftmaxPolicy(
model=links.MLP(ndim_obs, n_actions, hidden_sizes))
self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
super().__init__(self.pi, self.v)
def pi_and_v(self, state):
return self.pi(state), self.v(state)
Class A2CFFMellowmax An example of A2C feedforward mellowmax policy.
class A2CFFMellowmax(chainer.ChainList, a2c.A2CModel):
def __init__(self, ndim_obs, n_actions, hidden_sizes=(64, 64)):
self.pi = policies.MellowmaxPolicy(
model=links.MLP(ndim_obs, n_actions, hidden_sizes))
self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
super().__init__(self.pi, self.v)
def pi_and_v(self, state):
return self.pi(state), self.v(state)
Class A2CGaussian An example of A2C recurrent Gaussian policy.
class A2CGaussian(chainer.ChainList, a2c.A2CModel):
def __init__(self, obs_size, action_size):
self.pi = policies.FCGaussianPolicyWithFixedCovariance(
obs_size,
action_size,
np.log(np.e - 1),
n_hidden_layers=2,
n_hidden_channels=64,
nonlinearity=F.tanh)
self.v = v_function.FCVFunction(obs_size, n_hidden_layers=2,
n_hidden_channels=64,
nonlinearity=F.tanh)
super().__init__(self.pi, self.v)
def pi_and_v(self, state):
return self.pi(state), self.v(state)
Main
args
import logging
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default='Pendulum-v0')
parser.add_argument('--arch', type=str, default='Gaussian',choices=('FFSoftmax', 'FFMellowmax', 'Gaussian'))
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--outdir', type=str, default='mydrive/OpenAI/Pendulum/result-a2c')
parser.add_argument('--profile', action='store_true')
parser.add_argument('--steps', type=int, default=8 * 10 ** 7)
parser.add_argument('--update-steps', type=int, default=5)
parser.add_argument('--log-interval', type=int, default=1000)
parser.add_argument('--eval-interval', type=int, default=10 ** 5)
parser.add_argument('--eval-n-runs', type=int, default=10)
parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5)
parser.add_argument('--render', action='store_true', default=False)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--use-gae', action='store_true', default=False)
parser.add_argument('--tau', type=float, default=0.95)
parser.add_argument('--lr', type=float, default=7e-4)
parser.add_argument('--weight-decay', type=float, default=0.0)
parser.add_argument('--demo', action='store_true', default=False)
parser.add_argument('--load', type=str, default='')
parser.add_argument('--logger-level', type=int, default=logging.INFO)
parser.add_argument('--monitor', action='store_true')
parser.add_argument('--max-grad-norm', type=float, default=0.5)
parser.add_argument('--alpha', type=float, default=0.99)
parser.add_argument('--gpu', '-g', type=int, default=-1)
parser.add_argument('--num-envs', type=int, default=1)
Wo Sie sich ändern möchten
args =parser.parse_args(['--env','CartPole-v0'])
Machen.
args = parser.parse_args('')
logging.basicConfig(level=args.logger_level, stream=sys.stdout, format='')
Set a random seed used in ChainerRL.
If you use more than one processes, the results will be no longer deterministic even with the same random seed.
misc.set_random_seed(args.seed)
# Set different random seeds for different subprocesses.
# If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
# If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
assert process_seeds.max() < 2 ** 32
if not os.path.exists(args.outdir):
os.makedirs(args.outdir)
###function
def make_env(process_idx, test):
env = gym.make(args.env)
# Use different random seeds for train and test envs
process_seed = int(process_seeds[process_idx])
env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
env.seed(env_seed)
# Cast observations to float32 because our model uses float32
env = chainerrl.wrappers.CastObservationToFloat32(env)
if args.monitor and process_idx == 0:
env = chainerrl.wrappers.Monitor(env, args.outdir)
# Scale rewards observed by agents
if not test:
misc.env_modifiers.make_reward_filtered(
env, lambda x: x * args.reward_scale_factor)
if args.render and process_idx == 0 and not test:
env = chainerrl.wrappers.Render(env)
return env
def make_batch_env(test):
return chainerrl.envs.MultiprocessVectorEnv(
[functools.partial(make_env, idx, test)
for idx, env in enumerate(range(args.num_envs))])
Wählen Sie ein Modell nach Aktionstyp aus.
sample_env = make_env(process_idx=0, test=False)
timestep_limit = sample_env.spec.tags.get(
'wrapper_config.TimeLimit.max_episode_steps')
obs_space = sample_env.observation_space
action_space = sample_env.action_space
# Switch policy types accordingly to action space types
if args.arch == 'Gaussian':
model = A2CGaussian(obs_space.low.size, action_space.low.size)
elif args.arch == 'FFSoftmax':
model = A2CFFSoftmax(obs_space.low.size, action_space.n)
elif args.arch == 'FFMellowmax':
model = A2CFFMellowmax(obs_space.low.size, action_space.n)
optimizer = chainer.optimizers.RMSprop(args.lr,
eps=args.rmsprop_epsilon,
alpha=args.alpha)
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm))
if args.weight_decay > 0:
optimizer.add_hook(NonbiasWeightDecay(args.weight_decay))
agent = a2c.A2C(model, optimizer, gamma=args.gamma,
gpu=args.gpu,
num_processes=args.num_envs,
update_steps=args.update_steps,
use_gae=args.use_gae,
tau=args.tau)
if args.load:
agent.load(args.load)
experiments.train_agent_batch_with_evaluation(
agent=agent,
env=make_batch_env(test=False),
eval_env=make_batch_env(test=True),
steps=args.steps,
log_interval=args.log_interval,
eval_n_steps=None,
eval_n_episodes=args.eval_n_runs,
eval_interval=args.eval_interval,
outdir=args.outdir,
)
agent.save(args.outdir+'/agent')
import pandas as pd
import glob
import os
score_files = glob.glob(args.outdir+'/scores.txt')
score_files.sort(key=os.path.getmtime)
score_file = score_files[-1]
df = pd.read_csv(score_file, delimiter='\t' )
df
df.plot(x='steps',y='average_actor')
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
%matplotlib inline
frames = []
env = gym.make(args.env)
process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
assert process_seeds.max() < 2 ** 32
env_seed = int(process_seeds[0])
env.seed(env_seed)
env = chainerrl.wrappers.CastObservationToFloat32(env)
env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
envw = gym.wrappers.Monitor(env, args.outdir, force=True)
for i in range(3):
obs = envw.reset()
done = False
R = 0
t = 0
while not done and t < 200:
frames.append(envw.render(mode = 'rgb_array'))
action = agent.act(obs)
obs, r, done, _ = envw.step(action)
R += r
t += 1
print('test episode:', i, 'R:', R)
agent.stop_episode()
#envw.render()
envw.close()
from IPython.display import HTML
plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72)
patch = plt.imshow(frames[0])
plt.axis('off')
def animate(i):
patch.set_data(frames[i])
anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50)
anim.save(args.outdir+'/test.mp4')
HTML(anim.to_jshtml())
Recommended Posts