On suppose que vous avez terminé le renforcement de l'apprentissage 6.
Je souhaite me connecter à ChainerUI, mais avant cela, je vais organiser les données de formation. Nous continuerons après le train.py de ChainerRL créé dans Strengthening Learning 6.
Il est important de noter qu'à partir du 13 novembre 2019, la méthode suivante décrite vers la fin de la référence rapide chainerRL entraînera une erreur.
import logging
import sys
gym.undo_logger_setup() # Turn off gym's default logger settings
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
chainerrl.experiments.train_agent_with_evaluation(
agent, env,
steps=2000, # Train the agent for 2000 steps
eval_n_runs=10, # 10 episodes are sampled for each evaluation
max_episode_len=200, # Maximum length of each episodes
eval_interval=1000, # Evaluate the agent after every 1000 steps
outdir='result') # Save everything to 'result' directory
Réécrivez le texte intégral comme suit, y compris les modifications.
train_plus.py
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import gym
import numpy as np
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)
obs = env.reset()
#env.render()
print('initial observation:', obs)
action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)
class QFunction(chainer.Chain):
def __init__(self, obs_size, n_actions, n_hidden_channels=50):
super().__init__()
with self.init_scope():
self.l0 = L.Linear(obs_size, n_hidden_channels)
self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
self.l2 = L.Linear(n_hidden_channels, n_actions)
def __call__(self, x, test=False):
"""
Args:
x (ndarray or chainer.Variable): An observation
test (bool): a flag indicating whether it is in test mode
"""
h = F.tanh(self.l0(x))
h = F.tanh(self.l1(h))
return chainerrl.action_value.DiscreteActionValue(self.l2(h))
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)
# Set the discount factor that discounts future rewards.
gamma = 0.95
# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
epsilon=0.3, random_action_func=env.action_space.sample)
# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)
# Now create an agent that will interact with the environment.
agent = chainerrl.agents.DoubleDQN(
q_func, optimizer, replay_buffer, gamma, explorer,
replay_start_size=500, update_interval=1,
target_update_interval=100, phi=phi)
# Set up the logger to print info messages for understandability.
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
chainerrl.experiments.train_agent_with_evaluation(
agent, env,
steps=20000, # Train the agent for 2000 steps
eval_n_steps=None, # 10 episodes are sampled for each evaluation
eval_n_episodes=10, # 10 episodes are sampled for each evaluation
eval_max_episode_len=200, # Maximum length of each episodes
eval_interval=1000, # Evaluate the agent after every 1000 steps
outdir='result') # Save everything to 'result' directory
print('Finished.')
agent.save('agent')
env.close()
Cela créera scores.txt dans le dossier des résultats. Le chainerrl est publié sur github et le fichier source contient une description.
Recommended Posts