Einführung

Ich habe versucht, das beste Problem bei der Routensuche mit einem Python-Programm zu lösen. Als Lehrbuch ["Stärkung des Lernens"](http://www.amazon.co.jp/ Stärkung des Lernens - Richard-S-Sutton / dp / 4627826613 / ref = sr_1_1? Dh = UTF8 & qid = 1463965873 & sr = 8-1 & keywords = Stärkung des Lernens % E3% 80% 80 Mikami) wurde verwendet.

Struktur dieses Artikels

Einführung
Suche nach der besten Route
Regeln
Q-Wert aktualisieren
Implementierung
Ergebnis
abschließend

Beste Routensuche

Regel

Betrachten Sie das Suchproblem im Labyrinth wie in der Abbildung gezeigt. Der weiße Kreis ist der Agent, der rote ist der Minusbereich und der blaue ist das Ziel. Das Suchproblem bezieht sich hier auf die Suche nach einer Route, die die Belohnung für das Erreichen des Ziels maximiert. Die Regeln sind unten gezeigt.

Labyrinthgröße: $ 6 \ times6 $
Belohnungen in jedem Bereich: Schwarzer Bereich $ 0 $, Roter Bereich $ -1 $, Blauer Bereich $ + 3 $
Wenn Sie das blaue Ziel erreicht haben, beginnen Sie am Startpunkt oben links von vorne

Aktualisieren Sie den $ Q $ -Wert mit der folgenden Formel.

Q_{new}(s_t, a) = Q_{old}(s_t, a) + \alpha\bigl[r_{t+1} + \gamma max_{a'}Q(s_{t+1}, a') - Q_{old}(s_t, a)\bigr]

$ Q (s_t, a) $ repräsentiert den Wert der Aktion $ a $ in einem bestimmten Zustand $ s_t $. Aktion im Zustand $ s_t $ $ a $ gibt Belohnung $ r_ {t + 1} $. Beim Aktualisieren des $ Q $ -Werts wird die letzte Belohnung $ r_ {t + 1} $ zum Maximalwert addiert, der durch die Aktion $ a '$ im nächsten Zustand $ s_ {t + 1} $ erhalten wird. Fügen Sie das Produkt multipliziert mit gamma $ hinzu. Dies bedeutet, dass die Aktion in einem bestimmten Zustand ausgewählt wird, indem nicht nur die letzte Belohnung, sondern auch der Wert im nachfolgenden Zustand berücksichtigt wird. $ \ Gamma $ wird als Abzinsungssatz und $ \ alpha $ als Lernrate bezeichnet.

Implementierung

Ich habe es in Python implementiert. Die Wahrscheinlichkeit, zufällige Aktionen auszuführen, beträgt $ \ varepsilon = 0,1 $, die Lernrate beträgt $ \ alpha = 0,2 $ und der Abzinsungssatz beträgt $ \ gamma = 0,9 $. Das Lernen endet, wenn Sie das Ziel von 300 $ erreichen.

.
|- agent.py Agentenklasse
|- mazeimage.py Klasse anzeigen / speichern
|- main.py
|- resources
      |- maze.csv

`agent.py`


import numpy
import cv2
import random
import copy

class Agent:

	# constructor
	def __init__(self, maze_shape):
		self.state = numpy.array([0, 0])
		self.actions = self.__create_actions(maze_shape)
		self.moves = {0: numpy.array([0, -1]), 1: numpy.array([1, 0]), 2: numpy.array([0, 1]), 3: numpy.array([-1, 0])}
		self.q = numpy.zeros((4, ) + maze_shape)


# public method
	def act(self, maze, epsilon, alpha, gamma):
		act_index = self.__select_action(epsilon)
		move = self.moves.get(act_index)
		reward = maze[tuple(self.state + move)]
		self.update_q(act_index, move, reward, alpha, gamma)
		self.state += move


	def update_q(self, act_index, move, reward, alpha, gamma):
		y, x = self.state
		_q = self.q[act_index, y, x]
		self.q[act_index, y, x] = _q + alpha * (reward + gamma * self.__get_max_q(move) - _q)


	def goal(self, maze_shape):
		return numpy.array_equal(self.state, numpy.array(maze_shape) - 1)


	def reset(self):
		self.state = numpy.array([0, 0])


# private method
	def __create_actions(self, maze_shape):
		actions = []
		maze_h, maze_w = maze_shape
		for j in range(maze_h):
			actions.append([])
			for i in range(maze_w):
				action = [0, 1, 2, 3]
				self.__remove_actions(action, maze_h, maze_w, j, i)
				actions[j].append(action)

		return numpy.array(actions)


	def __remove_actions(self, action, maze_h, maze_w, j, i):
		if i == 0:
			action.remove(0)
		if i == maze_w - 1:
			action.remove(2)
		if j == 0:
			action.remove(3)
		if j == maze_h - 1:
			action.remove(1)

	
	def __select_action(self, epsilon):
		y, x = self.state
		action = copy.deepcopy(self.actions[y, x])
		if numpy.random.rand() > epsilon:
			mode = '!!!greedy!!!'
			act_index = self.__select_greedy_action(action)
		else:
			mode = '!!!random!!!'
			act_index = self.__select_random_action(action)

		print u'%s  state: (%d, %d), action: %d' % (mode, y, x, act_index)

		return act_index


	def __select_greedy_action(self, action):
		y, x = self.state
		_max = self.q[action, y, x].max()
		_indexes = list(numpy.argwhere(self.q[action, y, x] == _max))
		random.shuffle(_indexes)
		return action[_indexes[0]]


	def __select_random_action(self, action):
		random.shuffle(action)
		return action[0]


	def __get_max_q(self, move):
		y, x = self.state + move
		move = self.actions[y, x]
		return self.q[move, y, x].max()

`mazeimage.py`


import numpy
import cv2

class MazeImage:

	# constructor
	def __init__(self, maze, height, width):
		self.height, self.width = (height, width)
		self.maze_h, self.maze_w = maze.shape
		self.ystride = height / self.maze_h
		self.xstride = width / self.maze_w
		self.map_org = self.__create_image(maze)
		self.map_now = self.map_org
		self.writer = cv2.VideoWriter('q_learning.mv4', cv2.cv.CV_FOURCC('m', 'p', '4', 'v'), 30, (self.map_org.shape[1], self.map_org.shape[0]))


# public method
	def show(self, agent):
		self.map_now = self.map_org.copy()
		_y, _x = agent.state
		center = (int((_x + 0.5) * self.xstride), int((_y + 0.5) * self.ystride))
		cv2.circle(self.map_now, center, 11, (255, 255, 255), -1, cv2.cv.CV_AA)
		cv2.imshow('', self.map_now)
		return cv2.waitKey(10)


	def save_movie(self):
		self.writer.write(self.map_now)


	def shortest_path(self, q):
		shortest_map = self.map_org.copy()
		q_arrow = numpy.vectorize(lambda x: {0: '<', 1: 'V', 2: '>', 3: '^'}.get(x))(q.argmax(axis = 0))

		for j in range(self.maze_h):
			for i in range(self.maze_w):
				pt = (int(self.xstride * (i + 0.35)), int(self.ystride * (j + 0.6)))
				cv2.putText(shortest_map, q_arrow[j, i], pt, cv2.FONT_HERSHEY_PLAIN, 1, [255, 255, 255])

		return shortest_map


# private method
	def __create_image(self, maze):
		image = numpy.zeros((self.height, self.width, 3)).astype('uint8')

		for j in range(self.maze_h):
			for i in range(self.maze_w):
				tl = (self.xstride * i, self.ystride * j)
				br = (self.xstride * (i + 1) - 1, self.ystride * (j + 1) - 1)
				cv2.rectangle(image, tl, br, self.__color(maze[j, i]), -1)

		return image


	def __color(self, score):
		if score == 0.0:
			return [0, 0, 0]
		elif score == -1.0:
			return [0, 0, 127]
		else:
			return [127, 0, 0]

`main.py`


from agent import *
from mazeimage import *

if __name__ == '__main__':

	# init
	epsilon = 0.1
	alpha = 0.2
	gamma = 0.9
	maze = numpy.loadtxt('./resources/maze.csv', delimiter = ',')
	agent = Agent(maze.shape)
	maze_image = MazeImage(maze, 300, 300)

	trial = 0
	while True:
		if maze_image.show(agent) == 27:
			print '!!!escape!!!'
			break

		agent.act(maze, epsilon, alpha, gamma)
		maze_image.save_movie()
		
		if agent.goal(maze.shape):
			print '\033[32m' + '!!!goal!!!' + '\033[0m'
			trial += 1
			print 'next trial: %d' % trial
			agent.reset()

		if trial == 300:
			break

	maze_image.save_movie()
	cv2.imwrite('shortest.png', maze_image.shortest_path(agent.q))
	cv2.destroyAllWindows()

maze.csv


-1	0	0	0
-1	-1	-1	0
-1	-1	0	0
-1	-1	0	-1
-1	-1	0	0
0	-1	-1	3

Ergebnis

Die Abbildung der besten Route, die durch Lernen erhalten wurde, wird angezeigt. Sie können den Weg zum Erreichen des Ziels lernen, ohne den negativen Bereich zu passieren. Ich habe es so implementiert, dass der Lernstatus als Video gespeichert wird. Wenn Sie also interessiert sind, verschieben Sie es bitte.

abschließend

Ich konnte das beste Problem bei der Routensuche lösen. Wie die Aktualisierungsformel von $ Q (s_t, a) $ zeigt, wurde verstanden, dass der Wert von der Zielseite ausgeht, indem der Wert im Zustand von $ 1 $ voraus betrachtet wird.

[PYTHON] [Lernen stärken] Suche nach der besten Route