Previously Can you learn addition games with reinforcement learning? and I was able to learn without any problems.
This time, let's assume a more realistic problem.
There is a problem. Well, there is a story that you should send it to everyone if it is about email, but if you send too much, it will lead to withdrawal, and coupons are costly, so I do not want to burst out too much.
The subject of this issue is what would happen if this problem were solved in a Q-Learning framework. With Q-Learning, it's good to be able to handle multiple actions. However, think of it as a simple, completely virtual situation.
To write it easily
+ 1
after 4 turns.It's like that.
In the case of this game, I was worried that it would be difficult to say, "There is an interval between taking the best action and getting the reward." It seems that learning will be quick if you get a reward immediately after the optimal action, but the biggest point is that it is not. I think it's difficult to define a "penalty", but now I'm just giving it to an action that seems to be wrong. (Is this too easy for the problem ...?).
However, as you can see in Searching for a maze with reinforcement learning, you can learn retroactively, so I'm wondering if you can do it. I just wanted to verify it.
You will only get rewards when U = 2 occurs, so increase chance_count
when U = 2 occurs. The maximum reward you can get over a period of time is chance_count
.
Therefore, let hit_rate
be the reward / chance_count` obtained.
The figure below shows how it changed after 10,000 learning / evaluations.
I tried about 50 million times, but after about 30 million times, I felt that learning had peaked, and it was about hit_rate = 0.9.
Q Learning is fun because it looks like a simple AI in a sense. I hope it can be used for something.
I will post it for reference.
#!/usr/bin/env python
# coding: utf-8
import numpy as np
from random import random, choice
class Game(object):
state = None
actions = None
game_over = False
def __init__(self, player):
self.player = player
self.turn = 0
self.last_reward = 0
self.total_reward = 0
self.init_state()
def player_action(self):
action = self.player.action(self.state, self.last_reward)
if action not in self.actions:
raise Exception("Invalid Action: '%s'" % action)
self.state, self.last_reward = self.get_next_state_and_reward(self.state, action)
def play(self):
yield(self)
while not self.game_over:
self.player_action()
self.turn += 1
self.total_reward += self.last_reward
yield(self)
def init_state(self):
raise NotImplemented()
def get_next_state_and_reward(self, state, action):
raise NotImplemented()
class UserAndPushEventGame(Game):
"""
State :S : list of (U, A)
UserActivity :U : int of 0~3
Action :A : int of 0 or 1
Next-State(S, A):S':
S[-1][1] = A
S.append((Next-U, None))
S = S[-5:]
Next-U : :
if S[-4] == (2, 1) then 3
else 10% -> 2, 10% -> 1, 80% -> 0
Reward(S, A) :R :
if S[-1] == (3, *) then R += 1
wrong_action_count := Number of ({0,1,3}, 1) in S
R -= wrong_action_count * 0.3
"""
STATE_HISTORY_SIZE = 5
def init_state(self):
self.actions = [0, 1]
self.state = [(0, None)]
self.chance_count = 0
def get_next_state_and_reward(self, state, action):
next_state = (state + [(self.next_user_action(state), None)])[-self.STATE_HISTORY_SIZE:]
next_state[-2] = (next_state[-2][0], action)
reward = 0
if len(state) > 0 and state[-1][0] == 3:
reward += 1
action_count = reduce(lambda t, x: t+(x[1] or 0), state, 0)
correct_action_count = len([0 for x in state if x == (2, 1)])
wrong_action_count = action_count - correct_action_count
reward -= wrong_action_count * 0.3
return next_state, reward
def next_user_action(self, state):
if len(state) > 4 and state[-4] == (2, 1):
return 3
else:
rnd = np.random.random()
if rnd < 0.8:
return 0
elif rnd < 0.9:
return 1
else:
self.chance_count += 1
return 2
class HumanPlayer(object):
training = False
def action(self, state, last_reward):
print "LastReward=%s, CurrentState: %s" % (last_reward, state)
while True:
action_input = raw_input("Enter 0~1: ")
if int(action_input) in [0, 1]:
return int(action_input)
class QLearnPlayer(object):
ALPHA = 0.1
GAMMA = 0.99
E_GREEDY = 0.05
def __init__(self):
self.actions = [0, 1]
self.q_table = {}
self.last_state = self.last_action = None
self.training = True
def get_q_value(self, state, action):
return self.q_table.get(state, {}).get(action, (np.random.random() - 0.5)/1000) #Undefined returns a small random number
def get_all_q_values(self, state):
return [self.get_q_value(state, act) for act in self.actions]
def set_q_value(self, state, action, val):
if state in self.q_table:
self.q_table[state][action] = val
else:
self.q_table[state] = {action: val}
def action(self, state, last_reward):
state = tuple(state)
next_action = self.select_action(state)
if self.last_state is not None:
self.update_q_table(self.last_state, self.last_action, state, last_reward)
self.last_state = state
self.last_action = next_action
return next_action
def select_action(self, state):
if self.training and random() < self.E_GREEDY:
return choice(self.actions)
else:
return np.argmax(self.get_all_q_values(state))
def update_q_table(self, last_state, last_action, cur_state, last_reward):
if self.training:
d = last_reward + np.max(self.get_all_q_values(cur_state)) * self.GAMMA - self.get_q_value(last_state, last_action)
self.set_q_value(last_state, last_action, self.get_q_value(last_state, last_action) + self.ALPHA * d)
if __name__ == '__main__':
SWITCH_MODE_TURN_NUM = 10000
fp = file("result.txt", "w")
dt = file("detail.txt", "w")
player = QLearnPlayer()
# player = HumanPlayer()
game = UserAndPushEventGame(player)
last_chance_count = last_score = 0
for g in game.play():
# dt.write("%s: isT?=%s LastReward=%s TotalReward=%s S=%s\n" %
# (g.turn, player.training, g.last_reward, g.total_reward, g.state))
if g.turn % SWITCH_MODE_TURN_NUM == 0:
if not player.training:
this_term_score = game.total_reward - last_score
this_term_chance = game.chance_count - last_chance_count
if this_term_chance > 0:
hit_rate = 100.0*this_term_score/this_term_chance
else:
hit_rate = 0
# print "Turn=%d: This 100 turn score=%2.2f chance=%02d: HitRate=%.1f%% %s" % \
# (g.turn, this_term_score, this_term_chance, hit_rate, '*' * int(hit_rate/2))
fp.write("%d\t%.2f\t%d\t%f\n" % (g.turn, this_term_score, this_term_chance, hit_rate))
last_score = game.total_reward
last_chance_count = game.chance_count
player.training = not player.training
if g.turn % 10000 == 0:
fp.flush()
Recommended Posts