In this post, we explore policy evaluation — a key step in reinforcement learning that allows us to determine how good a given policy is. Using a simple maze environment, we demonstrate how iterative policy evaluation helps compute the expected return (value) of each state under different policies, and how this lays the groundwork for policy improvement.
We define a 5x5 grid environment with actions, UP
, DOWN
, LEFT
, RIGHT
:
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | S | . | . | X | G |
1 | . | X | . | X | . |
2 | . | X | . | . | . |
3 | . | . | . | X | . |
4 | X | X | . | . | . |
# Constants for actions
UP, DOWN, LEFT, RIGHT = 0, 1, 2, 3
ACTIONS = [UP, DOWN, LEFT, RIGHT]
ACTION_DELTAS = {
UP: (-1, 0),
DOWN: (1, 0),
LEFT: (0, -1),
RIGHT: (0, 1),
}
# Maze config
MAZE_ROWS = 5
MAZE_COLS = 5
WALLS = {(0, 3), (1, 1), (1, 3), (2, 1), (3, 3), (4, 0), (4, 1)}
GOAL = (0, 4)
States values are as follows:
S
= Start
G
= Goal (terminal)
X
= Wall (unreachable)
.
= Empty state
class State:
def __init__(self, name: int, is_terminal: bool = False):
self.name = name
self.is_terminal = is_terminal
class Env:
def __init__(self):
self.states = []
self.state_map = {}
for r in range(MAZE_ROWS):
for c in range(MAZE_COLS):
if (r, c) not in WALLS:
is_terminal = (r, c) == GOAL
state = State((r, c), is_terminal)
self.states.append(state)
self.state_map[(r, c)] = state
self.actions = ACTIONS
def step(self, state: State, action: int) -> List[Tuple[State, float, float]]:
r, c = state.name
dr, dc = ACTION_DELTAS[action]
nr, nc = r + dr, c + dc
# Stay in place if moving out of bounds or into a wall
if (nr < 0 or nr >= MAZE_ROWS or
nc < 0 or nc >= MAZE_COLS or
(nr, nc) in WALLS):
next_pos = (r, c)
else:
next_pos = (nr, nc)
next_state = self.state_map[next_pos]
reward = 0.0 if next_pos == GOAL else -1.0
return [(next_state, reward, 1.0)] # deterministic
import math
import random
from typing import List, Tuple
def iterative_policy_eval(agent: Agent, env: Env, gamma: float = 0.9, threshold: float = 1e-6):
values = {state.name: 0.0 for state in env.states}
while True:
delta = 0
for state in env.states:
if state.is_terminal:
continue
old_value = values[state.name]
new_value = 0.0
for action in env.actions:
action_prob = agent.policy(state, action)
for next_state, reward, prob in env.step(state, action):
new_value += action_prob * prob * (reward + gamma * values[next_state.name])
values[state.name] = new_value
delta = max(delta, abs(old_value - new_value))
if delta < threshold:
break
return values
if __name__ == '__main__':
env = Env()
agent = Agent()
values = iterative_policy_eval(agent, env)
for state in sorted(env.states, key=lambda s: s.name):
print(f"State {state.name}: V = {values[state.name]:.2f}")
1. Left Aciton only Agent
class Agent:
def policy(self, state: State, action: int) -> float:
return 1 if action == LEFT else 0
State (0, 0): V = -10.00
State (0, 1): V = -10.00
State (0, 2): V = -10.00
State (0, 4): V = 0.00
State (1, 0): V = -10.00
State (1, 2): V = -10.00
State (1, 4): V = -10.00
State (2, 0): V = -10.00
State (2, 2): V = -10.00
State (2, 3): V = -10.00
State (2, 4): V = -10.00
State (3, 0): V = -10.00
State (3, 1): V = -10.00
State (3, 2): V = -10.00
State (3, 4): V = -10.00
State (4, 2): V = -10.00
State (4, 3): V = -10.00
State (4, 4): V = -10.00
2. Stochastic Agent (Uniform Random Policy)
class Agent:
def policy(self, state: State, action: int) -> float:
return 1.0 / 4
State (0, 0): V = -9.96
State (0, 1): V = -9.93
State (0, 2): V = -9.87
State (0, 4): V = 0.00
State (1, 0): V = -9.96
State (1, 2): V = -9.76
State (1, 4): V = -4.53
State (2, 0): V = -9.96
State (2, 2): V = -9.54
State (2, 3): V = -8.89
State (2, 4): V = -7.75
State (3, 0): V = -9.93
State (3, 1): V = -9.87
State (3, 2): V = -9.76
State (3, 4): V = -8.82
State (4, 2): V = -9.75
State (4, 3): V = -9.64
State (4, 4): V = -9.37
3. Better than Random Policy
class Agent:
def policy(self, state: State, action: int) -> float:
if action == UP and state.name == (1,4): return 1
if state.name == (1,4): return 0
return 1.0 / 4
State (0, 0): V = -9.92
State (0, 1): V = -9.87
State (0, 2): V = -9.77
State (0, 4): V = 0.00
State (1, 0): V = -9.94
State (1, 2): V = -9.56
State (1, 4): V = 0.00
State (2, 0): V = -9.92
State (2, 2): V = -9.15
State (2, 3): V = -7.97
State (2, 4): V = -5.88
State (3, 0): V = -9.87
State (3, 1): V = -9.77
State (3, 2): V = -9.56
State (3, 4): V = -7.85
State (4, 2): V = -9.55
State (4, 3): V = -9.35
State (4, 4): V = -8.85
3. Hand-Crafted Optimal Policy
class Agent:
def __init__(self):
self.policy_map = {
(0,0): RIGHT,
(0,1): RIGHT,
(0,2): DOWN,
(1,2): DOWN,
(2,2): RIGHT,
(2,3): RIGHT,
(2,4): UP,
(1,4): UP
}
def policy(self, state, action):
# If state is a State object, extract the tuple
if hasattr(state, "name"):
state = state.name
optimal_action = self.policy_map.get(state)
if optimal_action is None:
return 0
return 1 if action == optimal_action else 0
State (0, 0): V = -5.22
State (0, 1): V = -4.69
State (0, 2): V = -4.10
State (0, 4): V = 0.00
State (1, 0): V = 0.00
State (1, 2): V = -3.44
State (1, 4): V = 0.00
State (2, 0): V = 0.00
State (2, 2): V = -2.71
State (2, 3): V = -1.90
State (2, 4): V = -1.00
State (3, 0): V = 0.00
State (3, 1): V = 0.00
State (3, 2): V = 0.00
State (3, 4): V = 0.00
State (4, 2): V = 0.00
State (4, 3): V = 0.00
State (4, 4): V = 0.00