#!/usr/bin/env python3 import collections import itertools import gym import numpy _OBSERVATION_STEPS_NUM = 24 _LEARNING_RATE = 0.1 _DISCOUNT = 0.95 # > CartPole-v0 defines "solving" as getting average reward of 195.0 over 100 consecutive trials. # https://gym.openai.com/envs/CartPole-v0/ _SUCCESS_AVERAGE_REWARD = 195.0 _SUCCESS_AVERAGE_WINDOW_SIZE = 100 def _main(): env = gym.make("CartPole-v0") observation_min = numpy.array([-4.8, -4.8, -0.9, -4.0]) assert ( env.observation_space.shape == observation_min.shape ), env.observation_space.shape observation_max = observation_min * -1 observation_step = (observation_max - observation_min) / _OBSERVATION_STEPS_NUM assert len(env.observation_space.shape) == 1 q_table = numpy.random.uniform( low=-4, high=0, size=( [_OBSERVATION_STEPS_NUM] * (env.observation_space.shape[0]) + [env.action_space.n] ), ) print("q_table:", q_table.itemsize * q_table.size / 2 ** 20, "MiB") last_step_counts = collections.deque() last_step_counts_sum = 0 for episode_index in itertools.count(): observation = env.reset() render = (episode_index % 1000) == 0 if render: env.render() observation_index = ((observation - observation_min) / observation_step).astype( int ) for step_index in itertools.count(): action = q_table[tuple(observation_index)].argmax() next_observation, reward, done, info = env.step(action) if render: env.render() assert (next_observation >= observation_min).all(), next_observation assert (next_observation <= observation_max).all(), next_observation next_observation_index = ( (next_observation - observation_min) / observation_step ).astype(int) if done: q_table[tuple(observation_index)][action] = ( -300 if step_index < 190 else reward ) last_step_counts.append(step_index + 1) last_step_counts_sum += step_index + 1 if len(last_step_counts) > _SUCCESS_AVERAGE_WINDOW_SIZE: last_step_counts_sum -= last_step_counts.popleft() average_reward = last_step_counts_sum / _SUCCESS_AVERAGE_WINDOW_SIZE print( f"episode #{episode_index}" f"\t{step_index+1} steps" f"\taverage of {average_reward:.1f} steps" f" over last {_SUCCESS_AVERAGE_WINDOW_SIZE} episodes" ) if average_reward > _SUCCESS_AVERAGE_REWARD: return break assert not info, info q_table[tuple(observation_index)][action] += _LEARNING_RATE * ( reward + _DISCOUNT * q_table[tuple(next_observation_index)].max() - q_table[tuple(observation_index)][action] ) observation_index = next_observation_index if __name__ == "__main__": _main()