12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- #!/usr/bin/env python3
- import collections
- import itertools
- import gym
- import numpy
- _OBSERVATION_STEPS_NUM = 24
- _LEARNING_RATE = 0.1
- _DISCOUNT = 0.95
- # > CartPole-v0 defines "solving" as getting average reward of 195.0 over 100 consecutive trials.
- # https://gym.openai.com/envs/CartPole-v0/
- _SUCCESS_AVERAGE_REWARD = 195.0
- _SUCCESS_AVERAGE_WINDOW_SIZE = 100
- def _main():
- env = gym.make("CartPole-v0")
- observation_min = numpy.array([-4.8, -4.8, -0.9, -4.0])
- assert (
- env.observation_space.shape == observation_min.shape
- ), env.observation_space.shape
- observation_max = observation_min * -1
- observation_step = (observation_max - observation_min) / _OBSERVATION_STEPS_NUM
- assert len(env.observation_space.shape) == 1
- q_table = numpy.random.uniform(
- low=-4,
- high=0,
- size=(
- [_OBSERVATION_STEPS_NUM] * (env.observation_space.shape[0])
- + [env.action_space.n]
- ),
- )
- print("q_table:", q_table.itemsize * q_table.size / 2 ** 20, "MiB")
- last_step_counts = collections.deque()
- last_step_counts_sum = 0
- for episode_index in itertools.count():
- observation = env.reset()
- render = (episode_index % 1000) == 0
- if render:
- env.render()
- observation_index = ((observation - observation_min) / observation_step).astype(
- int
- )
- for step_index in itertools.count():
- action = q_table[tuple(observation_index)].argmax()
- next_observation, reward, done, info = env.step(action)
- if render:
- env.render()
- assert (next_observation >= observation_min).all(), next_observation
- assert (next_observation <= observation_max).all(), next_observation
- next_observation_index = (
- (next_observation - observation_min) / observation_step
- ).astype(int)
- if done:
- q_table[tuple(observation_index)][action] = (
- -300 if step_index < 190 else reward
- )
- last_step_counts.append(step_index + 1)
- last_step_counts_sum += step_index + 1
- if len(last_step_counts) > _SUCCESS_AVERAGE_WINDOW_SIZE:
- last_step_counts_sum -= last_step_counts.popleft()
- average_reward = last_step_counts_sum / _SUCCESS_AVERAGE_WINDOW_SIZE
- print(
- f"episode #{episode_index}"
- f"\t{step_index+1} steps"
- f"\taverage of {average_reward:.1f} steps"
- f" over last {_SUCCESS_AVERAGE_WINDOW_SIZE} episodes"
- )
- if average_reward > _SUCCESS_AVERAGE_REWARD:
- return
- break
- assert not info, info
- q_table[tuple(observation_index)][action] += _LEARNING_RATE * (
- reward
- + _DISCOUNT * q_table[tuple(next_observation_index)].max()
- - q_table[tuple(observation_index)][action]
- )
- observation_index = next_observation_index
- if __name__ == "__main__":
- _main()
|