csnlp · Jun 23, 2019
diff --git a/‎DQN.py
+37 b/‎DQN.py
+37
diff --git a/‎DQN.pyc
1.62 KB b/‎DQN.pyc
1.62 KB
diff --git a/‎README.md
+2 b/‎README.md
+2
diff --git a/‎memory.py
+25 b/‎memory.py
+25
diff --git a/‎memory.pyc
1.36 KB b/‎memory.pyc
1.36 KB
diff --git a/‎train.py
+137 b/‎train.py
+137
diff --git a/‎utils.py
+84 b/‎utils.py
+84
diff --git a/‎utils.pyc
2.79 KB b/‎utils.pyc
2.79 KB
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+class DQN(nn.Module):
+    def __init__(self, h, w):
+        super(DQN, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.bn2 = nn.BatchNorm2d(32)
+        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
+        self.bn3 = nn.BatchNorm2d(32)
+
+        def conv2d_size_out(size, kernel_size=5, stride=2):
+            return (size - kernel_size) // stride + 1
+        convw = w
+        convh = h
+        for i in range(3):
+            convw = conv2d_size_out(convw)
+            convh = conv2d_size_out(convh)
+        
+        self.head = nn.Linear(convw * convh * 32, 2)
+        
+            
+    
+    # x.size: (N, input_channels, H, W)
+    # output.size: (N, 2)
+    # DQN is used to calculate Q(s_t)
+    def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = F.relu(self.bn2(self.conv2(x)))
+        x = F.relu(self.bn3(self.conv3(x)))
+
+        return self.head(x.view(x.size(0), -1))
+        
@@ -0,0 +1,2 @@
+# Deep Q-learning Network for Reinforcement Learning
+The re-implement of 《Reinforcement Learning(DQN) Tutorial》. 
@@ -0,0 +1,25 @@
+import random
+from collections import namedtuple
+
+# define a tuple named Transition who have several fieldnames
+Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))
+
+class ReplayMemory(object):
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.memory = []
+        self.position = 0
+
+    def push(self, *args):
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+        self.memory[self.position] = Transition(*args)
+        # keep the recent self.capacity elements
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        return random.sample(self.memory, batch_size)
+
+    def __len__(self):
+        return len(self.memory)
+
@@ -0,0 +1,137 @@
+import random
+import math
+from itertools import count
+
+import gym
+import torch
+import torch.optim as optim
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+from DQN import DQN
+from utils import get_screen
+from utils import plot_durations
+from memory import ReplayMemory
+from memory import Transition
+
+def select_action(state):
+    global steps_done
+    sample = random.random()
+    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
+            math.exp(-steps_done / float(EPS_DECAY))
+    steps_done += 1
+
+    # random strategy: at begining always take the random strategy
+    if sample < eps_threshold:
+        return torch.tensor([[random.randrange(2)]], device=device, dtype=torch.long)
+    else:
+        return policy_net(state).max(1)[1].view(1,1)
+
+
+def optimize_model(policy_net, optimizer):
+    # first sample a batch
+    if len(memory) < BATCH_SIZE:
+        return
+    transitions = memory.sample(BATCH_SIZE)
+    batch = Transition(*zip(*transitions))
+    # non_final_mask is the mask to tag all the item whose next_state is not None as True
+    non_final_mask = tuple(map(lambda s: s is not None, batch.next_state))
+    non_final_mask = torch.tensor(non_final_mask, device=device, dtype=torch.uint8)
+    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
+
+    state_batch = torch.cat(batch.state)
+    action_batch = torch.cat(batch.action)
+    reward_batch = torch.cat(batch.reward)
+
+    # policy_net(state_batch) is used to get all value among all actions
+    # gather method is used to get the value corresponding to certain action
+    state_action_values = policy_net(state_batch).gather(1, action_batch)
+
+    next_state_values = torch.zeros(BATCH_SIZE, device=device)
+
+    # compute the V(s_{t+1}) for $s_{t+1}$ which is final state, we set V(s_{t+1}) = 0
+    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
+    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
+
+    # Huber loss
+    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
+    
+    optimizer.zero_grad()
+    loss.backward()
+    for param in policy_net.parameters():
+        param.grad.data.clamp_(-1, 1)
+    optimizer.step()
+
+env = gym.make('CartPole-v0').unwrapped
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+env.reset()
+
+BATCH_SIZE = 128
+# GAMMA is the discount factor
+GAMMA = 0.999
+EPS_START = 0.9
+EPS_END = 0.05
+EPS_DECAY = 200
+
+TARGET_UPDATE = 10
+
+AVERAGE_SIZE = 10
+episode_durations = []
+
+init_screen = get_screen(env, device)
+_, _, screen_height, screen_width = init_screen.shape
+
+policy_net = DQN(screen_height, screen_width).to(device)
+target_net = DQN(screen_height, screen_width).to(device)
+
+target_net.load_state_dict(policy_net.state_dict())
+target_net.eval()
+
+optimizer = optim.RMSprop(policy_net.parameters())
+memory = ReplayMemory(10000)
+
+steps_done = 0
+num_episodes = 300
+for i_episode in range(num_episodes):
+    env.reset()
+    last_screen = get_screen(env, device)
+    current_screen = get_screen(env, device)
+    state = current_screen - last_screen
+    #print state
+    for t in count():
+        action = select_action(state)
+        _, reward, done, _ = env.step(action.item())
+        reward = torch.tensor([reward], device=device)
+
+        last_screen = current_screen
+        current_screen = get_screen(env, device)
+
+        if not done:
+            next_state = current_screen - last_screen
+        else:
+            next_state = None
+
+        memory.push(state, action, next_state, reward)
+        
+        state = next_state
+        #if done:
+        #    print "Episode Done"
+        #else:
+        #    print state.size()
+        optimize_model(policy_net, optimizer)
+        if done:
+            episode_durations.append(t+1)
+            plot_durations(episode_durations, AVERAGE_SIZE)
+            break
+
+    if i_episode % TARGET_UPDATE == 0:
+        target_net.load_state_dict(policy_net.state_dict())
+
+print("Complet")
+env.render()
+env.close()
+plt.ioff()
+plt.show()
+
+
+
@@ -0,0 +1,84 @@
+from PIL import Image
+
+import gym
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import gym
+import torchvision.transforms as T
+import matplotlib.pyplot as plt
+
+
+# resize is several transforms composed together
+resize = T.Compose([
+    T.ToPILImage(),
+    T.Resize(40, interpolation=Image.CUBIC),
+    T.ToTensor()
+    ])
+
+def plot_durations(episode_durations, AVERAGE_SIZE):
+    plt.figure(2)
+    plt.clf()
+    durations_t = torch.tensor(episode_durations, dtype=torch.float)
+
+    plt.title('Training ...')
+    plt.xlabel('Episode')
+    plt.ylabel('Duration')
+    plt.plot(durations_t.numpy())
+
+    if len(durations_t) >= AVERAGE_SIZE:
+        dim = 0
+        size = AVERAGE_SIZE
+        step = 1
+        # duations_t.unfold(dim, size, step).size(): (no_point, 100)
+        # duations_t.unfold(dim, size, step).mean(1).size(): (number_point, 1)
+        means = durations_t.unfold(dim, size, step).mean(1).view(-1)
+        means = torch.cat((torch.zeros(AVERAGE_SIZE-1), means))
+        plt.plot(means.numpy())
+
+    plt.pause(0.001)
+    #if is_ipython:
+    #    display.clear_output(wait=True)
+    #    display.display(plt.gcf())
+
+# Anyway, it is used to extract the abscissa asis of the cart
+def get_cart_location(env, screen_width):
+    world_width = env.x_threshold * 2
+    scale = screen_width / world_width
+    return int(env.state[0] * scale + screen_width / 2.0)
+
+def get_screen(env, device):
+    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
+    _, screen_height, screen_width = screen.shape
+    screen = screen[:, int(screen_height * 0.4):int(screen_height * 0.8)]
+
+    view_width = int(screen_width * 0.6)
+    cart_location = get_cart_location(env, screen_width)
+
+    # slice usage: slice(stop) or slice(start, stop)
+    # if in the left side
+    if cart_location < view_width//2:
+        slice_range = slice(view_width)
+    # if in the right side
+    elif cart_location > (screen_width - view_width // 2):
+        slice_range = slice(-view_width, None)
+    # if in the middle
+    else:
+        slice_range = slice(cart_location - view_width // 2, cart_location + view_width // 2)
+    screen = screen[:, :, slice_range]
+    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
+    
+    screen = torch.from_numpy(screen)
+    # add a batch dimension: BCHW
+    return resize(screen).unsqueeze(0).to(device) 
+
+if __name__ == '__main__':
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    env = gym.make('CartPole-v0').unwrapped 
+    env.reset()
+    plt.figure()
+    cart = get_screen(env).cpu().squeeze(0).permute(1, 2, 0).numpy()
+
+    plt.imshow(cart, interpolation='none')
+    plt.title('Cart')
+    plt.show()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Deep Q-learning Network for Reinforcement Learning`
	`2`	`+The re-implement of 《Reinforcement Learning(DQN) Tutorial》.`