import React from "react"
import { graphql } from "gatsby"
import { Tabs, Divider, Popover, Collapse } from 'antd';
import BlogPostChrome from "../../components/BlogPostChrome"
import tennisPlay from './images/tennis_play.gif'
import Img from 'gatsby-image'
import { Row, Col } from 'antd';
import katex from 'katex'
import VegaChart from "../../components/VegaChart"
import Layout from './../../components/Layouts';

const Panel = Collapse.Panel;

export const frontmatter = {
  title: `Multi-Agent Deep Deterministic Policy Gradients`,
  written: `2018-12-08`,
  updated: `2018-12-18`,
  layoutType: `post`,
  contentType: "blog",
  path: `/maddpg/`,
  category: `Reinforcement Learning`,
  image: './poster.jpg',
  description: `In this post, we train two agents to play tennis against each other`,
}


class MultiAgentContinuousControl extends React.Component {
  constructor(props) {
    super(props);
    this.state = {
      mode: 'top',
    };
  }

  handleModeChange = (e) => {
    const mode = e.target.value;
    this.setState({ mode });
  }

  render() {
    // console.log(this.props.data)
    const flow_overview = this.props.data.flow_overview.childImageSharp.resolutions
    const maddpg_architecture = this.props.data.maddpg_architecture.childImageSharp.resolutions
    const visdom_plot_sm = this.props.data.visdom_plot_sm.childImageSharp.resolutions
    const network_arch = this.props.data.network_arch.childImageSharp.resolutions
    return (
      <Layout data={this.props.data} location={this.props.location}>
      <BlogPostChrome {...this.props.data.javascriptFrontmatter.data}>
        <h1>CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING</h1>
        <article>

         <p>
             In this post, we are going to solve Tennis, a unity ml-agent environment, 
             using the approach presented in the paper "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments".
         </p>

         <h2>Tennis</h2>
         <p>
         In this environment, two agents control rackets to bounce a ball over a net. 
         If an agent hits the ball over the net, it receives a reward of +0.1. 
         If an agent lets a ball hit the ground or hits the ball out of bounds, it receives a reward of -0.01. 
         Thus, the goal of each agent is to keep the ball in play.
         </p>
         <p>
         The observation space consists of 8 variables corresponding to the position and velocity 
         of the ball and racket. Each agent receives its own, local observation. 
         Two continuous actions are available, corresponding to movement toward (or away from) the net, 
         and jumping.
         </p>
         <p>
          The task is episodic, and in order to solve the environment, 
          the agents must get an average score of +0.5 (over 100 consecutive episodes, 
          after taking the maximum over both agents). Specifically,
        </p>
        <ul>
          <li>
            After each episode, we add up the rewards that each agent received (without discounting), 
            to get a score for each agent. This yields 2 (potentially different) scores. 
            We then take the maximum of these 2 scores.
          </li>
          <li>
            This yields a single score for each episode.
          </li>
        </ul>
        <p>
          The environment is considered solved, when the average (over 100 episodes) of those scores is at least +0.5.
        </p>
        <div style={{textAlign: "center", margin: "0 auto"}}>
          <img src={tennisPlay} />
          <p className="is-size-7 small">Agents playing the game using trained policy</p>
        </div>
        <Divider></Divider>
        <h2>Overview</h2>
        <p>
          The solution used in this post makes use of the approach presented in the paper 
          "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments". 
          We use Actor-Critic networks, same as Deep deterministic policy gradients <a href="/ddpg-reacher">post</a>, 
          but instead of training each agent to learn from its own action, we incorporate actions taken by all agents. 
          Why use actions taken by all agents to train the single agent ? 
          Because, the environment state depends on the actions taken by all agents 
          (i.e non-stationary from a single agent's perspective) so if we 
          just train the agent using its own action, the policy network does not get enough information 
          to come up with a good policy and most likely will take longer to find any good policy.

          Having said that, It is possible to solve this environment by training each agent just using its own action (i.e DDPG), 
          see my implementation <a href="https://github.com/katnoria/unityml-tennis/blob/master/ddpg_multi.py" target="_blank" rel="noopener noreferrer">here</a>.         
        </p>
        <p>
          Going back to Multi-Agent Actor-Critic paper, the action from each agent is used only during the <strong>training</strong> phase to ease the training (centralised training).
          During execution, we just use the the policy network that returns the action for a given state.
          We do not use any information from other agents (
            <Popover content="Sorry, no blockchain here" title="">
              i.e.<span className="hover-text-anchor">&nbsp;decentralised&nbsp;</span>
                        </Popover>

             execution
            ).
        </p>
        <div style={{textAlign: "center", margin: "0 auto"}}>
          <Img resolutions={flow_overview} />
        </div>
        <p>
          This post builds upon my previous posts, <a href="/ddpg-reacher">deep deterministic policy gradients</a> (DDPG) and <a href="/nb_dqn_lunar">Deep Q-Network</a>, so it is best to review some of the core concepts such as experience replay, fixed target network and actior-critic there.
          We will go over them rather quickly here.
        </p>
        <ul>
          <li>The Actor network takes state as input and returns the action to take in the environment</li>
          <li>The Critic network, <a href="/nb_dqn_lunar">Deep Q-Network</a>, takes the state and action of all agents as input and returns the action-value (Q-value)</li>
          <li>The Critic and Actor networks are trained by sampling experiences from the replay buffer.</li>
          <li>The action-value from Critic is used to teach the Actor to choose better actions.</li> 
        </ul>
        <p>                  
          The key improvement over the DDPG approach is that we now share the actions taken by all agents to train each agent
          (see <a href="https://github.com/katnoria/unityml-tennis/blob/a056fb897200f97d63bef15fda0218ae7941b573/maddpg.py#L115">maddpg.py</a> to review the code).  
        </p>
        <Collapse bordered={false}>
            <Panel header="Show Code" key="1">
            <pre>
            <code className="language-python">
              {
                `
                def learn(self, agents, experience, gamma):
                """Use the experience to allow agents to learn. 
                The critic of each agent can see the actions taken by all agents 
                and incorporate that in the learning.
                Parameters:
                    agents (MADDPGAgent): instance of all the agents
                    experience (Tuple[torch.Tensor]):  tuple of (s, a, r, s', done) tuples 
                    gamma (float): discount factor
                """
                num_agents = len(agents)
                states, actions, rewards, next_states, dones = experience
                # ---------------central critic-------------------
                # use target actor to get action, here we get target actors from 
                # all agents to predict the next action
                next_actions = torch.zeros((len(states), num_agents, self.action_size)).to(device)
                for i, agent in enumerate(agents):            
                    next_actions[:, i] = agent.target_actor(states[:, i, :])
                
                # Flatten state and action
                # e.g from state (100,2,24) --> (100, 48)
                critic_states = flatten(next_states)
                next_actions = flatten(next_actions)
        
                # calculate target and expected
                Q_targets_next = self.target_critic(critic_states, next_actions)
                Q_targets = rewards[:, self.agent_index, :] + (gamma * Q_targets_next * (1 - dones[:, self.agent_index, :]))
                Q_expected = self.local_critic(flatten(states), flatten(actions))
        
                # use mse loss 
                critic_loss = F.mse_loss(Q_expected, Q_targets)
                critic_loss_value = critic_loss.item()
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                if config['CLIP_GRADS']:
                    for param in self.local_critic.parameters():
                        param.grad.data.clamp_(-1 * config['CLAMP_VALUE'], config['CLAMP_VALUE'])
                self.critic_optimizer.step()
        
                # ---------------actor---------------------
                # Only update the predicted action of current agent
                predicted_actions = torch.zeros((len(states), num_agents, self.action_size)).to(device)
                predicted_actions.data.copy_(actions.data)
                predicted_actions[:, self.agent_index] = self.local_actor(states[:, self.agent_index])
                actor_loss = -self.local_critic(flatten(states), flatten(predicted_actions)).mean()        
                actor_loss_value = actor_loss.item()

                # Backward pass
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                if config['CLIP_GRADS']:
                    for param in self.local_actor.parameters():
                        # import pdb; pdb.set_trace()
                        param.grad.data.clamp_(-1 * config['CLAMP_VALUE'], config['CLAMP_VALUE'])
                self.actor_optimizer.step()
        
                # ----------------------- update target networks ----------------------- #
                if self.learn_step == 0:
                    # One time only, start local and target with same parameters
                    self._copy_weights(self.local_critic, self.target_critic)
                    self._copy_weights(self.local_actor, self.target_actor)
                else:
                    self.soft_update(self.local_critic, self.target_critic, config["TAU"])
                    self.soft_update(self.local_actor, self.target_actor, config["TAU"])
        
                self.learn_step += 1
                return actor_loss_value, critic_loss_value                
                `
              }
            </code>
            </pre>
            </Panel>
        </Collapse>
        <h2>Network Architecture</h2>
        <p>Similar to the previous post, we use fully connected layers for both Actor and Critic networks.</p>
        <div style={{textAlign: "center", margin: "0 auto"}}>
          <Img resolutions={maddpg_architecture} />
        </div>
        <h2>Hyper parameters</h2>
        <p>
          I tried several hyperparameters in order to solve the environment and the following worked the best for me.
        </p>
        <table>
          <tr>
            <th>Parameter</th>
            <th>Value</th>
            <th>Description</th>
          </tr>
          <tbody>
          <tr>
            <td>BUFFER_SIZE</td>
            <td>100000</td>
            <td>Replay memory buffer size</td>
          </tr>
          <tr>
            <td>BATCH_SIZE</td>
            <td>512</td>
            <td>Minibatch size</td>
          </tr>
          <tr>
            <td>GAMMA</td>
            <td>0.95</td>
            <td>Discount factor</td>
          </tr>
          <tr>
            <td>TAU</td>
            <td>1e-2</td>
            <td>Soft update of target parameters</td>
          </tr>
          <tr>
            <td>LR_ACTOR</td>
            <td>1e-3</td>
            <td>Actor learning rate</td>
          </tr>
          <tr>
            <td>LR_CRITIC</td>
            <td>1e-3</td>
            <td>Critic learning rate</td>
          </tr>
          <tr>
            <td>WEIGHT_DECAY</td>
            <td>0</td>
            <td>L2 weight decay</td>
          </tr>
          <tr>
            <td>SCALE_REWARD</td>
            <td>1.0</td>
            <td>Reward scaling (1.0 means no scaling)</td>
          </tr>
          <tr>
            <td>SIGMA</td>
            <td>0.01</td>
            <td>OU Noise standard deviation</td>
          </tr>
          <tr>
            <td>LEARN_STEP</td>
            <td>1</td>
            <td>How often to perform learning step (i.e after episode)</td>
          </tr>
          <tr>
            <td>CLIP_GRADS</td>
            <td>True</td>
            <td>Should we clip gradients</td>
          </tr>
          <tr>
            <td>CLAMP_VALUE</td>
            <td>1</td>
            <td>Gradient Clip value (e.g a value of 1 gets set as -1,+1)</td>
          </tr>
          <tr>
            <td>FC1</td>
            <td>64</td>
            <td>Input channels for 1st hidden layer</td>
          </tr>
          <tr>
            <td>FC2</td>
            <td>64</td>
            <td>Input channels for 2nd hidden layer</td>
          </tr>
          </tbody>
        </table>  
        <h2>Performance</h2>      
        <p>
          The environment is considered solved when the average reward over 100 episode is +0.5.
          In the plot below, we can see that MADDPG solved the environment in half the number of 
          episodes as compared to DDPG version and its training was a lot more stable. 
        </p>
        <div style={{textAlign: "center", margin: "0 auto"}}>
          <VegaChart id="random-policy"  chartName="random-policy" specUrl="https://raw.githubusercontent.com/katnoria/unityml-tennis/master/data/plot.json">
          </VegaChart>
        </div>
        <Divider />
        <h2>Future work</h2>
        <p>
        We solved the environment using general-purpose multi-agent learning algorithm. 
        The environment was solved in fewer episodes in comparison with deep deterministic policy 
        gradients solution (see <a href="https://github.com/katnoria/unityml-tennis/blob/master/Report_DDPG.md">DDPG Report</a> on github). I found that training was very sensitive to soft 
        update parameter (TAU), discount factor (GAMMA), gradient clipping and number of steps in 
        an episode. The following could be explored to improve training stability and performance:
        </p>
        <ul>
          <li>use prioritized experience replay to give more importance to experience tuples that can provide high expected learning progress</li>
          <li>experiment further with learning rates and introduce weight decay (currently set to 0) to speed up the learning</li>
          <li>implement ensemble policies to improve training performance (details below)</li>
        </ul>
        <p>
          The Multi-agent reinforcement learning suffers from environment non-stationarity 
          problem because the agent(s) policies change during the training. This could lead 
          to agents learning the behaviour of their competitors which is undesirable 
          because they may fail when competitors alter their strategies.
          The paper suggests that we can tackle this issue by making use of 
          ensemble of policies where each agent trains a fixed number of sub-policies and at 
          each episode a random policy per agent is chosen to execute the actions. 
          This way, the agent and its competitors use one of the sub-policies at each episode.          
        </p>
        <p>
          The code along with model checkpoint and DDPG version is available on my github <a href="https://github.com/katnoria/unityml-tennis">repository</a>.
        </p>
        </article>                   
        <Divider />
        <Row>
                        <Col span={4} className="references-header">
                            References & Links:
          </Col>
                        <Col span={20} className="references-text">
                            <ol>
                              <li>
                              Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments  
                                &nbsp;[<a href="https://arxiv.org/abs/1706.02275" target="_blank" rel="noopener noreferrer">arvix</a>]                                
                            </li>
                            <li>
                                Benchmarking Deep Reinforcement Learning for Continuous Control  
                                &nbsp;[<a href="https://arxiv.org/abs/1604.06778" target="_blank" rel="noopener noreferrer">arxiv</a>]                                
                            </li>
                            <li>
                               Prioritized experience replay
                               &nbsp;[<a href="https://arxiv.org/abs/1511.05952" target="_blank" rel="noopener noreferrer">arxiv</a>]
                            </li>
                            </ol>
                        </Col>
                    </Row>                
      </BlogPostChrome>
      </Layout>
    )
  }
}

export default MultiAgentContinuousControl

const styles = {}

styles.row = {
  display: `flex`,
  flexWrap: `wrap`,
  margin: `8px -4px 1rem`,
}

// We want to keep this component mostly about the code
//  so we write our explanation with markdown and manually pull it in here.
//  Within the config, we loop all of the markdown and createPages. However,
//  it will ignore any files appended with an _underscore. We can still manually
//  query for it here, and get the transformed html though because remark transforms
//  any markdown based node.
export const pageQuery = graphql`
    query maddpgquery($slug: String!) {
      javascriptFrontmatter(fields: { slug: { eq: $slug } }) {
        ...JSBlogPost_data
      }
      flow_overview: file(
        relativePath: {regex: "/maddpg_overview.png/"}) {
        childImageSharp {
          resolutions(width: 480, height: 378) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      maddpg_architecture: file(
        relativePath: {regex: "/ma_ddpg_architecture.png/"}) {
        childImageSharp {
          resolutions(width: 640, height: 521) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      visdom_plot_sm: file(
        relativePath: {regex: "/maddpg_solved_visdom.jpg/"}) {
        childImageSharp {
          resolutions(width: 1024, height: 893) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      critic_target: file(
        relativePath: {regex: "/critic_target_av.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 251) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      actor_loss: file(
        relativePath: {regex: "/actor_loss_anno.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 202) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      single_agent_soln: file(
        relativePath: {regex: "/single_agent_avg_score_sm.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 200) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      single_agent_visdom: file(
        relativePath: {regex: "/single_agent_ddpg_solved_sm.png/"}) {
        childImageSharp {
          resolutions(width: 600, height: 626) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      network_arch: file(
        relativePath: {regex: "/ddpg_arch.png/"}) {
        childImageSharp {
          resolutions(width: 600, height: 435) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      ddpgflow: file(
        relativePath: {regex: "/ddpgflow.png/"}) {
        childImageSharp {
          resolutions(width: 525, height: 524) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
    }
  `