import React from "react"
import { graphql } from "gatsby"
import { Tabs, Divider } from 'antd';
import BlogPostChrome from "../../components/BlogPostChrome"
import Img from 'gatsby-image'
import { Row, Col } from 'antd';
import reacher from './images/singleplayer.gif';
import katex from 'katex'
import AgentTrainingPlot from './AgentTrainingPlot';
import Layout from './../../components/Layouts';

export const frontmatter = {
  title: `Deep Deterministic Policy Gradients`,
  written: `2018-11-01`,
  updated: `2018-11-04`,
  layoutType: `post`,
  contentType: "blog",
  path: `/ddpg-reacher/`,
  category: `Reinforcement Learning`,
  image: './poster.jpg',
  description: `We use deep deterministic policy gradients to solve unity ml-agent with continuous action space`,
}


const CriticLoss = () => {
var loss_equation = katex.renderToString(
  `
    L = \\textcolor{orange}{\\frac {1}{N} \\sum_i} (\\textcolor{red}{{y_i}} - \\textcolor{green}{Q}(\\textcolor{#0099cc}{s_i}, \\textcolor{#0099cc}{a_i}|\\textcolor{green}{\\theta^Q)})^2 \\\\
    \\textcolor{orange}{{\\text{The average of squared differences between the}}}\\\\ 
    \\textcolor{red}{{\\text{ target action-value}}}
    \\textcolor{black}{{\\text{ and }}}
    \\textcolor{green}{{\\text{the expected action-value}}}\\\\
    \\textcolor{black}{{\\text{ where}}}\\\\
    \\textcolor{green}{{\\text{ the expected action-value is given by the local critic network}}}\\\\
    \\textcolor{#0099cc}{{\\text{ that takes state and action as input}}}      
  `, {
    throwOnError: false
});
  
  return (
      <div style={{textAlign: "center"}}>
          <span dangerouslySetInnerHTML={{__html: loss_equation}} >             
          </span>        
      </div>
  )
} 

const TargetActionValue = () => {
var loss_equation = katex.renderToString(
  `
  y_i = \\textcolor{#0099cc}{r_i} + \\textcolor{#ff4444}{\\gamma} \\textcolor{#40e0d0}{Q^{\\prime}}(\\textcolor{blue}{s_{i+1}}, \\textcolor{#ffa500}{\\mu^{\\prime}}(\\textcolor{blue}{s_{i+1}}|\\textcolor{#ffa500}{\\theta^{\\mu^{\\prime}}})|\\textcolor{#40e0d0}{\\theta^{Q^{\\prime}}})
  \\\\
  \\textcolor{black}{{\\text{calculate the target estimate by adding }}}\\\\
  \\textcolor{#0099cc}{{\\text{ the reward }}}\\textcolor{black}{{\\text{and }}}
  \\textcolor{red}{{\\text{discounted }}}\\textcolor{#40e0d0}{{\\text{action-value }}}
  \\\\\\textcolor{black}{{\\text{ where }}}\\\\
  \\textcolor{#40e0d0}{{\\text{the target critic network takes }}}
  \\textcolor{blue}{{\\text{ state }}}\\\\
  \\textcolor{#40e0d0}{{\\text{ and }}} 
  \\textcolor{#ffa500}{{\\text{action }}} 
  \\textcolor{#40e0d0}{{\\text{ as input and returns the }}} 
  \\textcolor{#40e0d0}{{\\text{ action-value.}}} 
  \\\\
  \\textcolor{#ffa500}{{\\text{The target actor network maps the }}} 
  \\textcolor{blue}{{\\text{ state }}}
  \\textcolor{#ffa500}{{\\text{to action.}}} 
    `, {
    throwOnError: false
});
  
  return (
      <div style={{textAlign: "center"}}>
          <span dangerouslySetInnerHTML={{__html: loss_equation}} >             
          </span>        
      </div>
  )
} 

const ActorLoss = () => {
var loss_equation = katex.renderToString(
  `
  \\nabla_{\\theta^{\\mu}}J 
  \\approx 
  \\textcolor{black}{\\frac {1}{N} \\sum_i} 
  \\nabla_a 
  \\textcolor{green}{Q}(\\textcolor{#0099cc}{s},\\textcolor{orange}{a}\\mid \\textcolor{green}{\\theta^Q})
  \\scriptstyle{\\mid{\\textcolor{#0099cc}{s=s_i}, \\textcolor{orange}{a=\\mu(s_i)}}}
  \\displaystyle
  \\nabla_{\\textcolor{orange}{\\theta^{\\mu}}} \\textcolor{orange}{\\mu}
  (\\textcolor{#0099cc}{s}\\mid\\textcolor{orange}{\\theta^{\\mu}})
  \\\\
  
  \\textcolor{black}{{\\text{The average of }}}
  \\textcolor{green}{{\\text{ action-values given by the local critic network that takes}}}\\\\
  \\textcolor{#0099cc}{{\\text{state}}}
  \\textcolor{black}{{\\text{ and }}}
  \\textcolor{orange}{{\\text{action}}}
  \\textcolor{green}{{\\text{ as input }}}\\\\
  \\textcolor{black}{{\\text{ where}}}\\\\ 
  \\textcolor{orange}{{\\text{ the action is estimated by the local actor network that takes }}}
  \\textcolor{#0099cc}{{\\text{state}}}
  \\textcolor{orange}{{\\text{ as input }}}
    `, {
    throwOnError: false
});
  
  return (
      <div style={{textAlign: "center"}}>
          <span dangerouslySetInnerHTML={{__html: loss_equation}} >             
          </span>        
      </div>
  )
} 

class DDPGReacher extends React.Component {
  constructor(props) {
    super(props);
    this.state = {
      mode: 'top',
    };
  }

  handleModeChange = (e) => {
    const mode = e.target.value;
    this.setState({ mode });
  }

  render() {
    console.log(this.props.data)
    const algo = this.props.data.algo.childImageSharp.resolutions
    const ddpgflow = this.props.data.ddpgflow.childImageSharp.resolutions
    const single_agent_visdom = this.props.data.single_agent_visdom.childImageSharp.resolutions
    const network_arch = this.props.data.network_arch.childImageSharp.resolutions
    return (
      <Layout data={this.props.data} location={this.props.location}>
      <BlogPostChrome {...this.props.data.javascriptFrontmatter.data}>
        <h1>CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING</h1>
        <article>

         <p>
             In this post, we are going to solve Reacher, a unity ml-agent environment, 
             using deep deterministic policy gradient (DDPG) method introduced by Lillicrap et al.
         </p>

         <h2>Reacher</h2>
         <p>
            In this environment, a double-jointed arm can move to target locations. 
            A reward of +0.1 is provided for each step that the agent's hand is in the goal location. 
            Thus, the goal of our agent is to maintain its position at the target location for as many time steps as possible.
        </p>
         <img src={reacher} />         
        <p>
            The observation space consists of 33 variables corresponding to position, rotation, 
            velocity, and angular velocities of the arm. Each action is a vector with four numbers, 
            corresponding to torque applicable to two joints. Every entry in the action vector should 
            be a number between -1 and 1.
            This means that environment requires the agent to learn from high dimensional state space 
            and perform actions in <strong>continuous action space</strong>.
        </p>
          <p>
            Previously we looked at value based methods such as <a href="/nb_dqn_lunar">DQN</a> and 
            simple policy based methods such as <a href="/hillclimb">hill climbing</a> algorithm 
            to solve environments with continuous state space but discrete action space. 
            It turns out that neither algorithm is well suited to solve Reacher; DQN will require finding an action that 
            maximizes the action-value which in turn requires iterative optimization process at every step; 
            hill climbing methods could take forever as they rely on randomly perturbing the policy weights.             
          </p>
          <p>
          REINFORCE, policy based method, can learn the policy to map state into actions but they are 
          sample inefficient, noisy because we are sampling a trajectory (or a few trajectories) which may 
          not truly represent the policy and could prematurely converge to local optima.
          </p>
        </article>
        <Divider />
        <h2>Deep deterministic policy gradients</h2>
        <p>
          The Deep deterministic policy gradients paper introduced a model free, off-policy actor-critic algorithm 
          that uses deep neural networks to learn policies in high-dimensional, continuous action spaces. 
        </p>          
        <p>
          In the diagram below, I have outlined the training process. 
          The agent is trained for the fixed number of episodes and with in each episode, fixed number of timesteps:
        </p>
        <Img resolutions={ddpgflow} />
        <div style={{backgroundColor: '#f1f4f7', marginTop: 10}} className="box">
          For fixed number of timesteps in an episode, do
          <ul>
            <li>Choose an action for the given state (step 1 and step 2)</li>
            <li>Take action and receive next state, reward, done (whether episode finished?) </li>
            <li>Store the current state, action, next state, reward and done as experience tuple in memory buffer (step 3)</li>
            <li>Sample random batch of experience  (i.e length of memory > batch size, step 4) </li>
            <li>Train Actor and Critic networks using sampled minibatch</li>
          </ul>           
        </div>
        <h4>Training Actor and Critic Network</h4>        
        <p>
          The actor network takes state as input and returns the action 
          whereas the critic network takes state and action as input and returns the value.
          The critic in this case is a DQN with local and fixed target networks and replay buffer (memory).
          Both, actor and critic use two neural networks: local and fixed. The local networks are trained 
          by sampling experiences from replay buffer and minimising the loss function.
        </p>
        <p>This is how I understood the loss functions for actor and critic</p>
        <div className="box">
          <p>The critic loss is given by </p>
          {/* <Img resolutions={critic_loss} /> */}
          <CriticLoss />
          <p style={{marginTop: 10}}>and target action-value is calculated as </p>
          {/* <Img resolutions={critic_target} /> */}
          <TargetActionValue />
        </div>  
        <div className="box">
        <p>The actor is updated using sampled policy gradient.</p>
        <ActorLoss />
        </div>
        <p>
          The target networks in DQN paper were update by directly copying all the weights from local network,
          but in this paper, they use soft updates to constraint the target values to update slowly as it 
          greatly improves the learning stability.
        </p>
        <p>The following image from the paper shows the full algorithm</p>
        <Img resolutions={algo} />
        <h2>Network architecture</h2>

        <p>
          We use a simple network architecture because we are using raw state values as input instead of pixels.
          Hence, it is entirely possible to train this network on CPU.
          If we were to learn from pixels then we would have choosen convolutional layers along with linear.</p>
        <Img resolutions={network_arch} />  
        <h2>Training</h2>
        <p>
          The single agent version was trained on 8-core CPU machine and the final version of the model took 36 minutes to solve the environment.
          However, before coming up with the final version, there was a long period of trial and error, to find out the right set of 
          hyperparameters and network capacity. 
          During this period, I found the following items helpful in organising my experiments.
        </p>
        <h4>1. Keep track of experiments</h4>
        <p>
          Create a new folder every time an experiment is run. 
          Record hyperparameters, network architecture and comments along with the verbose log file.
          Save desired metrics as csv for analysis later
           (<a href="https://github.com/katnoria/ddpg-reacher/tree/master/data/single/2018-10-25-123253" target="_blank" rel="noopener noreferrer">example</a>).
        </p>        
        
        <h4>2. Get real-time feedback</h4>
        <p>
          Publish real-time metrics, messages to visdom server for live visualisation.
          Sample screenshot of visdom during the training run.   
        </p>
        {/* <img src={visdomgif} height="600"/> */}
        <Img resolutions={single_agent_visdom} />
        <h4>3. Model checkpoint</h4>
        <p>
          Save model weights on every improvement over the current best score. 
          This way I could pause the training and continue later by loading the saved model,
          it also gave me the flexiblity of using the saved model as starting point 
          in some trials rather than training everything from the scratch everytime.  
        </p>

        <h2>Final Model</h2>
        <p>
          This section lists the hyperparameters used in the final model. The  model was able to solve 
          the environment in 203 episodes where each episode was trained for 1000 timesteps.
        </p>
        <h3>Hyperparameters</h3>
        <table>
          <tr>
            <th>Parameter</th>
            <th>Value</th>
            <th>Description</th>
          </tr>
          <tbody>
          <tr>
            <td>BATCH_SIZE</td>
            <td>256</td>
            <td>Minibatch size</td>
          </tr>
          <tr>
            <td>GAMMA</td>
            <td>0.9</td>
            <td>Discount factor</td>
          </tr>
          <tr>
            <td>TAU</td>
            <td>1e-3</td>
            <td>Soft update of target parameters</td>
          </tr>
          <tr>
            <td>LR_ACTOR</td>
            <td>1e-3</td>
            <td>Actor learning rate</td>
          </tr>
          <tr>
            <td>LR_CRITIC</td>
            <td>1e-3</td>
            <td>Critic learning rate</td>
          </tr>
          <tr>
            <td>WEIGHT_DECAY</td>
            <td>0</td>
            <td>L2 weight decay</td>
          </tr>
          <tr>
            <td>SCALE_REWARD</td>
            <td>1.0</td>
            <td>Reward scaling (1.0 means no scaling)</td>
          </tr>
          <tr>
            <td>SIGMA</td>
            <td>0.01</td>
            <td>OU Noise standard deviation</td>
          </tr>
          <tr>
            <td>FC1</td>
            <td>128</td>
            <td>Input channels for 1st hidden layer</td>
          </tr>
          <tr>
            <td>FC2</td>
            <td>128</td>
            <td>Input channels for 2nd hidden layer</td>
          </tr>
          </tbody>
        </table>
        <p>          
          Below, we see the average score received by the agent during its training process.
          </p>
        <AgentTrainingPlot />
        <Divider />
        <h2>Multi Agent Environment</h2>
        <p>
          The multi agent reacher environment includes twenty agents and presents a different challenge.
          The environment is considered solved when the average score of all twenty agents is +30 or above over 100 episodes.
          In order to solve this environment, we make use of twenty actor-critic networks but shared replay buffer 
          because the experience tuple experienced by each agent can be useful to others.
        </p>
        <p>
          The network architecture and hyperparameters largely remain the same but it took a lot more trials to 
          get the agent to solve the environment. This was mainly because the agent learned very slowly 
          e.g. improvement from 22.36 to 22.37 took 20 minutes and this lead me to "stop the training, tune hyperparameters 
          and try again" loop. This is where the model checkpointing came in handy because I could start training from where the 
          last training process was stopped (i.e killed) and try out different hyperparams (e.g learning rate, weight decay).           
        </p>
        <p>
          In the end, it took 28 hours of training (not including failed trials) to solve this environment.
          The training metrics and logs are available on my <a href="https://github.com/katnoria/ddpg-reacher/tree/master/data/multi" target="_blank" rel="noopener noreferrer">github repo</a>.
          The following video shows the trained agents in action.
        </p>
        <video className="video-container video-container-overlay" autoPlay={false} muted controls>
              <source type="video/mp4" data-reactid=".0.1.0.0.0" src="https://sprinkle.nyc3.digitaloceanspaces.com/results/reacher/reacher20.mp4" />
        </video>

        <Divider />
        <h2>Future work</h2>
        <p>
        Some people were able to solve the environment in relatively fewer number of training episodes. 
        I think a combination of following could help speed up the training process: 
        </p>       
        <ul>
          <li>reduce network size either by remove one hidden layer or decreasing the number of units as this would result in a lot less parameters</li>
          <li>increase the learning rate and introduce weight decay (currently set to 0) to speed up the learning</li>
          <li>experiment with scaled rewards (see below)</li>
        </ul>
        <p>
          This <a href="https://arxiv.org/abs/1604.06778" target="_blank" rel="noopener noreferrer">paper</a> found that DDPG is less stable than batch algorithms such as REINFORCE 
          and the performance of policy could degrade significantly during the training phase. 
          In my tests, I found that average score plateaued if I continued the training process 
          even after solving the environment. 
          The paper also suggests that scaling the rewards could improve the training stability.
        </p>

        <p>
          While working on DDPG solution, there were a lot of moving parts such as network architecture, 
          hyperparameters and it took a long time, along with incorporating suggestions from the forum, to discover 
          the combination that could solve the environment. 
          Proximal policy optimization (PPO) has shown to achieve state-of-the-art results with very 
          little hyperparamter tuning and greater sample efficiency while keeping the policy deviation 
          under check (by forcing the ratio of old and new policy with in a small interval).
        </p>
        <p>
          The code along with model checkpoints is available on my github 
           <a href="https://github.com/katnoria/ddpg-reacher"> repository</a>.
        </p>
        <Divider />
        <Row>
                        <Col span={4} className="references-header">
                            References:
          </Col>
                        <Col span={20} className="references-text">
                            <ol>
                              <li>
                                Deep deterministic policy gradient  
                                &nbsp;[<a href="https://arxiv.org/abs/1509.02971" target="_blank" rel="noopener noreferrer">arvix</a>]                                
                            </li>
                            <li>
                                Benchmarking Deep Reinforcement Learning for Continuous Control  
                                &nbsp;[<a href="https://arxiv.org/abs/1604.06778" target="_blank" rel="noopener noreferrer">arxiv</a>]                                
                            </li>
                            <li>
                               Proximal Policy Optimization
                               &nbsp;[<a href="https://arxiv.org/abs/1707.06347" target="_blank" rel="noopener noreferrer">arxiv</a>]                               
                            </li>
                            </ol>
                        </Col>
                    </Row>        
      </BlogPostChrome>
      </Layout>
    )
  }
}

export default DDPGReacher

const styles = {}

styles.row = {
  display: `flex`,
  flexWrap: `wrap`,
  margin: `8px -4px 1rem`,
}

// We want to keep this component mostly about the code
//  so we write our explanation with markdown and manually pull it in here.
//  Within the config, we loop all of the markdown and createPages. However,
//  it will ignore any files appended with an _underscore. We can still manually
//  query for it here, and get the transformed html though because remark transforms
//  any markdown based node.
export const pageQuery = graphql`
    query ddpgquery($slug: String!) {
      javascriptFrontmatter(fields: { slug: { eq: $slug } }) {
        ...JSBlogPost_data
      }
      algo: file(
        relativePath: {regex: "/algo.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 586) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      reacher: file(
        relativePath: {regex: "/actor_loss_anno.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 799) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      critic_loss: file(
        relativePath: {regex: "/critic_loss_anno.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 231) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      critic_target: file(
        relativePath: {regex: "/critic_target_av.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 251) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      actor_loss: file(
        relativePath: {regex: "/actor_loss_anno.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 202) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      single_agent_soln: file(
        relativePath: {regex: "/single_agent_avg_score_sm.png/"}) {
        childImageSharp {
          resolutions(width: 800, height: 200) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      single_agent_visdom: file(
        relativePath: {regex: "/single_agent_ddpg_solved_sm.png/"}) {
        childImageSharp {
          resolutions(width: 600, height: 626) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      network_arch: file(
        relativePath: {regex: "/ddpg_arch.png/"}) {
        childImageSharp {
          resolutions(width: 600, height: 435) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
      ddpgflow: file(
        relativePath: {regex: "/ddpgflow.png/"}) {
        childImageSharp {
          resolutions(width: 525, height: 524) {
            ...GatsbyImageSharpResolutions
          }
        }
      }    
    }
  `