import React from 'react'
import Layout from '../../components/Layouts'
import BeamSearchViz from './BeamSearchViz'
import GreedySearchViz from './GreedySearchViz'
import PureSamplingSearchViz from './PureSamplingSearchViz'
import Img from 'gatsby-image'
import katex from 'katex'
import './NlgDecoder.css'
import { Divider, Row, Col, Tag } from 'antd'
import { graphql } from 'gatsby'
import NICFlowWithDecoder from './NICFlowWithDecoder'

export const frontmatter = {
  title: `NLG Decoding Algorithms`,
  written: '2019-06-20',
  updated: '2019-07-1',
  layoutType: `post`,
  contentType: 'dataviz',
  path: '/nlg-decoders/',
  category: 'VISUALISATION',
  image: `./poster.png`,
  description:
    'Intuitive guide to various decoding algorithms used in natural language generation (NLG)'
}

const KatexEquation = (props) => {
  var eq = katex.renderToString(props.equation, {
    throwOnError: false
  })

  return <span dangerouslySetInnerHTML={{ __html: eq }} />
}

export default class NLGDecoderVisualiser extends React.Component {
  render() {
    const tpe_hotel = this.props.data.tpe_hotel.childImageSharp
    const ref_img = this.props.data.ref_img.childImageSharp
    const queenstown = this.props.data.queenstown.childImageSharp
    const cityview = this.props.data.cityview.childImageSharp

    return (
      <Layout data={this.props.data} location={this.props.location}>
        <div style={{ maxWidth: '800px', margin: '0px auto' }}>
          <h1 className="header-title">Visualising Decoding Algorithms</h1>
          <h4
            className="header-subtitle"
            style={{ marginTop: 20, marginBottom: 40 }}
          >
            Desktop Version | 21 July, 2019
          </h4>
          <div className="story-content" style={{ marginBottom: 20 }}>
            <div>
              <p className="para">
                In the neural image caption generator <a href="/nic-p1">post</a>
                , we saw how to build and train the neural network that can
                generate the caption for any given image. We also saw how the
                choice of decoder impacts the quality of captions generated. And
                whilst we described how each decoder works, in words, I find it
                easier to understand the concepts when they are visualized.
              </p>
              {/* <Img sizes={cap_gen_flow.fluid} /> */}
              <div className="nic-decoder-img">
                <NICFlowWithDecoder />
              </div>
              <p className="para">
                Recollect that after we have trained our image caption model, we
                initiate the caption generation process by feeding the image
                tensor along with the special start of sentence token (i.e.
                &lt;START&gt;). The model generates the probability distribution
                (actually the logits) over our vocabulary of 36,780 words. The
                orange box shows the choice of decoding algorithms that helps us
                choose which word to use. The chosen word and the image is then
                passed again to the model until we meet the stopping criteria
                which are either we get the special end of sentence token (i.e.
                &lt;STOP&gt;) as our next word or we exceed the predefined
                number of steps where one step is passing the image and word
                tensor to the caption generator model and choosing the word
                using decoding algorithm.
              </p>
              <p className="para">
                In this post, we focus on the orange box i.e the decoding
                algorithms that help us choose the word from a probability
                distribution over the entire vocabulary.
              </p>
            </div>
            <div>
              <h2 className="is-size-4 heading">Greedy Decoder</h2>
              <p className="para">
                This is the most straightforward approach where we select the
                word that has the highest probability (i.e act greedily). And
                while it could generate the sequence of words, the quality of
                output is often low when compared to the other decoding
                algorithms. You can check out the "Test Drive" section under the
                image caption <a href="/nic-p1">post</a> for comparison.
              </p>
              <Img sizes={ref_img.fluid} />
              <p className="para" style={{ fontSize: 10, textAlign: 'center' }}>
                Silo Park, Auckland: Input image to test Greedy Decoder
              </p>
              <GreedySearchViz
                chartname="greedydecoder"
                url="https://gist.githubusercontent.com/katnoria/26e90ed605ac06f118bb8d38d6518499/raw/ffa079e431fac4d67bb40a4ad75df4535710b10d/greedysearch.json"
              />
              <p className="para" style={{ fontSize: 12, textAlign: 'center' }}>
                Greedy decoder in action: argmax)
              </p>
              <p className="para">
                It is not possible to show all 36,780 words from the vocabulary,
                so we pick the top 60 words for the visualization purpose. Also,
                note that it causes the labels to switch at every timestep but
                hopefully still conveys the idea.
              </p>
            </div>
            <div>
              <h2 className="is-size-4 heading">Beam Search Decoder</h2>
              <p className="para">
                In the greedy decoder, we considered a single word at every
                step. What if we could track multiple words at every step and
                use those to generate multiple hypotheses.
              </p>
              <p className="para">
                This is exactly what the beam search algorithm does, we define
                how many words (k) we want to keep at every step. The algorithm
                keeps track of k words along with its score, each seeded from
                the previous top scoring k words. The score is calculated as a
                sum of the log probability of the hypothesis generated so far.
              </p>
              <div style={{ marginTop: 10, marginBottom: 10 }}>
                <KatexEquation equation=" score(y_1,...,y_t)= \sum_{i=1}^t logP(y_i|y_1,....,y_{i-1}, x)" />
              </div>
              <p className="para">
                where t is the step, x is the input image and y are the words
                generated. The stopping criteria remain the same as greedy
                search where the hypothesis stops as soon as we encounter
                &lt;STOP&gt; or run out of a predefined maximum number of
                steps.The end result is a tree of words (i.e multiple
                hypotheses), and we pick the one that has the highest score as
                our final solution.
              </p>
              <p className="para">
                When we use k=1, it works just like the greedy decoder algorithm
                and suffers from the same issue of producing low-quality output.
                As we increase k the algorithm starts to generate better quality
                output, although at larger k the output gets very short. Also,
                note that increasing k is compute-intensive because we need to
                keep track of k words at every step.
              </p>
              <Img sizes={queenstown.fluid} />
              <p className="para" style={{ fontSize: 10, textAlign: 'center' }}>
                Queenstown: Input image to test beam search decoder
              </p>
              <BeamSearchViz />
              <p className="para" style={{ fontSize: 12, textAlign: 'center' }}>
                Beam search decoder with k=3 and max steps as 51
              </p>
              <p className="para">
                The start and stop words are highlighted in green and red, and
                the grey text shows the score of sequence at that step or point
                in time.
              </p>
              <p className="para">
                You may find <a href="/nlg-beamsearch">this post</a> on effect
                of parameters on beam search useful.
              </p>
            </div>
            <div>
              <h2 className="is-size-4 heading">Pure Sampling Decoder</h2>
              <p className="para">
                Pure sampling decoder is very similar to the greedy search
                decoder, but instead of picking the word with the highest
                probability, we randomly sample the word from the probability
                distribution of entire vocabulary. The sampling methods such as
                pure sampling and Top-K sampling (below) provide better
                diversity and are generally considered better at generating
                natural language.
              </p>
              <Img sizes={cityview.fluid} />
              <p className="para" style={{ fontSize: 10, textAlign: 'center' }}>
                Marina Bay, Singapore: Input image to test pure sampling decoder
              </p>
              <PureSamplingSearchViz
                chartname="puresampling"
                url="https://gist.githubusercontent.com/katnoria/8a590b44c15db10a39d008878e5c3a2d/raw/90658a8df990e1d88b64e1e034727278cccd14c5/pure_sampling_hist_d3_200.json"
              />
              <p className="para" style={{ fontSize: 12, textAlign: 'center' }}>
                Pure sampling decoder:{' '}
                <span
                  style={{
                    color: 'orange',
                    background: 'rgba(255,165,0, 0.1)',
                    borderColor: 'rgb(255,165,0)'
                  }}
                >
                  orange - selected word
                </span>{' '}
                and{' '}
                <span
                  style={{
                    color: 'steelblue',
                    backgroundColor: 'rgba(70,130,180, 0.1)'
                  }}
                >
                  blue - highest probability word
                </span>
              </p>
            </div>
            <div>
              <h2 className="is-size-4 heading">Top-K Sampling Decoder</h2>
              <p className="para">
                This approach is similar to the Pure sampling decoder, but
                instead of using the entire probability distribution, we use
                top-k probable words. If we use k=1, it is same as greedy search
                and if we use the total length of vocabulary as k then it works
                as pure sampling decoder. The visualization below uses the same
                input image as the pure sampling example.
              </p>
              <PureSamplingSearchViz
                chartname="top_k_sampling"
                topk="8"
                topk_color="skyblue"
                url="https://gist.githubusercontent.com/katnoria/b755770660b761bed8625e65cdbbff23/raw/857cdcf465f9d272658f975fe08fbbbe00d492da/top-8_sampling_hist_d2_200.json"
              />
              <p className="para" style={{ fontSize: 12, textAlign: 'center' }}>
                Top-K sampling decoder with k=8,{' '}
                <span
                  style={{
                    color: 'orange',
                    background: 'rgba(255,165,0, 0.1)',
                    borderColor: 'rgb(255,165,0)'
                  }}
                >
                  orange - selected word
                </span>{' '}
                and{' '}
                <span
                  style={{
                    color: 'skyblue',
                    backgroundColor: 'rgba(135,206,235, 0.1)'
                  }}
                >
                  blue - top-k words with highest probability
                </span>
              </p>
            </div>
            <Divider />
            <div>
              <div style={{ marginTop: 24 }}>
                <div className="is-size-4 heading">Conclusion</div>
                <p className="para">
                  And that concludes the visualization of various decoding
                  algorithms I used in my <a href="/nic-p1">post</a>
                  &nbsp; on neural image caption generation. Here is one last
                  example showing the output of all four decoders for the same
                  input image.
                </p>
              </div>
              <div>
                <Row>
                  <Col span={12}>
                    <Img sizes={tpe_hotel.fluid} />
                  </Col>
                  <Col span={12}>
                    <div
                      style={{
                        marginLeft: 16,
                        marginTop: 10,
                        marginBottom: 10
                      }}
                    >
                      <Tag>Greedy</Tag>
                      {/* <div style={{marginLeft: 16, marginTop: 10, marginBottom: 10}}> */}
                      <label>Large building in the snow in the</label>
                      {/* </div> */}
                    </div>
                    <Divider />
                    <div
                      style={{
                        marginLeft: 16,
                        marginTop: 10,
                        marginBottom: 10
                      }}
                    >
                      <Tag>BEAM SEARCH</Tag>
                      <label>
                        <span className="caption-body">
                          Large building in a barn
                        </span>
                      </label>
                    </div>
                    <Divider />
                    <div
                      style={{
                        marginLeft: 16,
                        marginTop: 10,
                        marginBottom: 10
                      }}
                    >
                      <Tag>PURE</Tag>
                      <label>
                        <span className="caption-body">
                          Photo of green boxes in the snow
                        </span>
                      </label>
                    </div>
                    <Divider />
                    <div
                      style={{
                        marginLeft: 16,
                        marginTop: 10,
                        marginBottom: 10
                      }}
                    >
                      <Tag>TOP-K</Tag>
                      <label>
                        <span className="caption-body">
                          Large building in the snow away from below
                        </span>
                      </label>
                    </div>
                    <Divider />
                  </Col>
                </Row>
              </div>
            </div>
            <div>
              <Divider />
              <Row>
                <Col span={4} className="references-header">
                  {' '}
                  References:
                </Col>
                <Col span={20} className="references-text">
                  <ol>
                    <li>
                      My Post on Neural Image Caption Generator &nbsp;[
                      <a href="/nlg-p1">link</a>]
                    </li>
                    <li>
                      Show and Tell: A Neural Image Caption Generator &nbsp;[
                      <a
                        href="https://arxiv.org/abs/1411.4555"
                        target="_blank"
                        rel="noopener noreferrer"
                      >
                        arvix
                      </a>
                      ]
                    </li>
                    <li>
                      CS224N Machine Translation Lecture &nbsp;[
                      <a
                        href="http://web.stanford.edu/class/cs224n/slides/cs224n-2019-lecture08-nmt.pdf"
                        target="_blank"
                        rel="noopener noreferrer"
                      >
                        link
                      </a>
                      ]
                    </li>
                  </ol>
                </Col>
              </Row>
            </div>
          </div>
        </div>
      </Layout>
    )
  }
}

export const nlgDecoderQuery = graphql`
  query {
    cap_gen_flow: file(relativePath: { regex: "/caption-generation.png/" }) {
      childImageSharp {
        fluid(maxWidth: 800, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    ref_img: file(relativePath: { regex: "/reference-img.jpg/" }) {
      childImageSharp {
        fluid(maxWidth: 800, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    queenstown: file(relativePath: { regex: "/queenstown.jpg/" }) {
      childImageSharp {
        fluid(maxWidth: 800, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    cityview: file(relativePath: { regex: "/cityview.jpg/" }) {
      childImageSharp {
        fluid(maxWidth: 800, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    tpe_hotel: file(relativePath: { regex: "/taipei.jpeg/" }) {
      childImageSharp {
        fluid(maxWidth: 500, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
  }
`
