import React from "react"
import {Button, Row, Col, Divider, Modal } from 'antd';
import { graphql } from "gatsby"
import katex from 'katex'
import Img from 'gatsby-image'
import Layout from '../../components/Layouts';
import BlogPostChrome from "../../components/BlogPostChrome"
import VegaChart from "../../components/VegaChart"
import GeneratedCaptionsCarousel from './GeneratedCaptions'
import "./NIC.css"
import CocoDatasetSamplePage from "./CocoDatasetSample";
import CustomImageGridWithCaption from './CustomImageGridWithCaption';


export const frontmatter = {
    title: `Neural Image Caption Generator`,
    written: `2019-06-30`,
    updated: `2019-07-06`,
    layoutType: `post`,
    contentType: `blog`,
    path: `/nic-p1/`,
    category: `Deep Learning`,
    image: `./poster.png`,
    cover: `./header2.jpg`,
    description: `Generating image caption using Convolutional and Recurrent Neural Networks`
}


const KatexEquation = (props) => {
  var eq = katex.renderToString(props.equation, {
      throwOnError: false
  });
  
  return (
      <span dangerouslySetInnerHTML={{__html: eq}} /> 
  )
} 

class NICPage extends React.Component {
    // constructor(props) {
    //     super(props)
    // }

    state = {
      cocoVisible: false,
    };

    showCocoSamples = () => {
      this.setState({
        cocoVisible: true
      });
    }

    handleOkCoco = () => {
      this.setState({cocoVisible: false});
    }

    handleCancelCoco = () => {
      this.setState({cocoVisible: false});
    }

    onCarouselChange = (e) => {
      console.log(e);      
    }
    
    render() {    
      const encoder_decoder = this.props.data.encoder_decoder.childImageSharp;
      const mt_example = this.props.data.mt_example.childImageSharp;
      const show_and_tell = this.props.data.show_and_tell.childImageSharp;
      const wordtree = this.props.data.wordtree.childImageSharp;
      const nic_vs_attn = this.props.data.nic_vs_attn.childImageSharp;

        return (
            <Layout data={this.props.data} location={this.props.location}>
            <BlogPostChrome {...this.props.data.javascriptFrontmatter}>
            <h1 style={{ textAlign: "center"}}>
              Caption This! Image Caption using Neural Networks
            </h1>
          <p
          className="header-subtitle"
          style={{ marginTop: 20, marginBottom: 10 }}
        >          
          30 Jun, 2019
        </p>            
                                      
            <p>
              The caption on the image above was generated by a neural network. 
              More precisely, using a combination of two neural networks known as encoder-decoder architecture.
            </p>
            <Img sizes={encoder_decoder.fluid} />
            <p style={{fontSize: 12, textAlign: "center"}}>Example of the caption we want the model to generate</p>
            <p>
              The natural question then is "how does the model generate captions ?". Let's zoom into the network architecture.
            </p>
            <p>
            The Encoder consists of a state of the art vision model (CNN) and the Decoder is responsible for learning the language model using RNN.
            We pass the image through the Encoder network, where a pre-trained Convolutional Neural Network extracts the image features and converts it into the encoded feature vector (Z).
            The Decoder network takes Z as input and uses it along along with the hidden state of RNN to generate the caption - one word at a time.
            </p>
            <p>
              Great, but how can it generate fully formed sentences instead of some arbitrarily placed alphabets?
            </p>
            <p>
            The Decoder is responsible for learning two tasks - learn the structure of the language (i.e the Language Model) 
            so that it can generate English sentences and also learn the contents of the image. The decoder must learn both 
            the tasks together, learning one is not enough.
            If it only learns the language model, it will output words or fully formed sentences with no relation to the image. 
            If it only learns to recognize the contents of the image, it will output the name of object but will not be able to describe how 
            those objects relate to each other.
            </p>
            <p>
              In the following section, I will describe how to build an end-to-end image captioning system using neural networks.              
            </p>

            <h2>Show and Tell: A Neural Image Caption Generator</h2>
            <p>
              This paper by Vinyals et. al was perhaps one of the first to achieve state of the art 
              results on Pascal, Flickr30K, and SBU using an end-to-end trainable neural network. As the authors highlight, the main 
              inspiration of this paper comes from the breakthrough work in Neural Machine Translation. 
              Machine translation, as the name suggests, is the task of translating text from one language 
              to another.
            </p>
            <p>&lt;Detour&gt;</p>
            <p>              
              Before 2014, the language translation was done largely using 
              statistical machine translation (SMT) models. 
              The core idea of statistical machine translation is to learn a probabilistic model from 
              the training data (e.g English -> Hindi Corpus):              
            </p>
            <Img sizes={mt_example.fluid} />
            <p style={{fontSize: 10, textAlign: "center"}}>Recreation of the final battle scene from Avengers Endgame by an eight-year old</p>
            <p>
              Given a sentence in English, for example the last words of Thanos in Endgame i.e (x): <strong>I am Inevitable</strong>
            </p>
            <p>
              We want the model to translate it to Hindi sentence, (y):
                <strong> मैं अपरिहार्य हूं</strong>
              &nbsp; <a href="https://translate.google.com/#view=home&op=translate&sl=en&tl=hi&text=i%20am%20inevitable" target="_blank" rel="noopener noreferrer">Google Translate</a>
            </p>
            <p>
              i.e on a very high level, we want <KatexEquation equation="f(x) = argmax_y P(y|x)" />   
            </p>
            <p>
              By the way, Is Avengers Endgame the best performing movie in the Marvel Cinematic Universe. I try to find out <a href="/mcuboxoffice">here</a>?
            </p>
            <p>
            SMT models required a very complex feature engineering pipeline along with several separately designed subcomponents. 
            Each of which required lots of human effort for each language pair.[<a href="http://web.stanford.edu/class/cs224n/slides/cs224n-2019-lecture08-nmt.pdf" target="_blank" rel="noopener noreferrer">reference</a>]
            </p>
            <p>
              In 2014, the researchers from Google introduced an end-to-end trainable approach 
              that used two recurrent neural networks to learn the mapping from one sequence 
              to another. They called it sequence to sequence learning and applied it on 
              machine translation task. The translations produced by their seq2seq model 
              performed as good as the SMT method on the same dataset. Since then, the architecture 
              has been applied to different domains such as speech recognition, text summarization 
              and of course machine translation. 
            </p>
            <p>&lt;/Detour&gt;</p>
            <p>
              In Neural Image Caption, the authors use similar architecture but replace 
              the RNN in encoder with CNN as Convnets tend to perform better on computer vision 
              tasks. The main contributions of the paper are:
            </p>
            <li>End-to-end trainable system</li>
            <li>Use of state of the art sub-networks for vision and language models</li>

            <h3>How does it work</h3>
            <p>The diagram shows the neural network architecture that I used to implement Show and Attend paper.</p>
          
            <Img sizes={show_and_tell.fluid} />
            <div style={{marginTop: 10}}>
            <p>
              <strong>OBJECTIVE:</strong> For a given input image, the model's objective is to maximise the probability of generating the correct sentence.
            </p>

            <p>
              <KatexEquation equation="\theta^* = argmax_{\theta} \sum_{(I,S)} logp(S|I;\theta)" />
            </p>
            <p>              
              Where S is the generated sentence, I is the input image and <KatexEquation equation="\theta" /> is the model parameters. 
              A sentence S consists to multiple words i.e. <KatexEquation equation="S = S_0, S_1....S_N"/> where N is total number 
              of words in a sentence. We can then model the join probability by applying chain rule:
            </p>
          <p>
            <KatexEquation equation="logp(S|I) = \sum_{t=0}^N log p (S_t|I, S_0,...,S_{t-1})" />
          </p>

          <p>
            We train the network using image-caption pair (S,I) with the objective of maximising the sum 
            of log probabilities as shown above. For each training example, we pass the input image through 
            the encoder network to extract the feature vector, we then use the decoder network to generate 
            the sentence, one word at each timestep. We concatenate the word embeddings and the image feature vector 
            before passing them as the input to RNN, this is done to map them to the same space. 
            And although the architecture above displays the word as output, in reality, the decoder generates 
            the probabilities over all words in the vocab. We then pick the word using a decoding algorithm (more on this later).
          </p>
          <p>
            At the initial timestep <KatexEquation equation="t_0" />, we pass a special start of sentence word token (i.e <KatexEquation equation="S_0" />=&lt;START&gt;) and 
              at subsequent timesteps, we pass the image feature vector along with the word generated at the previous 
              timestep. This means we call the decoder several times until we have the full sentence. Since the length 
              of the sentence is unbounded, we need a mechanism to know when to stop. This is done using a special end 
              of sentence marker i.e <KatexEquation equation="S_N" />=&lt;STOP&gt;.
          </p>
      <p>
      <strong>LOSS:</strong> We use the negative log-likelihood of the correct word as the loss function and minimize 
       it with respect to the top layer of the encoder and all layers of the decoder network. We only use 
       the top layer in the encoder network and not the CNN sub-network because we do not want the 
       pre-trained CNN to forget its learned weights.
      </p>
      
      <p>
        <KatexEquation equation="L(I,S) = - \sum_{t=1}^N log p_t(S_t)" />
      </p>
            </div>

      <div>
      <p>
        The following plot shows the training loss and hyper-parameters used.
      </p>
        <Row>
          <Col xs={{span: 24}} md={{span: 16}}>
            <div style={{textAlign: "center", margin: "0 auto"}}>
              {/* <VegaChart id="random-policy"  chartName="random-policy" specUrl="https://raw.githubusercontent.com/katnoria/ml_examples_for_web/master/data/nic/train_50_plot.json"> */}
              <VegaChart id="nic-loss"  chartName="nic-loss" specUrl="https://gist.githubusercontent.com/katnoria/a4b02aab3dc6b754b6d8f68420644847/raw/ac31ed6562ac23348769f0e4766740b0f8269279/nic_train_200_plot.json">
              </VegaChart>
            </div>          
          </Col>
          <Col xs={{span: 24}} md={{span: 8}}>
      <table className="hyper-params-table">
        <tbody>
          <tr>
            <td>Batch size</td>
            <td>128</td>
          </tr>
          <tr>
            <td>Optimizer</td>
            <td>ADAM</td>
          </tr>
          <tr>
            <td>Learning rate</td>
            <td>1e-4</td>
          </tr>
          <tr>
            <td>
              Training images              
            </td>
            <td>
              177K &nbsp;
              <Button shape="circle" icon="file-image" onClick={this.showCocoSamples}></Button>              
              <Modal              
                title="Dataset Sample"
                centered
                visible={this.state.cocoVisible}
                onOk={this.handleOkCoco}
                onCancel={this.handleCancelCoco}
                style = {{width: 500}}
              >
              <CocoDatasetSamplePage />
            </Modal>
            </td>
          </tr>
          <tr>
            <td>Vocab size</td>
            <td>36,780 words</td>
          </tr>
          <tr>
            <td>Embedding size</td>
            <td>256</td>
          </tr>
          <tr>
            <td>Training time</td>
            <td>~19h on gtx1080</td>
          </tr>
        </tbody>
      </table>            
          </Col>
        </Row>
      </div>
      <Divider />
      <div>
        <h3>Test drive</h3>
        <p>Let's see how well does the model perform in wild</p>
        <GeneratedCaptionsCarousel />
        <p style={{fontSize: 12, textAlign: "center"}}>Cherry Picked Captions, leave the cursor on the image to stop auto scroll</p>
        <div style={{margin: 10}}>
          <p>
          After the model is trained, there are different ways to generate the caption.
          Recall that at every step, the model generates the probabilities over the entire vocabulary 
          so the easiest and the most obvious approach would be to pick the word with the highest 
          probability. This is called greedy decoding where we greedily select the next word.
          And while it may be able to generate captions, there are better options available.
          Let's briefly look at each of them.
          </p>
          <table>
            <tbody>
              <tr>
                <td className="greedy-text">Greedy Decoding</td>
                <td>
                  At every step, pick the word with the highest probability to generate the sequence.
                  It is a simple method that generates low-quality output compared to other methods in the list.
                </td>
              </tr>
              <tr>
                <td className="beam-text">Beam Search Decoding</td>
                <td>
                  Track multiple sequences at once. Instead of using just one word, store top k words at every step t. At the next step t+1, generate top k words 
                  for each for previous top k words. This end result is a tree of words (i.e multiple hypotheses), 
                  pick the one with the highest probability. k=1 is greedy search and suffers from the same 
                  issue of producing low-quality output, 
                  increasing k is compute intensive but generally produces higher quality output. 
                  Although at larger k, the output gets very short.
                  </td>
              </tr>
              <tr>
                <td className="pure-text">Pure Sampling</td>
                <td>
                  Similar to greedy decoding, but instead of picking the word with the highest probability, 
                  randomly sample the word from the probability distribution. 
                  Sampling methods such as pure sampling and top-K provide better diversity and are 
                  generally better at natural language generation.
                </td>
              </tr>
              <tr>
                <td className="topk-text">Top-K Sampling</td>
                <td>
                  Similar to Pure sampling, but instead of sampling just a single word, sample top-k probable words.
                  k=1 is greedy search and k=length of vocabulary is pure sampling.
                </td>
              </tr>
            </tbody>
          </table>
          <Img sizes={wordtree.fluid} />
            <p style={{fontSize: 12, textAlign: "center"}}>Visualization of Beam Search Decoder from <a href="/nlg-decoders/">here</a></p>
          <p>For more details on decoding algorithms, see : <a href="/nlg-decoders/">Visualising Decoding Algorithms</a></p>
          <div>
            <p>
            The carousel above presented the cherry-picked image and their captions. 
            Below we compare how each decoding algorithm performs against the baseline, 
            which in this case is the captions generated by an 8-year old 
            who agreed after some negotiations 🎁.
            </p>

            <CustomImageGridWithCaption />
            <p style={{fontSize: 12, textAlign: "center"}}>Hover over the image to view the captions</p>
          </div>
        </div>
      </div>
      <Divider />
      <div style={{marginTop: 10}}>
        <h4>Conclusion</h4>
        <p>
          We were able to build and train a seq2seq network that was largely based on the architecture used in 
          the show and tell paper.          
          We tested the model on images in the wild, in this case, a sample from &nbsp;
          <a href="https://500px.com/katnoria" target="_blank" rel="noopener noreferrer">my collection of photos</a>
          - which is different from the dataset our model was trained on.
          We saw that pure sampling and top-k sampling decoders produced better captions 
          but the overall result was not as good as we would like it to be.
          We can try a few things - train the model long enough, 
          use full training set from the COCO dataset, network architecture changes and make use of attention.
        </p>
        <p>
          Note that we did not perform any model performance comparison between the datasets using BLEU scores, this 
          is intentional because my objective was to learn how to build an image caption system and try on a variety 
          of images. I do plan to use BLEU scores to compare the model described in this post with the 
          attention-based seq2seq model.
        </p>
      </div>
      <div style={{marginTop: 20}}>
        <div>      
          <p>
            Before ending the post, here is the caption generated by the model used in this post and the new attention-based model 
            that I just finished working on.
            Both the captions were generated using Greedy decoder.
          </p>    
          <div>
            <Row>
            <Col span={2} />
              <Col span={18}>
                <div>
                  <Img sizes={nic_vs_attn.fluid} />
                  <p style={{lineHeight: "24px", marginTop: "10px"}}>Vanilla Model: Sunset trees</p>                  
                  <p style={{lineHeight: "24px"}}>Attention Model 😎: View of mountains and a fence overlooking a mountain range</p>
                </div>
              </Col>
              <Col span={2} />
            </Row>
          </div>
          <div style={{marginTop: "10px", marginBottom: "10px"}}>
            <p>Code and model checkpoint is available on my GitHub <a href="https://github.com/katnoria/image-caption"  target="_blank" rel="noopener noreferrer">Repo</a>.</p>
            <p>Check out the <a href="/attn-nic">Part 2: Attention</a></p>
          </div>
        </div>
      </div>

      <Divider />
        <Row>
          <Col span={4} className="references-header">References & Links:</Col>
          <Col span={20} className="references-text">
              <ol>
                <li>
                Visualising Decoding Algorithms [<a href="/nlg-decoders/">link</a>]
                </li>
                <li>
                Show and Tell: A Neural Image Caption Generator  
                  &nbsp;[<a href="https://arxiv.org/abs/1411.4555" target="_blank" rel="noopener noreferrer">arvix</a>]                                
              </li>
              <li>
              Sequence to Sequence Learning with Neural Networks  
                  &nbsp;[<a href="https://arxiv.org/abs/1409.3215" target="_blank" rel="noopener noreferrer">arxiv</a>]                                
              </li>
              <li>
                  CS224N Machine Translation Lecture
                  &nbsp;[<a href="http://web.stanford.edu/class/cs224n/slides/cs224n-2019-lecture08-nmt.pdf" target="_blank" rel="noopener noreferrer">link</a>]
              </li>
              <li>
                Tensorflow NMT Notebook
                [<a href="https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb" rel="noopener noreferrer">link</a>]
              </li>
              </ol>
          </Col>
        </Row> 
      </BlogPostChrome>
      </Layout>    
        )

    }
}    

export default NICPage


export const pageQuery = graphql`
query ($slug: String!) {
  markdownRemark(
    fields: { slug: { eq: "/2019-05-29-nic/" } }
  ) {
    html
  }
  javascriptFrontmatter(fields: { slug: { eq: $slug } }) {
    ...JSBlogPost_data
  }
  encoder_decoder: file(
    relativePath: {regex: "/enc-dec2.png/"}) {
      childImageSharp {
        fluid(maxWidth: 3000, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }              
}    
  mt_example: file(
    relativePath: {regex: "/IMG_7309.jpeg/"}) {
    childImageSharp {
      fluid(maxWidth: 3000, quality: 100) {
        ...GatsbyImageSharpFluid
        presentationWidth
      }
    }
  }    
  show_and_tell: file(
    relativePath: {regex: "/show-and-tell.png/"}) {
    childImageSharp {
      fluid(maxWidth: 3000, quality: 100) {
        ...GatsbyImageSharpFluid
        presentationWidth
      }
    }
  }    
  train_50: file(
    relativePath: {regex: "/train-50-epochs.png/"}) {
    childImageSharp {
      fluid(maxWidth: 3000, quality: 100) {
        ...GatsbyImageSharpFluid
        presentationWidth
      }
    }
  }      
  wordtree: file(
    relativePath: {regex: "/wordtree-k3.png/"}) {
    childImageSharp {
      fluid(maxWidth: 3000, quality: 100) {
        ...GatsbyImageSharpFluid
        presentationWidth
      }
    }
  }      
  nic_vs_attn: file(
    relativePath: {regex: "/nic_vs_attn.jpg/"}) {
    childImageSharp {
      fluid(maxWidth: 3000, quality: 100) {
        ...GatsbyImageSharpFluid
        presentationWidth
      }
    }
  }      
}
`
