import React from "react";
import {
  Button,
  Row,
  Col,
  Divider,
  Modal,
  Radio,
  Popover,
  Card,
  Tabs,
  Table,
} from "antd";
import { Collapse, Icon } from "antd";
import {
  GithubOutlined,
  ReadOutlined,
  LineChartOutlined,
} from "@ant-design/icons";
import { graphql } from "gatsby";
import SyntaxHighlighter from "react-syntax-highlighter";
import { dark, atomDark } from "react-syntax-highlighter/dist/esm/styles/prism";
import VegaChart from "../../components/VegaChart";

import katex from "katex";
import Img from "gatsby-image";
import Layout from "../../components/Layouts";
import BlogPostChrome from "../../components/BlogPostChrome";

const { Meta } = Card;
const { Panel } = Collapse;
const { TabPane } = Tabs;

export const frontmatter = {
  title: `A Simple Guide to Crowd Density Estimation`,
  written: "2020-04-30",
  updated: "2020-05-24",
  layoutType: "post",
  contentType: "blog",
  path: "/crowd-density/",
  category: "Deep Learning",
  image: "./poster.png",
  cover: "./cover.png",
  coverhover: "./coverhover.png",
  coverText:
    "View from the Amer Fort overlooking the Aravalli hills and the second largest wall in the world. Amer Fort, Jaipur (UNESCO World Heritage Site)",
  description:
    "Counting item of interest in an image or a video is an interesting problem. Too few and you can use YOLO or SSD based object detection models, too many and you can use...",
};

const shhaColumns = [
  {
    title: "Set",
    dataIndex: "set",
    key: "set",
  },
  {
    title: "Total Images",
    dataIndex: "images",
    key: "images",
  },
  {
    title: "Min People",
    dataIndex: "minPeople",
    key: "minPeople",
  },
  {
    title: "Max People",
    dataIndex: "maxPeople",
    key: "maxPeople",
  },
  {
    title: "Min Image Size",
    dataIndex: "minImageSize",
    key: "minImageSize",
  },
  {
    title: "Max Image Size",
    dataIndex: "maxImageSize",
    key: "maxImageSize",
  },
];

const shhaInfo = [
  {
    key: "1",
    set: "Train",
    images: 300,
    minPeople: 33,
    maxPeople: 3136,
    minImageSize: `(400,400)`,
    maxImageSize: `(768,1024)`,
  },
  {
    key: "2",
    set: "Test",
    images: 300,
    minPeople: 12,
    maxPeople: 577,
    minImageSize: `(299,299)`,
    maxImageSize: `(768,768)`,
  },
];

const metricsCols = [
  {
    title: "Model",
    dataIndex: "model",
    key: "model",
  },
  // {
  //   title: "Dataset",
  //   dataIndex: "dataset",
  //   key: "dataset",
  // },
  // {
  //   title: "Set",
  //   dataIndex: "set",
  //   key: "set",
  // },
  {
    title: "TRAIN: MSE",
    dataIndex: "trainmse",
    key: "trainmse",
  },
  {
    title: "TEST: MSE",
    dataIndex: "testmse",
    key: "testmse",
  },
  {
    title: "TRAIN: MAE",
    dataIndex: "trainmae",
    key: "trainmae",
  },
  {
    title: "TEST: MAE",
    dataIndex: "testmae",
    key: "testmae",
  },
];

const metricsData = [
  {
    key: "1",
    model: "VGG16 Baseline",
    // dataset: "SHHA",
    // set: "Train",
    // mse: "144",
    // mae: "6",
    trainmse: "144",
    trainmae: "6",
    testmse: "706",
    testmae: "13",
  },
  {
    key: "2",
    model: "VGG16 Decoder",
    // dataset: "SHHA",
    // set: "Test",
    // mse: "706",
    // mae: "13",
    trainmse: "129",
    trainmae: "8",
    testmse: "650",
    testmae: "14",
  },
];

class CrowdDensityPage extends React.Component {
  // componentDidMount() {
  //   // Prism.highlightAll();
  //   setTimeout(() => Prism.highlightAll(), 0)
  // }

  render() {
    const high_level_cc = this.props.data.high_level_cc.childImageSharp;
    const density_example = this.props.data.density_example.childImageSharp;
    const shha_sample = this.props.data.shha_sample.childImageSharp;
    const vgg_baseline = this.props.data.vgg_baseline.childImageSharp;
    const vgg_decoder = this.props.data.vgg_decoder.childImageSharp;
    const vgg_baseline_trainplot = this.props.data.vgg_baseline_trainplot
      .childImageSharp;
    const shha_train_eval = this.props.data.shha_train_eval.childImageSharp;
    const shha_train_outl = this.props.data.shha_train_outl.childImageSharp;
    const shha_train_handpicked = this.props.data.shha_train_handpicked
      .childImageSharp;
    const shha_test_eval = this.props.data.shha_test_eval.childImageSharp;
    const shha_test_outl = this.props.data.shha_test_outl.childImageSharp;
    const vgg_decoder_trainplot = this.props.data.vgg_decoder_trainplot
      .childImageSharp;
    const vgg_decoder_testplot = this.props.data.vgg_decoder_testplot
      .childImageSharp;
    const vgg_baseline_448trainplot = this.props.data.vgg_baseline_448trainplot
      .childImageSharp;
    const vgg_baseline_448testplot = this.props.data.vgg_baseline_448testplot
      .childImageSharp;
    const vgg_decoder_448trainplot = this.props.data.vgg_decoder_448trainplot
      .childImageSharp;
    const vgg_decoder_448testplot = this.props.data.vgg_decoder_448testplot
      .childImageSharp;
    const obj_detection_sample = this.props.data.obj_detection_sample
      .childImageSharp;
    const large_crowd = this.props.data.large_crowd.childImageSharp;
    const vggbaseline_pred = this.props.data.vggbaseline_pred.childImageSharp;
    const vggdecoder_pred = this.props.data.vggdecoder_pred.childImageSharp;

    return (
      <Layout data={this.props.data} location={this.props.location}>
        <BlogPostChrome {...this.props.data.javascriptFrontmatter}>
        <h1 style={{ textAlign: "center"}}>
          A Simple Guide to Crowd Density Estimation
        </h1>
        <p
          className="header-subtitle"
          style={{ marginTop: 20, marginBottom: 10 }}
        >          
          30 Apr, 2020
        </p>
                  
          
          <div style={{ fontSize: 12, textAlign: "center" }}>
            <Button
              type="link"
              href="https://github.com/katnoria/crowd-density"
              icon={<GithubOutlined />}
              size="large"
            >
              <GithubOutlined />
              GitHub
            </Button>
            <Divider type="vertical" />
            <Button
              type="link"
              href="https://github.com/katnoria/crowd-density/blob/master/notebooks/evaluate-models.ipynb"
              icon={<ReadOutlined />}
              size="large"
            >
              <ReadOutlined />
              Notebook
            </Button>
            <Divider type="vertical" />
            <Button
              type="link"
              href="/crowd-density-eval"
              icon={<LineChartOutlined />}
              size="large"
            >
              <LineChartOutlined />
              Model Evaluation
            </Button>
          </div>
          {/* <h2>Introduction</h2> */}
          <p>
            In this post, we are going to build an object counting model based
            on simple network architecture. Although we use the crowd dataset
            here, a similar solution can be applied to the rather more useful
            applications such as counting cells, crops, fruits, trees, cattle,
            or even endangered species in the wild.
          </p>
          <p>
            There are different ways to count the object(s) in a given image.
            One could make use of R-CNN based models for object detection as
            shown in the example below:
          </p>

          <Tabs type="card">
            <TabPane tab="Image" key="11">
              <Row>
                <Col span={24}>
                  <Img sizes={obj_detection_sample.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Object detection and semantic segmentation
                  </p>
                </Col>
              </Row>
            </TabPane>
            <TabPane tab="Video" key="21">
              <div
                style={{
                  // textAlign: "center",
                  // margin: "0 auto",
                  position: "relative",
                  paddingBottom: "56.25%",
                  height: 0,
                  overflow: "hidden",
                  maxWidth: "100%",
                }}
              >
                <iframe
                  style={{
                    position: "absolute",
                    top: 0,
                    left: 0,
                    width: "100%",
                    height: "100%",
                  }}
                  width="854"
                  height="480"
                  src="https://www.youtube.com/embed/K8WuRuddanA"
                  frameborder="0"
                  allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"
                  allowfullscreen
                ></iframe>
              </div>
            </TabPane>
          </Tabs>
          <p style={{ marginTop: "10px" }}>
            <Row>
              <Col span={18}>
                <p>
                  and that would work just fine, but what do you do when you
                  have a lot more people such as Figure 1? Will the same
                  assumptions hold true? Do we have access to the labelled
                  datasets that are in the format used by R-CNN and its
                  variants?
                </p>
                <p>
                  In this post, we are going to build models that attempt solve
                  this using the pre-trained ConvNet as backbone and a
                  regression head for counting the crowd.
                </p>
              </Col>
              <Col span={6}>
                <Img sizes={large_crowd.fluid} />
                <p style={{ fontSize: 12, textAlign: "center" }}>
                  Figure 1, source: ShanghaiTech Dataset
                </p>
              </Col>
            </Row>
          </p>
          <p style={{ marginTop: "10px" }}>
            The network architecture is simple enough that I think this could be
            called the "Hello World" of the crowd density estimation task
            (pardon my ignorance if you know of simpler ways).
          </p>
          <Img sizes={high_level_cc.fluid} />
          <p style={{ fontSize: 12, textAlign: "center" }}>High Level Flow</p>
          <p>
            If we overlay the output, i.e., the density map over the image, we
            can see that the head of each person is highlighted. These
            highlighted points are what we want our model to learn to estimate.
            And, to get the total count, we sum the points together.
          </p>
          <Img sizes={density_example.fluid} />
          <p style={{ fontSize: 12, textAlign: "center" }}>
            Sample image and density map from ShanghaiTech Dataset
          </p>
          <h3>Dataset</h3>
          <p>
            We use the crowd counting dataset introduced in this{" "}
            <a
              href="Single-Image Crowd Counting via Multi-Column Convolutional Neural Network"
              target="_blank"
              rel="noopener noreferrer"
            >
              paper
            </a>
            . The dataset is known as "ShanghaiTech Crowd Counting Dataset", and
            it has images with arbitrary crowd density along with the target
            labels. We train our model on Part A of the dataset. However,
            instead of using density maps provided by the dataset, we use the
            processed maps generated by the{" "}
            <a
              href="https://github.com/gjy3035/C-3-Framework"
              target="_blank"
              rel="noopener noreferrer"
            >
              C3 Framework
            </a>{" "}
            for convenience purposes. The C3 Framework is an excellent resource
            that covers multiple network architectures and their performance on
            different datasets. I encourage you to have a look at the paper and
            their repo.
          </p>
          <Table
            dataSource={shhaInfo}
            columns={shhaColumns}
            size="small"
            pagination={false}
          />
          <div style={{ textAlign: "center", margin: "0 auto" }}>
            <VegaChart
              id="shha-train"
              chartName="shha-train"
              specUrl="https://gist.githubusercontent.com/katnoria/7b55002fcfd9c790ea235df96d06e4d7/raw/55a7f3f967da1f22efc7b789fa29c80cd8956e62/shha-train"
            ></VegaChart>
          </div>
          <p style={{ fontSize: 12, textAlign: "center" }}>
            Histogram of train set
          </p>
          <p>
            As you can see from the histogram, a large number of images have a
            crowd of less than 600.
          </p>
          <blockquote>
            Question 🤔: Does that mean the model trained on this dataset will
            not perform as well on the images with a large crowd?
          </blockquote>
          <blockquote>
            Answer: You can use{" "}
            <a href="https://github.com/katnoria/crowd-density/blob/master/notebooks/evaluate-models.ipynb">
              this notebook
            </a>{" "}
            to ask this and other questions.
          </blockquote>
          <p>
            Below we show a few sample images from the dataset. We also show the
            associated density map below each image. Annotating the dataset
            must've been a difficult task.
          </p>
          <Img sizes={shha_sample.fluid} />
          <p style={{ fontSize: 12, textAlign: "center" }}>Dataset sample</p>
          <h3>Pre-processing</h3>
          <p>
            Throughout the implementation, we follow the guideline and
            techniques used by C3 Framework. The C3 framework uses the following
            augmentation/transformation:
          </p>
          <p>
            CenterCrop (to 224) → RandomFlip → ScaleDown → LabelNormalize (100)
            → ToTensor → Normalize
          </p>
          <SyntaxHighlighter language="python" showLineNumbers style={atomDark}>
            {`
          # transforms
          transforms.Compose([
            CenterCrop(output_size=output_size),
            RandomFlip(),
            ScaleDown(factor), # Not used (currently set as 1)
            LabelNormalize(),
            ToTensor(),
            Normalize()
          ])

          `}
          </SyntaxHighlighter>
          <p style={{ fontSize: 12, textAlign: "center" }}>
            <a href="https://github.com/katnoria/crowd-density/blob/master/src/data.py#L115">
              Image transformations
            </a>{" "}
            in Pytorch (syntax highlighter doesn't work yet😅){" "}
          </p>
          <h2>Models</h2>
          <p>
            We will use VGG16 as the backbone for our models in this post. Once
            we have the full training and evaluation infra ready, we can easily
            add more powerful models and compare its performance against the
            baseline models.
          </p>
          <h3>Baseline Model</h3>
          <p>
            As our baseline, we will use a pre-trained VGG16 network followed by
            2 Conv layers and the upsampling layer to match the target density
            map (m x n). Recollect that we scale down the input image and target
            by a scaling factor, so the final layer needs to keep that in
            consideration.
          </p>
          <Img sizes={vgg_baseline.fluid} />
          <p style={{ fontSize: 12, textAlign: "center" }}>
            VGG16 Baseline Model
          </p>
          <SyntaxHighlighter language="python" showLineNumbers style={atomDark}>
            {`
  class VGG16Baseline(nn.Module):
  """Baseline model

  Baseline model that uses a pre-trained VGG16 network as backbone        
  """
  def __init__(self, channels=[512, 128, 1], scale_factor=4):
      """
      Parameters
      ----------
      channels: Input channel size for all three layers
      scale_factor: Factor to upsample the feature map
      """
      super(VGG16Baseline, self).__init__()
      self.scale_factor = scale_factor
      conv_layers = list(models.vgg16(pretrained=True).features.children())
      # Mark the backbone as not trainable
      for layer in conv_layers:
          layer.requires_grad = False

      self.model = nn.Sequential(
          *conv_layers,
          nn.Conv2d(channels[0], channels[1], kernel_size=3, padding=1),
          nn.ReLU(inplace=True),
          nn.Conv2d(channels[1], channels[2], kernel_size=3, padding=1),
          nn.ReLU(inplace=True)
      )

  def forward(self, inputs):
      """ Forward pass"""
      output = self.model(inputs)
      output = F.upsample(output, scale_factor=self.scale_factor)
      return output          
          `}
          </SyntaxHighlighter>
          <p style={{ fontSize: 12, textAlign: "center" }}>
            Pytorch Code:{" "}
            <a href="https://github.com/katnoria/crowd-density/blob/master/src/models.py#L9">
              models.py
            </a>
          </p>
          <p>
            We train the baseline model using the ADAM optimizer with the
            learning rate of 1e-5 for 400 epochs. The choice of loss function to
            optimizer is Mean Square Error(MSE). We also track Mean Absolute
            Error (MAE) on tensoboard along with the MSE.
          </p>
          <div style={{ textAlign: "center", margin: "0 auto" }}>
            <VegaChart
              id="vggbaseline-test-train"
              chartName="vggbaseline-test-train"
              specUrl="https://gist.githubusercontent.com/katnoria/f73672c45abb5c011f19838e9eea4ce4/raw/b9698585a39f82f274e3c918dd3f1147be99dcd3/vgg16-baseline-test-train"
            ></VegaChart>
          </div>
          <p style={{ fontSize: 12, textAlign: "center" }}>
            Displaying only 200 epochs because the loss remained constant after
            that
            <Popover
              content={
                <p>
                  Improving the test and train loss is left as an exercise to
                  the reader
                </p>
              }
              title=""
            >
              <Button type="link">💡</Button>
            </Popover>
          </p>
          <h4>Evaluation</h4>
          <p>
            We will first check how well the model is able to overfit the
            training data. We visualize its performance by comparing the actual
            vs. predicted crowd count. Each tab shows the predictions on the
            images of a given input size. The better the model, the more points
            lie closer to the diagonal line. As you can see in the plots below,
            the model does comparatively better for images with crowd ≤ 1000.
            However, its performance begins to suffer as the number of crowd
            increases.
          </p>
          <p>
            Question for you: Recheck the histogram of training data and see if
            you can now guess the reason why?
          </p>
          <Tabs type="card">
            <TabPane tab="224 x 224" key="1">
              <Row>
                <Col span={12}>
                  <Img sizes={shha_train_eval.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Train set performance
                  </p>
                </Col>
                <Col span={12}>
                  <Img sizes={shha_test_eval.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Test set performance
                  </p>
                </Col>
              </Row>
              {/* <div style={{width: 400, height: 400}}>
              <Img sizes={shha_train_eval.fluid} />
              <p style={{fontSize: 12, textAlign: "center"}}>VGG16 Baseline Model</p>
            </div> */}
            </TabPane>
            <TabPane tab="448 x 448" key="2">
              <Row>
                <Col span={12}>
                  <Img sizes={vgg_baseline_448trainplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Train set performance
                  </p>
                </Col>
                <Col span={12}>
                  <Img sizes={vgg_baseline_448testplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Test set performance
                  </p>
                </Col>
              </Row>
            </TabPane>
            <TabPane tab="640 x 640" key="3">
              <h3>Right, You get the idea 😀</h3>
            </TabPane>
          </Tabs>
          <p>
            I find it quite useful to review the images the model gets right and
            the ones where it struggles. Here, we display the images from both
            the train and test set.
          </p>
          <Img sizes={shha_train_handpicked.fluid} />
          <p style={{ fontSize: 12, textAlign: "center", color: "green" }}>
            Train Set: Handpicked images from train set where the model does a
            good job
          </p>
          <Img sizes={shha_train_outl.fluid} />
          <p style={{ fontSize: 12, textAlign: "center", color: "red" }}>
            Train Set: Images where the model does a poor job (marked in the red
            circle in the plot above)
          </p>
          <p>
            Two things stand out to me: 1) the images with better prediction
            only contain people 2) the orientation of the heads in the image.
            Another crucial insight is that as you increase the input image
            size, the model starts to perform better. Review the plots in both
            224x224 and 448x448 tabs to confirm this.
          </p>
          <p>
            The sample from the test set also seems to confirm that the images
            with many different objects such as trees along with people make
            things difficult for the model. The last image is difficult for the
            human eye as well.
          </p>
          <Img sizes={shha_test_outl.fluid} />
          <p style={{ fontSize: 12, textAlign: "center", color: "red" }}>
            Test Set: Images where the model does a poor job (marked in the red
            circle in test set performance plot)
          </p>
          <Divider />
          <h3>VGG16 with Decoder</h3>
          <p>
            We now move to another simple yet more powerful model that, too,
            uses pre-trained VGG16 as its backbone. We make use of CONV and
            CONVTRANSPOSE layers, the C3 paper refers to these layers as the
            decoder.
          </p>
          <Img sizes={vgg_decoder.fluid} />
          <p style={{ fontSize: 12, textAlign: "center" }}>
            VGG16 + Decoder Model
          </p>
          {/* <pre> */}
          <SyntaxHighlighter language="python" showLineNumbers style={atomDark}>
            {`
  class VGG16WithDecoderV2(nn.Module):
  """ VGG16 Decoder"""

  def __init__(self):
      super(VGG16WithDecoderV2, self).__init__()
      conv_layers = list(models.vgg16(pretrained=True).features.children())[:23]
      # Pre-trained layers are not trainable
      for layer in conv_layers:
          layer.requires_grad = False

      self.model = nn.Sequential(
          *conv_layers,
          nn.Conv2d(512, 128, kernel_size=3, padding=1),
          nn.ReLU(inplace=True),
          nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1, bias=True, output_padding=0),
          nn.ReLU(inplace=True),
          nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1, bias=True, output_padding=0),
          nn.ReLU(inplace=True),
          nn.ConvTranspose2d(32, 16, 4, stride=2, padding=1, bias=True, output_padding=0),
          nn.ReLU(inplace=True),
          nn.Conv2d(16, 1, 1, padding=0),
          nn.ReLU(inplace=True)
      )

  def forward(self, inputs):
      """forward pass
      
      Parameters
      ----------
      inputs: Batch of input images
      """
      output = self.model(inputs)
      return output              
              `}
          </SyntaxHighlighter>
          <p style={{ fontSize: 12, textAlign: "center" }}>
            Pytorch Code:{" "}
            <a href="https://github.com/katnoria/crowd-density/blob/master/src/models.py#L49">
              models.py
            </a>
          </p>
          <p>
            {" "}
            We train this model with the same hyper parameters as the baseline
            model for 400 epochs.
          </p>
          <Tabs onChange={this.trainPlotCallback} type="card">
            <TabPane tab="224 x 224" key="1">
              <Row>
                <Col span={12}>
                  <Img sizes={vgg_decoder_trainplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Train set performance
                  </p>
                </Col>
                <Col span={12}>
                  <Img sizes={vgg_decoder_testplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Test set performance
                  </p>
                </Col>
              </Row>
              {/* <div style={{width: 400, height: 400}}>
              <Img sizes={shha_train_eval.fluid} />
              <p style={{fontSize: 12, textAlign: "center"}}>VGG16 Baseline Model</p>
            </div> */}
            </TabPane>
            <TabPane tab="448 x 448" key="2">
              <Row>
                <Col span={12}>
                  <Img sizes={vgg_decoder_448trainplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Train set performance
                  </p>
                </Col>
                <Col span={12}>
                  <Img sizes={vgg_decoder_448testplot.fluid} />
                  <p style={{ fontSize: 12, textAlign: "center" }}>
                    Test set performance
                  </p>
                </Col>
              </Row>
            </TabPane>
          </Tabs>
          <p style={{ marginTop: 10 }}>
            According to the C3 Framework, both the models will have comparable
            performance but VGG16 with Decoder will generate more precise
            density maps. We can see this in the table and examples below. Our
            numbers are no way near the ones reported by C3 Framework, which I
            mainly think is because they use a higher input size to train their
            models.
          </p>
          <Table
            columns={metricsCols}
            dataSource={metricsData}
            size="small"
            pagination={false}
          />
          <p style={{ marginTop: 10 }}>
            Next, we show a more detailed plot to visualise the effect of input
            image size on both the models.
          </p>
          <Tabs onChange={this.trainPlotCallback} type="card">
            <TabPane tab="MSE" key="111">
              <div style={{ textAlign: "center", margin: "0 auto" }}>
                <VegaChart
                  id="mse-shha"
                  chartName="mse-train"
                  specUrl="https://gist.githubusercontent.com/katnoria/fd996cc488b62220e8423eb1e02adc35/raw/e3af708a6ee7fedd011454f42f8e5b6553140473/mse_shha_plot"
                ></VegaChart>
              </div>
              <p style={{ fontSize: 12, textAlign: "center" }}>
                MSE on different input size
              </p>
            </TabPane>
            <TabPane tab="MAE" key="222">
              <div style={{ textAlign: "center", margin: "0 auto" }}>
                <VegaChart
                  id="mae-shha"
                  chartName="mae-train"
                  specUrl="https://gist.githubusercontent.com/katnoria/a85a9fed23dc7791172c6a3ec8cc3047/raw/8e404a8ee7676d1f9a2e302f026dde37a9f86b78/mae_shha_plot"
                ></VegaChart>
              </div>
              <p style={{ fontSize: 12, textAlign: "center" }}>
                MAE on different input size
              </p>
            </TabPane>
          </Tabs>
          <p style={{ marginTop: 10 }}>
            And finally we overlay the density maps generated by both the models
            on a given image. We see that VGG16 Baseline is spot on ✅in terms
            of the actual count but VGG16 + Decoder generates tighter density
            maps.
          </p>
          <Row style={{ marginTop: 10 }}>
            <Col span={12}>
              <Img sizes={vggbaseline_pred.fluid} />
              <p style={{ fontSize: 12, textAlign: "center" }}>
                VGG16 Baseline
              </p>
            </Col>
            <Col span={12}>
              <Img sizes={vggdecoder_pred.fluid} />
              <p style={{ fontSize: 12, textAlign: "center" }}>
                VGG16 + Decoder
              </p>
            </Col>
          </Row>

          <p style={{ marginTop: 10 }}>
            You can try increasing the input size and play around with the
            models yourself. If you can get good enough model, you can perhaps
            help answer the question on whose rally had more people🤣. The code
            is available on my{" "}
            <a href="https://github.com/katnoria/crowd-density">GitHub Repo</a>.
            In case you haven't noticed, if you hover over the cover image you
            will see the density map generated by VGG16 Decoder.
          </p>
          {/* </pre> */}
          <Divider />
          <h2>What Next</h2>
          <p>
            You can try tuning the hyperparameters, finding the right learning
            rate and or model architecture to get better performance. Here is
            the list of things I would try next if I were to make it useful:
            <ol>
              <li>Add regularisation</li>
              <li>Use a powerful backbone such as Resnet variants</li>
              <li>Use other models from C3 Framework</li>
              <li>
                Given that we have only 300 samples (very low in terms of DL),
                you could try U-Net which is known to perform well on tasks such
                as cell segmentation
              </li>
              <li>
                Spatially divide the image into sub-regions called closed sets,
                train the model on closed sets as suggested by the paper "From
                Open Set to Closed Set: Counting Objects by Spatial
                Divide-and-Conquer". The authors claim that this approach
                generalised well and can achieve the state of the art
                performance on a few crowd counting datasets
              </li>
              <li>
                Use the encoder-decoder based approach highlighted in the paper
                "Encoder-Decoder Based Convolutional Neural Networks with
                Multi-Scale-Aware Modules for Crowd Counting". They claim that
                their model can perform well on dense as well as sparse crowds.
              </li>
            </ol>
          </p>
          <Divider />
          <Row>
            <Col span={4} className="references-header">
              References & Links:
            </Col>
            <Col span={20} className="references-text">
              <ol>
                <li>
                  C3 Framework: I learned a great deal from this paper and their
                  code &nbsp;[
                  <a
                    href="https://arxiv.org/abs/1907.02724"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    PAPER
                  </a>
                  &nbsp;|&nbsp;
                  <a
                    href="https://github.com/gjy3035/C-3-Framework"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    GITHUB
                  </a>
                  ]
                </li>
                <li>
                  A Survey of Recent Advances in CNN-based Single Image Crowd
                  Counting and Density Estimation &nbsp;[
                  <a
                    href="https://arxiv.org/abs/1707.01202"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    LINK
                  </a>
                  ]
                </li>
                <li>
                  U-Net: Convolutional Networks for Biomedical Image
                  Segmentation &nbsp;[
                  <a
                    href="https://arxiv.org/abs/1505.04597"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    LINK
                  </a>
                  ]
                </li>
                <li>
                  From Open Set to Closed Set: Counting Objects by Spatial
                  Divide-and-Conquer &nbsp;[
                  <a
                    href="http://arxiv.org/abs/1908.06473"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    LINK
                  </a>
                  ]
                </li>
                <li>
                  Encoder-Decoder Based Convolutional Neural Networks with
                  Multi-Scale-Aware Modules for Crowd Counting &nbsp;[
                  <a
                    href="https://arxiv.org/abs/2003.05586"
                    target="_blank"
                    rel="noopener noreferrer"
                  >
                    LINK
                  </a>
                  ]
                </li>
              </ol>
            </Col>
          </Row>
        </BlogPostChrome>
      </Layout>
    );
  }
}

export default CrowdDensityPage;

export const pageQuery = graphql`
  query CrowdDensityQuery($slug: String!) {
    javascriptFrontmatter(fields: { slug: { eq: $slug } }) {
      ...JSBlogPost_data
    }
    high_level_cc: file(relativePath: { regex: "/high-level-cc-gray.png/" }) {
      childImageSharp {
        fluid(maxWidth: 1200, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    density_example: file(relativePath: { regex: "/density_example.png/" }) {
      childImageSharp {
        fluid(maxWidth: 1200, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    obj_detection_sample: file(
      relativePath: { regex: "/scene-detection.jpg/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 1000, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_sample: file(relativePath: { regex: "/shha-sample.png/" }) {
      childImageSharp {
        fluid(maxWidth: 600, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_baseline: file(relativePath: { regex: "/vgg-baseline-v2.png/" }) {
      childImageSharp {
        fluid(maxWidth: 600, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_decoder: file(relativePath: { regex: "/vgg-decoder.png/" }) {
      childImageSharp {
        fluid(maxWidth: 700, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_baseline_trainplot: file(
      relativePath: { regex: "/vgg-baseline-trainplot.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_train_eval: file(relativePath: { regex: "/shha-train-eval.png/" }) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_test_eval: file(relativePath: { regex: "/shha-test-eval.png/" }) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_train_outl: file(relativePath: { regex: "/shha-train-outl.png/" }) {
      childImageSharp {
        fluid(maxWidth: 600, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_train_handpicked: file(
      relativePath: { regex: "/shha-train-vggbaseline-handpicked.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 600, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    shha_test_outl: file(relativePath: { regex: "/shha-test-outl.png/" }) {
      childImageSharp {
        fluid(maxWidth: 600, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_decoder_trainplot: file(
      relativePath: { regex: "/vggdecoder-224-shha-train.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_decoder_testplot: file(
      relativePath: { regex: "/vggdecoder-224-shha-test.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_baseline_448trainplot: file(
      relativePath: { regex: "/vgg-baseline-448-shha-train.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_baseline_448testplot: file(
      relativePath: { regex: "/vgg-baseline-448-shha-test.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_decoder_448trainplot: file(
      relativePath: { regex: "/vggdecoder-448-shha-train.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vgg_decoder_448testplot: file(
      relativePath: { regex: "/vggdecoder-448-shha-test.png/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    large_crowd: file(relativePath: { regex: "/has_172.jpg/" }) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vggbaseline_pred: file(
      relativePath: { regex: "/vggbaseline-denmap.jpg/" }
    ) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
    vggdecoder_pred: file(relativePath: { regex: "/vggdecoder-denmap.jpg/" }) {
      childImageSharp {
        fluid(maxWidth: 400, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
  }
`;
