import { graphql } from "gatsby";
import React from "react";
import Layout from "../../components/Layouts";
import Img from "gatsby-image";

import {
  Divider,
  Popover,
  Row,
  Col,
  Tag,
  Checkbox,
  Button,
  Tabs,
  Collapse,
} from "antd";
import { CaretRightOutlined } from "@ant-design/icons";

import VegaChart from "../../components/VegaChart";
import MetricPlotWithOptions from "./MetricPlotWithOptions";
import BrushSelectionPlot from "./BrushSelectionPlot";
import "./index.css";

export const frontmatter = {
  title: `Crowd Density Estimation Models`,
  written: "2020-05-16",
  updated: "2020-05-25",
  layoutType: `post`,
  contentType: "dataviz",
  path: "/crowd-density-eval/",
  category: "VISUALISATION",
  image: `./poster.png`,
  description: "Evaluation of Crowd Density Models",
};

const { Panel } = Collapse;
const { TabPane } = Tabs;

export default class CrowdDensityModelEvaluation extends React.Component {
  state = {};

  render() {
    const img_outliers = this.props.data.img_outliers.childImageSharp;

    return (
      <Layout data={this.props.data} location={this.props.location}>
        <div style={{ maxWidth: "800px", margin: "0px auto" }}>
          <h1 className="header-title">
            Evaluating Crowd Density Estimation Models
          </h1>
          <h4
            className="header-subtitle"
            style={{ marginTop: 20, marginBottom: 10 }}
          >
            Desktop Version | 14 May, 2019
          </h4>
          {/* <p style={{ textAlign: "center", marginBottom: 40 }}>
            <Tag>Visualisation</Tag>
            <Tag>Vega</Tag>
          </p> */}
          <div className="story-content" style={{ marginBottom: 20 }}>
            <div>
              <p className="para">
                This post is an extension of the{" "}
                <a href="/crowd-density">crowd density estimation</a>&nbsp;
                post. The models used here were trained using almost no
                hyperparameter tuning so, please ignore their actual
                performance. It is possible to improve the performance by
                carefully selecting the image augmentation(s), learning rate,
                learning rate schedule, and the optimizer. This post's objective
                is to visualise the predictions across different variables to
                understand the model performance, and identify areas that can be
                improved.
              </p>
              <p className="para">
                The vega plots displayed on this page are pulled from the GitHub
                and might take some time to load depending on your network
                connection.
              </p>
              <h2 className="is-size-4 heading">
                SECTION 1: METRICS, PREDICTIONS & INPUT SIZES
              </h2>
              <p className="para">
                In this section, we are going to plot the predictions of models
                based on two different architectures: VGG16 Baseline and
                VGG16+Decoder model. These predictions are generated by 4
                different models:
              </p>
              <ul className="para">
                <li>➤ VGG16 Baseline: trained on 224px images</li>
                <li>➤ VGG16 Baseline: trained on 448px images</li>
                <li>➤ VGG16+Decoder: trained on 224px images</li>
                <li>➤ VGG16+Decoder: trained on 448px images</li>
              </ul>
              <p className="para">
                Each plot displays two overall metrics: Mean Squared Error and
                Mean Absolute Error. We are using two different plots in this
                section.
              </p>
              <p className="para">
                <strong>1: Scatter Plot</strong>
              </p>
              <VegaChart
                id={`preview-scatter-baseline-metrics`}
                chartName={`preview-scatter-baseline-metrics`}
                specUrl="https://gist.githubusercontent.com/katnoria/9dd3412f4c5d3658eb7f6cce369b3de0/raw/203e2cc188812f7aacf0487fa951160bb221dad9/scatter_baseline_shha_train_compare_224"
              />
              <p className="para">
                <strong>2: Scatter + Heatmap</strong>
              </p>
              <VegaChart
                id={`preview-heatmap-baseline-metrics`}
                chartName={`preview-heatmap-baseline-metrics`}
                specUrl="https://gist.githubusercontent.com/katnoria/97d1fb303002243b9f02461bc8a94131/raw/a42a27d42751b3898f9086bb58c84c60c756452a/heatmap_baseline_shha_train_compare_224"
              />
              <p class="para">
                We first start with the baseline architecture and then present
                the decoder based architecture. Each sub-section presents the
                question, and plots provide the potential answer, if any.{" "}
              </p>
              <h2 className="is-size-5 heading"> VGG16 BASELINE</h2>
              <Collapse
                bordered={false}
                defaultActiveKey={["1"]}
                expandIcon={({ isActive }) => (
                  <CaretRightOutlined rotate={isActive ? 90 : 0} />
                )}
                className="site-collapse-custom-collapse"
              >
                <Panel
                  header="TRAIN"
                  key="1"
                  className="site-collapse-custom-panel"
                >
                  <p className="para">
                    In this sub-section, we plot the predictions generated by
                    the baseline model on the training set. We are looking for a
                    few things here: How well does the model overfit the
                    training data? Are there any significant differences between
                    the performance of the baseline model trained on 224PX
                    images vs. 448PX images.
                  </p>
                  <Tabs defaultActiveKey="1">
                    <TabPane tab="Scatter" key="1">
                      <MetricPlotWithOptions
                        name="baseline"
                        text={`
                        The checkbox presents the option to display true vs. predicted scatter plots
                        for three different input types.
                        `}
                        url224="https://gist.githubusercontent.com/katnoria/9dd3412f4c5d3658eb7f6cce369b3de0/raw/203e2cc188812f7aacf0487fa951160bb221dad9/scatter_baseline_shha_train_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/d2d53f9841f999ac66098dc989932d78/raw/7d0f4ea5f1f9651435fe125dc7365b1cc154722e/scatter_baseline_shha_train_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/7235464f70dcd9080c968f089667b99b/raw/a457627f88f4b0081c53ba5d804a78ea8ab3635a/scatter_baseline_shha_train_compare_600"
                      />
                    </TabPane>
                    <TabPane tab="Heatmap" key="2">
                      <MetricPlotWithOptions
                        name="heatmap"
                        text={`
                        This section displays the same information as scatter but augments it with the heatmap.
                        It allows use to find out the bins that perform better or worse than others.
                        `}
                        url224="https://gist.githubusercontent.com/katnoria/97d1fb303002243b9f02461bc8a94131/raw/a42a27d42751b3898f9086bb58c84c60c756452a/heatmap_baseline_shha_train_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/7d0073802928339cc0d6ae53060e7f8f/raw/31ed8027dfcd2f5be9923adee22869ce24d10acb/heatmap_baseline_shha_train_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/d33e95d858fee43621f4952b17b9f12a/raw/3762f262c1f76b700e6661340949667c7233192a/heatmap_baseline_shha_train_compare_600"
                      />
                    </TabPane>
                  </Tabs>
                </Panel>
                <Panel
                  header="TEST"
                  key="2"
                  className="site-collapse-custom-panel"
                >
                  <p className="para">
                    In this sub-section, we plot the predictions generated by
                    baseline model on the test set. We are looking for a few
                    things here: How well does the model generalise? Between
                    224PX and 448PX trained versions of the baseline model,
                    which one generalises better?
                  </p>

                  <Tabs defaultActiveKey="1">
                    <TabPane tab="Scatter" key="1">
                      <MetricPlotWithOptions
                        name="baseline_scatter_test"
                        url224="https://gist.githubusercontent.com/katnoria/017dfb060901708173fd62db9c069599/raw/31008bd7688ecd6fd401e45bf091f58455f325d1/scatter_baseline_shha_test_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/78a3d2c4d13408191f4bc11f41a947db/raw/58179bac3cdf73c3eaf57f6b8fece373b06a11f1/scatter_baseline_shha_test_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/d26e473569cf0c02128890a813ed484e/raw/5e48113c01074b257d4eeff29fa3432d0c2dcd70/scatter_baseline_shha_test_compare_600"
                      />
                    </TabPane>
                    <TabPane tab="Heatmap" key="2">
                      <MetricPlotWithOptions
                        name="baseline_heatmap_test"
                        url224="https://gist.githubusercontent.com/katnoria/38fb823d4c881dd8d9b09808a94ef767/raw/58fe62f9fad1bebf5673b6862a5b8e83a6a4aba0/heatmap_baseline_shha_test_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/35a921c36eb6bf3c57fa95f198ec86fa/raw/7938f66c4576b1c7e807eb79e29b71ea0aeb4d75/heatmap_baseline_shha_test_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/e0e46bd5b6841b3fdc9db53909f3908d/raw/1e81d0bb4945b243fdbd831c67fd0665881f0ee9/heatmap_baseline_shha_test_compare_600"
                      />
                    </TabPane>
                  </Tabs>
                </Panel>
              </Collapse>

              <h2 className="is-size-5 heading">VGG16 + DECODER</h2>
              <p class="para">
                We repeat the same for VGG16 + Decoder based architecture. Note
                that we are still not making any direct comparison between two
                different architectures used here.
              </p>
              <Collapse
                bordered={false}
                defaultActiveKey={["1"]}
                expandIcon={({ isActive }) => (
                  <CaretRightOutlined rotate={isActive ? 90 : 0} />
                )}
                className="site-collapse-custom-collapse"
              >
                <Panel
                  header="TRAIN"
                  key="1"
                  className="site-collapse-custom-panel"
                >
                  <p className="para">
                    The questions remain the same: How well does the model
                    overfit the training data? Are there any significant
                    differences between the performance of the baseline model
                    trained on 224PX images vs 448PX images.
                  </p>
                  <Tabs defaultActiveKey="1">
                    <TabPane tab="Scatter" key="1">
                      <MetricPlotWithOptions
                        name="decoder_scatter_train"
                        url224="https://gist.githubusercontent.com/katnoria/b6363877eb275a5a8acae8e7721d2132/raw/fbb9c5cf94016a086844e23fdc5ce795ef43c772/decoder_baseline_shha_train_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/9d51f66a5621e525b950b8d23077109f/raw/a1e9c319447654ee78ee36cf1412c6ce23a19a84/decoder_baseline_shha_train_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/15229058d27c8b05553634f6e19ea0c2/raw/5501c6c878ee720c96b7196f6eb3f8ef7e76eb28/decoder_baseline_shha_train_compare_600"
                      />
                    </TabPane>
                    <TabPane tab="Heatmap" key="2">
                      <MetricPlotWithOptions
                        name="decoder_heatmap_train"
                        url224="https://gist.githubusercontent.com/katnoria/61e592aaf13e5de174e06af3e5cf482a/raw/2738cf183aa5a18df1a8d9087da96f5f8dc3da28/heatmap_decoder_shha_train_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/d8d940e2e30c0a5d0702711d998272fb/raw/d701d786d038807013546cd7f73d09df6264c74f/heatmap_decoder_shha_train_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/d508e2fa52ef1ec00cee91ed63f87021/raw/9d2bbee90e36d17176b488aa3229bb41ae7c2e21/heatmap_decoder_shha_train_compare_600"
                      />
                    </TabPane>
                  </Tabs>
                </Panel>
                <Panel
                  header="TEST"
                  key="2"
                  className="site-collapse-custom-panel"
                >
                  <p className="para">
                    How well does the model generalise? Between 224PX and 448PX
                    trained versions of the baseline model, which one
                    generalises better?
                  </p>
                  <Tabs defaultActiveKey="1">
                    <TabPane tab="Scatter" key="1">
                      <MetricPlotWithOptions
                        name="decoder_scatter_test"
                        url224="https://gist.githubusercontent.com/katnoria/e63351959fa6538e4abd393b0e7e92e9/raw/2dccad821c18499caf177e6cb24c3915db579b59/decoder_decoder_shha_test_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/99fbf29ddec8944786561f815744dfea/raw/7b124363a68bb6cf3b472ebc6cf57d2776349e95/decoder_decoder_shha_test_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/21345a9abb90811f8f18df999a9280a8/raw/8410a506eceec9f8eb1225b83ebafacd30995380/decoder_decoder_shha_test_compare_600"
                      />
                    </TabPane>
                    <TabPane tab="Heatmap" key="2">
                      <MetricPlotWithOptions
                        name="decoder_heatmap_test"
                        url224="https://gist.githubusercontent.com/katnoria/a0428058fd60ddf6b171cf5a6ea2f2f0/raw/833196e8c6f74672526111efba9bbfe33ce4c22c/heatmap_decoder_shha_test_compare_224"
                        url448="https://gist.githubusercontent.com/katnoria/a321be5e923a40e57b509de45c260889/raw/4ccfac942573640c60bfd0a26c28341f47b33607/heatmap_decoder_shha_test_compare_448"
                        url600="https://gist.githubusercontent.com/katnoria/7b5767f2b94491417ffcec654b7b7701/raw/7a0f5cc2866e05f9c1614d72f545151c95cd2bd0/heatmap_decoder_shha_test_compare_600"
                      />
                    </TabPane>
                  </Tabs>
                </Panel>
              </Collapse>
            </div>
            <Divider />
            <div>
              <h2 className="is-size-4 heading">
                SECTION 2: BASELINE vs DECODER
              </h2>
              <p className="para">
                We now move to a direct comparison between two different
                architectures. We found above that increasing the input size of
                training data improves the model performance. We use the
                predictions from the models trained on 448px images from now on.
              </p>
              <p className="para">
                We continue plotting predictions on images of three different
                input sizes, but this time we plot train and test set together.
                The plot on the left shows the baseline prediction, and the
                right one shows that of decoder. Here, you can{" "}
                <Tag color="#87d068">highlight</Tag> a section of the plot and
                make a direct comparison between the baseline and decoder model.
              </p>
              <BrushSelectionPlot
                name="shha"
                url224="https://gist.githubusercontent.com/katnoria/ec364e311c12d0fddae5d0f1b8f94e5d/raw/5d2988bcd0fbd96a132f428dec96742da8e13fc0/baseline_vs_decoder_224"
                url448="https://gist.githubusercontent.com/katnoria/0342f75b123a6132cda495b53b144a05/raw/b0fef6e93e75a49957c3fd413ace9fbad8e036ee/baseline_vs_decoder_448"
                url600="https://gist.githubusercontent.com/katnoria/2ad6eb26c2d6dc949a6c17c3603b3fd3/raw/732aa3bae4e1cceb6c94abd9c138c3fad7842be9/baseline_vs_decoder_600"
              />
            </div>
            <p class="para">
              Do you think decoder seems to generate the tighter prediction (i.e
              closer to the diagonal line) for larger images? Some images are
              really hard for both the models, isn't it? Are there any
              similarities between these images🤔? Let's cluster the images and
              try to find out.
            </p>
            <Divider />
            <div>
              <h2 className="is-size-4 heading">
                SECTION 3: CROSS-FILTERING & IMAGE CLUSTERS
              </h2>
              <p className="para">
                And in the final section, we use the power of cross-filtering to
                review specific predictions across different models as well as
                the image clusters. We use the dimensionality reduction
                algorithm, t-SNE in this case, to cluster the images.
              </p>
              <p className="para" style={{ marginBottom: 20 }}>
                The plots below are generated using the baseline and decoder
                based models trained on 448px images. We show model predictions
                on 224, 448, and 600 px images in the above sections. However,
                here I am just using the predictions on 448px images, mainly
                because we want to highlight the approach. We could very well do
                this for different input sizes as well as the datasets if
                required.
              </p>
              <p className="para">
                You can select a part of the plot, and see other two plots
                highlight the relevant points/images. Go ahead, give it a try.{" "}
              </p>
              <VegaChart
                id="shha_train_highlight_dots"
                chartName="shha_train_highlight_dots"
                specUrl="https://gist.githubusercontent.com/katnoria/9ebb4dc068ff197710ae563cd472680a/raw/7ce9e116cde6689d6759f4292447e56f690a198b/shha_train_highlight_dots"
              ></VegaChart>
              <p
                style={{ fontSize: 12, textAlign: "center", marginBottom: 20 }}
              >
                Train set: Use the mouse to select the area
              </p>
              <p className="para" style={{ marginBottom: 20 }}>
                The model was trained on 448px images, and we see that both the
                models can overfit the training data, which is something that
                you would want to do before you start regualising the model.
                Now, let's review the test data.
              </p>
              <VegaChart
                id="shha_test_highlight_dots"
                chartName="shha_test_highlight_dots"
                specUrl="https://gist.githubusercontent.com/katnoria/1f24b351fa08bffa9d8a07a8a1c801ff/raw/a20fc57ae64f949b72d413e16d0aab4b6a54b70e/shha_test_highlight_dots"
              ></VegaChart>
              <p style={{ fontSize: 12, textAlign: "center" }}>
                Test set: Use the mouse to select the area
              </p>
              <p className="para" style={{ marginTop: 20, marginBottom: 20 }}>
                Clearly, we have some work to do to get the model to generalise
                better. We also need to review the
                <Popover
                  content={
                    <div style={{ width: 400 }}>
                      <Img sizes={img_outliers.fluid} />
                    </div>
                  }
                  title=""
                >
                  <Button type="link">4 green dots</Button>
                </Popover>
                that have the crowd density between 600-1000 (see x-axis). These
                images do not come from the same cluster.
              </p>
              <p className="para">
                And, although there aren't nice and clear clusters in this
                particular dataset. The idea seems quite useful to me. It can
                help me answer questions such as do certain clusters perform
                worse than the others; if so, I can then review images in that
                cluster manually.
              </p>
            </div>
            <Divider />
            <div>
              <div style={{ marginTop: 24 }}>
                <div className="is-size-4 heading">Conclusion</div>
                <p className="para">
                  And that concludes the post. I intended to share how I use
                  different plots to visualise and compare model predictions and
                  use that to drive my investigations. And hopefully, I was able
                  to convey that.
                </p>
              </div>
            </div>
          </div>
        </div>
      </Layout>
    );
  }
}

export const crowdDensityEvalQuery = graphql`
  query {
    img_outliers: file(relativePath: { regex: "/outliers.png/" }) {
      childImageSharp {
        fluid(maxWidth: 800, quality: 100) {
          ...GatsbyImageSharpFluid
          presentationWidth
        }
      }
    }
  }
`;
