import React from "react"
import { graphql } from "gatsby"
import { Button, Divider, Icon } from 'antd';
import BlogPostChrome from "../../components/BlogPostChrome"
import Img from 'gatsby-image'
import { Row, Col, Tag, Progress, Popover } from 'antd';
import './Facerec.css'
import katex from 'katex'
import Layout from './../../components/Layouts';

export const frontmatter = {
    title: `Facial Recognition`,
    written: `2018-09-09`,
    updated: `2018-09-24`,
    layoutType: `post`,
    contentType: `blog`,
    path: `/facerec/`,
    category: `Deep Learning`,
    image: `./poster.png`,
    cover: `./cover.png`,
    description: `It can provide many useful applications but at what cost ? Privacy, mass surveillance.. `
}


const TripletLoss = () => {
    var loss_equation = katex.renderToString("L = \\sum_{i}^N [\\|\\textcolor{blue} {f(x_{i}^a)} - \\textcolor{green}{f(x_{i}^p)}\\|_{2}^2 - \\|\\textcolor{blue}{f(x_{i}^a)} - \\textcolor{red}{f(x_{i}^n)}\\|_{2}^2 + \\alpha]", {
        throwOnError: false
    });
    
    var anchor_image = katex.renderToString("f(x_{i}^a)", {
        throwOnError: false
    });
   
    var positive_images = katex.renderToString("f(x_{i}^p)", {
        throwOnError: false
    });

    var negative_images = katex.renderToString("f(x_{i}^n)", {
        throwOnError: false
    });

    return (
        <div>
            <span dangerouslySetInnerHTML={{__html: loss_equation}} >             
            </span>
            <p style={{marginTop: 10}}>
                We want <span className="anchor-image" dangerouslySetInnerHTML={{__html: anchor_image}} /> 
                to be closer to <span className="positive-image" dangerouslySetInnerHTML={{__html: positive_images}} />
                 and away from  <span  className="negative-image" dangerouslySetInnerHTML={{__html: negative_images}} />

            </p>
        </div>
    )
} 

class FacialRecognitionArticle extends React.Component {
    constructor(props) {
        super(props)
    }

    render() {
        const facenet = this.props.data.facenet.childImageSharp.sizes
        const embedding_gen = this.props.data.embedding_gen.childImageSharp.sizes        
        // const cover_img = this.props.data.cover_img.childImageSharp.sizes
        return (
            <Layout data={this.props.data} location={this.props.location}>
            <BlogPostChrome {...this.props.data.javascriptFrontmatter}>
                <article>
                <section>
                <h1 className="k-header-title">Facial Recognition</h1>
                    <p>
                        In this post, we explore facial recognition using deep learning.
                        We will start with a high-level overview of network architecture and 
                        quickly move into its applications and possibilities.
                    </p>
                    <p>
                        The network architecture was proposed by Schroff et al in the Facenet paper.
                        The key contribution of their paper was to introduce an end-to-end system that
                        utilizes deep learning architecture to learn the mapping from the facial image to 128d vector, i.e embedding.
                        This results in very high representational efficiency because a facial image can now be
                        represented by 128 bytes.
                        This is very efficient in terms of space and compute 
                        because we can store the images in compressed embedding form and use them
                        later for inference.
                        The original images are no longer required during inference and can be discarded.
                        The operations on lower dimensional embeddings require less compute than original images (width, height, channel) matrix.

                    </p>
                    <Img sizes={facenet} />
                    <p>
                        In order to train the network, they use
                        <Popover content={<TripletLoss />} title="Triplet Loss">
                            <span className="hover-text-anchor">&nbsp;TRIPLET LOSS&nbsp;</span>
                        </Popover>
                        where three images are used:
                    </p>
                    <ol>
                        <li><b>Anchor:</b> source image that will be converted into embeddings</li>
                        <li><b>Positive:</b> a different image of the same face and </li>
                        <li><b>Negative:</b> a different image of the different face</li>
                    </ol>
                    <p>
                        The triplet loss minimizes the distance between an Anchor and the Positive
                        and maximizes the distance between Anchor and the Negative image.
                        For more details, please refer to the paper in references.
                    </p>
                    <p>
                        On to the applications.
                        The most obvious use case is to perform facial recognition
                        where the app uses previously saved embeddings to compare
                        against a new photo, video or realtime camera feed.
          </p>
          <Divider />
                    <h1>Facial Similarity</h1>
                    <p>
                        We could apply the idea of comparing face embeddings to some fun projects
                        such as, given the face of an unknown person as input, show me the top three
                        nearest embeddings from our embeddings database. Sometime last year, I told a
                        friend about a toy classifier that I had built using neural networks (dog breed classifier).
                        His immediate response was "why not build an app that can tell you the name of
                        the actor you resemble ?". Well, here it is:
          </p>

                    <h2>Which Movie star do you resemble?</h2>
                    <p>
                        The idea is straightforward. We ask the user to upload an image,
                        the classifier extracts the face, converts into the embedding
                        vector and ranks it against all the actor embeddings.
            
                        The bigger the embedding universe the better the chances
                        of finding someone close enough. The similarity is performed by
                        taking the distance (euclidean norm) between the source
                        (i.e embedding of uploaded image) and the actor vector (i.e embedding of each actor)
            and choosing the <i>k</i> nearest ones. In order to improve the search quality,
                        we also apply a threshold on distance such that images with a distance greater 
                        than the threshold are dropped.
                        The threshold is another hyperparameters that can be played with
                        if it is too tight (i.e closer to 0) we may filter out almost all the embeddings and if it is
                        too wide (i.e close to 1) the quality of prediction may suffer.
                    </p>
                    <Img sizes={embedding_gen} />
                    <div className="is-size-7 hsl(0, 0%, 48%)">Embedding universe generation process</div>
                    <p style={{marginTop: 10}}>
                        Based on the above idea, I built a fun little webapp that you can try online.
                        The actor data was acquired by querying wikipedia for names and bing image search
                        for lookup and download. After that, the images were run through the face extraction process 
                        to extract faces and embedding generator to generate and save embedding for each image.
        </p>
                    {/* <Tag color="#2db7f5"><a href="https://faces.sprinkleai.com">CLICK HERE FOR ONLINE DEMO</a></Tag> */}
                    <Button type="primary" icon="team" size="large" href="https://faces.sprinkleai.com/celeb">CLICK HERE FOR ONLINE DEMO</Button>                              
                    <p style={{marginTop: 10}}>
                        The webapp is hosted on space and compute constrained virtual machine but still performs
                        alright in my limited tests. You should use the front facing images and ideally try with multiple
                        images to test the model. It seems the model pays more attention to nose and eyes
                        compared to other facial features. We can also check its accuracy by testing it
                        against the photo of a known actor.
        </p>
                    <div>
                        <p>
                            In this example, I use the picture of <a href="https://en.wikipedia.org/wiki/Javier_Bardem" target="_blank" rel="noopener noreferrer">
                            Javier Bardem</a> to find the faces similar to his and we see that model returns 
                            <a href="https://en.wikipedia.org/wiki/Jeffrey_Dean_Morgan" target="_blank" rel="noopener noreferrer"> Jeffrey Dean Morgan</a> as 
                            the nearest match. A very good match in my opinion, they do look similar. There is no Javier Bardem in 
                            the results because the wikipedia query, used to prepare the actor dataset, did not yield his name in the list of hollywood actors.
                        </p>
                        <div style={{textAlign: "center"}}>
                            <iframe width="560" height="315" src="https://www.youtube.com/embed/GFk_w_wKfoc" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>                        
                        </div>
                        </div>
                        <div>
                        <p>
                            Next, I use a popular selfie by Ellen Degeneres to identify the faces and their nearest match. 
                            The model is able to identify all the actors </p>
                        <div style={{textAlign: "center"}}>
                            <iframe width="560" height="315" src="https://www.youtube.com/embed/PBxiYB9S-2A" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
                        </div>
                    </div>
                    <h2>Similarity between family members</h2>
                    <p>
                        The narrative goes like this:
                    </p>
                    <Row>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                        <p>
                    A distant relative during one of the family gathering: "<i>Kids...they grow fast. By the way, Yours looks more like your wife</i>"
                    </p>
                    <p>Yet another distant<sup>distant</sup> relative (nodding): "<i>Well actually, the kid looks more like you</i>"</p>

                        <p>
                        How many of us have heard that before?
                        Well, we can use the same idea of comparing image embeddings in the vector space
                        but this time instead of comparing with actors, we compare with in the family members.
                        </p>
                        <p>                        
                        In the video here, I try my demo app on Will Smith and his family.
                        </p>                        
                        </Col>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                        <div style={{textAlign: "center"}}>
                            <iframe width="560" height="315" src="https://www.youtube.com/embed/EGA0UYA0Hio" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>                        
                        <div>
                            <Button type="primary" style={{marginTop: 10}}icon="user" size="large" href="https://faces.sprinkleai.com/family">DEMO: TRY IT YOURSELF</Button> 
                        </div>                    
                        </div>
                        </Col>
                    </Row>      
                    {/* <p>                        
                        In the example below, I try my demo app on Will Smith and his family.
                    </p>
                    <video className="video-container video-container-overlay box" autoPlay={false} muted controls>
                        <source type="video/mp4" data-reactid=".0.1.0.0.0" src="https://sprinkle.nyc3.digitaloceanspaces.com/facerec/facial_similarity.mp4" />
                    </video>                     */}
                    {/* <Button type="primary" icon="user" size="large" href="https://faces.sprinkleai.com/family">FACIAL SIMILARTY: TRY IT YOURSELF</Button>  */}
                    <h2>Monitoring employees</h2>
                    <p className="scratch-that">No, Scratch that, we're only talking about fun projects.</p>
                    <p>Or in fat Tony's words "fuhgeddaboudit"</p>

                    <Divider />
                    <h1>Robot Vision</h1>
                    <p>
                        Give vision to your DIY robots, train them to recognize you and the world around.
                        Bonus, amaze and inspire the little ones.
                    </p>
                    <div >
                    <Row>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                        <div style={{textAlign: "center"}}>
                        <video className="video-container video-container-overlay" autoPlay={false} muted controls >
                            <source type="video/mp4" data-reactid=".0.1.0.0.0" src="https://sprinkle.nyc3.digitaloceanspaces.com/facerec/MartPrime_Vision.mp4" />
                        </video>
                        <Col>
                            <span className="small-span">Testing facial recognition</span>
                        </Col>
                        </div>
                        </Col>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                            <p>
                                In this video, I trained the model to identify my face along with a few from my photo library.
                                This was done by first running the facial detection on all the images in my library and generating 
                                the embedding for every face. The embeddings were then labeled semi-automatically by first clustering 
                                the embeddings and labelling each cluster (or a few clusters) using Dash powered UI
                                (Perhaps a topic for a future article).

                            </p>
                        </Col>
                    </Row>                         
                    </div>
                    <h1>Custom slideshows</h1>
                    <p>
                        Similar to iPhone memories but on photographs that are not in the iPhotos library for better or worse.
                        You're planning to surprise your grandmother by throwing her a birthday party and you want to
                        create an awesome slideshow to show her journey through the photos. Now, you can either sift
                        through the giant photo library of yours (big data &#x1f61b;), carefully choosing the photos that
                        include your grandmother and sometimes taking a detour to a holiday album, or you can
                        harness the power of deep learning (compute time/power not included) and let it filter
                        the images for you. You can go one step further and combine textual search to filter the
                        images further. It is quite possible to see something like this implemented in image organization
                        software in near future.
                    </p>

                    <h1>Screen Time</h1>
                    <p>
                        Let's say you work at the movie production house and you are tasked to find the screen time of all the actors in a Movie (for some analysis).
                    </p>
                    <p>
                        We can make use of the same facial recognition model. Here, instead of loading
                        the entire embedding universe of actors, we can just load the embeddings of the main
                        cast or generate if it doesn't exist and label all the faces found in the movie.
                        The unrecognized faces can either be discarded or labeled using unknown_n where n is
                        the counter (e.g unknown_1, unknown_2).
    </p>

                    <p>Let's try it out on a recent trailer from Mission Impossible: Fallout:</p>
                    <Row>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                        <div style={{textAlign: "center"}}>
                            <iframe width="560" height="315" src="https://www.youtube.com/embed/XKu3Z22gkk0" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>                        
                        </div>
                        </Col>
                        <Col xs={{span: 24}} lg={{span: 24}} >
                            <div style={{textAlign: "center"}}>
                            <Row style={{maxWidth: 400, margin: "0 auto"}}>
                                <Col span={10}>Tom Cruise</Col>
                                <Col span={14}>
                                    <Progress percent={39.17} size="small" />
                                </Col>
                                <Col span={10}>Sean Harris</Col>
                                <Col span={14}>
                                    <Progress percent={15.88} size="small" />
                                </Col>
                                <Col span={10}>Rebecca Ferguson</Col>
                                <Col span={14}>
                                    <Progress percent={9.35} size="small" />
                                </Col>
                                <Col span={10}>Alec Baldwin</Col>
                                <Col span={14}>
                                    <Progress percent={8.16} size="small" />
                                </Col>
                                <Col span={10}>Simon Pegg</Col>
                                <Col span={14}>
                                    <Progress percent={7.12} size="small" />
                                </Col>
                                <Col span={10}>Henry Cavill</Col>
                                <Col span={14}>
                                    <Progress percent={5.64} size="small" />
                                </Col>
                                <Col span={10}>Angela Bassett</Col>
                                <Col span={14}>
                                    <Progress percent={5.34} size="small" />
                                </Col>
                                <Col span={10}>Ving Rhames</Col>
                                <Col span={14}>
                                    <Progress percent={5.19} size="small" />
                                </Col>
                                <Col span={10}>Vanessa Kirby</Col>
                                <Col span={14}>
                                    <Progress percent={0.3} size="small" />
                                </Col>
                                <Col span={24}>
                                    <span className="small-span">Based on total number of frontal faces found in the video</span>
                                </Col>
                            </Row>
                            </div>
                        </Col>
                    </Row>


                    <h1>Video Search</h1>
                    <p>
                        Once could use facial recognition along with object and scene detection
                        to provide a rich video search on video platforms such as Netflix or Apple
                        Movies. When watching reruns, you may not want to watch the entire show or
                        movie but just parts of it.
    </p>
                    <p>
                        For example, Siri on Apple TV.
    </p>
                    <Row className="dialogue">
                        <Col span={24}>
                            User: Hey Siri, show me the scene where Neo is offered to choose between the Red and the Blue pill in Matrix?
        </Col>
                        <Col span={24}>
                            Siri: Hang on, let me check with Alexa.
        </Col>
                    </Row>
                    <p>
                        There is a lot to unpack for Siri to provide a meaningful search result.
                        It has to understand that Matrix is the name of a movie, Neo is a character in the movie,
                        the movie has frames with colored pills and the location of frames where Neo and
                        pills appear together or within a threshold window. The possible solution would probably 
                        require more than the geometric (Computer vision, NLP, NLU)  artificial intelligence.
                    </p>

                    <Divider />
                    <h1>Conclusion</h1>

                    <p>
                        At the moment, facial recognition performs best on the frontal face
                        and fails or performs poorly on side pose. I expect this limitation
                        to go away in the near future because this seems to be an area of active research
                        but perhaps largely proprietary because I didn't have any success on Arxiv.
    </p>

                    <p>This technology does open up a huge market.</p>

                    <p>
                        Facial recognition can allow product companies to build better UX
                        where such tools can be used to aid human collaborator or the user.
                        The examples include image organization, in-video text/voice based search,
                        home security, personalization (e.g Cozmo robot can recognize me), authentication (iPhone/Surface face unlock).
    </p>

                    <p>
                        But it will also be used by agencies and companies for surveillance,
                        tracking, marketing and revenue generation. One such example could be
                        shopping mall cameras making money by tracking your movements inside
                        the mall and selling it to the shops that can afford the subscription,
                        shops using it along with eye tracking software to monitor the products you glance
                        and provide personalized pricing to convert the glance into a buy (glance-thru conversion rate ? ).
    </p>

                    <p>Are we there yet?</p>
                    <div style={{textAlign: "center"}}>
                        <iframe width="560" height="315" src="https://www.youtube.com/embed/0_W3McptR3Y" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>                    
                    </div>
                    <Divider />
                    <Row>
                        {/* <Col span={4}>
          Additional Resources:
          </Col>
          <Col span={20}>
          </Col> */}
                        <Col span={4} className="references-header">
                            References:
          </Col>
                        <Col span={20} className="references-text">
                            <ol>
                                <li>
                                FaceNet: A Unified Embedding for Face Recognition and Clustering
                                &nbsp;[<a href="https://www.cv-foundation.org/openaccess/content_cvpr_2015/app/1A_089.pdf">PDF</a>]
                            </li>
                                <li>
                                    OpenFace
                                    &nbsp;[<a href="https://cmusatyalab.github.io/openface/">website</a>,
                                    &nbsp;<a href="https://arxiv.org/abs/1512.04150">Arxiv</a>]
                                </li>
                                <li>
                                    High Quality Face Recognition with Deep Metric Learning
                &nbsp;[<a href="http://blog.dlib.net/2017/02/high-quality-face-recognition-with-deep.html">website</a>]
              </li>
                                <li>
                                    DeViSE: A Deep Visual-Semantic Embedding Model
                &nbsp;[<a href="https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41473.pdf">PDF</a>]
              </li>
                            </ol>
                        </Col>
                    </Row>
                    </section>
                </article>
            </BlogPostChrome>
            </Layout>
        )
    }
}

export default FacialRecognitionArticle

// We want to keep this component mostly about the code
//  so we write our explanation with markdown and manually pull it in here.
//  Within the config, we loop all of the markdown and createPages. However,
//  it will ignore any files appended with an _underscore. We can still manually
//  query for it here, and get the transformed html though because remark transforms
//  any markdown based node.
export const pageQuery = graphql`
    query facerecquery($slug: String!) {
      markdownRemark(
        fields: { slug: { eq: "/2017-01-22-a-first-post/" } }
      ) {
        html
      }
      javascriptFrontmatter(fields: { slug: { eq: $slug } }) {
        ...JSBlogPost_data
      }
      facenet: file(
        relativePath: {regex: "/facenet_arch_paper.png/"}) {
        childImageSharp {
          sizes(maxWidth: 600) {
            ...GatsbyImageSharpSizes_tracedSVG
          }
        }
      }    
      embedding_gen: file(
        relativePath: {regex: "/embedding-gen.png/"}) {
        childImageSharp {
          sizes(maxWidth: 1024) {
            ...GatsbyImageSharpSizes_tracedSVG
          }
        }
      }
    }
  `
