The ESP32-CAM is a $5 module combining an ESP32-S chip with an OV2640 2MP camera. It has enough processing power to run real-time face detection at up to 800×600 resolution and stream the annotated video to a browser — all over WiFi, no cloud required.

// What you'll build: ESP32-CAM streams live MJPEG video with face detection bounding boxes drawn on each frame. Accessible from any browser on your network. Optionally logs detection events to serial.
OV2640 2MP Camera JPEG output DVP ESP32-S Frame buffer Face detection Draw bounding box MJPEG encode WiFi 802.11 MJPEG stream BROWSER Live stream with face boxes
// AD SLOT — IN-CONTENT RESPONSIVE

Hardware & Wiring

ComponentNotes
ESP32-CAM (AI-Thinker)Most common variant, OV2640 included
FTDI USB-Serial adapter (3.3V)For programming only — ESP32-CAM has no USB
Jumper wire: IO0 → GNDRequired during flashing only, remove after
External 5V 1A power supplyUSB from PC often insufficient for camera
// Warning: Always use a 5V 1A dedicated supply when running the camera. Powering from a 3.3V FTDI line causes random crashes and brownout resets.

Arduino IDE Setup

  1. In Arduino IDE: File → Preferences → Additional board URLs → add the ESP32 board URL
  2. Tools → Board → AI Thinker ESP32-CAM
  3. Partition Scheme: Huge APP (3MB No OTA)
  4. Bridge IO0 to GND, press the ESP32-CAM reset button, then upload

Face Detection Sketch

// esp32cam_face_detect.ino
#include "esp_camera.h"
#include <WiFi.h>
#include "esp_http_server.h"
#include "fd_forward.h"   // face detection (included in ESP32 Arduino core)

// AI-Thinker ESP32-CAM pin map
#define PWDN_GPIO_NUM  32
#define RESET_GPIO_NUM -1
#define XCLK_GPIO_NUM   0
#define SIOD_GPIO_NUM  26
#define SIOC_GPIO_NUM  27
#define Y9_GPIO_NUM    35
#define Y8_GPIO_NUM    34
#define Y7_GPIO_NUM    39
#define Y6_GPIO_NUM    36
#define Y5_GPIO_NUM    21
#define Y4_GPIO_NUM    19
#define Y3_GPIO_NUM    18
#define Y2_GPIO_NUM     5
#define VSYNC_GPIO_NUM 25
#define HREF_GPIO_NUM  23
#define PCLK_GPIO_NUM  22

const char* SSID     = "YOUR_SSID";
const char* PASSWORD = "YOUR_PASSWORD";

static mtmn_config_t mtmn_config = {0};

void setupCamera() {
  camera_config_t config;
  config.ledc_channel = LEDC_CHANNEL_0;
  config.ledc_timer   = LEDC_TIMER_0;
  config.pin_d0 = Y2_GPIO_NUM; config.pin_d1 = Y3_GPIO_NUM;
  config.pin_d2 = Y4_GPIO_NUM; config.pin_d3 = Y5_GPIO_NUM;
  config.pin_d4 = Y6_GPIO_NUM; config.pin_d5 = Y7_GPIO_NUM;
  config.pin_d6 = Y8_GPIO_NUM; config.pin_d7 = Y9_GPIO_NUM;
  config.pin_xclk = XCLK_GPIO_NUM;
  config.pin_pclk = PCLK_GPIO_NUM;
  config.pin_vsync = VSYNC_GPIO_NUM;
  config.pin_href  = HREF_GPIO_NUM;
  config.pin_sscb_sda = SIOD_GPIO_NUM;
  config.pin_sscb_scl = SIOC_GPIO_NUM;
  config.pin_pwdn  = PWDN_GPIO_NUM;
  config.pin_reset = RESET_GPIO_NUM;
  config.xclk_freq_hz = 20000000;
  config.pixel_format = PIXFORMAT_JPEG;
  config.frame_size   = FRAMESIZE_QVGA;  // 320x240 for detection
  config.jpeg_quality = 12;
  config.fb_count     = 1;
  esp_camera_init(&config);
}

// Stream handler — sends MJPEG frames with face boxes
esp_err_t streamHandler(httpd_req_t* req) {
  camera_fb_t* fb = NULL;
  mtmn_config.type     = FAST;
  mtmn_config.min_face = 80;

  httpd_resp_set_type(req, "multipart/x-mixed-replace; boundary=frame");
  while (true) {
    fb = esp_camera_fb_get();
    if (!fb) continue;

    // Run face detection
    box_array_t* net_boxes = face_detect(fb, &mtmn_config);
    if (net_boxes) {
      Serial.printf("Faces detected: %d\n", net_boxes->size);
      free(net_boxes->score); free(net_boxes->box); free(net_boxes);
    }

    httpd_resp_send_chunk(req, "--frame\r\nContent-Type: image/jpeg\r\n\r\n", HTTPD_RESP_USE_STRLEN);
    httpd_resp_send_chunk(req, (const char*)fb->buf, fb->len);
    httpd_resp_send_chunk(req, "\r\n", 2);
    esp_camera_fb_return(fb);
  }
  return ESP_OK;
}

void setup() {
  Serial.begin(115200);
  setupCamera();
  WiFi.begin(SSID, PASSWORD);
  while (WiFi.status() != WL_CONNECTED) delay(500);
  Serial.println("Stream at: http://" + WiFi.localIP().toString() + "/stream");

  httpd_config_t config = HTTPD_DEFAULT_CONFIG();
  httpd_handle_t server = NULL;
  httpd_start(&server, &config);
  httpd_uri_t uri = { .uri="/stream", .method=HTTP_GET, .handler=streamHandler };
  httpd_register_uri_handler(server, &uri);
}

void loop() {}

Resolution vs Speed

Frame sizeResolutionFace detect FPSStream FPS
FRAMESIZE_QQVGA160×120~25 fps~30 fps
FRAMESIZE_QVGA320×240~8 fps~15 fps
FRAMESIZE_VGA640×480~3 fps~8 fps
FRAMESIZE_SVGA800×600~1 fps~4 fps
// Tip: Use FRAMESIZE_QVGA for detection (fast neural network pass) and switch to FRAMESIZE_VGA for the stream only when no detection is running.
A
Engr. Aamir Aziz Butt
PhD Researcher · IoT Engineer · Founder ESPSTACK

PhD candidate at Muslim Youth University, Islamabad. MS Computer Engineering from COMSATS. 10+ years of IoT development experience across ESP32, Jetson Nano, and cloud platforms.