The ESP32-CAM is a $5 module combining an ESP32-S chip with an OV2640 2MP camera. It has enough processing power to run real-time face detection at up to 800×600 resolution and stream the annotated video to a browser — all over WiFi, no cloud required.
// What you'll build: ESP32-CAM streams live MJPEG video with face detection bounding boxes drawn on each frame. Accessible from any browser on your network. Optionally logs detection events to serial.
// AD SLOT — IN-CONTENT RESPONSIVE
Hardware & Wiring
| Component | Notes |
|---|---|
| ESP32-CAM (AI-Thinker) | Most common variant, OV2640 included |
| FTDI USB-Serial adapter (3.3V) | For programming only — ESP32-CAM has no USB |
| Jumper wire: IO0 → GND | Required during flashing only, remove after |
| External 5V 1A power supply | USB from PC often insufficient for camera |
// Warning: Always use a 5V 1A dedicated supply when running the camera. Powering from a 3.3V FTDI line causes random crashes and brownout resets.
Arduino IDE Setup
- In Arduino IDE: File → Preferences → Additional board URLs → add the ESP32 board URL
- Tools → Board → AI Thinker ESP32-CAM
- Partition Scheme: Huge APP (3MB No OTA)
- Bridge IO0 to GND, press the ESP32-CAM reset button, then upload
Face Detection Sketch
// esp32cam_face_detect.ino#include "esp_camera.h"
#include <WiFi.h>
#include "esp_http_server.h"
#include "fd_forward.h" // face detection (included in ESP32 Arduino core)
// AI-Thinker ESP32-CAM pin map
#define PWDN_GPIO_NUM 32
#define RESET_GPIO_NUM -1
#define XCLK_GPIO_NUM 0
#define SIOD_GPIO_NUM 26
#define SIOC_GPIO_NUM 27
#define Y9_GPIO_NUM 35
#define Y8_GPIO_NUM 34
#define Y7_GPIO_NUM 39
#define Y6_GPIO_NUM 36
#define Y5_GPIO_NUM 21
#define Y4_GPIO_NUM 19
#define Y3_GPIO_NUM 18
#define Y2_GPIO_NUM 5
#define VSYNC_GPIO_NUM 25
#define HREF_GPIO_NUM 23
#define PCLK_GPIO_NUM 22
const char* SSID = "YOUR_SSID";
const char* PASSWORD = "YOUR_PASSWORD";
static mtmn_config_t mtmn_config = {0};
void setupCamera() {
camera_config_t config;
config.ledc_channel = LEDC_CHANNEL_0;
config.ledc_timer = LEDC_TIMER_0;
config.pin_d0 = Y2_GPIO_NUM; config.pin_d1 = Y3_GPIO_NUM;
config.pin_d2 = Y4_GPIO_NUM; config.pin_d3 = Y5_GPIO_NUM;
config.pin_d4 = Y6_GPIO_NUM; config.pin_d5 = Y7_GPIO_NUM;
config.pin_d6 = Y8_GPIO_NUM; config.pin_d7 = Y9_GPIO_NUM;
config.pin_xclk = XCLK_GPIO_NUM;
config.pin_pclk = PCLK_GPIO_NUM;
config.pin_vsync = VSYNC_GPIO_NUM;
config.pin_href = HREF_GPIO_NUM;
config.pin_sscb_sda = SIOD_GPIO_NUM;
config.pin_sscb_scl = SIOC_GPIO_NUM;
config.pin_pwdn = PWDN_GPIO_NUM;
config.pin_reset = RESET_GPIO_NUM;
config.xclk_freq_hz = 20000000;
config.pixel_format = PIXFORMAT_JPEG;
config.frame_size = FRAMESIZE_QVGA; // 320x240 for detection
config.jpeg_quality = 12;
config.fb_count = 1;
esp_camera_init(&config);
}
// Stream handler — sends MJPEG frames with face boxes
esp_err_t streamHandler(httpd_req_t* req) {
camera_fb_t* fb = NULL;
mtmn_config.type = FAST;
mtmn_config.min_face = 80;
httpd_resp_set_type(req, "multipart/x-mixed-replace; boundary=frame");
while (true) {
fb = esp_camera_fb_get();
if (!fb) continue;
// Run face detection
box_array_t* net_boxes = face_detect(fb, &mtmn_config);
if (net_boxes) {
Serial.printf("Faces detected: %d\n", net_boxes->size);
free(net_boxes->score); free(net_boxes->box); free(net_boxes);
}
httpd_resp_send_chunk(req, "--frame\r\nContent-Type: image/jpeg\r\n\r\n", HTTPD_RESP_USE_STRLEN);
httpd_resp_send_chunk(req, (const char*)fb->buf, fb->len);
httpd_resp_send_chunk(req, "\r\n", 2);
esp_camera_fb_return(fb);
}
return ESP_OK;
}
void setup() {
Serial.begin(115200);
setupCamera();
WiFi.begin(SSID, PASSWORD);
while (WiFi.status() != WL_CONNECTED) delay(500);
Serial.println("Stream at: http://" + WiFi.localIP().toString() + "/stream");
httpd_config_t config = HTTPD_DEFAULT_CONFIG();
httpd_handle_t server = NULL;
httpd_start(&server, &config);
httpd_uri_t uri = { .uri="/stream", .method=HTTP_GET, .handler=streamHandler };
httpd_register_uri_handler(server, &uri);
}
void loop() {}
Resolution vs Speed
| Frame size | Resolution | Face detect FPS | Stream FPS |
|---|---|---|---|
| FRAMESIZE_QQVGA | 160×120 | ~25 fps | ~30 fps |
| FRAMESIZE_QVGA | 320×240 | ~8 fps | ~15 fps |
| FRAMESIZE_VGA | 640×480 | ~3 fps | ~8 fps |
| FRAMESIZE_SVGA | 800×600 | ~1 fps | ~4 fps |
// Tip: Use
FRAMESIZE_QVGA for detection (fast neural network pass) and switch to FRAMESIZE_VGA for the stream only when no detection is running.