DaniAffCH's picture
C++ Demo - Facial Expression Recognition (#233)
adfd76b
#include "opencv2/opencv.hpp"
#include <map>
#include <vector>
#include <string>
#include <iostream>
using namespace std;
using namespace cv;
using namespace dnn;
std::vector<std::pair<int, int>> backend_target_pairs = {
{DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
{DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
{DNN_BACKEND_CANN, DNN_TARGET_NPU}
};
class FER
{
private:
Net model;
string modelPath;
float std[5][2] = {
{38.2946, 51.6963},
{73.5318, 51.5014},
{56.0252, 71.7366},
{41.5493, 92.3655},
{70.7299, 92.2041}
};
vector<String> expressionEnum = {
"angry", "disgust", "fearful",
"happy", "neutral", "sad", "surprised"
};
Mat stdPoints = Mat(5, 2, CV_32F, this->std);
Size patchSize = Size(112,112);
Scalar imageMean = Scalar(0.5,0.5,0.5);
Scalar imageStd = Scalar(0.5,0.5,0.5);
const String inputNames = "data";
const String outputNames = "label";
int backend_id;
int target_id;
public:
FER(const string& modelPath,
int backend_id = 0,
int target_id = 0)
: modelPath(modelPath), backend_id(backend_id), target_id(target_id)
{
this->model = readNet(modelPath);
this->model.setPreferableBackend(backend_id);
this->model.setPreferableTarget(target_id);
}
Mat preprocess(const Mat image, const Mat points)
{
// image alignment
Mat transformation = estimateAffine2D(points, this->stdPoints);
Mat aligned = Mat::zeros(this->patchSize.height, this->patchSize.width, image.type());
warpAffine(image, aligned, transformation, this->patchSize);
// image normalization
aligned.convertTo(aligned, CV_32F, 1.0 / 255.0);
aligned -= imageMean;
aligned /= imageStd;
return blobFromImage(aligned);;
}
String infer(const Mat image, const Mat facePoints)
{
Mat points = facePoints(Rect(4, 0, facePoints.cols-5, facePoints.rows)).reshape(2, 5);
Mat inputBlob = preprocess(image, points);
this->model.setInput(inputBlob, this->inputNames);
Mat outputBlob = this->model.forward(this->outputNames);
Point maxLoc;
minMaxLoc(outputBlob, nullptr, nullptr, nullptr, &maxLoc);
return getDesc(maxLoc.x);
}
String getDesc(int ind)
{
if (ind >= 0 && ind < this->expressionEnum.size())
{
return this->expressionEnum[ind];
}
else
{
cerr << "Error: Index out of bounds." << endl;
return "";
}
}
};
class YuNet
{
public:
YuNet(const string& model_path,
const Size& input_size = Size(320, 320),
float conf_threshold = 0.6f,
float nms_threshold = 0.3f,
int top_k = 5000,
int backend_id = 0,
int target_id = 0)
: model_path_(model_path), input_size_(input_size),
conf_threshold_(conf_threshold), nms_threshold_(nms_threshold),
top_k_(top_k), backend_id_(backend_id), target_id_(target_id)
{
model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_);
}
void setBackendAndTarget(int backend_id, int target_id)
{
backend_id_ = backend_id;
target_id_ = target_id;
model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_);
}
/* Overwrite the input size when creating the model. Size format: [Width, Height].
*/
void setInputSize(const Size& input_size)
{
input_size_ = input_size;
model->setInputSize(input_size_);
}
Mat infer(const Mat image)
{
Mat res;
model->detect(image, res);
return res;
}
private:
Ptr<FaceDetectorYN> model;
string model_path_;
Size input_size_;
float conf_threshold_;
float nms_threshold_;
int top_k_;
int backend_id_;
int target_id_;
};
cv::Mat visualize(const cv::Mat& image, const cv::Mat& faces, const vector<String> expressions, float fps = -1.f)
{
static cv::Scalar box_color{0, 255, 0};
static std::vector<cv::Scalar> landmark_color{
cv::Scalar(255, 0, 0), // right eye
cv::Scalar( 0, 0, 255), // left eye
cv::Scalar( 0, 255, 0), // nose tip
cv::Scalar(255, 0, 255), // right mouth corner
cv::Scalar( 0, 255, 255) // left mouth corner
};
static cv::Scalar text_color{0, 255, 0};
auto output_image = image.clone();
if (fps >= 0)
{
cv::putText(output_image, cv::format("FPS: %.2f", fps), cv::Point(0, 15), cv::FONT_HERSHEY_SIMPLEX, 0.5, text_color, 2);
}
for (int i = 0; i < faces.rows; ++i)
{
// Draw bounding boxes
int x1 = static_cast<int>(faces.at<float>(i, 0));
int y1 = static_cast<int>(faces.at<float>(i, 1));
int w = static_cast<int>(faces.at<float>(i, 2));
int h = static_cast<int>(faces.at<float>(i, 3));
cv::rectangle(output_image, cv::Rect(x1, y1, w, h), box_color, 2);
// Expression as text
String exp = expressions[i];
cv::putText(output_image, exp, cv::Point(x1, y1+12), cv::FONT_HERSHEY_DUPLEX, 0.5, text_color);
// Draw landmarks
for (int j = 0; j < landmark_color.size(); ++j)
{
int x = static_cast<int>(faces.at<float>(i, 2*j+4)), y = static_cast<int>(faces.at<float>(i, 2*j+5));
cv::circle(output_image, cv::Point(x, y), 2, landmark_color[j], 2);
}
}
return output_image;
}
string keys =
"{ help h | | Print help message. }"
"{ model m | facial_expression_recognition_mobilefacenet_2022july.onnx | Usage: Path to the model, defaults to facial_expression_recognition_mobilefacenet_2022july.onnx }"
"{ yunet_model ym | ../face_detection_yunet/face_detection_yunet_2023mar.onnx | Usage: Path to the face detection yunet model, defaults to face_detection_yunet_2023mar.onnx }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ backend_target t | 0 | Choose one of the backend-target pair to run this demo:\n"
"0: (default) OpenCV implementation + CPU,\n"
"1: CUDA + GPU (CUDA),\n"
"2: CUDA + GPU (CUDA FP16),\n"
"3: TIM-VX + NPU,\n"
"4: CANN + NPU}"
"{ save s | false | Specify to save results.}"
"{ vis v | true | Specify to open a window for result visualization.}"
;
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
parser.about("Facial Expression Recognition");
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
string modelPath = parser.get<string>("model");
string yunetModelPath = parser.get<string>("yunet_model");
string inputPath = parser.get<string>("input");
uint8_t backendTarget = parser.get<uint8_t>("backend_target");
bool saveFlag = parser.get<bool>("save");
bool visFlag = parser.get<bool>("vis");
if (modelPath.empty())
CV_Error(Error::StsError, "Model file " + modelPath + " not found");
if (yunetModelPath.empty())
CV_Error(Error::StsError, "Face Detection Model file " + yunetModelPath + " not found");
YuNet faceDetectionModel(yunetModelPath);
FER expressionRecognitionModel(modelPath, backend_target_pairs[backendTarget].first, backend_target_pairs[backendTarget].second);
VideoCapture cap;
if (!inputPath.empty())
cap.open(samples::findFile(inputPath));
else
cap.open(0);
if (!cap.isOpened())
CV_Error(Error::StsError, "Cannot opend video or file");
Mat frame;
static const std::string kWinName = "Facial Expression Demo";
while (waitKey(1) < 0)
{
cap >> frame;
if (frame.empty())
{
if(inputPath.empty())
cout << "Frame is empty" << endl;
break;
}
faceDetectionModel.setInputSize(frame.size());
Mat faces = faceDetectionModel.infer(frame);
vector<String> expressions;
for (int i = 0; i < faces.rows; ++i)
{
Mat face = faces.row(i);
String exp = expressionRecognitionModel.infer(frame, face);
expressions.push_back(exp);
int x1 = static_cast<int>(faces.at<float>(i, 0));
int y1 = static_cast<int>(faces.at<float>(i, 1));
int w = static_cast<int>(faces.at<float>(i, 2));
int h = static_cast<int>(faces.at<float>(i, 3));
float conf = faces.at<float>(i, 14);
std::cout << cv::format("%d: x1=%d, y1=%d, w=%d, h=%d, conf=%.4f expression=%s\n", i, x1, y1, w, h, conf, exp.c_str());
}
Mat res_frame = visualize(frame, faces, expressions);
if(visFlag || inputPath.empty())
{
imshow(kWinName, res_frame);
if(!inputPath.empty())
waitKey(0);
}
if(saveFlag)
{
cout << "Results are saved to result.jpg" << endl;
cv::imwrite("result.jpg", res_frame);
}
}
return 0;
}