|
#include <iostream> |
|
|
|
#include <opencv2/dnn.hpp> |
|
#include <opencv2/imgproc.hpp> |
|
#include <opencv2/highgui.hpp> |
|
|
|
using namespace std; |
|
using namespace cv; |
|
using namespace dnn; |
|
|
|
vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = { |
|
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU), |
|
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA), |
|
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16), |
|
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU), |
|
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)}; |
|
|
|
|
|
std::string keys = |
|
"{ help h | | Print help message. }" |
|
"{ model m | text_detection_cn_ppocrv3_2023may.onnx | Usage: Set model type, defaults to text_detection_ch_ppocrv3_2023may.onnx }" |
|
"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}" |
|
"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}" |
|
"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}" |
|
"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}" |
|
"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}" |
|
"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}" |
|
"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}" |
|
"{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}" |
|
"{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}" |
|
"{ backend bt | 0 | Choose one of computation backends: " |
|
"0: (default) OpenCV implementation + CPU, " |
|
"1: CUDA + GPU (CUDA), " |
|
"2: CUDA + GPU (CUDA FP16), " |
|
"3: TIM-VX + NPU, " |
|
"4: CANN + NPU}"; |
|
|
|
|
|
class PPOCRDet { |
|
public: |
|
|
|
PPOCRDet(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3, |
|
float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0, |
|
dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh), |
|
polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio), |
|
backendId(bId), targetId(tId) |
|
{ |
|
this->model = TextDetectionModel_DB(readNet(modelPath)); |
|
this->model.setPreferableBackend(backendId); |
|
this->model.setPreferableTarget(targetId); |
|
|
|
this->model.setBinaryThreshold(binaryThreshold); |
|
this->model.setPolygonThreshold(polygonThreshold); |
|
this->model.setUnclipRatio(unclipRatio); |
|
this->model.setMaxCandidates(maxCandidates); |
|
|
|
this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793)); |
|
} |
|
pair< vector<vector<Point>>, vector<float> > infer(Mat image) { |
|
CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size "); |
|
CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size "); |
|
vector<vector<Point>> pt; |
|
vector<float> confidence; |
|
this->model.detect(image, pt, confidence); |
|
return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence); |
|
} |
|
|
|
private: |
|
string modelPath; |
|
TextDetectionModel_DB model; |
|
Size inputSize; |
|
float binaryThreshold; |
|
float polygonThreshold; |
|
int maxCandidates; |
|
double unclipRatio; |
|
dnn::Backend backendId; |
|
dnn::Target targetId; |
|
|
|
}; |
|
|
|
Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2) |
|
{ |
|
Mat output; |
|
image.copyTo(output); |
|
if (fps > 0) |
|
putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor); |
|
polylines(output, results.first, isClosed, boxColor, thickness); |
|
return output; |
|
} |
|
|
|
int main(int argc, char** argv) |
|
{ |
|
CommandLineParser parser(argc, argv, keys); |
|
|
|
parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV."); |
|
if (parser.has("help")) |
|
{ |
|
parser.printMessage(); |
|
return 0; |
|
} |
|
|
|
int backendTargetid = parser.get<int>("backend"); |
|
String modelName = parser.get<String>("model"); |
|
|
|
if (modelName.empty()) |
|
{ |
|
CV_Error(Error::StsError, "Model file " + modelName + " not found"); |
|
} |
|
|
|
Size inpSize(parser.get<int>("width"), parser.get<int>("height")); |
|
float binThresh = parser.get<float>("binary_threshold"); |
|
float polyThresh = parser.get<float>("polygon_threshold"); |
|
int maxCand = parser.get<int>("max_candidates"); |
|
double unRatio = parser.get<float>("unclip_ratio"); |
|
bool save = parser.get<bool>("save"); |
|
bool viz = parser.get<bool>("viz"); |
|
|
|
PPOCRDet model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second); |
|
|
|
|
|
VideoCapture cap; |
|
if (parser.has("input")) |
|
cap.open(parser.get<String>("input")); |
|
else |
|
cap.open(0); |
|
if (!cap.isOpened()) |
|
CV_Error(Error::StsError, "Cannot open video or file"); |
|
Mat originalImage; |
|
static const std::string kWinName = modelName; |
|
while (waitKey(1) < 0) |
|
{ |
|
cap >> originalImage; |
|
if (originalImage.empty()) |
|
{ |
|
if (parser.has("input")) |
|
{ |
|
cout << "Frame is empty" << endl; |
|
break; |
|
} |
|
else |
|
continue; |
|
} |
|
int originalW = originalImage.cols; |
|
int originalH = originalImage.rows; |
|
double scaleHeight = originalH / double(inpSize.height); |
|
double scaleWidth = originalW / double(inpSize.width); |
|
Mat image; |
|
resize(originalImage, image, inpSize); |
|
|
|
|
|
TickMeter tm; |
|
tm.start(); |
|
pair< vector<vector<Point>>, vector<float> > results = model.infer(image); |
|
tm.stop(); |
|
auto x = results.first; |
|
|
|
for (auto &pts : results.first) |
|
{ |
|
for (int i = 0; i < 4; i++) |
|
{ |
|
pts[i].x = int(pts[i].x * scaleWidth); |
|
pts[i].y = int(pts[i].y * scaleHeight); |
|
} |
|
} |
|
originalImage = visualize(originalImage, results, tm.getFPS()); |
|
tm.reset(); |
|
if (parser.has("input")) |
|
{ |
|
if (save) |
|
{ |
|
cout << "Result image saved to result.jpg\n"; |
|
imwrite("result.jpg", originalImage); |
|
} |
|
if (viz) |
|
{ |
|
imshow(kWinName, originalImage); |
|
waitKey(0); |
|
} |
|
} |
|
else |
|
imshow(kWinName, originalImage); |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
|