Aug 5, 2019 · Mar 12, 2020 · Feb 6, 2020
diff --git a/doc/tutorials/dnn/dnn_high_level_api/dnn_high_level_api.md b/doc/tutorials/dnn/dnn_high_level_api/dnn_high_level_api.md
 # High Level API: Usage and tips {#tutorial_dnn_high_level_api}

 ### Introduction
 In this tutorial we will go through how you can use the new OpenCV Dnn API. The newly added DNN module allows us to run inference on deep neural networks exported from other frameworks, such as Caffe, Tensorflow, ONNX, and Torch.

 ### Usage
 Here is a small example on how to run inference on the different API module. For a more detailed example, check the [samples](https://github.com/opencv/opencv/tree/master/samples/dnn)

 #### Classification Module
 @snippet dnn/classification.cpp Read and initialize network
 @snippet dnn/classification.cpp Set Input Parameters
 @snippet dnn/classification.cpp Network Forward pass

 This will return the predicted class and the confidence of the model.

 #### Detection Module
 @snippet dnn/detection.cpp Read and initialize network
 @snippet dnn/detection.cpp Set Input Parameters
 @snippet dnn/detection.cpp Network Forward pass

 This will store the predicted classes in ``classIds``, the bounding boxes in ``boxes`` and the confidence of the model in ``confidences``


 #### Segmentation Module
 @snippet dnn/segmentation.cpp Read and initialize network
 @snippet dnn/segmentation.cpp Set Input Parameters
 @snippet dnn/segmentation.cpp Network Forward pass

 This will store the segmentation map in ``mask``

 #### Keypoints Module
 @snippet dnn/keypoints.cpp Read and initialize network
 @snippet dnn/keypoints.cpp Set Input Parameters
 @snippet dnn/keypoints.cpp Network Forward pass

 This will return the predicted keypoints.


 ### Some explanations
 If you are new to Machine Learning you might be wondering why do we need those variables, scale, Size, mean, etc. Usually statistical models,
 (e.g Neural Networks) are trained on normalized data to help convergence. When working with images, it is common to do this normalization in a per-channel
 fashion, so you take every image and subtract the per-channel mean and divide it by the per-channel standard deviation (std). Notice that the second operation (i.e dividing by the std)
 is done to limit the range of our data, and since the values in an image are already limited (0-255), you will sometimes see this step being skipped and the image just being divided by 1.
 Since the OpenCV Dnn Module is based on Caffe, and Caffe does skip this step (i.e divide the image by 1) this operation is not integrated into the API directly. So if we really want to
 normalize the image we have to set the **scale** parameter to the inverse of the standard deviation. Thus, the **scale** parameter represents we will scale our image by,
 and the **mean** represents the per-channel mean we want to subtract from it. It is also common to resize the images to a fixed size for convenience before feeding
 them to the network, this size is specified by the **size** parameter in our code. Last but not least, the **swapRB** variable indicates to swap the order of a BGR image to RGB
 or vice versa.
 OpenCV reads images in BGR format, while other frameworks, such as PIL read them in RGB. Anyhow, setting this flag to **true** will swap the image channels and the **mean** vector accordingly.
diff --git a/doc/tutorials/dnn/table_of_content_dnn.markdown b/doc/tutorials/dnn/table_of_content_dnn.markdown
    *Author:* Dmitry Kurtaev

    How to define custom layers to import networks.

 -   @subpage tutorial_dnn_high_level_api

    *Compatibility:* \> OpenCV 4.1.1

    *Author:* Diego Velazquez

    How you can use the new High Level API for the DNN module, plus some general advice.
diff --git a/samples/dnn/classification.cpp b/samples/dnn/classification.cpp
    "{ @alias      | | An alias name of model to extract preprocessing parameters from models.yml file. }"
    "{ zoo         | models.yml | An optional path to file with preprocessing parameters }"
    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
    "{ classes     | | Optional path to a text file with names of classes. }"
    "{ backend     | 0 | Choose one of computation backends: "
                        "0: automatically (by default), "
    int inpHeight = parser.get<int>("height");
    String model = findFile(parser.get<String>("model"));
    String config = findFile(parser.get<String>("config"));
    String framework = parser.get<String>("framework");
    int backendId = parser.get<int>("backend");
    int targetId = parser.get<int>("target");

    CV_Assert(!model.empty());

    //! [Read and initialize network]
    Net net = readNet(model, config, framework);
    // Alexnet
    // https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_alexnet.tar.gz
    ClassificationModel net(model, config); // create our model
    net.setPreferableBackend(backendId);
    net.setPreferableTarget(targetId);
    //! [Read and initialize network]
        cap.open(0);
    //! [Open a video file or an image file or a camera stream]

    //! [Set Input Parameters]
    net.setInputParams(scale, Size(inpHeight, inpWidth), mean, swapRB);
    //! [Set Input Parameters]

    // Process frames.
    Mat frame, blob;
    Mat frame;
    while (waitKey(1) < 0)
    {
        cap >> frame;
            break;
        }

        //! [Create a 4D blob from a frame]
        blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
        //! [Create a 4D blob from a frame]

        //! [Set input blob]
        net.setInput(blob);
        //! [Set input blob]
        //! [Make forward pass]
        Mat prob = net.forward();
        //! [Make forward pass]

        //! [Get a class with a highest score]
        Point classIdPoint;
        double confidence;
        minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
        int classId = classIdPoint.x;
        //! [Get a class with a highest score]
        //! [Network Forward pass]
        std::pair<int, float> prediction = net.classify(frame);
        //! [Network Forward pass]


        int classId;
        float confidence;
        std::tie(classId, confidence) = prediction;

        // Put efficiency information.
        std::vector<double> layersTimes;
diff --git a/samples/dnn/detection.cpp b/samples/dnn/detection.cpp
 #include <fstream>
 #include <sstream>

 #include <opencv2/dnn.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>

 #include "common.hpp"

 std::string keys =
    "{ help  h     | | Print help message. }"
    "{ @alias      | | An alias name of model to extract preprocessing parameters from models.yml file. }"
    "{ zoo         | models.yml | An optional path to file with preprocessing parameters }"
    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
    "{ classes     | | Optional path to a text file with names of classes. }"
    "{ confidence  | 0.5 | Optional threshold to discard low confidence boxes.}"
    "{ nms_thr     | 0.0 | Optional IOU threshold to discard highly overlapping boxes.}"
    "{ backend     | 0 | Choose one of computation backends: "
                        "0: automatically (by default), "
                        "1: Halide language (http://halide-lang.org/), "
                        "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
                        "3: OpenCV implementation }"
    "{ target      | 0 | Choose one of target computation devices: "
                        "0: CPU target (by default), "
                        "1: OpenCL, "
                        "2: OpenCL fp16 (half-float precision), "
                        "3: VPU }";

 using namespace cv;
 using namespace dnn;

 std::vector<std::string> classes;

 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, keys);

    const std::string modelName = parser.get<String>("@alias");
    const std::string zooFile = parser.get<String>("zoo");

    keys += genPreprocArguments(modelName, zooFile);

    parser = CommandLineParser(argc, argv, keys);
    parser.about("Use this script to run classification deep learning networks using OpenCV.");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }

    float scale = parser.get<float>("scale");
    Scalar mean = parser.get<Scalar>("mean");
    bool swapRB = parser.get<bool>("rgb");
    int inpWidth = parser.get<int>("width");
    int inpHeight = parser.get<int>("height");
    String model = findFile(parser.get<String>("model"));
    String config = findFile(parser.get<String>("config"));
    int backendId = parser.get<int>("backend");
    int targetId = parser.get<int>("target");
    float confThreshold = parser.get<float>("confidence");
    float nmsThreshold = parser.get<float>("nms_thr");

    // Open file with classes names.
    if (parser.has("classes"))
    {
        std::string file = parser.get<String>("classes");
        std::ifstream ifs(file.c_str());
        if (!ifs.is_open())
            CV_Error(Error::StsError, "File " + file + " not found");
        std::string line;
        while (std::getline(ifs, line))
        {
            classes.push_back(line);
        }
    }

    if (!parser.check())
    {
        parser.printErrors();
        return 1;
    }
    CV_Assert(!model.empty());

    //! [Read and initialize network]
    // YOLOv3
    // https://pjreddie.com/media/files/yolov3.weights
    DetectionModel net(model, config);
    net.setPreferableBackend(backendId);
    net.setPreferableTarget(targetId);
    //! [Read and initialize network]

    // Create a window
    static const std::string kWinName = "Deep learning image classification in OpenCV";
    namedWindow(kWinName, WINDOW_NORMAL);

    //! [Open a video file or an image file or a camera stream]
    VideoCapture cap;
    if (parser.has("input"))
        cap.open(parser.get<String>("input"));
    else
        cap.open(0);
    //! [Open a video file or an image file or a camera stream]

    //! [Set Input Parameters]
    net.setInputParams(scale, Size(inpHeight, inpWidth), mean, swapRB);
    //! [Set Input Parameters]

    // Process frames.
    Mat frame;
    while (waitKey(1) < 0)
    {
        cap >> frame;
        if (frame.empty())
        {
            waitKey();
            break;
        }

        //! [Network Forward pass]
        std::vector<Rect> boxes;
        std::vector<int> classIds;
        std::vector<float> confidences;
        net.detect(frame, classIds, confidences, boxes, confThreshold, nmsThreshold);
        //! [Network Forward pass]

        //! [Iterate over every predicted box and draw them on the image with the predicted class and confidence on top]
        std::vector<Rect2d> boxesDouble(boxes.size());
        std::stringstream ss;

        for (uint i = 0; i < boxes.size(); i++) {
            ss << classIds[i];
            ss << ": ";
            ss << confidences[i];
            boxesDouble[i] = boxes[i];
            rectangle(frame, boxesDouble[i], Scalar(0, 0, 255), 1, 8, 0);
            putText(frame, ss.str(), Point(boxes[i].x, boxes[i].y), FONT_HERSHEY_DUPLEX, 0.5, Scalar(0,0,0), 2);
            ss.str("");
        }
        //! [Iterate over every predicted box and draw them on the image with the predicted class and confidence on top]

        // Put efficiency information.
        std::vector<double> layersTimes;
        double freq = getTickFrequency() / 1000;
        double t = net.getPerfProfile(layersTimes) / freq;
        std::string label = format("Inference time: %.2f ms", t);
        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

        imshow(kWinName, frame);
    }

    return 0;
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,49 @@
		# High Level API: Usage and tips {#tutorial_dnn_high_level_api}

		### Introduction
		In this tutorial we will go through how you can use the new OpenCV Dnn API. The newly added DNN module allows us to run inference on deep neural networks exported from other frameworks, such as Caffe, Tensorflow, ONNX, and Torch.

		### Usage
		Here is a small example on how to run inference on the different API module. For a more detailed example, check the [samples](https://github.com/opencv/opencv/tree/master/samples/dnn)

		#### Classification Module
		@snippet dnn/classification.cpp Read and initialize network
		@snippet dnn/classification.cpp Set Input Parameters
		@snippet dnn/classification.cpp Network Forward pass

		This will return the predicted class and the confidence of the model.

		#### Detection Module
		@snippet dnn/detection.cpp Read and initialize network
		@snippet dnn/detection.cpp Set Input Parameters
		@snippet dnn/detection.cpp Network Forward pass

		This will store the predicted classes in ``classIds``, the bounding boxes in ``boxes`` and the confidence of the model in ``confidences``


		#### Segmentation Module
		@snippet dnn/segmentation.cpp Read and initialize network
		@snippet dnn/segmentation.cpp Set Input Parameters
		@snippet dnn/segmentation.cpp Network Forward pass

		This will store the segmentation map in ``mask``

		#### Keypoints Module
		@snippet dnn/keypoints.cpp Read and initialize network
		@snippet dnn/keypoints.cpp Set Input Parameters
		@snippet dnn/keypoints.cpp Network Forward pass

		This will return the predicted keypoints.


		### Some explanations
		If you are new to Machine Learning you might be wondering why do we need those variables, scale, Size, mean, etc. Usually statistical models,
		(e.g Neural Networks) are trained on normalized data to help convergence. When working with images, it is common to do this normalization in a per-channel
		fashion, so you take every image and subtract the per-channel mean and divide it by the per-channel standard deviation (std). Notice that the second operation (i.e dividing by the std)
		is done to limit the range of our data, and since the values in an image are already limited (0-255), you will sometimes see this step being skipped and the image just being divided by 1.
		Since the OpenCV Dnn Module is based on Caffe, and Caffe does skip this step (i.e divide the image by 1) this operation is not integrated into the API directly. So if we really want to
		normalize the image we have to set the scale parameter to the inverse of the standard deviation. Thus, the scale parameter represents we will scale our image by,
		and the mean represents the per-channel mean we want to subtract from it. It is also common to resize the images to a fixed size for convenience before feeding
		them to the network, this size is specified by the size parameter in our code. Last but not least, the swapRB variable indicates to swap the order of a BGR image to RGB
		or vice versa.
		OpenCV reads images in BGR format, while other frameworks, such as PIL read them in RGB. Anyhow, setting this flag to true will swap the image channels and the mean vector accordingly.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,3 +56,11 @@ Deep Neural Networks (dnn module) {#tutorial_table_of_content_dnn}
		Author: Dmitry Kurtaev

		How to define custom layers to import networks.

		- @subpage tutorial_dnn_high_level_api

		Compatibility: \> OpenCV 4.1.1

		Author: Diego Velazquez

		How you can use the new High Level API for the DNN module, plus some general advice.
alalek marked this conversation as resolved. Show resolvedHide resolved
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,7 +12,6 @@ std::string keys =
		"{ @alias \| \| An alias name of model to extract preprocessing parameters from models.yml file. }"
		"{ zoo \| models.yml \| An optional path to file with preprocessing parameters }"
		"{ input i \| \| Path to input image or video file. Skip this argument to capture frames from a camera.}"
		"{ framework f \| \| Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
		"{ classes \| \| Optional path to a text file with names of classes. }"
		"{ backend \| 0 \| Choose one of computation backends: "
		"0: automatically (by default), "
Expand DownExpand Up		@@ -54,7 +53,6 @@ int main(int argc, char** argv)
		int inpHeight = parser.get<int>("height");
		String model = findFile(parser.get<String>("model"));
		String config = findFile(parser.get<String>("config"));
		String framework = parser.get<String>("framework");
		int backendId = parser.get<int>("backend");
		int targetId = parser.get<int>("target");

Expand All		@@ -80,7 +78,9 @@ int main(int argc, char** argv)
		CV_Assert(!model.empty());

		//! [Read and initialize network]
		Net net = readNet(model, config, framework);
dkurt marked this conversation as resolved. Show resolvedHide resolved
		// Alexnet
		// https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_alexnet.tar.gz
		ClassificationModel net(model, config); // create our model
		net.setPreferableBackend(backendId);
		net.setPreferableTarget(targetId);
		//! [Read and initialize network]
Expand All		@@ -97,8 +97,12 @@ int main(int argc, char** argv)
		cap.open(0);
		//! [Open a video file or an image file or a camera stream]

		//! [Set Input Parameters]
		net.setInputParams(scale, Size(inpHeight, inpWidth), mean, swapRB);
		//! [Set Input Parameters]

		// Process frames.
		Mat frame, blob;
		Mat frame;
		while (waitKey(1) < 0)
		{
		cap >> frame;
Expand All		@@ -108,23 +112,14 @@ int main(int argc, char** argv)
		break;
		}

		//! [Create a 4D blob from a frame]
		blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
		//! [Create a 4D blob from a frame]

		//! [Set input blob]
		net.setInput(blob);
		//! [Set input blob]
		//! [Make forward pass]
		Mat prob = net.forward();
		//! [Make forward pass]

		//! [Get a class with a highest score]
		Point classIdPoint;
		double confidence;
		minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
		int classId = classIdPoint.x;
		//! [Get a class with a highest score]
		//! [Network Forward pass]
		std::pair<int, float> prediction = net.classify(frame);
Copy link Member dkurtFeb 6, 2020 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Please replace to int classId;float confidence;std::tie(classId, confidence) = net.classify(frame); dvd42 reacted with thumbs up emoji
		//! [Network Forward pass]


		int classId;
		float confidence;
		std::tie(classId, confidence) = prediction;

		// Put efficiency information.
		std::vector<double> layersTimes;
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,152 @@
		#include <fstream>
		#include <sstream>

		#include <opencv2/dnn.hpp>
		#include <opencv2/imgproc.hpp>
		#include <opencv2/highgui.hpp>

		#include "common.hpp"

		std::string keys =
		"{ help h \| \| Print help message. }"
		"{ @alias \| \| An alias name of model to extract preprocessing parameters from models.yml file. }"
		"{ zoo \| models.yml \| An optional path to file with preprocessing parameters }"
		"{ input i \| \| Path to input image or video file. Skip this argument to capture frames from a camera.}"
		"{ classes \| \| Optional path to a text file with names of classes. }"
		"{ confidence \| 0.5 \| Optional threshold to discard low confidence boxes.}"
		"{ nms_thr \| 0.0 \| Optional IOU threshold to discard highly overlapping boxes.}"
		"{ backend \| 0 \| Choose one of computation backends: "
		"0: automatically (by default), "
		"1: Halide language (http://halide-lang.org/), "
		"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
		"3: OpenCV implementation }"
		"{ target \| 0 \| Choose one of target computation devices: "
		"0: CPU target (by default), "
		"1: OpenCL, "
		"2: OpenCL fp16 (half-float precision), "
		"3: VPU }";

		using namespace cv;
		using namespace dnn;

		std::vector<std::string> classes;

		int main(int argc, char** argv)
		{
		CommandLineParser parser(argc, argv, keys);

		const std::string modelName = parser.get<String>("@alias");
		const std::string zooFile = parser.get<String>("zoo");

		keys += genPreprocArguments(modelName, zooFile);

		parser = CommandLineParser(argc, argv, keys);
		parser.about("Use this script to run classification deep learning networks using OpenCV.");
		if (argc == 1 \|\| parser.has("help"))
		{
		parser.printMessage();
		return 0;
		}

		float scale = parser.get<float>("scale");
		Scalar mean = parser.get<Scalar>("mean");
		bool swapRB = parser.get<bool>("rgb");
		int inpWidth = parser.get<int>("width");
		int inpHeight = parser.get<int>("height");
		String model = findFile(parser.get<String>("model"));
		String config = findFile(parser.get<String>("config"));
		int backendId = parser.get<int>("backend");
		int targetId = parser.get<int>("target");
		float confThreshold = parser.get<float>("confidence");
		float nmsThreshold = parser.get<float>("nms_thr");

		// Open file with classes names.
		if (parser.has("classes"))
		{
		std::string file = parser.get<String>("classes");
		std::ifstream ifs(file.c_str());
		if (!ifs.is_open())
		CV_Error(Error::StsError, "File " + file + " not found");
		std::string line;
		while (std::getline(ifs, line))
		{
		classes.push_back(line);
		}
		}

		if (!parser.check())
		{
		parser.printErrors();
		return 1;
		}
		CV_Assert(!model.empty());

		//! [Read and initialize network]
		// YOLOv3
		// https://pjreddie.com/media/files/yolov3.weights
		DetectionModel net(model, config);
		net.setPreferableBackend(backendId);
		net.setPreferableTarget(targetId);
		//! [Read and initialize network]

		// Create a window
		static const std::string kWinName = "Deep learning image classification in OpenCV";
		namedWindow(kWinName, WINDOW_NORMAL);

		//! [Open a video file or an image file or a camera stream]
		VideoCapture cap;
		if (parser.has("input"))
		cap.open(parser.get<String>("input"));
		else
		cap.open(0);
		//! [Open a video file or an image file or a camera stream]

		//! [Set Input Parameters]
		net.setInputParams(scale, Size(inpHeight, inpWidth), mean, swapRB);
		//! [Set Input Parameters]

		// Process frames.
		Mat frame;
		while (waitKey(1) < 0)
		{
		cap >> frame;
		if (frame.empty())
		{
		waitKey();
		break;
		}

		//! [Network Forward pass]
		std::vector<Rect> boxes;
		std::vector<int> classIds;
		std::vector<float> confidences;
		net.detect(frame, classIds, confidences, boxes, confThreshold, nmsThreshold);
		//! [Network Forward pass]

		//! [Iterate over every predicted box and draw them on the image with the predicted class and confidence on top]
		std::vector<Rect2d> boxesDouble(boxes.size());
		std::stringstream ss;

		for (uint i = 0; i < boxes.size(); i++) {
		ss << classIds[i];
		ss << ": ";
		ss << confidences[i];
		boxesDouble[i] = boxes[i];
		rectangle(frame, boxesDouble[i], Scalar(0, 0, 255), 1, 8, 0);
		putText(frame, ss.str(), Point(boxes[i].x, boxes[i].y), FONT_HERSHEY_DUPLEX, 0.5, Scalar(0,0,0), 2);
		ss.str("");
		}
		//! [Iterate over every predicted box and draw them on the image with the predicted class and confidence on top]

		// Put efficiency information.
		std::vector<double> layersTimes;
		double freq = getTickFrequency() / 1000;
		double t = net.getPerfProfile(layersTimes) / freq;
		std::string label = format("Inference time: %.2f ms", t);
		putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

		imshow(kWinName, frame);
		}

		return 0;
		}