Movatterモバイル変換


[0]ホーム

URL:


Skip to content
Search Gists
Sign in Sign up

Instantly share code, notes, and snippets.

@YashasSamaga
Last activeJanuary 19, 2023 09:25
    • Star(11)You must be signed in to star a gist
    • Fork(2)You must be signed in to fork a gist
    Save YashasSamaga/48bdb167303e10f4d07b754888ddbdcf to your computer and use it in GitHub Desktop.
    OpenCV DNN Benchmark Code
    #include<iostream>
    #include<algorithm>
    #include<vector>
    #include<chrono>
    #include<numeric>
    #include<opencv2/dnn.hpp>
    #include<opencv2/dnn/all_layers.hpp>
    #include<opencv2/highgui.hpp>
    #include"benchmark.hpp"
    /* OPTION I:
    * Use random images for testing.
    *
    * OPTION II:
    * Use images in "data/images/img_n.jpg" where `n` varies from 0, 1, 2, 3, ....
    */
    #defineUSE_RANDOM_IMAGES
    constexprauto default_batch_size =1;
    structmask_type {
    int backend;
    int target;
    };
    structconfig_type {
    std::string name;
    int backend;
    int target;
    };
    // select backend target combinations that you want to test
    std::vector<config_type> backends = {
    //{"OCV CPU", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU},
    //{"OCV OpenCL", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL},
    //{"OCV OpenCL FP16", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL_FP16},
    //{"IE CPU", cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_TARGET_CPU},
    {"CUDA FP32", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA},
    {"CUDA FP16", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16}
    };
    std::vector<cv::Mat> image_samples;
    template<classT>
    autoto_milliseconds(const T& duration) {
    return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
    }
    template<classT>
    autoto_microseconds(const T& duration) {
    return std::chrono::duration_cast<std::chrono::microseconds>(duration);
    }
    structperf_result_t
    {
    using duration = std::chrono::microseconds;
    duration init_time;
    std::vector<duration> runtimes;
    };
    template<std::size_t BENCHMARK_RUNS, std::size_t WARMUP_RUNS>
    autorun_network(
    const std::string& model,const std::string& config,
    const cv::Mat& blob,
    const std::vector<std::string>& output_names_,
    int backend,int target)
    {
    auto net =cv::dnn::readNet(model, config);
    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
    auto output_names = output_names_;
    if (output_names.empty())
    output_names = net.getUnconnectedOutLayersNames();
    std::vector<cv::Mat> output_mats;
    auto init_time =benchmark([&] {
    net.setInput(blob);
    net.forward(output_mats, output_names);
    });
    for(int i =0; i < WARMUP_RUNS; i++)
    {
    net.setInput(blob);
    net.forward(output_mats, output_names);
    }
    perf_result_t result;
    result.init_time = init_time;
    result.runtimes.reserve(BENCHMARK_RUNS);
    for(int i =0; i < BENCHMARK_RUNS; i++)
    {
    net.setInput(blob);
    auto inference_time =benchmark([&] {
    net.forward(output_mats, output_names);
    });
    result.runtimes.push_back(inference_time);
    }
    return result;
    }
    voidbench_network(
    const std::string& model,const std::string& config,
    cv::Size input_size,
    const std::vector<std::string>& output_names = {},
    int count = default_batch_size,
    std::vector<mask_type> mask = {})
    {
    #ifndef USE_RANDOM_IMAGES
    assert(count <= image_samples.size());
    #endif
    std::vector<cv::Mat> images;
    for (int i =0; i < count; i++)
    {
    #ifdef USE_RANDOM_IMAGES
    cv::Matimage(input_size, CV_32FC3);
    cv::randu(image,cv::Scalar(0,0,0),cv::Scalar(255,255,255));
    images.push_back(image);
    #else
    images.push_back(image_samples[i]);
    #endif
    }
    cv::Mat blob =cv::dnn::blobFromImages(images,1.0f, input_size,0.0f);
    for (auto c : backends) {
    auto backend = c.backend;
    auto target = c.target;
    bool skip = [backend, target, mask] {
    for (auto m : mask) {
    if (m.backend == backend && m.target == target)
    returntrue;
    if (m.backend == backend && m.target == -1)
    returntrue;
    if (m.backend == -1 && m.target == target)
    returntrue;
    }
    returnfalse;
    } ();
    if(skip)
    continue;
    try {
    constexprint WARMUP_RUNS =10;
    constexprint BENCHMARK_RUNS =100;
    auto result = run_network<BENCHMARK_RUNS, WARMUP_RUNS>(model, config, blob, output_names, backend, target);
    float init_time =to_microseconds(result.init_time).count() /1000.0;
    std::vector<float> runtimes;
    for (auto r : result.runtimes)
    runtimes.push_back(to_microseconds(r).count() /1000.0);
    auto sum =std::accumulate(std::begin(runtimes),std::end(runtimes),0.0f);
    auto squared_sum =std::inner_product(std::begin(runtimes),std::end(runtimes),std::begin(runtimes),0.0f);
    auto min = *std::min_element(std::begin(runtimes),std::end(runtimes));
    auto max = *std::max_element(std::begin(runtimes),std::end(runtimes));
    auto mean = sum / runtimes.size();
    auto stddev =std::sqrt(squared_sum / runtimes.size() - mean * mean);
    std::cout <<'[' << c.name <<"]" <<'\n'
    <<"\tinit >>" << init_time <<"ms" <<'\n'
    <<"\tinference >>" <<"min =" << min <<"ms, max =" << max <<"ms, mean =" << mean <<"ms, stddev =" << stddev <<"ms" << std::endl;
    }catch(const std::exception& ex) {
    std::cout << ex.what() << std::endl;
    return;
    }
    }
    std::cout << std::endl;
    }
    voidbench_alexnet()
    {
    std::cout <<"BVLC AlexNet\n";
    bench_network("data/alexnet/deploy.prototxt","data/alexnet/bvlc_alexnet.caffemodel",cv::Size(227,227));
    std::cout << std::endl;
    }
    voidbench_densenet121()
    {
    std::cout <<"DenseNet 121\n";
    bench_network("data/densenet121/DenseNet_121.prototxt","data/densenet121/DenseNet_121.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_east_text_detection()
    {
    std::cout <<"East Text Detection\n";
    bench_network("data/east_text_detection/frozen_east_text_detection.pb","",cv::Size(320,320));
    std::cout << std::endl;
    }
    voidbench_enet()
    {
    std::cout <<"ENet Cityscapes\n";
    bench_network("data/enet/model-cityscapes.net","",cv::Size(512,256), {},1);
    std::cout << std::endl;
    }
    voidbench_fns_stary_night()
    {
    std::cout <<"FastNeuralStyle Stary Night\n";
    bench_network("data/fns_stary_night/fast_neural_style_eccv16_starry_night.t7","",cv::Size(320,240));
    std::cout << std::endl;
    }
    voidbench_googlenet()
    {
    std::cout <<"BVLC GoogleNet\n";
    bench_network("data/googlenet/deploy.prototxt","data/googlenet/bvlc_googlenet.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_inception_v2_faster_rcnn()
    {
    std::cout <<"Inception v2 Faster RCNN\n";
    bench_network("data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pb","data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt",cv::Size(800,600), {}, default_batch_size,
    {
    {cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, -1}
    });
    std::cout << std::endl;
    }
    voidbench_inception_v2_mask_rcnn()
    {
    std::cout <<"Inception v2 Mask RCNN\n";
    bench_network("data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt","data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pb",cv::Size(1024,1024), {"detection_out_final","detection_masks"});
    std::cout << std::endl;
    }
    voidbench_mobilenet_ssd()
    {
    std::cout <<"MobileNet SSD\n";
    bench_network("data/mobilenet_ssd/MobileNetSSD_deploy.prototxt","data/mobilenet_ssd/MobileNetSSD_deploy.caffemodel",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_mobilenet_ssd_v1_coco()
    {
    std::cout <<"MobileNet SSD v1 Coco\n";
    bench_network("data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pb","data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pbtxt",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_mobilenet_ssd_v2_coco()
    {
    std::cout <<"MobileNet SSD v2 Coco\n";
    bench_network("data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pb","data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_opencv_face_detector()
    {
    std::cout <<"OpenCV Face Detector\n";
    bench_network("data/opencv_face_detector/deploy.prototxt","data/opencv_face_detector/res10_300x300_ssd_iter_140000_fp16.caffemodel",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_openface_nn4_small2_v1()
    {
    std::cout <<"OpenFace nn4 small2 v1\n";
    bench_network("data/openface_nn4_small2_v1/nn4.small2.v1.t7","",cv::Size(96,96));
    std::cout << std::endl;
    }
    voidbench_openpose_pose_mpi()
    {
    std::cout <<"OpenPose pose MPI\n";
    bench_network("data/openpose_pose_mpi/openpose_pose_mpi_faster_4_stages.prototxt","data/openpose_pose_mpi/pose_iter_160000.caffemodel",cv::Size(368,368));
    std::cout << std::endl;
    }
    voidbench_resnet50()
    {
    std::cout <<"ResNet 50\n";
    bench_network("data/resnet50/ResNet-50-deploy.prototxt","data/resnet50/ResNet-50-model.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_resnet50_faster_rcnn()
    {
    std::cout <<"ResNet50 Faster RCNN\n";
    bench_network("data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt","data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pb",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_resnet101()
    {
    std::cout <<"ResNet 101\n";
    bench_network("data/resnet101/ResNet-101-deploy.prototxt","data/resnet101/ResNet-101-model.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_squeezenet()
    {
    std::cout <<"SqueezeNet v1.1\n";
    bench_network("data/squeezenet/squeezenet_v1.1.prototxt","data/squeezenet/squeezenet_v1.1.caffemodel",cv::Size(227,227));
    std::cout << std::endl;
    }
    voidbench_inception_v2_coco()
    {
    std::cout <<"Inception v2 Coco\n";
    bench_network("data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pb","data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pbtxt",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_tensorflow_inception_5h()
    {
    std::cout <<"TensorFlow Inception 5h\n";
    bench_network("data/tensorflow_inception_5h/tensorflow_inception_graph.pb","",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_vgg16()
    {
    std::cout <<"VGG16 SSD\n";
    bench_network("data/vgg16/ssd_vgg16.prototxt","data/vgg16/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",cv::Size(300,300));
    std::cout << std::endl;
    }
    voidbench_vgg16_faster_rcnn()
    {
    std::cout <<"VGG16 Faster RCNN\n";
    bench_network("data/vgg16_faster_rcnn/faster_rcnn_vgg16.prototxt","data/vgg16_faster_rcnn/VGG16_faster_rcnn_final.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    voidbench_vgg_ssd512()
    {
    std::cout <<"VGG SSD512\n";
    bench_network("data/vgg512/deploy.prototxt","data/vgg512/VGG_coco_SSD_512x512_iter_360000.caffemodel",cv::Size(512,512));
    std::cout << std::endl;
    }
    voidbench_yolo_v2()
    {
    std::cout <<"YOLO v2\n";
    bench_network("data/yolov2/yolov2.cfg","data/yolov2/yolov2.weights",cv::Size(608,608));
    std::cout << std::endl;
    }
    voidbench_yolo_v3()
    {
    std::cout <<"YOLO v3\n";
    bench_network("data/yolov3/yolov3.cfg","data/yolov3/yolov3.weights",cv::Size(608,608));
    std::cout << std::endl;
    }
    voidbench_yolo_v3_spp()
    {
    std::cout <<"YOLO v3 SPP\n";
    bench_network("data/yolov3_spp/yolov3-spp.cfg","data/yolov3_spp/yolov3-spp.weights",cv::Size(608,608));
    std::cout << std::endl;
    }
    voidbench_yolov3_enet_b0()
    {
    std::cout <<"EfficientNet B0 YOLOv3\n";
    bench_network("data/yolov3-enet-b0/enet-coco.cfg","data/yolov3-enet-b0/enetb0-coco_final.weights",cv::Size(416,416));
    std::cout << std::endl;
    }
    voidbench_yolo_v3_tiny()
    {
    std::cout <<"YOLO v3 Tiny\n";
    bench_network("data/yolov3-tiny/yolov3-tiny.cfg","data/yolov3-tiny/yolov3-tiny.weights",cv::Size(416,416));
    std::cout << std::endl;
    }
    voidbench_yolo_v3_tiny_prn()
    {
    std::cout <<"YOLO v3 Tiny PRN\n";
    bench_network("data/yolov3-tiny-prn/yolov3-tiny-prn.cfg","data/yolov3-tiny-prn/yolov3-tiny-prn.weights",cv::Size(416,416));
    std::cout << std::endl;
    }
    voidbench_yolo_v4()
    {
    std::cout <<"YOLO v4\n";
    bench_network("data/yolov4/yolov4.cfg","data/yolov4/yolov4.weights",cv::Size(608,608));
    std::cout << std::endl;
    }
    voidbench_yolo_v4_tiny()
    {
    std::cout <<"YOLO v4 Tiny\n";
    bench_network("data/yolov4-tiny/yolov4-tiny.cfg","data/yolov4-tiny/yolov4-tiny.weights",cv::Size(416,416));
    std::cout << std::endl;
    }
    voidbench_zf_faster_rcnn()
    {
    std::cout <<"ZF Faster RCNN\n";
    bench_network("data/zf_faster_rcnn/faster_rcnn_zf.prototxt","data/zf_faster_rcnn/ZF_faster_rcnn_final.caffemodel",cv::Size(224,224));
    std::cout << std::endl;
    }
    intmain(int argc,char *argv[])
    {
    constexprauto total_images =10;
    auto prefix =std::string("data/images/img_"),
    suffix =std::string(".jpg");
    /* populate sample images*/
    for (int i =0; i < total_images; i++) {
    auto file = prefix +std::to_string(i) + suffix;
    auto image =cv::imread(file);
    image_samples.push_back(image);
    }
    bench_yolo_v4();
    bench_yolo_v4_tiny();
    return0;
    bench_alexnet();
    bench_densenet121();
    bench_east_text_detection();
    bench_enet();
    bench_fns_stary_night();
    bench_googlenet();
    bench_inception_v2_faster_rcnn();
    bench_inception_v2_mask_rcnn();
    bench_mobilenet_ssd();
    bench_mobilenet_ssd_v1_coco();
    bench_mobilenet_ssd_v2_coco();
    bench_opencv_face_detector();
    bench_openface_nn4_small2_v1();
    bench_openpose_pose_mpi();
    bench_resnet50();
    bench_resnet50_faster_rcnn();
    bench_resnet101();
    bench_squeezenet();
    bench_inception_v2_coco();
    bench_tensorflow_inception_5h();
    bench_vgg16();
    bench_vgg_ssd512();
    bench_vgg16_faster_rcnn();
    bench_yolo_v2();
    bench_yolo_v3_tiny();
    bench_yolo_v3_tiny_prn();
    bench_yolo_v3();
    bench_yolo_v3_spp();
    bench_yolov3_enet_b0();
    bench_yolo_v4();
    bench_yolo_v4_tiny();
    bench_zf_faster_rcnn();
    return0;
    }
    #ifndef BENCHMARK_HPP
    #defineBENCHMARK_HPP
    #include<chrono>
    template<classFunction,typename ...Args>
    autobenchmark(Function function, Args&& ...args) {
    using std::chrono::steady_clock;
    auto start =steady_clock::now();
    function(std::forward<Args>(args)...);
    auto end =steady_clock::now();
    return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    }
    /* doNotOptimizeAway from https://stackoverflow.com/a/36781982/1935009*/
    #ifdef _MSC_VER
    #pragma optimize("", off)
    template<classT>
    voiddoNotOptimizeAway(T&& datum) {
    datum = datum;
    }
    #pragma optimize("", on)
    #elif defined(__clang__)
    template<classT>
    __attribute__((__optnone__)) void doNotOptimizeAway(T&&/* datum*/) {}
    #else
    template<classT>
    voiddoNotOptimizeAway(T&& datum) {
    asmvolatile("" :"+r" (datum));
    }
    #endif
    #endif/* BENCHMARK_HPP*/
    g++ -I/usr/local/include/opencv4/ benchmark.cpp -lopencv_core -lopencv_imgproc -lopencv_dnn -lopencv_imgcodecs -O3 -std=c++17
    @xjsxujingsong
    Copy link

    Hi, thanks for your code.
    I am testing the speed on yolov3. It looks like incresing batch_size wont speed up the process. See the log for batch=1, 2, and 4. The running time increase when the batch increases. Is this normal?

    YOLO v3
    BATCH=1
    [CUDA FP32]
    init >> 724.753ms
    inference >> min = 36.77ms, max = 46.585ms, mean = 37.8626ms, stddev = 1.09954ms
    [CUDA FP16]
    init >> 469.271ms
    inference >> min = 19.636ms, max = 22.132ms, mean = 20.1432ms, stddev = 0.410955ms

    BATCH=2YOLO v3

    [CUDA FP32]
    init >> 987.252ms
    inference >> min = 65.468ms, max = 71.962ms, mean = 67.4592ms, stddev = 0.933586ms
    [CUDA FP16]
    init >> 510.797ms
    inference >> min = 35.917ms, max = 38.802ms, mean = 36.991ms, stddev = 0.525474ms

    BATCH=4
    [CUDA FP32]
    init >> 947.806ms
    inference >> min = 118.306ms, max = 139.88ms, mean = 133.521ms, stddev = 2.06297ms
    [CUDA FP16]
    init >> 632.171ms
    inference >> min = 69.454ms, max = 75.5ms, mean = 70.737ms, stddev = 1.05118ms

    @YashasSamaga
    Copy link
    Author

    YashasSamaga commentedJun 4, 2020
    edited
    Loading

    The statistics reported are for thenet.forward() call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.

    You can get a significant speedup by disabling NMS. You have to setnms_threshold=0 in all[yolo] blocks inyolov3.cfg. Checkthis for information on doing it from the code.

    @xjsxujingsong
    Copy link

    Thanks for your quick reply.
    When I copy the code to C++, it cannot find region layer
    auto layer = net.getLayer(layerId).dynamicCastcv::dnn::RegionLayer();
    My OpenCV is 4.3.0 with cuda build. which version is your code using?

    @YashasSamaga
    Copy link
    Author

    @xjsxujingsong You need the master branch (nmsThreshold field in region layer was exposed recently). You can use YOLOv4 if you use the master branch. YOLOv4 is nearly as fast as YOLOv3 and gives higher performance.

    @xjsxujingsong
    Copy link

    Thanks. I am rebuilding the latest OpenCV.
    I am find your page throughhttps://github.com/AlexeyAB/darknet
    From the table there, take the last line for example

    OpenCV FP16, FPS : 100
    OpenCV FP16 batch=4, FPS: 133

    It looks like batching wont speed up too much. Is this using this benchmark.cpp or you disable the nms?

    @YashasSamaga
    Copy link
    Author

    YashasSamaga commentedJun 5, 2020
    edited
    Loading

    @xjsxujingsong I updated benchmark.cpp to disable NMS yesterday. The table there used the oldbenchmark.cpp code with NMS manually by settingnms_threshold=0 in all[yolo] blocks inyolov4.cfg. The newbenchmark.cpp does the same in the code.

    @albertchristianto
    Copy link

    albertchristianto commentedOct 8, 2020
    edited
    Loading

    Hi@YashasSamaga, I want to implement YOLO detection on multiple camera.

    So, my strategy right now is spawning YOLO model for each camera which is not a good idea since the used GPU memory will become very big. (if 1 YOLO model need ~2GB, then a RTX 2080 Ti can only handle 4-5 cameras at once).

    I have an idea of utilizing batch size to solve this problem.

    The statistics reported are for thenet.forward() call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.

    You can get a significant speedup by disabling NMS. You have to setnms_threshold=0 in all[yolo] blocks inyolov3.cfg. Checkthis for information on doing it from the code.

    Based on your answer above, if I want to utilize batch size to handle multiple camera, I have to consider how many camera that will be handled by 1 YOLO model , so the FPS won't be dropped.
    Have I come in the right conclusion? Or Do you have some suggestion regarding this solution?

    best regards,
    Albert Christianto

    @YashasSamaga
    Copy link
    Author

    YashasSamaga commentedOct 8, 2020
    edited
    Loading

    @albertchristianto

    I would recommend having a fixed number of networks (one network on each thread) — you can have a constant parameter, sayNUM_NETWORK_THREADS that you can tune by trial and error. You can have a task queue which your cameras will populate with work items. One of the unused network will pick up a pending work and dump outputs to an output queue. If the queue has multiple items pending, you can do batch inference to improve throughput.

    The most important point you have to remember is that both latency and throughput increases with batch size. Small batch sizes will report results faster but overall throughput will be lower (time per image is higher); large batch sizes will take longer to report results but overall throughput will be higher (time per image is lower). You need to balance the two factors for optimal performance (and maybe power consumption).

    @albertchristianto
    Copy link

    @YashasSamaga.
    Ok, I get the point.
    Thank you for your reply and your insight. It is very helpful for me.
    best regards,
    Albert Christianto

    Sign up for freeto join this conversation on GitHub. Already have an account?Sign in to comment

    [8]ページ先頭

    ©2009-2025 Movatter.jp