Instantly share code, notes, and snippets.
Save YashasSamaga/48bdb167303e10f4d07b754888ddbdcf to your computer and use it in GitHub Desktop.
#include<iostream> | |
#include<algorithm> | |
#include<vector> | |
#include<chrono> | |
#include<numeric> | |
#include<opencv2/dnn.hpp> | |
#include<opencv2/dnn/all_layers.hpp> | |
#include<opencv2/highgui.hpp> | |
#include"benchmark.hpp" | |
/* OPTION I: | |
* Use random images for testing. | |
* | |
* OPTION II: | |
* Use images in "data/images/img_n.jpg" where `n` varies from 0, 1, 2, 3, .... | |
*/ | |
#defineUSE_RANDOM_IMAGES | |
constexprauto default_batch_size =1; | |
structmask_type { | |
int backend; | |
int target; | |
}; | |
structconfig_type { | |
std::string name; | |
int backend; | |
int target; | |
}; | |
// select backend target combinations that you want to test | |
std::vector<config_type> backends = { | |
//{"OCV CPU", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU}, | |
//{"OCV OpenCL", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL}, | |
//{"OCV OpenCL FP16", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL_FP16}, | |
//{"IE CPU", cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_TARGET_CPU}, | |
{"CUDA FP32", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA}, | |
{"CUDA FP16", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16} | |
}; | |
std::vector<cv::Mat> image_samples; | |
template<classT> | |
autoto_milliseconds(const T& duration) { | |
return std::chrono::duration_cast<std::chrono::milliseconds>(duration); | |
} | |
template<classT> | |
autoto_microseconds(const T& duration) { | |
return std::chrono::duration_cast<std::chrono::microseconds>(duration); | |
} | |
structperf_result_t | |
{ | |
using duration = std::chrono::microseconds; | |
duration init_time; | |
std::vector<duration> runtimes; | |
}; | |
template<std::size_t BENCHMARK_RUNS, std::size_t WARMUP_RUNS> | |
autorun_network( | |
const std::string& model,const std::string& config, | |
const cv::Mat& blob, | |
const std::vector<std::string>& output_names_, | |
int backend,int target) | |
{ | |
auto net =cv::dnn::readNet(model, config); | |
net.setPreferableBackend(backend); | |
net.setPreferableTarget(target); | |
auto output_names = output_names_; | |
if (output_names.empty()) | |
output_names = net.getUnconnectedOutLayersNames(); | |
std::vector<cv::Mat> output_mats; | |
auto init_time =benchmark([&] { | |
net.setInput(blob); | |
net.forward(output_mats, output_names); | |
}); | |
for(int i =0; i < WARMUP_RUNS; i++) | |
{ | |
net.setInput(blob); | |
net.forward(output_mats, output_names); | |
} | |
perf_result_t result; | |
result.init_time = init_time; | |
result.runtimes.reserve(BENCHMARK_RUNS); | |
for(int i =0; i < BENCHMARK_RUNS; i++) | |
{ | |
net.setInput(blob); | |
auto inference_time =benchmark([&] { | |
net.forward(output_mats, output_names); | |
}); | |
result.runtimes.push_back(inference_time); | |
} | |
return result; | |
} | |
voidbench_network( | |
const std::string& model,const std::string& config, | |
cv::Size input_size, | |
const std::vector<std::string>& output_names = {}, | |
int count = default_batch_size, | |
std::vector<mask_type> mask = {}) | |
{ | |
#ifndef USE_RANDOM_IMAGES | |
assert(count <= image_samples.size()); | |
#endif | |
std::vector<cv::Mat> images; | |
for (int i =0; i < count; i++) | |
{ | |
#ifdef USE_RANDOM_IMAGES | |
cv::Matimage(input_size, CV_32FC3); | |
cv::randu(image,cv::Scalar(0,0,0),cv::Scalar(255,255,255)); | |
images.push_back(image); | |
#else | |
images.push_back(image_samples[i]); | |
#endif | |
} | |
cv::Mat blob =cv::dnn::blobFromImages(images,1.0f, input_size,0.0f); | |
for (auto c : backends) { | |
auto backend = c.backend; | |
auto target = c.target; | |
bool skip = [backend, target, mask] { | |
for (auto m : mask) { | |
if (m.backend == backend && m.target == target) | |
returntrue; | |
if (m.backend == backend && m.target == -1) | |
returntrue; | |
if (m.backend == -1 && m.target == target) | |
returntrue; | |
} | |
returnfalse; | |
} (); | |
if(skip) | |
continue; | |
try { | |
constexprint WARMUP_RUNS =10; | |
constexprint BENCHMARK_RUNS =100; | |
auto result = run_network<BENCHMARK_RUNS, WARMUP_RUNS>(model, config, blob, output_names, backend, target); | |
float init_time =to_microseconds(result.init_time).count() /1000.0; | |
std::vector<float> runtimes; | |
for (auto r : result.runtimes) | |
runtimes.push_back(to_microseconds(r).count() /1000.0); | |
auto sum =std::accumulate(std::begin(runtimes),std::end(runtimes),0.0f); | |
auto squared_sum =std::inner_product(std::begin(runtimes),std::end(runtimes),std::begin(runtimes),0.0f); | |
auto min = *std::min_element(std::begin(runtimes),std::end(runtimes)); | |
auto max = *std::max_element(std::begin(runtimes),std::end(runtimes)); | |
auto mean = sum / runtimes.size(); | |
auto stddev =std::sqrt(squared_sum / runtimes.size() - mean * mean); | |
std::cout <<'[' << c.name <<"]" <<'\n' | |
<<"\tinit >>" << init_time <<"ms" <<'\n' | |
<<"\tinference >>" <<"min =" << min <<"ms, max =" << max <<"ms, mean =" << mean <<"ms, stddev =" << stddev <<"ms" << std::endl; | |
}catch(const std::exception& ex) { | |
std::cout << ex.what() << std::endl; | |
return; | |
} | |
} | |
std::cout << std::endl; | |
} | |
voidbench_alexnet() | |
{ | |
std::cout <<"BVLC AlexNet\n"; | |
bench_network("data/alexnet/deploy.prototxt","data/alexnet/bvlc_alexnet.caffemodel",cv::Size(227,227)); | |
std::cout << std::endl; | |
} | |
voidbench_densenet121() | |
{ | |
std::cout <<"DenseNet 121\n"; | |
bench_network("data/densenet121/DenseNet_121.prototxt","data/densenet121/DenseNet_121.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_east_text_detection() | |
{ | |
std::cout <<"East Text Detection\n"; | |
bench_network("data/east_text_detection/frozen_east_text_detection.pb","",cv::Size(320,320)); | |
std::cout << std::endl; | |
} | |
voidbench_enet() | |
{ | |
std::cout <<"ENet Cityscapes\n"; | |
bench_network("data/enet/model-cityscapes.net","",cv::Size(512,256), {},1); | |
std::cout << std::endl; | |
} | |
voidbench_fns_stary_night() | |
{ | |
std::cout <<"FastNeuralStyle Stary Night\n"; | |
bench_network("data/fns_stary_night/fast_neural_style_eccv16_starry_night.t7","",cv::Size(320,240)); | |
std::cout << std::endl; | |
} | |
voidbench_googlenet() | |
{ | |
std::cout <<"BVLC GoogleNet\n"; | |
bench_network("data/googlenet/deploy.prototxt","data/googlenet/bvlc_googlenet.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_inception_v2_faster_rcnn() | |
{ | |
std::cout <<"Inception v2 Faster RCNN\n"; | |
bench_network("data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pb","data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt",cv::Size(800,600), {}, default_batch_size, | |
{ | |
{cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, -1} | |
}); | |
std::cout << std::endl; | |
} | |
voidbench_inception_v2_mask_rcnn() | |
{ | |
std::cout <<"Inception v2 Mask RCNN\n"; | |
bench_network("data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt","data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pb",cv::Size(1024,1024), {"detection_out_final","detection_masks"}); | |
std::cout << std::endl; | |
} | |
voidbench_mobilenet_ssd() | |
{ | |
std::cout <<"MobileNet SSD\n"; | |
bench_network("data/mobilenet_ssd/MobileNetSSD_deploy.prototxt","data/mobilenet_ssd/MobileNetSSD_deploy.caffemodel",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_mobilenet_ssd_v1_coco() | |
{ | |
std::cout <<"MobileNet SSD v1 Coco\n"; | |
bench_network("data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pb","data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pbtxt",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_mobilenet_ssd_v2_coco() | |
{ | |
std::cout <<"MobileNet SSD v2 Coco\n"; | |
bench_network("data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pb","data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_opencv_face_detector() | |
{ | |
std::cout <<"OpenCV Face Detector\n"; | |
bench_network("data/opencv_face_detector/deploy.prototxt","data/opencv_face_detector/res10_300x300_ssd_iter_140000_fp16.caffemodel",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_openface_nn4_small2_v1() | |
{ | |
std::cout <<"OpenFace nn4 small2 v1\n"; | |
bench_network("data/openface_nn4_small2_v1/nn4.small2.v1.t7","",cv::Size(96,96)); | |
std::cout << std::endl; | |
} | |
voidbench_openpose_pose_mpi() | |
{ | |
std::cout <<"OpenPose pose MPI\n"; | |
bench_network("data/openpose_pose_mpi/openpose_pose_mpi_faster_4_stages.prototxt","data/openpose_pose_mpi/pose_iter_160000.caffemodel",cv::Size(368,368)); | |
std::cout << std::endl; | |
} | |
voidbench_resnet50() | |
{ | |
std::cout <<"ResNet 50\n"; | |
bench_network("data/resnet50/ResNet-50-deploy.prototxt","data/resnet50/ResNet-50-model.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_resnet50_faster_rcnn() | |
{ | |
std::cout <<"ResNet50 Faster RCNN\n"; | |
bench_network("data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt","data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pb",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_resnet101() | |
{ | |
std::cout <<"ResNet 101\n"; | |
bench_network("data/resnet101/ResNet-101-deploy.prototxt","data/resnet101/ResNet-101-model.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_squeezenet() | |
{ | |
std::cout <<"SqueezeNet v1.1\n"; | |
bench_network("data/squeezenet/squeezenet_v1.1.prototxt","data/squeezenet/squeezenet_v1.1.caffemodel",cv::Size(227,227)); | |
std::cout << std::endl; | |
} | |
voidbench_inception_v2_coco() | |
{ | |
std::cout <<"Inception v2 Coco\n"; | |
bench_network("data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pb","data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pbtxt",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_tensorflow_inception_5h() | |
{ | |
std::cout <<"TensorFlow Inception 5h\n"; | |
bench_network("data/tensorflow_inception_5h/tensorflow_inception_graph.pb","",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_vgg16() | |
{ | |
std::cout <<"VGG16 SSD\n"; | |
bench_network("data/vgg16/ssd_vgg16.prototxt","data/vgg16/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",cv::Size(300,300)); | |
std::cout << std::endl; | |
} | |
voidbench_vgg16_faster_rcnn() | |
{ | |
std::cout <<"VGG16 Faster RCNN\n"; | |
bench_network("data/vgg16_faster_rcnn/faster_rcnn_vgg16.prototxt","data/vgg16_faster_rcnn/VGG16_faster_rcnn_final.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
voidbench_vgg_ssd512() | |
{ | |
std::cout <<"VGG SSD512\n"; | |
bench_network("data/vgg512/deploy.prototxt","data/vgg512/VGG_coco_SSD_512x512_iter_360000.caffemodel",cv::Size(512,512)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v2() | |
{ | |
std::cout <<"YOLO v2\n"; | |
bench_network("data/yolov2/yolov2.cfg","data/yolov2/yolov2.weights",cv::Size(608,608)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v3() | |
{ | |
std::cout <<"YOLO v3\n"; | |
bench_network("data/yolov3/yolov3.cfg","data/yolov3/yolov3.weights",cv::Size(608,608)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v3_spp() | |
{ | |
std::cout <<"YOLO v3 SPP\n"; | |
bench_network("data/yolov3_spp/yolov3-spp.cfg","data/yolov3_spp/yolov3-spp.weights",cv::Size(608,608)); | |
std::cout << std::endl; | |
} | |
voidbench_yolov3_enet_b0() | |
{ | |
std::cout <<"EfficientNet B0 YOLOv3\n"; | |
bench_network("data/yolov3-enet-b0/enet-coco.cfg","data/yolov3-enet-b0/enetb0-coco_final.weights",cv::Size(416,416)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v3_tiny() | |
{ | |
std::cout <<"YOLO v3 Tiny\n"; | |
bench_network("data/yolov3-tiny/yolov3-tiny.cfg","data/yolov3-tiny/yolov3-tiny.weights",cv::Size(416,416)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v3_tiny_prn() | |
{ | |
std::cout <<"YOLO v3 Tiny PRN\n"; | |
bench_network("data/yolov3-tiny-prn/yolov3-tiny-prn.cfg","data/yolov3-tiny-prn/yolov3-tiny-prn.weights",cv::Size(416,416)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v4() | |
{ | |
std::cout <<"YOLO v4\n"; | |
bench_network("data/yolov4/yolov4.cfg","data/yolov4/yolov4.weights",cv::Size(608,608)); | |
std::cout << std::endl; | |
} | |
voidbench_yolo_v4_tiny() | |
{ | |
std::cout <<"YOLO v4 Tiny\n"; | |
bench_network("data/yolov4-tiny/yolov4-tiny.cfg","data/yolov4-tiny/yolov4-tiny.weights",cv::Size(416,416)); | |
std::cout << std::endl; | |
} | |
voidbench_zf_faster_rcnn() | |
{ | |
std::cout <<"ZF Faster RCNN\n"; | |
bench_network("data/zf_faster_rcnn/faster_rcnn_zf.prototxt","data/zf_faster_rcnn/ZF_faster_rcnn_final.caffemodel",cv::Size(224,224)); | |
std::cout << std::endl; | |
} | |
intmain(int argc,char *argv[]) | |
{ | |
constexprauto total_images =10; | |
auto prefix =std::string("data/images/img_"), | |
suffix =std::string(".jpg"); | |
/* populate sample images*/ | |
for (int i =0; i < total_images; i++) { | |
auto file = prefix +std::to_string(i) + suffix; | |
auto image =cv::imread(file); | |
image_samples.push_back(image); | |
} | |
bench_yolo_v4(); | |
bench_yolo_v4_tiny(); | |
return0; | |
bench_alexnet(); | |
bench_densenet121(); | |
bench_east_text_detection(); | |
bench_enet(); | |
bench_fns_stary_night(); | |
bench_googlenet(); | |
bench_inception_v2_faster_rcnn(); | |
bench_inception_v2_mask_rcnn(); | |
bench_mobilenet_ssd(); | |
bench_mobilenet_ssd_v1_coco(); | |
bench_mobilenet_ssd_v2_coco(); | |
bench_opencv_face_detector(); | |
bench_openface_nn4_small2_v1(); | |
bench_openpose_pose_mpi(); | |
bench_resnet50(); | |
bench_resnet50_faster_rcnn(); | |
bench_resnet101(); | |
bench_squeezenet(); | |
bench_inception_v2_coco(); | |
bench_tensorflow_inception_5h(); | |
bench_vgg16(); | |
bench_vgg_ssd512(); | |
bench_vgg16_faster_rcnn(); | |
bench_yolo_v2(); | |
bench_yolo_v3_tiny(); | |
bench_yolo_v3_tiny_prn(); | |
bench_yolo_v3(); | |
bench_yolo_v3_spp(); | |
bench_yolov3_enet_b0(); | |
bench_yolo_v4(); | |
bench_yolo_v4_tiny(); | |
bench_zf_faster_rcnn(); | |
return0; | |
} |
#ifndef BENCHMARK_HPP | |
#defineBENCHMARK_HPP | |
#include<chrono> | |
template<classFunction,typename ...Args> | |
autobenchmark(Function function, Args&& ...args) { | |
using std::chrono::steady_clock; | |
auto start =steady_clock::now(); | |
function(std::forward<Args>(args)...); | |
auto end =steady_clock::now(); | |
return std::chrono::duration_cast<std::chrono::microseconds>(end - start); | |
} | |
/* doNotOptimizeAway from https://stackoverflow.com/a/36781982/1935009*/ | |
#ifdef _MSC_VER | |
#pragma optimize("", off) | |
template<classT> | |
voiddoNotOptimizeAway(T&& datum) { | |
datum = datum; | |
} | |
#pragma optimize("", on) | |
#elif defined(__clang__) | |
template<classT> | |
__attribute__((__optnone__)) void doNotOptimizeAway(T&&/* datum*/) {} | |
#else | |
template<classT> | |
voiddoNotOptimizeAway(T&& datum) { | |
asmvolatile("" :"+r" (datum)); | |
} | |
#endif | |
#endif/* BENCHMARK_HPP*/ |
g++ -I/usr/local/include/opencv4/ benchmark.cpp -lopencv_core -lopencv_imgproc -lopencv_dnn -lopencv_imgcodecs -O3 -std=c++17 |
xjsxujingsong commentedJun 4, 2020
Hi, thanks for your code.
I am testing the speed on yolov3. It looks like incresing batch_size wont speed up the process. See the log for batch=1, 2, and 4. The running time increase when the batch increases. Is this normal?
YOLO v3
BATCH=1
[CUDA FP32]
init >> 724.753ms
inference >> min = 36.77ms, max = 46.585ms, mean = 37.8626ms, stddev = 1.09954ms
[CUDA FP16]
init >> 469.271ms
inference >> min = 19.636ms, max = 22.132ms, mean = 20.1432ms, stddev = 0.410955ms
BATCH=2YOLO v3
[CUDA FP32]
init >> 987.252ms
inference >> min = 65.468ms, max = 71.962ms, mean = 67.4592ms, stddev = 0.933586ms
[CUDA FP16]
init >> 510.797ms
inference >> min = 35.917ms, max = 38.802ms, mean = 36.991ms, stddev = 0.525474ms
BATCH=4
[CUDA FP32]
init >> 947.806ms
inference >> min = 118.306ms, max = 139.88ms, mean = 133.521ms, stddev = 2.06297ms
[CUDA FP16]
init >> 632.171ms
inference >> min = 69.454ms, max = 75.5ms, mean = 70.737ms, stddev = 1.05118ms
YashasSamaga commentedJun 4, 2020 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
The statistics reported are for thenet.forward()
call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.
You can get a significant speedup by disabling NMS. You have to setnms_threshold=0
in all[yolo]
blocks inyolov3.cfg
. Checkthis for information on doing it from the code.
xjsxujingsong commentedJun 4, 2020
Thanks for your quick reply.
When I copy the code to C++, it cannot find region layer
auto layer = net.getLayer(layerId).dynamicCastcv::dnn::RegionLayer();
My OpenCV is 4.3.0 with cuda build. which version is your code using?
@xjsxujingsong You need the master branch (nmsThreshold
field in region layer was exposed recently). You can use YOLOv4 if you use the master branch. YOLOv4 is nearly as fast as YOLOv3 and gives higher performance.
xjsxujingsong commentedJun 5, 2020
Thanks. I am rebuilding the latest OpenCV.
I am find your page throughhttps://github.com/AlexeyAB/darknet
From the table there, take the last line for example
OpenCV FP16, FPS : 100
OpenCV FP16 batch=4, FPS: 133
It looks like batching wont speed up too much. Is this using this benchmark.cpp or you disable the nms?
YashasSamaga commentedJun 5, 2020 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
@xjsxujingsong I updated benchmark.cpp to disable NMS yesterday. The table there used the oldbenchmark.cpp
code with NMS manually by settingnms_threshold=0
in all[yolo]
blocks inyolov4.cfg
. The newbenchmark.cpp
does the same in the code.
albertchristianto commentedOct 8, 2020 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
Hi@YashasSamaga, I want to implement YOLO detection on multiple camera.
So, my strategy right now is spawning YOLO model for each camera which is not a good idea since the used GPU memory will become very big. (if 1 YOLO model need ~2GB, then a RTX 2080 Ti can only handle 4-5 cameras at once).
I have an idea of utilizing batch size to solve this problem.
The statistics reported are for the
net.forward()
call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.You can get a significant speedup by disabling NMS. You have to set
nms_threshold=0
in all[yolo]
blocks inyolov3.cfg
. Checkthis for information on doing it from the code.
Based on your answer above, if I want to utilize batch size to handle multiple camera, I have to consider how many camera that will be handled by 1 YOLO model , so the FPS won't be dropped.
Have I come in the right conclusion? Or Do you have some suggestion regarding this solution?
best regards,
Albert Christianto
YashasSamaga commentedOct 8, 2020 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
I would recommend having a fixed number of networks (one network on each thread) — you can have a constant parameter, sayNUM_NETWORK_THREADS
that you can tune by trial and error. You can have a task queue which your cameras will populate with work items. One of the unused network will pick up a pending work and dump outputs to an output queue. If the queue has multiple items pending, you can do batch inference to improve throughput.
The most important point you have to remember is that both latency and throughput increases with batch size. Small batch sizes will report results faster but overall throughput will be lower (time per image is higher); large batch sizes will take longer to report results but overall throughput will be higher (time per image is lower). You need to balance the two factors for optimal performance (and maybe power consumption).
albertchristianto commentedOct 10, 2020
@YashasSamaga.
Ok, I get the point.
Thank you for your reply and your insight. It is very helpful for me.
best regards,
Albert Christianto