Commit211abfa

JanuszL

authored and

stiepan

committed

Fix data paths in TL3 short tests (#5845)

- unifies data paths in TL3 short tests with other tests- improves the way the remote file system is detected in the testsSigned-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

1 parent6f1d20c commit211abfaCopy full SHA for 211abfa

File tree

5 files changed

+25

-34

lines changed

qa
- TL1_separate_executor
  - test_nofw.sh
- TL2_FW_iterators_perf
  - test_pytorch.sh
- TL2_RN50_data_perf
  - test.sh
- TL3_RN50_short
  - test_pytorch.sh
  - test_tensorflow.sh

5 files changed

+25

-34

lines changed

`‎qa/TL1_separate_executor/test_nofw.sh`

Lines changed: 3 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,17 +9,11 @@ do_once() {`
`9`	`9`	`}`
`10`	`10`
`11`	`11`	`test_body() {`
`12`		- start=`date +%s`
`13`		`- (sleep 10&& pkill -HUP ls&& true)&`
`14`		`- (ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&`
`15`		`-wait`
`16`		- end=`date +%s`
`17`		`- runtime=$((end-start))`
`18`		`-echo"Data access time:$runtime seconds"`
`19`		`-if [$runtime-gt 3 ];then`
`20`		`-echo"Data access time is greater than 3 seconds, skipping the test"`
	`12`	`+if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then`
	`13`	`+echo"Not available locally, skipping the test"`
`21`	`14`	`return 0`
`22`	`15`	`fi`
	`16`	`+`
`23`	`17`	`python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --separate_queue \`
`24`	`18`	`--cpu_size 2 --gpu_size 2 --fp16 --nhwc`
`25`	`19`	`python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --separate_queue \`

`‎qa/TL2_FW_iterators_perf/test_pytorch.sh`

Lines changed: 3 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -13,17 +13,11 @@ test_body() {`
`13`	`13`	`python test_RN50_data_fw_iterators.py --framework${fw} --gpus${NUM_GPUS} -b 13 \`
`14`	`14`	`--workers 3 --prefetch 2 --epochs 3`
`15`	`15`	`done`
`16`		- start=`date +%s`
`17`		`- (sleep 10&& pkill -HUP ls&& true)&`
`18`		`- (ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&`
`19`		`-wait`
`20`		- end=`date +%s`
`21`		`- runtime=$((end-start))`
`22`		`-echo"Data access time:$runtime seconds"`
`23`		`-if [$runtime-gt 3 ];then`
`24`		`-echo"Data access time is greater than 3 seconds, skipping the test"`
	`16`	`+if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then`
	`17`	`+echo"Not available locally, skipping the test"`
`25`	`18`	`return 0`
`26`	`19`	`fi`
	`20`	`+`
`27`	`21`	`torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel`
`28`	`22`	`torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel`
`29`	`23`	`torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader`

`‎qa/TL2_RN50_data_perf/test.sh`

Lines changed: 3 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,18 +8,11 @@ do_once() {`
`8`	`8`	`}`
`9`	`9`
`10`	`10`	`test_body() {`
`11`		- start=`date +%s`
`12`		`- (sleep 10&& pkill -HUP ls&& true)&`
`13`		`- (ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&`
`14`		`-wait`
`15`		- end=`date +%s`
`16`		`- runtime=$((end-start))`
`17`		`-echo"Data access time:$runtime seconds"`
`18`		`-if [$runtime-gt 3 ];then`
`19`		`-echo"Data access time is greater than 3 seconds, skipping the test"`
	`11`	`+if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then`
	`12`	`+echo"Not available locally, skipping the test"`
`20`	`13`	`return 0`
`21`	`14`	`fi`
`22`		`-# test code`
	`15`	`+`
`23`	`16`	`python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type"legacy"`
`24`	`17`	`python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type"experimental"`
`25`	`18`	`python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 16 --workers 3 --prefetch 11 --decoder_type"legacy"`

`‎qa/TL3_RN50_short/test_pytorch.sh`

Lines changed: 9 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -13,10 +13,10 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50`
`13`	`13`	`NUM_GPUS=$(nvidia-smi -L\| wc -l)`
`14`	`14`
`15`	`15`	`if [!-d"val" ];then`
`16`		`- ln -sf /data_raid/imagenet/val-jpeg/ val`
	`16`	`+ ln -sf /data/imagenet/val-jpeg/ val`
`17`	`17`	`fi`
`18`	`18`	`if [!-d"train" ];then`
`19`		`- ln -sf /data_raid/imagenet/train-jpeg/ train`
	`19`	`+ ln -sf /data/imagenet/train-jpeg/ train`
`20`	`20`	`fi`
`21`	`21`
`22`	`22`	`LOG=dali.log`
`@@ -26,7 +26,7 @@ SECONDS=0`
`26`	`26`	`# turn off SHARP to avoid NCCL errors`
`27`	`27`	`export NCCL_NVLS_ENABLE=0`
`28`	`28`
`29`		`-torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs5 ./2>&1\| tee$LOG`
	`29`	`+torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs2 ./2>&1\| tee$LOG`
`30`	`30`
`31`	`31`	`RET=${PIPESTATUS[0]}`
`32`	`32`	`echo"Training ran in$SECONDS seconds"`
`@@ -57,7 +57,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP`
`57`	`57`	`printf"TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n"$TOP5$MIN_TOP5$TOP5_RESULT`
`58`	`58`	`printf"Average perf: %.2f (expect at least %f) samples/sec %s\n"$PERF$MIN_PERF$PERF_RESULT`
`59`	`59`
`60`		`-if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK"&&"$PERF_RESULT"=="OK" ]];then`
	`60`	`+# check perf only if data is locally available`
	`61`	`+if [$(stat /data/imagenet/val-jpeg --format="%T" -f)=="ext2/ext3" ]&& ["$PERF_RESULT"!="OK" ];then`
	`62`	`+ CAN_AND_EXIT 4`
	`63`	`+fi`
	`64`	`+`
	`65`	`+if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK" ]];then`
`61`	`66`	`CLEAN_AND_EXIT 0`
`62`	`67`	`fi`
`63`	`68`

`‎qa/TL3_RN50_short/test_tensorflow.sh`

Lines changed: 7 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ mkdir -p idx-files/`
`6`	`6`
`7`	`7`	`NUM_GPUS=$(nvidia-smi -L\| wc -l)`
`8`	`8`
`9`		`-DATA_SET_DIR=/data_raid/imagenet/train-val-tfrecord`
	`9`	`+DATA_SET_DIR=/data/imagenet/train-val-tfrecord`
`10`	`10`	`forfilein$(ls$DATA_SET_DIR/-of-);`
`11`	`11`	`do`
`12`	`12`	`file=$(basename${file})`
`@@ -69,7 +69,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP`
`69`	`69`	`printf"TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n"$TOP5$MIN_TOP5$TOP5_RESULT`
`70`	`70`	`printf"mean speed %.2f (expect at least %f) samples/sec %s\n"$PERF$MIN_PERF$PERF_RESULT`
`71`	`71`
`72`		`-if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK"&&"$PERF_RESULT"=="OK" ]];then`
	`72`	`+# check perf only if data is locally available`
	`73`	`+if [$(stat /data/imagenet/train-val-tfrecord --format="%T" -f)=="ext2/ext3" ]&& ["$PERF_RESULT"!="OK" ];then`
	`74`	`+ CAN_AND_EXIT 4`
	`75`	`+fi`
	`76`	`+`
	`77`	`+if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK" ]];then`
`73`	`78`	`CLEAN_AND_EXIT 0`
`74`	`79`	`fi`
`75`	`80`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit211abfa

File tree

5 files changed

5 files changed

`‎qa/TL1_separate_executor/test_nofw.sh`

`‎qa/TL2_FW_iterators_perf/test_pytorch.sh`

`‎qa/TL2_RN50_data_perf/test.sh`

`‎qa/TL3_RN50_short/test_pytorch.sh`

`‎qa/TL3_RN50_short/test_tensorflow.sh`

0 commit comments