Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit211abfa

Browse files
JanuszLstiepan
authored andcommitted
Fix data paths in TL3 short tests (#5845)
- unifies data paths in TL3 short tests with other tests- improves the way the remote file system is detected in the testsSigned-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
1 parent6f1d20c commit211abfa

File tree

5 files changed

+25
-34
lines changed

5 files changed

+25
-34
lines changed

‎qa/TL1_separate_executor/test_nofw.sh

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,11 @@ do_once() {
99
}
1010

1111
test_body() {
12-
start=`date +%s`
13-
(sleep 10&& pkill -HUP ls&& true)&
14-
(ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&
15-
wait
16-
end=`date +%s`
17-
runtime=$((end-start))
18-
echo"Data access time:$runtime seconds"
19-
if [$runtime-gt 3 ];then
20-
echo"Data access time is greater than 3 seconds, skipping the test"
12+
if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then
13+
echo"Not available locally, skipping the test"
2114
return 0
2215
fi
16+
2317
python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --separate_queue \
2418
--cpu_size 2 --gpu_size 2 --fp16 --nhwc
2519
python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --separate_queue \

‎qa/TL2_FW_iterators_perf/test_pytorch.sh

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,11 @@ test_body() {
1313
python test_RN50_data_fw_iterators.py --framework${fw} --gpus${NUM_GPUS} -b 13 \
1414
--workers 3 --prefetch 2 --epochs 3
1515
done
16-
start=`date +%s`
17-
(sleep 10&& pkill -HUP ls&& true)&
18-
(ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&
19-
wait
20-
end=`date +%s`
21-
runtime=$((end-start))
22-
echo"Data access time:$runtime seconds"
23-
if [$runtime-gt 3 ];then
24-
echo"Data access time is greater than 3 seconds, skipping the test"
16+
if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then
17+
echo"Not available locally, skipping the test"
2518
return 0
2619
fi
20+
2721
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
2822
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
2923
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader

‎qa/TL2_RN50_data_perf/test.sh

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,11 @@ do_once() {
88
}
99

1010
test_body() {
11-
start=`date +%s`
12-
(sleep 10&& pkill -HUP ls&& true)&
13-
(ls /data/imagenet/train-jpeg> /dev/null&& pkill -HUP sleep)&
14-
wait
15-
end=`date +%s`
16-
runtime=$((end-start))
17-
echo"Data access time:$runtime seconds"
18-
if [$runtime-gt 3 ];then
19-
echo"Data access time is greater than 3 seconds, skipping the test"
11+
if [$(stat /data/imagenet/train-jpeg --format="%T" -f)!="ext2/ext3" ];then
12+
echo"Not available locally, skipping the test"
2013
return 0
2114
fi
22-
# test code
15+
2316
python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type"legacy"
2417
python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type"experimental"
2518
python test_RN50_data_pipeline.py --gpus${NUM_GPUS} -b 16 --workers 3 --prefetch 11 --decoder_type"legacy"

‎qa/TL3_RN50_short/test_pytorch.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50
1313
NUM_GPUS=$(nvidia-smi -L| wc -l)
1414

1515
if [!-d"val" ];then
16-
ln -sf /data_raid/imagenet/val-jpeg/ val
16+
ln -sf /data/imagenet/val-jpeg/ val
1717
fi
1818
if [!-d"train" ];then
19-
ln -sf /data_raid/imagenet/train-jpeg/ train
19+
ln -sf /data/imagenet/train-jpeg/ train
2020
fi
2121

2222
LOG=dali.log
@@ -26,7 +26,7 @@ SECONDS=0
2626
# turn off SHARP to avoid NCCL errors
2727
export NCCL_NVLS_ENABLE=0
2828

29-
torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs5 ./2>&1| tee$LOG
29+
torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs2 ./2>&1| tee$LOG
3030

3131
RET=${PIPESTATUS[0]}
3232
echo"Training ran in$SECONDS seconds"
@@ -57,7 +57,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
5757
printf"TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n"$TOP5$MIN_TOP5$TOP5_RESULT
5858
printf"Average perf: %.2f (expect at least %f) samples/sec %s\n"$PERF$MIN_PERF$PERF_RESULT
5959

60-
if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK"&&"$PERF_RESULT"=="OK" ]];then
60+
# check perf only if data is locally available
61+
if [$(stat /data/imagenet/val-jpeg --format="%T" -f)=="ext2/ext3" ]&& ["$PERF_RESULT"!="OK" ];then
62+
CAN_AND_EXIT 4
63+
fi
64+
65+
if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK" ]];then
6166
CLEAN_AND_EXIT 0
6267
fi
6368

‎qa/TL3_RN50_short/test_tensorflow.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mkdir -p idx-files/
66

77
NUM_GPUS=$(nvidia-smi -L| wc -l)
88

9-
DATA_SET_DIR=/data_raid/imagenet/train-val-tfrecord
9+
DATA_SET_DIR=/data/imagenet/train-val-tfrecord
1010
forfilein$(ls$DATA_SET_DIR/*-of-*);
1111
do
1212
file=$(basename${file})
@@ -69,7 +69,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
6969
printf"TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n"$TOP5$MIN_TOP5$TOP5_RESULT
7070
printf"mean speed %.2f (expect at least %f) samples/sec %s\n"$PERF$MIN_PERF$PERF_RESULT
7171

72-
if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK"&&"$PERF_RESULT"=="OK" ]];then
72+
# check perf only if data is locally available
73+
if [$(stat /data/imagenet/train-val-tfrecord --format="%T" -f)=="ext2/ext3" ]&& ["$PERF_RESULT"!="OK" ];then
74+
CAN_AND_EXIT 4
75+
fi
76+
77+
if [["$TOP1_RESULT"=="OK"&&"$TOP5_RESULT"=="OK" ]];then
7378
CLEAN_AND_EXIT 0
7479
fi
7580

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp