Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd87beea

Browse files
committed
Enable querying the build and runtime NCCL versions
ghstack-source-id:855f55dPull Requestresolved:#156305
1 parentf45f483 commitd87beea

File tree

4 files changed

+35
-13
lines changed

4 files changed

+35
-13
lines changed

‎torch/_C/_distributed_c10d.pyi‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,10 @@ class ProcessGroupNCCL(Backend):
643643
defuid(self)->int: ...
644644
@property
645645
defoptions(self)->Options: ...# type: ignore[override]
646+
@staticmethod
647+
defget_build_nccl_version(self)->tuple[int,int,int]: ...
648+
@staticmethod
649+
defget_runtime_nccl_version(self)->tuple[int,int,int]: ...
646650

647651
classProcessGroupUCC(Backend):
648652
def__init__(

‎torch/csrc/distributed/c10d/NCCLUtils.cpp‎

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -434,21 +434,11 @@ std::unordered_map<std::string, std::string> NCCLComm::ncclCommDump() {
434434

435435
std::stringgetNcclVersion() {
436436
static std::string versionString = []() {
437-
int version =0;
437+
auto [ncclMajor, ncclMinor, ncclPatch] =getNcclVersionTuple();
438438
std::string versionString;
439-
ncclResult_t status =ncclGetVersion(&version);
440-
// can't compute the version if call did not return successfully or version
441-
// code < 100 (corresponding to 0.1.0)
442-
if (status != ncclSuccess || version <100) {
439+
if (ncclMajor ==0 && ncclMinor ==0 && ncclPatch ==0) {
443440
versionString ="Unknown NCCL version";
444441
}else {
445-
// NCCL changed version coding starting 2.9
446-
constint majorBase = version <2900 ?1000 :10000;
447-
constint minorBase =100;
448-
auto ncclMajor = version / majorBase;
449-
auto ncclMinor = (version % majorBase) / minorBase;
450-
auto ncclPatch =
451-
version % (ncclMajor * majorBase + ncclMinor * minorBase);
452442
versionString =std::to_string(ncclMajor) +"." +
453443
std::to_string(ncclMinor) +"." +std::to_string(ncclPatch);
454444
#ifdef NCCL_SUFFIX
@@ -464,6 +454,25 @@ std::string getNcclVersion() {
464454
return versionString;
465455
}
466456

457+
std::tuple<int,int,int>getNcclVersionTuple() {
458+
static std::tuple<int,int,int> versionTuple = []() {
459+
int version =getNcclVersionNumber();
460+
// can't compute the version if call did not return successfully or version
461+
// code < 100 (corresponding to 0.1.0)
462+
if (version <100) {
463+
returnstd::make_tuple(0,0,0);
464+
}
465+
// NCCL changed version coding starting 2.9
466+
constint majorBase = version <2900 ?1000 :10000;
467+
constint minorBase =100;
468+
auto ncclMajor = version / majorBase;
469+
auto ncclMinor = (version % majorBase) / minorBase;
470+
auto ncclPatch = version % minorBase;
471+
returnstd::make_tuple(ncclMajor, ncclMinor, ncclPatch);
472+
}();
473+
return versionTuple;
474+
}
475+
467476
intgetNcclVersionNumber() {
468477
staticint version = []() {
469478
int version =0;

‎torch/csrc/distributed/c10d/NCCLUtils.hpp‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
229229

230230
TORCH_APIsize_thashTensors(const std::vector<at::Tensor>& tensors);
231231
TORCH_API std::stringgetNcclVersion();
232+
TORCH_API std::tuple<int,int,int>getNcclVersionTuple();
232233
TORCH_APIintgetNcclVersionNumber();
233234
TORCH_API std::stringncclGetErrorWithVersion(ncclResult_t error);
234235
intnccl_nonblocking_timeout();

‎torch/csrc/distributed/c10d/init.cpp‎

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3213,7 +3213,15 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
32133213
self->setEnableNanCheck(enable_nan_check);
32143214
},
32153215
py::arg("enable_nan_check"),
3216-
py::call_guard<py::gil_scoped_release>());
3216+
py::call_guard<py::gil_scoped_release>())
3217+
.def_static(
3218+
"get_build_nccl_version",
3219+
[] {
3220+
returnstd::make_tuple(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH);
3221+
})
3222+
.def_static("get_runtime_nccl_version", [] {
3223+
return ::c10d::getNcclVersionTuple();
3224+
});
32173225

32183226
module.def(
32193227
"_get_intra_node_comm_usage_counter",

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp