Commite0cfd87

committed

fix: Disaggregate serving malfunction when using attention dp

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>

1 parent7137cc8 commite0cfd87Copy full SHA for e0cfd87

File tree

4 files changed

+15

-4

lines changed

cpp/tensorrt_llm/batch_manager
tensorrt_llm/_torch/pyexecutor
- py_executor.py

4 files changed

+15

-4

lines changed

`‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -522,7 +522,9 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR`
`522`	`522`	`// Gather the kv cache transfer time from all workers and update to leader rank`
`523`	`523`	`if (!common::getEnvKVCacheTransferOutputPath().empty())`
`524`	`524`	`{`
`525`		`-updateKVCacheTransferBW(*mMpiGroupComm, it->first);`
	`525`	`+auto syncComm`
	`526`	`+ =mCacheState->getParallelConfig().mEnableAttentionDP ?mMpiGroupDataComm.get() :mMpiGroupComm;`
	`527`	`+updateKVCacheTransferBW(*syncComm, it->first);`
`526`	`528`	`}`
`527`	`529`	`TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),`
`528`	`530`	`"** it->first->mRequestId: %ld, context request ID: %ld **** get feature *",`

`‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp‎`

Lines changed: 9 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -185,25 +185,28 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,`
`185`	`185`	`NVTX3_SCOPED_RANGE(sendBufferFun);`
`186`	`186`
`187`	`187`	`TLLM_CUDA_CHECK(cudaSetDevice(deviceId));`
	`188`	`+auto startTime =std::chrono::steady_clock::now();`
`188`	`189`	`auto cacheIdx = processIdx % pPDomainSize;`
	`190`	`+size_t size;`
`189`	`191`	`if (cacheIdx < bufferCoverTargetNum)`
`190`	`192`	`{`
`191`		`-`
	`193`	`+ size = outputSplitCaches.at(cacheIdx)->getSizeInBytes();`
`192`	`194`	`TransferHelper::sendBuffer(connections.at(processIdx), outputSplitCaches.at(cacheIdx), reqId);`
`193`	`195`	`}`
`194`	`196`	`elseif (bufferCoverTargetNum >0)`
`195`	`197`	`{`
`196`	`198`	`// copy buffer allocated by cudaMallocAsync to buffer allocated by cudaMalloc before sending`
`197`	`199`	`auto sendBufferIdx = cacheIdx % bufferCoverTargetNum;`
	`200`	`+ size = outputSplitCaches.at(sendBufferIdx)->getSizeInBytes();`
`198`	`201`	`bufferManager.copy(outputSplitCaches.at(cacheIdx), outputSplitCaches.at(sendBufferIdx));`
`199`	`202`	`bufferManager.getStream().synchronize();`
`200`	`203`	`TransferHelper::sendBuffer(connections.at(processIdx), outputSplitCaches.at(sendBufferIdx), reqId);`
`201`	`204`	`}`
`202`	`205`	`else`
`203`	`206`	`{`
`204`		`-`
`205`	`207`	`// bufferCoverTargetNum=0, mSendBuffer size < one outputSlice`
`206`	`208`	`// send multiple times`
	`209`	`+ size = targetBufferSize;`
`207`	`210`	`size_t remainSendSize = targetBufferSize;`
`208`	`211`	`while (remainSendSize >0)`
`209`	`212`	`{`
`@@ -220,6 +223,10 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,`
`220`	`223`	`remainSendSize -= sendSize;`
`221`	`224`	`}`
`222`	`225`	`}`
	`226`	`+auto endTime =std::chrono::steady_clock::now();`
	`227`	`+double cacheTransferTime`
	`228`	`+ =std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());`
	`229`	`+ kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, cacheTransferTime, size);`
`223`	`230`	`};`
`224`	`231`
`225`	`232`	`if (connections.size() >1)`

`‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ class MLACacheFormatter final : public IOFormatter`
`79`	`79`	`private:`
`80`	`80`	`BaseKVCacheManager*mCacheManager{};`
`81`	`81`	`CacheTransBufferManager*mCacheTransBufferManager;`
	`82`	`+ KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};`
`82`	`83`	`};`
`83`	`84`
`84`	`85`	`}// namespace tensorrt_llm::batch_manager::kv_cache_manager`

`‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -895,7 +895,8 @@ def _executor_loop(self):`
`895`	`895`
`896`	`896`	`finished_requests= []`
`897`	`897`
`898`		`-ifscheduled_batch.batch_size>0:`
	`898`	`+ifscheduled_batch.batch_size>0or (`
	`899`	`+self.enable_attention_dpandself.dist.tp_size>1):`
`899`	`900`	`ifself.kv_cache_transceiver:`
`900`	`901`	`# For generation requests which have completed KV cache transfer`
`901`	`902`	`self._prepare_disagg_gen_transmission_complete(`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commite0cfd87

File tree

4 files changed

4 files changed

`‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎`

`‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp‎`

`‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h‎`

`‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎`

0 commit comments