Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
/cubPublic archive

Commit722e3ca

Browse files
authored
Merge pull request#331 from senior-zero/main-feature/github/cub_adjacent_difference
Port adjacent difference into CUB
2 parentsec07c16 +6d052db commit722e3ca

12 files changed

+3895
-212
lines changed
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
/******************************************************************************
2+
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of the NVIDIA CORPORATION nor the
12+
* names of its contributors may be used to endorse or promote products
13+
* derived from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*
26+
******************************************************************************/
27+
28+
#pragma once
29+
30+
#include"../config.cuh"
31+
#include"../util_type.cuh"
32+
#include"../util_namespace.cuh"
33+
#include"../block/block_load.cuh"
34+
#include"../block/block_store.cuh"
35+
#include"../block/block_adjacent_difference.cuh"
36+
37+
#include<thrust/system/cuda/detail/core/util.h>
38+
39+
40+
CUB_NAMESPACE_BEGIN
41+
42+
43+
template<
44+
int _BLOCK_THREADS,
45+
int _ITEMS_PER_THREAD =1,
46+
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
47+
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
48+
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
49+
structAgentAdjacentDifferencePolicy
50+
{
51+
staticconstexprint BLOCK_THREADS = _BLOCK_THREADS;
52+
staticconstexprint ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
53+
staticconstexprint ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
54+
55+
staticconstexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
56+
staticconstexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
57+
staticconstexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
58+
};
59+
60+
template<typename Policy,
61+
typename InputIteratorT,
62+
typename OutputIteratorT,
63+
typename DifferenceOpT,
64+
typename OffsetT,
65+
typename InputT,
66+
typename OutputT,
67+
bool InPlace,
68+
bool ReadLeft>
69+
structAgentDifference
70+
{
71+
using LoadIt =typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
72+
73+
using BlockLoad =typename cub::BlockLoadType<Policy, LoadIt>::type;
74+
using BlockStore =typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
75+
76+
using BlockAdjacentDifferenceT =
77+
cub::BlockAdjacentDifference<InputT, Policy::BLOCK_THREADS>;
78+
79+
union _TempStorage
80+
{
81+
typename BlockLoad::TempStorage load;
82+
typename BlockStore::TempStorage store;
83+
typename BlockAdjacentDifferenceT::TempStorage adjacent_difference;
84+
};
85+
86+
/// Alias wrapper allowing storage to be unioned
87+
structTempStorage : Uninitialized<_TempStorage> {};
88+
89+
staticconstexprint BLOCK_THREADS = Policy::BLOCK_THREADS;
90+
staticconstexprint ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
91+
staticconstexprint ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
92+
staticconstexprint SHARED_MEMORY_SIZE =static_cast<int>(sizeof(TempStorage));
93+
94+
_TempStorage &temp_storage;
95+
InputIteratorT input_it;
96+
LoadIt load_it;
97+
InputT *first_tile_previous;
98+
OutputIteratorT result;
99+
DifferenceOpT difference_op;
100+
OffsetT num_items;
101+
102+
__device____forceinline__AgentDifference(TempStorage &temp_storage,
103+
InputIteratorT input_it,
104+
InputT *first_tile_previous,
105+
OutputIteratorT result,
106+
DifferenceOpT difference_op,
107+
OffsetT num_items)
108+
: temp_storage(temp_storage.Alias())
109+
, input_it(input_it)
110+
, load_it(
111+
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(),
112+
input_it))
113+
, first_tile_previous(first_tile_previous)
114+
, result(result)
115+
, difference_op(difference_op)
116+
, num_items(num_items)
117+
{}
118+
119+
template<bool IS_LAST_TILE,
120+
bool IS_FIRST_TILE>
121+
__device____forceinline__voidconsume_tile_impl(int num_remaining,
122+
int tile_idx,
123+
OffsetT tile_base)
124+
{
125+
InputT input[ITEMS_PER_THREAD];
126+
OutputT output[ITEMS_PER_THREAD];
127+
128+
if (IS_LAST_TILE)
129+
{
130+
// Fill last elements with the first element
131+
// because collectives are not suffix guarded
132+
BlockLoad(temp_storage.load)
133+
.Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base));
134+
}
135+
else
136+
{
137+
BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
138+
}
139+
140+
CTA_SYNC();
141+
142+
if (ReadLeft)
143+
{
144+
if (IS_FIRST_TILE)
145+
{
146+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
147+
.SubtractLeft(input, output, difference_op);
148+
}
149+
else
150+
{
151+
InputT tile_prev_input = InPlace ? first_tile_previous[tile_idx]
152+
: *(input_it + tile_base -1);
153+
154+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
155+
.SubtractLeft(input, output, difference_op, tile_prev_input);
156+
}
157+
}
158+
else
159+
{
160+
if (IS_LAST_TILE)
161+
{
162+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
163+
.SubtractRightPartialTile(input, output, difference_op, num_remaining);
164+
}
165+
else
166+
{
167+
InputT tile_next_input = InPlace ? first_tile_previous[tile_idx]
168+
: *(input_it + tile_base + ITEMS_PER_TILE);
169+
170+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
171+
.SubtractRight(input, output, difference_op, tile_next_input);
172+
}
173+
}
174+
175+
CTA_SYNC();
176+
177+
if (IS_LAST_TILE)
178+
{
179+
BlockStore(temp_storage.store)
180+
.Store(result + tile_base, output, num_remaining);
181+
}
182+
else
183+
{
184+
BlockStore(temp_storage.store).Store(result + tile_base, output);
185+
}
186+
}
187+
188+
template<bool IS_LAST_TILE>
189+
__device____forceinline__voidconsume_tile(int num_remaining,
190+
int tile_idx,
191+
OffsetT tile_base)
192+
{
193+
if (tile_idx ==0)
194+
{
195+
consume_tile_impl<IS_LAST_TILE,true>(num_remaining,
196+
tile_idx,
197+
tile_base);
198+
}
199+
else
200+
{
201+
consume_tile_impl<IS_LAST_TILE,false>(num_remaining,
202+
tile_idx,
203+
tile_base);
204+
}
205+
}
206+
207+
__device____forceinline__voidProcess(int tile_idx,
208+
OffsetT tile_base)
209+
{
210+
OffsetT num_remaining = num_items - tile_base;
211+
212+
if (num_remaining > ITEMS_PER_TILE)// not a last tile
213+
{
214+
consume_tile<false>(num_remaining, tile_idx, tile_base);
215+
}
216+
else
217+
{
218+
consume_tile<true>(num_remaining, tile_idx, tile_base);
219+
}
220+
}
221+
};
222+
223+
template<typename InputIteratorT,
224+
typename InputT,
225+
typename OffsetT,
226+
bool ReadLeft>
227+
structAgentDifferenceInit
228+
{
229+
staticconstexprint BLOCK_THREADS =128;
230+
231+
static__device____forceinline__voidProcess(int tile_idx,
232+
InputIteratorT first,
233+
InputT *result,
234+
OffsetT num_tiles,
235+
int items_per_tile)
236+
{
237+
OffsetT tile_base =static_cast<OffsetT>(tile_idx) * items_per_tile;
238+
239+
if (tile_base >0 && tile_idx < num_tiles)
240+
{
241+
if (ReadLeft)
242+
{
243+
result[tile_idx] = first[tile_base -1];
244+
}
245+
else
246+
{
247+
result[tile_idx -1] = first[tile_base];
248+
}
249+
}
250+
}
251+
};
252+
253+
254+
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp