@@ -99,10 +99,10 @@ int main() {
9999 };
100100
101101for (auto & s : scopes) {
102- out <<" static inline__device__ void __cuda_membar_" << s.first <<" () { asm volatile(\" membar" << membar_scopes[s.first ] <<" ;\" :::\" memory\" ); }\n " ;
102+ out <<" static inline_LIBCUDACXX_DEVICE void __cuda_membar_" << s.first <<" () { asm volatile(\" membar" << membar_scopes[s.first ] <<" ;\" :::\" memory\" ); }\n " ;
103103for (auto & sem : fence_semantics)
104- out <<" static inline__device__ void" <<fencename (sem.first , s.first ) <<" () { asm volatile(\" fence" << sem.second << s.second <<" ;\" :::\" memory\" ); }\n " ;
105- out <<" static inline__device__ void __atomic_thread_fence_cuda(int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
104+ out <<" static inline_LIBCUDACXX_DEVICE void" <<fencename (sem.first , s.first ) <<" () { asm volatile(\" fence" << sem.second << s.second <<" ;\" :::\" memory\" ); }\n " ;
105+ out <<" static inline_LIBCUDACXX_DEVICE void __atomic_thread_fence_cuda(int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
106106 out <<" NV_DISPATCH_TARGET(\n " ;
107107 out <<" NV_PROVIDES_SM_70, (\n " ;
108108 out <<" switch (__memorder) {\n " ;
@@ -131,7 +131,7 @@ int main() {
131131for (auto & sz : ld_sizes) {
132132for (auto & sem : ld_semantics) {
133133 out <<" template<class _CUDA_A, class _CUDA_B>" ;
134- out <<" static inline__device__ void __cuda_load_" << sem.first <<" _" << sz <<" _" << s.first <<" (_CUDA_A __ptr, _CUDA_B& __dst) {" ;
134+ out <<" static inline_LIBCUDACXX_DEVICE void __cuda_load_" << sem.first <<" _" << sz <<" _" << s.first <<" (_CUDA_A __ptr, _CUDA_B& __dst) {" ;
135135if (ld_as_atom)
136136 out <<" asm volatile(\" atom.add" << (sem.first ==" volatile" ?" " : sem.second .c_str ()) << s.second <<" .u" << sz <<" %0, [%1], 0;\" :" ;
137137else
@@ -141,7 +141,7 @@ int main() {
141141 }
142142for (auto & cv: cv_qualifier) {
143143 out <<" template<class _Type, typename _CUDA_VSTD::enable_if<sizeof(_Type)==" << sz/8 <<" , int>::type = 0>\n " ;
144- out <<" __device__ void __atomic_load_cuda(const" << cv <<" _Type *__ptr, _Type *__ret, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
144+ out <<" _LIBCUDACXX_DEVICE void __atomic_load_cuda(const" << cv <<" _Type *__ptr, _Type *__ret, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
145145 out <<" uint" << sz <<" _t __tmp = 0;\n " ;
146146 out <<" NV_DISPATCH_TARGET(\n " ;
147147 out <<" NV_PROVIDES_SM_70, (\n " ;
@@ -170,14 +170,14 @@ int main() {
170170for (auto & sz : st_sizes) {
171171for (auto & sem : st_semantics) {
172172 out <<" template<class _CUDA_A, class _CUDA_B>" ;
173- out <<" static inline__device__ void __cuda_store_" << sem.first <<" _" << sz <<" _" << s.first <<" (_CUDA_A __ptr, _CUDA_B __src) {" ;
173+ out <<" static inline_LIBCUDACXX_DEVICE void __cuda_store_" << sem.first <<" _" << sz <<" _" << s.first <<" (_CUDA_A __ptr, _CUDA_B __src) {" ;
174174 out <<" asm volatile(\" st" << sem.second << (sem.first ==" volatile" ?" " : s.second .c_str ()) <<" .b" << sz <<" [%0], %1;\" ::" ;
175175 out <<" \" l\" (__ptr),\" " <<registers (" b" , sz) <<" \" (__src)" ;
176176 out <<" :\" memory\" ); }\n " ;
177177 }
178178for (auto & cv: cv_qualifier) {
179179 out <<" template<class _Type, typename cuda::std::enable_if<sizeof(_Type)==" << sz/8 <<" , int>::type = 0>\n " ;
180- out <<" __device__ void __atomic_store_cuda(" << cv <<" _Type *__ptr, _Type *__val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
180+ out <<" _LIBCUDACXX_DEVICE void __atomic_store_cuda(" << cv <<" _Type *__ptr, _Type *__val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
181181 out <<" uint" << sz <<" _t __tmp = 0;\n " ;
182182 out <<" memcpy(&__tmp, __val," << sz/8 <<" );\n " ;
183183 out <<" NV_DISPATCH_TARGET(\n " ;
@@ -215,7 +215,7 @@ int main() {
215215 out <<" template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D>" ;
216216else
217217 out <<" template<class _CUDA_A, class _CUDA_B, class _CUDA_C>" ;
218- out <<" static inline__device__ void __cuda_" << rmw.first <<" _" << sem.first <<" _" << type.first << sz <<" _" << s.first <<" (" ;
218+ out <<" static inline_LIBCUDACXX_DEVICE void __cuda_" << rmw.first <<" _" << sem.first <<" _" << type.first << sz <<" _" << s.first <<" (" ;
219219if (rmw.first ==" compare_exchange" )
220220 out <<" _CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op" ;
221221else
@@ -238,7 +238,7 @@ int main() {
238238for (auto & cv: cv_qualifier) {
239239if (rmw.first ==" compare_exchange" ) {
240240 out <<" template<class _Type, typename cuda::std::enable_if<sizeof(_Type)==" << sz/8 <<" , int>::type = 0>\n " ;
241- out <<" __device__ bool __atomic_compare_exchange_cuda(" << cv <<" _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder," <<scopenametag (s.first ) <<" ) {\n " ;
241+ out <<" _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(" << cv <<" _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder," <<scopenametag (s.first ) <<" ) {\n " ;
242242 out <<" uint" << sz <<" _t __tmp = 0, __old = 0, __old_tmp;\n " ;
243243 out <<" memcpy(&__tmp, __desired," << sz/8 <<" );\n " ;
244244 out <<" memcpy(&__old, __expected," << sz/8 <<" );\n " ;
@@ -276,7 +276,7 @@ int main() {
276276 out <<" template<class _Type, typename cuda::std::enable_if<sizeof(_Type)==" << sz/8 ;
277277if (rmw.first ==" exchange" ) {
278278 out <<" , int>::type = 0>\n " ;
279- out <<" __device__ void __atomic_exchange_cuda(" << cv <<" _Type *__ptr, _Type *__val, _Type *__ret, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
279+ out <<" _LIBCUDACXX_DEVICE void __atomic_exchange_cuda(" << cv <<" _Type *__ptr, _Type *__val, _Type *__ret, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
280280 out <<" uint" << sz <<" _t __tmp = 0;\n " ;
281281 out <<" memcpy(&__tmp, __val," << sz/8 <<" );\n " ;
282282 }
@@ -293,7 +293,7 @@ int main() {
293293 out <<" && cuda::std::is_integral<_Type>::value, int>::type = 0>\n " ;
294294else
295295 out <<" , int>::type = 0>\n " ;
296- out <<" __device__ _Type __atomic_" << rmw.first <<" _cuda(" << cv <<" _Type *__ptr, _Type __val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
296+ out <<" _LIBCUDACXX_DEVICE _Type __atomic_" << rmw.first <<" _cuda(" << cv <<" _Type *__ptr, _Type __val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
297297 out <<" _Type __ret;\n " ;
298298if (type.first ==" f" && sz ==32 )
299299 out <<" float" ;
@@ -345,7 +345,7 @@ int main() {
345345 std::vector<std::string> addsub{" add" ," sub" };
346346for (auto & op : addsub) {
347347 out <<" template<class _Type>\n " ;
348- out <<" __device__ _Type* __atomic_fetch_" << op <<" _cuda(_Type *" << cv <<" *__ptr, ptrdiff_t __val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
348+ out <<" _LIBCUDACXX_DEVICE _Type* __atomic_fetch_" << op <<" _cuda(_Type *" << cv <<" *__ptr, ptrdiff_t __val, int __memorder," <<scopenametag (s.first ) <<" ) {\n " ;
349349 out <<" _Type* __ret;\n " ;
350350 out <<" uint64_t __tmp = 0;\n " ;
351351 out <<" memcpy(&__tmp, &__val, 8);\n " ;