Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9a5334c

Browse files
committed
aio: Combine io_uring memory mappings, if supported
By default io_uring creates a shared memory mapping for each io_uringinstance, leading to a large number of memory mappings. Unfortunately a largenumber of memory mappings slows things down, backend exit is particularlyaffected. To address that, newer kernels (6.5) support using user-providedmemory for the memory. By putting the relevant memory into shared memory wedon't need any additional mappings.On a system with a new enough kernel and liburing, there is no discernibleoverhead when doing a pgbench -S -C anymore.Reported-by: MARK CALLAGHAN <mdcallag@gmail.com>Reviewed-by: "Burd, Greg" <greg@burd.me>Reviewed-by: Jim Nasby <jnasby@upgrade.com>Discussion:https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.comBackpatch-through: 18
1 parent3a797c2 commit9a5334c

File tree

6 files changed

+238
-6
lines changed

6 files changed

+238
-6
lines changed

‎configure‎

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13309,6 +13309,23 @@ fi
1330913309

1331013310
fi
1331113311

13312+
if test "$with_liburing" = yes; then
13313+
_LIBS="$LIBS"
13314+
LIBS="$LIBURING_LIBS $LIBS"
13315+
for ac_func in io_uring_queue_init_mem
13316+
do :
13317+
ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
13318+
if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
13319+
cat >>confdefs.h <<_ACEOF
13320+
#define HAVE_IO_URING_QUEUE_INIT_MEM 1
13321+
_ACEOF
13322+
13323+
fi
13324+
done
13325+
13326+
LIBS="$_LIBS"
13327+
fi
13328+
1331213329
if test "$with_lz4" = yes ; then
1331313330
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
1331413331
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }

‎configure.ac‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
14201420
AC_CHECK_LIB(xslt,xsltCleanupGlobals,[],[AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
14211421
fi
14221422

1423+
if test "$with_liburing" = yes; then
1424+
_LIBS="$LIBS"
1425+
LIBS="$LIBURING_LIBS $LIBS"
1426+
AC_CHECK_FUNCS([io_uring_queue_init_mem])
1427+
LIBS="$_LIBS"
1428+
fi
1429+
14231430
if test "$with_lz4" = yes ; then
14241431
AC_CHECK_LIB(lz4,LZ4_compress_default,[],[AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
14251432
fi

‎meson.build‎

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,12 @@ liburingopt = get_option('liburing')
990990
liburing=dependency('liburing',required: liburingopt)
991991
if liburing.found()
992992
cdata.set('USE_LIBURING',1)
993+
994+
if cc.has_function('io_uring_queue_init_mem',
995+
dependencies: liburing,args: test_c_args)
996+
cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM',1)
997+
endif
998+
993999
endif
9941000

9951001

‎src/backend/storage/aio/method_io_uring.c‎

Lines changed: 204 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929

3030
#ifdefIOMETHOD_IO_URING_ENABLED
3131

32+
#include<sys/mman.h>
33+
#include<unistd.h>
34+
3235
#include<liburing.h>
3336

3437
#include"miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
9497
structio_uringio_uring_ring;
9598
}PgAioUringContext;
9699

100+
/*
101+
* Information about the capabilities that io_uring has.
102+
*
103+
* Depending on liburing and kernel version different features are
104+
* supported. At least for the kernel a kernel version check does not suffice
105+
* as various vendors do backport features to older kernels :(.
106+
*/
107+
typedefstructPgAioUringCaps
108+
{
109+
boolchecked;
110+
/* -1 if io_uring_queue_init_mem() is unsupported */
111+
intmem_init_size;
112+
}PgAioUringCaps;
113+
114+
97115
/* PgAioUringContexts for all backends */
98116
staticPgAioUringContext*pgaio_uring_contexts;
99117

100118
/* the current backend's context */
101119
staticPgAioUringContext*pgaio_my_uring_context;
102120

121+
staticPgAioUringCapspgaio_uring_caps=
122+
{
123+
.checked= false,
124+
.mem_init_size=-1,
125+
};
103126

104127
staticuint32
105128
pgaio_uring_procs(void)
@@ -111,30 +134,184 @@ pgaio_uring_procs(void)
111134
returnMaxBackends+NUM_AUXILIARY_PROCS-MAX_IO_WORKERS;
112135
}
113136

114-
staticSize
137+
/*
138+
* Initializes pgaio_uring_caps, unless that's already done.
139+
*/
140+
staticvoid
141+
pgaio_uring_check_capabilities(void)
142+
{
143+
if (pgaio_uring_caps.checked)
144+
return;
145+
146+
/*
147+
* By default io_uring creates a shared memory mapping for each io_uring
148+
* instance, leading to a large number of memory mappings. Unfortunately a
149+
* large number of memory mappings slows things down, backend exit is
150+
* particularly affected. To address that, newer kernels (6.5) support
151+
* using user-provided memory for the memory, by putting the relevant
152+
* memory into shared memory we don't need any additional mappings.
153+
*
154+
* To know whether this is supported, we unfortunately need to probe the
155+
* kernel by trying to create a ring with userspace-provided memory. This
156+
* also has a secondary benefit: We can determine precisely how much
157+
* memory we need for each io_uring instance.
158+
*/
159+
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM)&& defined(IORING_SETUP_NO_MMAP)
160+
{
161+
structio_uringtest_ring;
162+
size_tring_size;
163+
void*ring_ptr;
164+
structio_uring_paramsp= {0};
165+
intret;
166+
167+
/*
168+
* Liburing does not yet provide an API to query how much memory a
169+
* ring will need. So we over-estimate it here. As the memory is freed
170+
* just below that's small temporary waste of memory.
171+
*
172+
* 1MB is more than enough for rings within io_max_concurrency's
173+
* range.
174+
*/
175+
ring_size=1024*1024;
176+
177+
/*
178+
* Hard to believe a system exists where 1MB would not be a multiple
179+
* of the page size. But it's cheap to ensure...
180+
*/
181+
ring_size-=ring_size %sysconf(_SC_PAGESIZE);
182+
183+
ring_ptr=mmap(NULL,ring_size,PROT_READ |PROT_WRITE,MAP_SHARED |MAP_ANONYMOUS,-1,0);
184+
if (ring_ptr==MAP_FAILED)
185+
elog(ERROR,
186+
"mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
187+
ring_size);
188+
189+
ret=io_uring_queue_init_mem(io_max_concurrency,&test_ring,&p,ring_ptr,ring_size);
190+
if (ret>0)
191+
{
192+
pgaio_uring_caps.mem_init_size=ret;
193+
194+
elog(DEBUG1,
195+
"can use combined memory mapping for io_uring, each ring needs %d bytes",
196+
ret);
197+
198+
/* clean up the created ring, it was just for a test */
199+
io_uring_queue_exit(&test_ring);
200+
}
201+
else
202+
{
203+
/*
204+
* There are different reasons for ring creation to fail, but it's
205+
* ok to treat that just as io_uring_queue_init_mem() not being
206+
* supported. We'll report a more detailed error in
207+
* pgaio_uring_shmem_init().
208+
*/
209+
errno=-ret;
210+
elog(DEBUG1,
211+
"cannot use combined memory mapping for io_uring, ring creation failed: %m");
212+
213+
}
214+
215+
if (munmap(ring_ptr,ring_size)!=0)
216+
elog(ERROR,"munmap() failed: %m");
217+
}
218+
#else
219+
{
220+
elog(DEBUG1,
221+
"can't use combined memory mapping for io_uring, kernel or liburing too old");
222+
}
223+
#endif
224+
225+
pgaio_uring_caps.checked= true;
226+
}
227+
228+
/*
229+
* Memory for all PgAioUringContext instances
230+
*/
231+
staticsize_t
115232
pgaio_uring_context_shmem_size(void)
116233
{
117234
returnmul_size(pgaio_uring_procs(),sizeof(PgAioUringContext));
118235
}
119236

237+
/*
238+
* Memory for the combined memory used by io_uring instances. Returns 0 if
239+
* that is not supported by kernel/liburing.
240+
*/
241+
staticsize_t
242+
pgaio_uring_ring_shmem_size(void)
243+
{
244+
size_tsz=0;
245+
246+
if (pgaio_uring_caps.mem_init_size>0)
247+
{
248+
/*
249+
* Memory for rings needs to be allocated to the page boundary,
250+
* reserve space. Luckily it does not need to be aligned to hugepage
251+
* boundaries, even if huge pages are used.
252+
*/
253+
sz=add_size(sz,sysconf(_SC_PAGESIZE));
254+
sz=add_size(sz,mul_size(pgaio_uring_procs(),
255+
pgaio_uring_caps.mem_init_size));
256+
}
257+
258+
returnsz;
259+
}
260+
120261
staticsize_t
121262
pgaio_uring_shmem_size(void)
122263
{
123-
returnpgaio_uring_context_shmem_size();
264+
size_tsz;
265+
266+
/*
267+
* Kernel and liburing support for various features influences how much
268+
* shmem we need, perform the necessary checks.
269+
*/
270+
pgaio_uring_check_capabilities();
271+
272+
sz=pgaio_uring_context_shmem_size();
273+
sz=add_size(sz,pgaio_uring_ring_shmem_size());
274+
275+
returnsz;
124276
}
125277

126278
staticvoid
127279
pgaio_uring_shmem_init(boolfirst_time)
128280
{
129281
intTotalProcs=pgaio_uring_procs();
130282
boolfound;
283+
char*shmem;
284+
size_tring_mem_remain=0;
285+
char*ring_mem_next=0;
131286

132-
pgaio_uring_contexts= (PgAioUringContext*)
133-
ShmemInitStruct("AioUring",pgaio_uring_shmem_size(),&found);
134-
287+
/*
288+
* We allocate memory for all PgAioUringContext instances and, if
289+
* supported, the memory required for each of the io_uring instances, in
290+
* one ShmemInitStruct().
291+
*/
292+
shmem=ShmemInitStruct("AioUringContext",pgaio_uring_shmem_size(),&found);
135293
if (found)
136294
return;
137295

296+
pgaio_uring_contexts= (PgAioUringContext*)shmem;
297+
shmem+=pgaio_uring_context_shmem_size();
298+
299+
/* if supported, handle memory alignment / sizing for io_uring memory */
300+
if (pgaio_uring_caps.mem_init_size>0)
301+
{
302+
ring_mem_remain=pgaio_uring_ring_shmem_size();
303+
ring_mem_next= (char*)shmem;
304+
305+
/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
306+
ring_mem_next= (char*)TYPEALIGN(sysconf(_SC_PAGESIZE),ring_mem_next);
307+
308+
/* account for alignment */
309+
ring_mem_remain-=ring_mem_next-shmem;
310+
shmem+=ring_mem_next-shmem;
311+
312+
shmem+=ring_mem_remain;
313+
}
314+
138315
for (intcontextno=0;contextno<TotalProcs;contextno++)
139316
{
140317
PgAioUringContext*context=&pgaio_uring_contexts[contextno];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
158335
* be worth using that - also need to evaluate if that causes
159336
* noticeable additional contention?
160337
*/
161-
ret=io_uring_queue_init(io_max_concurrency,&context->io_uring_ring,0);
338+
339+
/*
340+
* If supported (c.f. pgaio_uring_check_capabilities()), create ring
341+
* with its data in shared memory. Otherwise fall back io_uring
342+
* creating a memory mapping for each ring.
343+
*/
344+
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM)&& defined(IORING_SETUP_NO_MMAP)
345+
if (pgaio_uring_caps.mem_init_size>0)
346+
{
347+
structio_uring_paramsp= {0};
348+
349+
ret=io_uring_queue_init_mem(io_max_concurrency,&context->io_uring_ring,&p,ring_mem_next,ring_mem_remain);
350+
351+
ring_mem_remain-=ret;
352+
ring_mem_next+=ret;
353+
}
354+
else
355+
#endif
356+
{
357+
ret=io_uring_queue_init(io_max_concurrency,&context->io_uring_ring,0);
358+
}
359+
162360
if (ret<0)
163361
{
164362
char*hint=NULL;

‎src/include/pg_config.h.in‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@
229229
/* Define to 1 if you have the global variable 'int timezone'. */
230230
#undef HAVE_INT_TIMEZONE
231231

232+
/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
233+
#undef HAVE_IO_URING_QUEUE_INIT_MEM
234+
232235
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
233236
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
234237

‎src/tools/pgindent/typedefs.list‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,6 +2176,7 @@ PgAioReturn
21762176
PgAioTargetData
21772177
PgAioTargetID
21782178
PgAioTargetInfo
2179+
PgAioUringCaps
21792180
PgAioUringContext
21802181
PgAioWaitRef
21812182
PgArchData

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp