2929
3030#ifdef IOMETHOD_IO_URING_ENABLED
3131
32+ #include <sys/mman.h>
33+ #include <unistd.h>
34+
3235#include <liburing.h>
3336
3437#include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
9497struct io_uring io_uring_ring ;
9598}PgAioUringContext ;
9699
100+ /*
101+ * Information about the capabilities that io_uring has.
102+ *
103+ * Depending on liburing and kernel version different features are
104+ * supported. At least for the kernel a kernel version check does not suffice
105+ * as various vendors do backport features to older kernels :(.
106+ */
107+ typedef struct PgAioUringCaps
108+ {
109+ bool checked ;
110+ /* -1 if io_uring_queue_init_mem() is unsupported */
111+ int mem_init_size ;
112+ }PgAioUringCaps ;
113+
114+
97115/* PgAioUringContexts for all backends */
98116static PgAioUringContext * pgaio_uring_contexts ;
99117
100118/* the current backend's context */
101119static PgAioUringContext * pgaio_my_uring_context ;
102120
121+ static PgAioUringCaps pgaio_uring_caps =
122+ {
123+ .checked = false,
124+ .mem_init_size = -1 ,
125+ };
103126
104127static uint32
105128pgaio_uring_procs (void )
@@ -111,30 +134,184 @@ pgaio_uring_procs(void)
111134return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS ;
112135}
113136
114- static Size
137+ /*
138+ * Initializes pgaio_uring_caps, unless that's already done.
139+ */
140+ static void
141+ pgaio_uring_check_capabilities (void )
142+ {
143+ if (pgaio_uring_caps .checked )
144+ return ;
145+
146+ /*
147+ * By default io_uring creates a shared memory mapping for each io_uring
148+ * instance, leading to a large number of memory mappings. Unfortunately a
149+ * large number of memory mappings slows things down, backend exit is
150+ * particularly affected. To address that, newer kernels (6.5) support
151+ * using user-provided memory for the memory, by putting the relevant
152+ * memory into shared memory we don't need any additional mappings.
153+ *
154+ * To know whether this is supported, we unfortunately need to probe the
155+ * kernel by trying to create a ring with userspace-provided memory. This
156+ * also has a secondary benefit: We can determine precisely how much
157+ * memory we need for each io_uring instance.
158+ */
159+ #if defined(HAVE_LIBURING_QUEUE_INIT_MEM )&& defined(IORING_SETUP_NO_MMAP )
160+ {
161+ struct io_uring test_ring ;
162+ size_t ring_size ;
163+ void * ring_ptr ;
164+ struct io_uring_params p = {0 };
165+ int ret ;
166+
167+ /*
168+ * Liburing does not yet provide an API to query how much memory a
169+ * ring will need. So we over-estimate it here. As the memory is freed
170+ * just below that's small temporary waste of memory.
171+ *
172+ * 1MB is more than enough for rings within io_max_concurrency's
173+ * range.
174+ */
175+ ring_size = 1024 * 1024 ;
176+
177+ /*
178+ * Hard to believe a system exists where 1MB would not be a multiple
179+ * of the page size. But it's cheap to ensure...
180+ */
181+ ring_size -= ring_size %sysconf (_SC_PAGESIZE );
182+
183+ ring_ptr = mmap (NULL ,ring_size ,PROT_READ |PROT_WRITE ,MAP_SHARED |MAP_ANONYMOUS ,-1 ,0 );
184+ if (ring_ptr == MAP_FAILED )
185+ elog (ERROR ,
186+ "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m" ,
187+ ring_size );
188+
189+ ret = io_uring_queue_init_mem (io_max_concurrency ,& test_ring ,& p ,ring_ptr ,ring_size );
190+ if (ret > 0 )
191+ {
192+ pgaio_uring_caps .mem_init_size = ret ;
193+
194+ elog (DEBUG1 ,
195+ "can use combined memory mapping for io_uring, each ring needs %d bytes" ,
196+ ret );
197+
198+ /* clean up the created ring, it was just for a test */
199+ io_uring_queue_exit (& test_ring );
200+ }
201+ else
202+ {
203+ /*
204+ * There are different reasons for ring creation to fail, but it's
205+ * ok to treat that just as io_uring_queue_init_mem() not being
206+ * supported. We'll report a more detailed error in
207+ * pgaio_uring_shmem_init().
208+ */
209+ errno = - ret ;
210+ elog (DEBUG1 ,
211+ "cannot use combined memory mapping for io_uring, ring creation failed: %m" );
212+
213+ }
214+
215+ if (munmap (ring_ptr ,ring_size )!= 0 )
216+ elog (ERROR ,"munmap() failed: %m" );
217+ }
218+ #else
219+ {
220+ elog (DEBUG1 ,
221+ "can't use combined memory mapping for io_uring, kernel or liburing too old" );
222+ }
223+ #endif
224+
225+ pgaio_uring_caps .checked = true;
226+ }
227+
228+ /*
229+ * Memory for all PgAioUringContext instances
230+ */
231+ static size_t
115232pgaio_uring_context_shmem_size (void )
116233{
117234return mul_size (pgaio_uring_procs (),sizeof (PgAioUringContext ));
118235}
119236
237+ /*
238+ * Memory for the combined memory used by io_uring instances. Returns 0 if
239+ * that is not supported by kernel/liburing.
240+ */
241+ static size_t
242+ pgaio_uring_ring_shmem_size (void )
243+ {
244+ size_t sz = 0 ;
245+
246+ if (pgaio_uring_caps .mem_init_size > 0 )
247+ {
248+ /*
249+ * Memory for rings needs to be allocated to the page boundary,
250+ * reserve space. Luckily it does not need to be aligned to hugepage
251+ * boundaries, even if huge pages are used.
252+ */
253+ sz = add_size (sz ,sysconf (_SC_PAGESIZE ));
254+ sz = add_size (sz ,mul_size (pgaio_uring_procs (),
255+ pgaio_uring_caps .mem_init_size ));
256+ }
257+
258+ return sz ;
259+ }
260+
120261static size_t
121262pgaio_uring_shmem_size (void )
122263{
123- return pgaio_uring_context_shmem_size ();
264+ size_t sz ;
265+
266+ /*
267+ * Kernel and liburing support for various features influences how much
268+ * shmem we need, perform the necessary checks.
269+ */
270+ pgaio_uring_check_capabilities ();
271+
272+ sz = pgaio_uring_context_shmem_size ();
273+ sz = add_size (sz ,pgaio_uring_ring_shmem_size ());
274+
275+ return sz ;
124276}
125277
126278static void
127279pgaio_uring_shmem_init (bool first_time )
128280{
129281int TotalProcs = pgaio_uring_procs ();
130282bool found ;
283+ char * shmem ;
284+ size_t ring_mem_remain = 0 ;
285+ char * ring_mem_next = 0 ;
131286
132- pgaio_uring_contexts = (PgAioUringContext * )
133- ShmemInitStruct ("AioUring" ,pgaio_uring_shmem_size (),& found );
134-
287+ /*
288+ * We allocate memory for all PgAioUringContext instances and, if
289+ * supported, the memory required for each of the io_uring instances, in
290+ * one ShmemInitStruct().
291+ */
292+ shmem = ShmemInitStruct ("AioUringContext" ,pgaio_uring_shmem_size (),& found );
135293if (found )
136294return ;
137295
296+ pgaio_uring_contexts = (PgAioUringContext * )shmem ;
297+ shmem += pgaio_uring_context_shmem_size ();
298+
299+ /* if supported, handle memory alignment / sizing for io_uring memory */
300+ if (pgaio_uring_caps .mem_init_size > 0 )
301+ {
302+ ring_mem_remain = pgaio_uring_ring_shmem_size ();
303+ ring_mem_next = (char * )shmem ;
304+
305+ /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
306+ ring_mem_next = (char * )TYPEALIGN (sysconf (_SC_PAGESIZE ),ring_mem_next );
307+
308+ /* account for alignment */
309+ ring_mem_remain -= ring_mem_next - shmem ;
310+ shmem += ring_mem_next - shmem ;
311+
312+ shmem += ring_mem_remain ;
313+ }
314+
138315for (int contextno = 0 ;contextno < TotalProcs ;contextno ++ )
139316{
140317PgAioUringContext * context = & pgaio_uring_contexts [contextno ];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
158335 * be worth using that - also need to evaluate if that causes
159336 * noticeable additional contention?
160337 */
161- ret = io_uring_queue_init (io_max_concurrency ,& context -> io_uring_ring ,0 );
338+
339+ /*
340+ * If supported (c.f. pgaio_uring_check_capabilities()), create ring
341+ * with its data in shared memory. Otherwise fall back io_uring
342+ * creating a memory mapping for each ring.
343+ */
344+ #if defined(HAVE_LIBURING_QUEUE_INIT_MEM )&& defined(IORING_SETUP_NO_MMAP )
345+ if (pgaio_uring_caps .mem_init_size > 0 )
346+ {
347+ struct io_uring_params p = {0 };
348+
349+ ret = io_uring_queue_init_mem (io_max_concurrency ,& context -> io_uring_ring ,& p ,ring_mem_next ,ring_mem_remain );
350+
351+ ring_mem_remain -= ret ;
352+ ring_mem_next += ret ;
353+ }
354+ else
355+ #endif
356+ {
357+ ret = io_uring_queue_init (io_max_concurrency ,& context -> io_uring_ring ,0 );
358+ }
359+
162360if (ret < 0 )
163361{
164362char * hint = NULL ;