Expand Up @@ -194,101 +194,143 @@ test_lock_counter_slow(PyObject *self, PyObject *obj) Py_RETURN_NONE; } struct bench_data_locks { int stop; int use_pymutex; int critical_section_length; struct bench_lock { char padding[200]; PyThread_type_lock lock; PyMutex m; double value; Py_ssize_t total_iters; }; struct bench_config { int stop; int work_inside; int work_outside; int num_acquisitions; int random_locks; Py_ssize_t target_iters; Py_ssize_t num_locks; struct bench_lock *locks; }; struct bench_thread_data { struct bench_data_locks *bench_data; struct bench_config *config; struct bench_lock *lock; uint64_t rng_state; Py_ssize_t iters; PyEvent done; }; static uint64_t splitmix64(uint64_t *state) { uint64_t z = (*state += 0x9e3779b97f4a7c15); z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; z = (z ^ (z >> 27)) * 0x94d049bb133111eb; return z ^ (z >> 31); } static void thread_benchmark_locks(void *arg) { struct bench_thread_data *thread_data = arg; struct bench_data_locks *bench_data = thread_data->bench_data; int use_pymutex = bench_data->use_pymutex; int critical_section_length = bench_data->critical_section_length; struct bench_thread_data *td = arg; struct bench_config *config = td->config; int work_inside = config->work_inside; int work_outside = config->work_outside; int num_acquisitions = config->num_acquisitions; Py_ssize_t target_iters = config->target_iters; uint64_t rng_state = td->rng_state; double local_value = 0.0; double my_value = 1.0; Py_ssize_t iters = 0; while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) { if (use_pymutex) { PyMutex_Lock(&bench_data->m); for (int i = 0; i < critical_section_length; i++) { bench_data->value += my_value; my_value = bench_data->value; for (;;) { if (target_iters > 0) { if (iters >= target_iters) { break; } PyMutex_Unlock(&bench_data->m); } else { PyThread_acquire_lock(bench_data->lock, 1); for (int i = 0; i < critical_section_length; i++) { bench_data->value += my_value; my_value = bench_data->value; else if (_Py_atomic_load_int_relaxed(&config->stop)) { break; } struct bench_lock *lock = td->lock; if (config->random_locks) { uint32_t r = (uint32_t)splitmix64(&rng_state); // Fast modulo reduction to pick a random lock, adapted from: // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32; lock = &config->locks[idx]; } for (int acq = 0; acq < num_acquisitions; acq++) { PyMutex_Lock(&lock->m); for (int i = 0; i < work_inside; i++) { lock->value += my_value; my_value = lock->value; } PyThread_release_lock(bench_data->lock );PyMutex_Unlock(&lock->m ); } iters++; for (int i = 0; i < work_outside; i++) { local_value += my_value; my_value = local_value; } iters += num_acquisitions; } thread_data->iters = iters; _Py_atomic_add_ssize(&bench_data->total_iters, iters); _PyEvent_Notify(&thread_data->done); td->iters = iters; _PyEvent_Notify(&td->done); } /*[clinic input] _testinternalcapi.benchmark_locks num_threads: Py_ssize_t use_pymutex: bool =True critical_section_length : int =1 work_inside: int =1 work_outside : int =0 time_ms: int = 1000 num_acquisitions: int = 1 total_iters: Py_ssize_t = 0 num_locks: Py_ssize_t = 1 random_locks: bool = False / [clinic start generated code]*/ static PyObject * _testinternalcapi_benchmark_locks_impl(PyObject *module, Py_ssize_t num_threads, int use_pymutex, int critical_section_length, int time_ms) /*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/ int work_inside, int work_outside, int time_ms, int num_acquisitions, Py_ssize_t total_iters, Py_ssize_t num_locks, int random_locks) /*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/ { // Run from Tools/lockbench/lockbench.py // Based on the WebKit lock benchmarks: // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp // See also https://webkit.org/blog/6161/locking-in-webkit/ PyObject *thread_iters = NULL; PyObject *res = NULL; struct bench_thread_data *thread_data = NULL; struct bench_data_locks bench_data; memset(&bench_data, 0, sizeof(bench_data)); bench_data.use_pymutex = use_pymutex; bench_data.critical_section_length = critical_section_length; bench_data.lock = PyThread_allocate_lock(); if (bench_data.lock == NULL) { return PyErr_NoMemory(); struct bench_config config = { .work_inside = work_inside, .work_outside = work_outside, .num_acquisitions = num_acquisitions, .target_iters = total_iters, .num_locks = num_locks, .random_locks = random_locks, }; config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks)); if (config.locks == NULL) { PyErr_NoMemory(); goto exit; } struct bench_thread_data *thread_data = NULL; thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data)); if (thread_data == NULL) { PyErr_NoMemory(); goto exit; } thread_iters = PyList_New(num_threads); if (thread_iters == NULL) { goto exit; Expand All @@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module, } for (Py_ssize_t i = 0; i < num_threads; i++) { thread_data[i].bench_data = &bench_data; thread_data[i].config = &config; thread_data[i].lock = &config.locks[i % num_locks]; thread_data[i].rng_state = (uint64_t)i + 1; PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]); } // Let the threads run for `time_ms` milliseconds pysleep(time_ms); _Py_atomic_store_int(&bench_data.stop, 1); if (total_iters == 0) { pysleep(time_ms); _Py_atomic_store_int(&config.stop, 1); } // Wait for the threads to finish for (Py_ssize_t i = 0; i < num_threads; i++) { PyEvent_Wait(&thread_data[i].done); } Py_ssize_t total_iters = bench_data.total_iters; if (PyTime_PerfCounter(&end) < 0) { goto exit; } // Return the total number of acquisitions and the number of acquisitions // for each thread. Py_ssize_t sum_iters = 0; for (Py_ssize_t i = 0; i < num_threads; i++) { PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters); if (iter == NULL) { goto exit; } PyList_SET_ITEM(thread_iters, i, iter); sum_iters += thread_data[i].iters; } assert(end != start); double rate = total_iters * 1e9 / (end - start); res = Py_BuildValue("(dO)", rate, thread_iters); PyTime_t elapsed_ns = end - start; double rate = sum_iters * 1e9 / elapsed_ns; res = Py_BuildValue("(dOL)", rate, thread_iters, (long long)elapsed_ns); exit: PyThread_free_lock(bench_data.lock );PyMem_Free(config.locks ); PyMem_Free(thread_data); Py_XDECREF(thread_iters); return res; Expand All @@ -344,7 +389,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj) { // Just make sure the benchmark runs without crashing PyObject *res = _testinternalcapi_benchmark_locks_impl( module, 1, 1,1 , 100); module, 1, 1,0 , 100, 1, 0, 1, 0 ); if (res == NULL) { return NULL; } Expand Down