Feb 23, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h b/Modules/_testinternalcapi/clinic/test_lock.c.h
diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c
    Py_RETURN_NONE;
 }

 struct bench_data_locks {
    int stop;
    int use_pymutex;
    int critical_section_length;
 struct bench_lock {
    char padding[200];
    PyThread_type_lock lock;
    PyMutex m;
    double value;
    Py_ssize_t total_iters;
 };

 struct bench_config {
    int stop;
    int work_inside;
    int work_outside;
    int num_acquisitions;
    int random_locks;
    Py_ssize_t target_iters;
    Py_ssize_t num_locks;
    struct bench_lock *locks;
 };

 struct bench_thread_data {
    struct bench_data_locks *bench_data;
    struct bench_config *config;
    struct bench_lock *lock;
    uint64_t rng_state;
    Py_ssize_t iters;
    PyEvent done;
 };

 static uint64_t
 splitmix64(uint64_t *state)
 {
    uint64_t z = (*state += 0x9e3779b97f4a7c15);
    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
    return z ^ (z >> 31);
 }

 static void
 thread_benchmark_locks(void *arg)
 {
    struct bench_thread_data *thread_data = arg;
    struct bench_data_locks *bench_data = thread_data->bench_data;
    int use_pymutex = bench_data->use_pymutex;
    int critical_section_length = bench_data->critical_section_length;

    struct bench_thread_data *td = arg;
    struct bench_config *config = td->config;
    int work_inside = config->work_inside;
    int work_outside = config->work_outside;
    int num_acquisitions = config->num_acquisitions;
    Py_ssize_t target_iters = config->target_iters;
    uint64_t rng_state = td->rng_state;

    double local_value = 0.0;
    double my_value = 1.0;
    Py_ssize_t iters = 0;
    while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
        if (use_pymutex) {
            PyMutex_Lock(&bench_data->m);
            for (int i = 0; i < critical_section_length; i++) {
                bench_data->value += my_value;
                my_value = bench_data->value;
    for (;;) {
        if (target_iters > 0) {
            if (iters >= target_iters) {
                break;
            }
            PyMutex_Unlock(&bench_data->m);
        }
        else {
            PyThread_acquire_lock(bench_data->lock, 1);
            for (int i = 0; i < critical_section_length; i++) {
                bench_data->value += my_value;
                my_value = bench_data->value;
        else if (_Py_atomic_load_int_relaxed(&config->stop)) {
            break;
        }
        struct bench_lock *lock = td->lock;
        if (config->random_locks) {
            uint32_t r = (uint32_t)splitmix64(&rng_state);
            // Fast modulo reduction to pick a random lock, adapted from:
            // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
            Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32;
            lock = &config->locks[idx];
        }
        for (int acq = 0; acq < num_acquisitions; acq++) {
            PyMutex_Lock(&lock->m);
            for (int i = 0; i < work_inside; i++) {
                lock->value += my_value;
                my_value = lock->value;
            }
 PyThread_release_lock(bench_data->lock);
 PyMutex_Unlock(&lock->m);
        }
        iters++;
        for (int i = 0; i < work_outside; i++) {
            local_value += my_value;
            my_value = local_value;
        }
        iters += num_acquisitions;
    }

    thread_data->iters = iters;
    _Py_atomic_add_ssize(&bench_data->total_iters, iters);
    _PyEvent_Notify(&thread_data->done);
    td->iters = iters;
    _PyEvent_Notify(&td->done);
 }

 /*[clinic input]
 _testinternalcapi.benchmark_locks

    num_threads: Py_ssize_t
 use_pymutex: bool =True
 critical_section_length: int =1
 work_inside: int =1
 work_outside: int =0
    time_ms: int = 1000
    num_acquisitions: int = 1
    total_iters: Py_ssize_t = 0
    num_locks: Py_ssize_t = 1
    random_locks: bool = False
    /

 [clinic start generated code]*/

 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                       Py_ssize_t num_threads,
                                       int use_pymutex,
                                       int critical_section_length,
                                       int time_ms)
 /*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
                                       int work_inside, int work_outside,
                                       int time_ms, int num_acquisitions,
                                       Py_ssize_t total_iters,
                                       Py_ssize_t num_locks,
                                       int random_locks)
 /*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/
 {
    // Run from Tools/lockbench/lockbench.py
    // Based on the WebKit lock benchmarks:
    // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
    // See also https://webkit.org/blog/6161/locking-in-webkit/
    PyObject *thread_iters = NULL;
    PyObject *res = NULL;
    struct bench_thread_data *thread_data = NULL;

    struct bench_data_locks bench_data;
    memset(&bench_data, 0, sizeof(bench_data));
    bench_data.use_pymutex = use_pymutex;
    bench_data.critical_section_length = critical_section_length;

    bench_data.lock = PyThread_allocate_lock();
    if (bench_data.lock == NULL) {
        return PyErr_NoMemory();
    struct bench_config config = {
        .work_inside = work_inside,
        .work_outside = work_outside,
        .num_acquisitions = num_acquisitions,
        .target_iters = total_iters,
        .num_locks = num_locks,
        .random_locks = random_locks,
    };

    config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks));
    if (config.locks == NULL) {
        PyErr_NoMemory();
        goto exit;
    }

    struct bench_thread_data *thread_data = NULL;
    thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
    if (thread_data == NULL) {
        PyErr_NoMemory();
        goto exit;
    }

    thread_iters = PyList_New(num_threads);
    if (thread_iters == NULL) {
        goto exit;
    }

    for (Py_ssize_t i = 0; i < num_threads; i++) {
        thread_data[i].bench_data = &bench_data;
        thread_data[i].config = &config;
        thread_data[i].lock = &config.locks[i % num_locks];
        thread_data[i].rng_state = (uint64_t)i + 1;
        PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
    }

    // Let the threads run for `time_ms` milliseconds
    pysleep(time_ms);
    _Py_atomic_store_int(&bench_data.stop, 1);
    if (total_iters == 0) {
        pysleep(time_ms);
        _Py_atomic_store_int(&config.stop, 1);
    }

    // Wait for the threads to finish
    for (Py_ssize_t i = 0; i < num_threads; i++) {
        PyEvent_Wait(&thread_data[i].done);
    }

    Py_ssize_t total_iters = bench_data.total_iters;
    if (PyTime_PerfCounter(&end) < 0) {
        goto exit;
    }

    // Return the total number of acquisitions and the number of acquisitions
    // for each thread.
    Py_ssize_t sum_iters = 0;
    for (Py_ssize_t i = 0; i < num_threads; i++) {
        PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
        if (iter == NULL) {
            goto exit;
        }
        PyList_SET_ITEM(thread_iters, i, iter);
        sum_iters += thread_data[i].iters;
    }

    assert(end != start);
    double rate = total_iters * 1e9 / (end - start);
    res = Py_BuildValue("(dO)", rate, thread_iters);
    PyTime_t elapsed_ns = end - start;
    double rate = sum_iters * 1e9 / elapsed_ns;
    res = Py_BuildValue("(dOL)", rate, thread_iters,
                        (long long)elapsed_ns);

 exit:
 PyThread_free_lock(bench_data.lock);
 PyMem_Free(config.locks);
    PyMem_Free(thread_data);
    Py_XDECREF(thread_iters);
    return res;
 {
    // Just make sure the benchmark runs without crashing
    PyObject *res = _testinternalcapi_benchmark_locks_impl(
        module, 1, 1,1, 100);
        module, 1, 1,0, 100, 1, 0, 1, 0);
    if (res == NULL) {
        return NULL;
    }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -194,101 +194,143 @@ test_lock_counter_slow(PyObject self, PyObject obj)
		Py_RETURN_NONE;
		}

		struct bench_data_locks {
		int stop;
		int use_pymutex;
		int critical_section_length;
		struct bench_lock {
		char padding[200];
		PyThread_type_lock lock;
		PyMutex m;
		double value;
		Py_ssize_t total_iters;
		};

		struct bench_config {
		int stop;
		int work_inside;
		int work_outside;
		int num_acquisitions;
		int random_locks;
		Py_ssize_t target_iters;
		Py_ssize_t num_locks;
		struct bench_lock *locks;
		};

		struct bench_thread_data {
		struct bench_data_locks *bench_data;
		struct bench_config *config;
		struct bench_lock *lock;
		uint64_t rng_state;
		Py_ssize_t iters;
		PyEvent done;
		};

		static uint64_t
		splitmix64(uint64_t *state)
		{
		uint64_t z = (*state += 0x9e3779b97f4a7c15);
		z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
		z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
		return z ^ (z >> 31);
		}

		static void
		thread_benchmark_locks(void *arg)
		{
		struct bench_thread_data *thread_data = arg;
		struct bench_data_locks *bench_data = thread_data->bench_data;
		int use_pymutex = bench_data->use_pymutex;
		int critical_section_length = bench_data->critical_section_length;

		struct bench_thread_data *td = arg;
		struct bench_config *config = td->config;
		int work_inside = config->work_inside;
		int work_outside = config->work_outside;
		int num_acquisitions = config->num_acquisitions;
		Py_ssize_t target_iters = config->target_iters;
		uint64_t rng_state = td->rng_state;

		double local_value = 0.0;
		double my_value = 1.0;
		Py_ssize_t iters = 0;
		while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
		if (use_pymutex) {
		PyMutex_Lock(&bench_data->m);
		for (int i = 0; i < critical_section_length; i++) {
		bench_data->value += my_value;
		my_value = bench_data->value;
		for (;;) {
		if (target_iters > 0) {
		if (iters >= target_iters) {
		break;
		}
		PyMutex_Unlock(&bench_data->m);
		}
		else {
		PyThread_acquire_lock(bench_data->lock, 1);
		for (int i = 0; i < critical_section_length; i++) {
		bench_data->value += my_value;
		my_value = bench_data->value;
		else if (_Py_atomic_load_int_relaxed(&config->stop)) {
		break;
		}
		struct bench_lock *lock = td->lock;
		if (config->random_locks) {
		uint32_t r = (uint32_t)splitmix64(&rng_state);
		// Fast modulo reduction to pick a random lock, adapted from:
		// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
		Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32;
		lock = &config->locks[idx];
		}
		for (int acq = 0; acq < num_acquisitions; acq++) {
		PyMutex_Lock(&lock->m);
		for (int i = 0; i < work_inside; i++) {
		lock->value += my_value;
		my_value = lock->value;
		}
		PyThread_release_lock(bench_data->lock);
		PyMutex_Unlock(&lock->m);
		}
		iters++;
		for (int i = 0; i < work_outside; i++) {
		local_value += my_value;
		my_value = local_value;
		}
		iters += num_acquisitions;
		}

		thread_data->iters = iters;
		_Py_atomic_add_ssize(&bench_data->total_iters, iters);
		_PyEvent_Notify(&thread_data->done);
		td->iters = iters;
		_PyEvent_Notify(&td->done);
		}

		/*[clinic input]
		_testinternalcapi.benchmark_locks

		num_threads: Py_ssize_t
		use_pymutex: bool =True
		critical_section_length: int =1
		work_inside: int =1
		work_outside: int =0
		time_ms: int = 1000
		num_acquisitions: int = 1
		total_iters: Py_ssize_t = 0
		num_locks: Py_ssize_t = 1
		random_locks: bool = False
		/

		[clinic start generated code]*/

		static PyObject *
		_testinternalcapi_benchmark_locks_impl(PyObject *module,
		Py_ssize_t num_threads,
		int use_pymutex,
		int critical_section_length,
		int time_ms)
		/[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]/
		int work_inside, int work_outside,
		int time_ms, int num_acquisitions,
		Py_ssize_t total_iters,
		Py_ssize_t num_locks,
		int random_locks)
		/[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]/
		{
		// Run from Tools/lockbench/lockbench.py
		// Based on the WebKit lock benchmarks:
		// https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
		// See also https://webkit.org/blog/6161/locking-in-webkit/
		PyObject *thread_iters = NULL;
		PyObject *res = NULL;
		struct bench_thread_data *thread_data = NULL;

		struct bench_data_locks bench_data;
		memset(&bench_data, 0, sizeof(bench_data));
		bench_data.use_pymutex = use_pymutex;
		bench_data.critical_section_length = critical_section_length;

		bench_data.lock = PyThread_allocate_lock();
		if (bench_data.lock == NULL) {
		return PyErr_NoMemory();
		struct bench_config config = {
		.work_inside = work_inside,
		.work_outside = work_outside,
		.num_acquisitions = num_acquisitions,
		.target_iters = total_iters,
		.num_locks = num_locks,
		.random_locks = random_locks,
		};

		config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks));
		if (config.locks == NULL) {
		PyErr_NoMemory();
		goto exit;
		}

		struct bench_thread_data *thread_data = NULL;
		thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
		if (thread_data == NULL) {
		PyErr_NoMemory();
		goto exit;
		}

		thread_iters = PyList_New(num_threads);
		if (thread_iters == NULL) {
		goto exit;
Expand All		@@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
		}

		for (Py_ssize_t i = 0; i < num_threads; i++) {
		thread_data[i].bench_data = &bench_data;
		thread_data[i].config = &config;
		thread_data[i].lock = &config.locks[i % num_locks];
		thread_data[i].rng_state = (uint64_t)i + 1;
		PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
		}

		// Let the threads run for `time_ms` milliseconds
		pysleep(time_ms);
		_Py_atomic_store_int(&bench_data.stop, 1);
		if (total_iters == 0) {
		pysleep(time_ms);
		_Py_atomic_store_int(&config.stop, 1);
		}

		// Wait for the threads to finish
		for (Py_ssize_t i = 0; i < num_threads; i++) {
		PyEvent_Wait(&thread_data[i].done);
		}

		Py_ssize_t total_iters = bench_data.total_iters;
		if (PyTime_PerfCounter(&end) < 0) {
		goto exit;
		}

		// Return the total number of acquisitions and the number of acquisitions
		// for each thread.
		Py_ssize_t sum_iters = 0;
		for (Py_ssize_t i = 0; i < num_threads; i++) {
		PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
		if (iter == NULL) {
		goto exit;
		}
		PyList_SET_ITEM(thread_iters, i, iter);
		sum_iters += thread_data[i].iters;
		}

		assert(end != start);
		double rate = total_iters * 1e9 / (end - start);
		res = Py_BuildValue("(dO)", rate, thread_iters);
		PyTime_t elapsed_ns = end - start;
		double rate = sum_iters * 1e9 / elapsed_ns;
		res = Py_BuildValue("(dOL)", rate, thread_iters,
		(long long)elapsed_ns);

		exit:
		PyThread_free_lock(bench_data.lock);
		PyMem_Free(config.locks);
		PyMem_Free(thread_data);
		Py_XDECREF(thread_iters);
		return res;
Expand All		@@ -344,7 +389,7 @@ test_lock_benchmark(PyObject module, PyObject obj)
		{
		// Just make sure the benchmark runs without crashing
		PyObject *res = _testinternalcapi_benchmark_locks_impl(
		module, 1, 1,1, 100);
		module, 1, 1,0, 100, 1, 0, 1, 0);
		if (res == NULL) {
		return NULL;
		}
Expand Down