python
diff --git a/‎Modules/_testinternalcapi/clinic/test_lock.c.h‎
Lines changed: 56 additions & 14 deletions b/‎Modules/_testinternalcapi/clinic/test_lock.c.h‎
Lines changed: 56 additions & 14 deletions
diff --git a/‎Modules/_testinternalcapi/test_lock.c‎
Lines changed: 69 additions & 46 deletions b/‎Modules/_testinternalcapi/test_lock.c‎
Lines changed: 69 additions & 46 deletions
@@ -196,13 +196,13 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
 
 struct bench_data_locks {
     int stop;
-    int use_pymutex;
-    int critical_section_length;
+    int work_inside;
+    int work_outside;
+    int num_acquisitions;
+    Py_ssize_t target_iters;
     char padding[200];
-    PyThread_type_lock lock;
     PyMutex m;
     double value;
-    Py_ssize_t total_iters;
 };
 
 struct bench_thread_data {
@@ -216,79 +216,95 @@ thread_benchmark_locks(void *arg)
 {
     struct bench_thread_data *thread_data = arg;
     struct bench_data_locks *bench_data = thread_data->bench_data;
-    int use_pymutex = bench_data->use_pymutex;
-    int critical_section_length = bench_data->critical_section_length;
+    int work_inside = bench_data->work_inside;
+    int work_outside = bench_data->work_outside;
+    int num_acquisitions = bench_data->num_acquisitions;
+    Py_ssize_t target_iters = bench_data->target_iters;
 
+    double local_value = 0.0;
     double my_value = 1.0;
     Py_ssize_t iters = 0;
-    while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
-        if (use_pymutex) {
-            PyMutex_Lock(&bench_data->m);
-            for (int i = 0; i < critical_section_length; i++) {
-                bench_data->value += my_value;
-                my_value = bench_data->value;
+    for (;;) {
+        if (target_iters > 0) {
+            // Fixed iteration mode: each thread runs for target_iters
+            if (iters >= target_iters) {
+                break;
             }
-            PyMutex_Unlock(&bench_data->m);
         }
         else {
-            PyThread_acquire_lock(bench_data->lock, 1);
-            for (int i = 0; i < critical_section_length; i++) {
+            // Time-based mode: stop when signaled
+            if (_Py_atomic_load_int_relaxed(&bench_data->stop)) {
+                break;
+            }
+        }
+        for (int acq = 0; acq < num_acquisitions; acq++) {
+            PyMutex_Lock(&bench_data->m);
+            for (int i = 0; i < work_inside; i++) {
                 bench_data->value += my_value;
                 my_value = bench_data->value;
             }
-            PyThread_release_lock(bench_data->lock);
+            PyMutex_Unlock(&bench_data->m);
         }
-        iters++;
+        for (int i = 0; i < work_outside; i++) {
+            local_value += my_value;
+            my_value = local_value;
+        }
+        iters += num_acquisitions;
     }
 
     thread_data->iters = iters;
-    _Py_atomic_add_ssize(&bench_data->total_iters, iters);
     _PyEvent_Notify(&thread_data->done);
 }
 
 /*[clinic input]
 _testinternalcapi.benchmark_locks
 
     num_threads: Py_ssize_t
-    use_pymutex: bool = True
-    critical_section_length: int = 1
+    work_inside: int = 1
+    work_outside: int = 0
     time_ms: int = 1000
+    num_acquisitions: int = 1
+    total_iters: Py_ssize_t = 0
+    num_locks: Py_ssize_t = 1
     /
 
 [clinic start generated code]*/
 
 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                        Py_ssize_t num_threads,
-                                       int use_pymutex,
-                                       int critical_section_length,
-                                       int time_ms)
-/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
+                                       int work_inside, int work_outside,
+                                       int time_ms, int num_acquisitions,
+                                       Py_ssize_t total_iters,
+                                       Py_ssize_t num_locks)
+/*[clinic end generated code: output=942723d0d7194f36 input=d21190b0d7cf00b9]*/
 {
     // Run from Tools/lockbench/lockbench.py
     // Based on the WebKit lock benchmarks:
     // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
     // See also https://webkit.org/blog/6161/locking-in-webkit/
     PyObject *thread_iters = NULL;
     PyObject *res = NULL;
+    struct bench_data_locks *bench_data = NULL;
+    struct bench_thread_data *thread_data = NULL;
 
-    struct bench_data_locks bench_data;
-    memset(&bench_data, 0, sizeof(bench_data));
-    bench_data.use_pymutex = use_pymutex;
-    bench_data.critical_section_length = critical_section_length;
-
-    bench_data.lock = PyThread_allocate_lock();
-    if (bench_data.lock == NULL) {
-        return PyErr_NoMemory();
+    bench_data = PyMem_Calloc(num_locks, sizeof(*bench_data));
+    if (bench_data == NULL) {
+        PyErr_NoMemory();
+        goto exit;
+    }
+    for (Py_ssize_t i = 0; i < num_locks; i++) {
+        bench_data[i].work_inside = work_inside;
+        bench_data[i].work_outside = work_outside;
+        bench_data[i].num_acquisitions = num_acquisitions;
+        bench_data[i].target_iters = total_iters;
     }
 
-    struct bench_thread_data *thread_data = NULL;
     thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
     if (thread_data == NULL) {
         PyErr_NoMemory();
         goto exit;
     }
-
     thread_iters = PyList_New(num_threads);
     if (thread_iters == NULL) {
         goto exit;
@@ -300,40 +316,47 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
     }
 
     for (Py_ssize_t i = 0; i < num_threads; i++) {
-        thread_data[i].bench_data = &bench_data;
+        thread_data[i].bench_data = &bench_data[i % num_locks];
         PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
     }
 
-    // Let the threads run for `time_ms` milliseconds
-    pysleep(time_ms);
-    _Py_atomic_store_int(&bench_data.stop, 1);
+    if (total_iters == 0) {
+        // Time-based mode: let the threads run for `time_ms` milliseconds
+        pysleep(time_ms);
+        for (Py_ssize_t i = 0; i < num_locks; i++) {
+            _Py_atomic_store_int(&bench_data[i].stop, 1);
+        }
+    }
 
-    // Wait for the threads to finish
+    // Wait for lock threads to finish
     for (Py_ssize_t i = 0; i < num_threads; i++) {
         PyEvent_Wait(&thread_data[i].done);
     }
 
-    Py_ssize_t total_iters = bench_data.total_iters;
     if (PyTime_PerfCounter(&end) < 0) {
         goto exit;
     }
 
-    // Return the total number of acquisitions and the number of acquisitions
-    // for each thread.
+    // Return the total number of acquisitions, the number of acquisitions
+    // for each thread, and elapsed time.
+    Py_ssize_t sum_iters = 0;
     for (Py_ssize_t i = 0; i < num_threads; i++) {
         PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
         if (iter == NULL) {
             goto exit;
         }
         PyList_SET_ITEM(thread_iters, i, iter);
+        sum_iters += thread_data[i].iters;
     }
 
     assert(end != start);
-    double rate = total_iters * 1e9 / (end - start);
-    res = Py_BuildValue("(dO)", rate, thread_iters);
+    PyTime_t elapsed_ns = end - start;
+    double rate = sum_iters * 1e9 / elapsed_ns;
+    res = Py_BuildValue("(dOL)", rate, thread_iters,
+                        (long long)elapsed_ns);
 
 exit:
-    PyThread_free_lock(bench_data.lock);
+    PyMem_Free(bench_data);
     PyMem_Free(thread_data);
     Py_XDECREF(thread_iters);
     return res;
@@ -344,7 +367,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
 {
     // Just make sure the benchmark runs without crashing
     PyObject *res = _testinternalcapi_benchmark_locks_impl(
-        module, 1, 1, 1, 100);
+        module, 1, 1, 0, 100, 1, 0, 1);
     if (res == NULL) {
         return NULL;
     }