@@ -196,13 +196,13 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
196196
197197struct bench_data_locks {
198198 int stop ;
199- int use_pymutex ;
200- int critical_section_length ;
199+ int work_inside ;
200+ int work_outside ;
201+ int num_acquisitions ;
202+ Py_ssize_t target_iters ;
201203 char padding [200 ];
202- PyThread_type_lock lock ;
203204 PyMutex m ;
204205 double value ;
205- Py_ssize_t total_iters ;
206206};
207207
208208struct bench_thread_data {
@@ -216,79 +216,95 @@ thread_benchmark_locks(void *arg)
216216{
217217 struct bench_thread_data * thread_data = arg ;
218218 struct bench_data_locks * bench_data = thread_data -> bench_data ;
219- int use_pymutex = bench_data -> use_pymutex ;
220- int critical_section_length = bench_data -> critical_section_length ;
219+ int work_inside = bench_data -> work_inside ;
220+ int work_outside = bench_data -> work_outside ;
221+ int num_acquisitions = bench_data -> num_acquisitions ;
222+ Py_ssize_t target_iters = bench_data -> target_iters ;
221223
224+ double local_value = 0.0 ;
222225 double my_value = 1.0 ;
223226 Py_ssize_t iters = 0 ;
224- while (!_Py_atomic_load_int_relaxed (& bench_data -> stop )) {
225- if (use_pymutex ) {
226- PyMutex_Lock (& bench_data -> m );
227- for (int i = 0 ; i < critical_section_length ; i ++ ) {
228- bench_data -> value += my_value ;
229- my_value = bench_data -> value ;
227+ for (;;) {
228+ if (target_iters > 0 ) {
229+ // Fixed iteration mode: each thread runs for target_iters
230+ if (iters >= target_iters ) {
231+ break ;
230232 }
231- PyMutex_Unlock (& bench_data -> m );
232233 }
233234 else {
234- PyThread_acquire_lock (bench_data -> lock , 1 );
235- for (int i = 0 ; i < critical_section_length ; i ++ ) {
235+ // Time-based mode: stop when signaled
236+ if (_Py_atomic_load_int_relaxed (& bench_data -> stop )) {
237+ break ;
238+ }
239+ }
240+ for (int acq = 0 ; acq < num_acquisitions ; acq ++ ) {
241+ PyMutex_Lock (& bench_data -> m );
242+ for (int i = 0 ; i < work_inside ; i ++ ) {
236243 bench_data -> value += my_value ;
237244 my_value = bench_data -> value ;
238245 }
239- PyThread_release_lock ( bench_data -> lock );
246+ PyMutex_Unlock ( & bench_data -> m );
240247 }
241- iters ++ ;
248+ for (int i = 0 ; i < work_outside ; i ++ ) {
249+ local_value += my_value ;
250+ my_value = local_value ;
251+ }
252+ iters += num_acquisitions ;
242253 }
243254
244255 thread_data -> iters = iters ;
245- _Py_atomic_add_ssize (& bench_data -> total_iters , iters );
246256 _PyEvent_Notify (& thread_data -> done );
247257}
248258
249259/*[clinic input]
250260_testinternalcapi.benchmark_locks
251261
252262 num_threads: Py_ssize_t
253- use_pymutex: bool = True
254- critical_section_length : int = 1
263+ work_inside: int = 1
264+ work_outside : int = 0
255265 time_ms: int = 1000
266+ num_acquisitions: int = 1
267+ total_iters: Py_ssize_t = 0
268+ num_locks: Py_ssize_t = 1
256269 /
257270
258271[clinic start generated code]*/
259272
260273static PyObject *
261274_testinternalcapi_benchmark_locks_impl (PyObject * module ,
262275 Py_ssize_t num_threads ,
263- int use_pymutex ,
264- int critical_section_length ,
265- int time_ms )
266- /*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
276+ int work_inside , int work_outside ,
277+ int time_ms , int num_acquisitions ,
278+ Py_ssize_t total_iters ,
279+ Py_ssize_t num_locks )
280+ /*[clinic end generated code: output=942723d0d7194f36 input=d21190b0d7cf00b9]*/
267281{
268282 // Run from Tools/lockbench/lockbench.py
269283 // Based on the WebKit lock benchmarks:
270284 // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
271285 // See also https://webkit.org/blog/6161/locking-in-webkit/
272286 PyObject * thread_iters = NULL ;
273287 PyObject * res = NULL ;
288+ struct bench_data_locks * bench_data = NULL ;
289+ struct bench_thread_data * thread_data = NULL ;
274290
275- struct bench_data_locks bench_data ;
276- memset (& bench_data , 0 , sizeof (bench_data ));
277- bench_data .use_pymutex = use_pymutex ;
278- bench_data .critical_section_length = critical_section_length ;
279-
280- bench_data .lock = PyThread_allocate_lock ();
281- if (bench_data .lock == NULL ) {
282- return PyErr_NoMemory ();
291+ bench_data = PyMem_Calloc (num_locks , sizeof (* bench_data ));
292+ if (bench_data == NULL ) {
293+ PyErr_NoMemory ();
294+ goto exit ;
295+ }
296+ for (Py_ssize_t i = 0 ; i < num_locks ; i ++ ) {
297+ bench_data [i ].work_inside = work_inside ;
298+ bench_data [i ].work_outside = work_outside ;
299+ bench_data [i ].num_acquisitions = num_acquisitions ;
300+ bench_data [i ].target_iters = total_iters ;
283301 }
284302
285- struct bench_thread_data * thread_data = NULL ;
286303 thread_data = PyMem_Calloc (num_threads , sizeof (* thread_data ));
287304 if (thread_data == NULL ) {
288305 PyErr_NoMemory ();
289306 goto exit ;
290307 }
291-
292308 thread_iters = PyList_New (num_threads );
293309 if (thread_iters == NULL ) {
294310 goto exit ;
@@ -300,40 +316,47 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
300316 }
301317
302318 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
303- thread_data [i ].bench_data = & bench_data ;
319+ thread_data [i ].bench_data = & bench_data [ i % num_locks ] ;
304320 PyThread_start_new_thread (thread_benchmark_locks , & thread_data [i ]);
305321 }
306322
307- // Let the threads run for `time_ms` milliseconds
308- pysleep (time_ms );
309- _Py_atomic_store_int (& bench_data .stop , 1 );
323+ if (total_iters == 0 ) {
324+ // Time-based mode: let the threads run for `time_ms` milliseconds
325+ pysleep (time_ms );
326+ for (Py_ssize_t i = 0 ; i < num_locks ; i ++ ) {
327+ _Py_atomic_store_int (& bench_data [i ].stop , 1 );
328+ }
329+ }
310330
311- // Wait for the threads to finish
331+ // Wait for lock threads to finish
312332 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
313333 PyEvent_Wait (& thread_data [i ].done );
314334 }
315335
316- Py_ssize_t total_iters = bench_data .total_iters ;
317336 if (PyTime_PerfCounter (& end ) < 0 ) {
318337 goto exit ;
319338 }
320339
321- // Return the total number of acquisitions and the number of acquisitions
322- // for each thread.
340+ // Return the total number of acquisitions, the number of acquisitions
341+ // for each thread, and elapsed time.
342+ Py_ssize_t sum_iters = 0 ;
323343 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
324344 PyObject * iter = PyLong_FromSsize_t (thread_data [i ].iters );
325345 if (iter == NULL ) {
326346 goto exit ;
327347 }
328348 PyList_SET_ITEM (thread_iters , i , iter );
349+ sum_iters += thread_data [i ].iters ;
329350 }
330351
331352 assert (end != start );
332- double rate = total_iters * 1e9 / (end - start );
333- res = Py_BuildValue ("(dO)" , rate , thread_iters );
353+ PyTime_t elapsed_ns = end - start ;
354+ double rate = sum_iters * 1e9 / elapsed_ns ;
355+ res = Py_BuildValue ("(dOL)" , rate , thread_iters ,
356+ (long long )elapsed_ns );
334357
335358exit :
336- PyThread_free_lock (bench_data . lock );
359+ PyMem_Free (bench_data );
337360 PyMem_Free (thread_data );
338361 Py_XDECREF (thread_iters );
339362 return res ;
@@ -344,7 +367,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
344367{
345368 // Just make sure the benchmark runs without crashing
346369 PyObject * res = _testinternalcapi_benchmark_locks_impl (
347- module , 1 , 1 , 1 , 100 );
370+ module , 1 , 1 , 0 , 100 , 1 , 0 , 1 );
348371 if (res == NULL ) {
349372 return NULL ;
350373 }
0 commit comments