Skip to content

Commit 8110537

Browse files
gpsheadclaude
andcommitted
pystrhex: Use configure.ac detection for SIMD support
Replace the hardcoded arch/compiler preprocessor checks with a configure.ac probe that tests __builtin_shufflevector with 128-bit vectors directly. This follows the existing HAVE_BUILTIN_ATOMIC pattern and lets configure determine what the compiler actually supports rather than maintaining a list of arch/compiler combos. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2a378d4 commit 8110537

File tree

4 files changed

+107
-11
lines changed

4 files changed

+107
-11
lines changed

Python/pystrhex.c

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,7 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
4141
4242
Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
4343
*/
44-
#if (defined(__x86_64__) || defined(__aarch64__) || \
45-
(defined(__arm__) && defined(__ARM_NEON))) && \
46-
(defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
47-
# define PY_HEXLIFY_CAN_COMPILE_SIMD 1
48-
#else
49-
# define PY_HEXLIFY_CAN_COMPILE_SIMD 0
50-
#endif
51-
52-
#if PY_HEXLIFY_CAN_COMPILE_SIMD
44+
#ifdef HAVE_BUILTIN_SHUFFLEVECTOR_AND_IT_IS_WORTH_USING
5345

5446
/* 128-bit vector of 16 unsigned bytes */
5547
typedef unsigned char v16u8 __attribute__((vector_size(16)));
@@ -122,7 +114,7 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
122114
_Py_hexlify_scalar(src + i, dst, len - i);
123115
}
124116

125-
#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
117+
#endif /* HAVE_BUILTIN_SHUFFLEVECTOR_AND_IT_IS_WORTH_USING */
126118

127119
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
128120
PyObject* sep, int bytes_per_sep_group,
@@ -202,7 +194,7 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
202194
unsigned char c;
203195

204196
if (bytes_per_sep_group == 0) {
205-
#if PY_HEXLIFY_CAN_COMPILE_SIMD
197+
#ifdef HAVE_BUILTIN_SHUFFLEVECTOR_AND_IT_IS_WORTH_USING
206198
if (arglen >= 16) {
207199
// little vector units go brrrr...
208200
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);

configure

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

configure.ac

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5014,6 +5014,40 @@ AS_VAR_IF([ac_cv_builtin_atomic], [yes], [
50145014
AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Has builtin __atomic_load_n() and __atomic_store_n() functions])
50155015
])
50165016

5017+
# Check for __builtin_shufflevector with 128-bit vector support on an
5018+
# architecture where it compiles to worthwhile native SIMD instructions.
5019+
# Used for SIMD-accelerated bytes.hex() in Python/pystrhex.c.
5020+
AC_CACHE_CHECK([for __builtin_shufflevector], [ac_cv_builtin_shufflevector], [
5021+
AC_LINK_IFELSE([
5022+
AC_LANG_PROGRAM([[
5023+
/* __builtin_shufflevector is available on many platforms, but 128-bit
5024+
vector code is only worthwhile on architectures with native SIMD:
5025+
x86-64 (SSE2, always available), ARM64 (NEON, always available),
5026+
or ARM32 when NEON is enabled via compiler flags (e.g. -march=native
5027+
on RPi3+). On ARM32 without NEON (e.g. armv6 builds), the compiler
5028+
has the builtin but generates slow scalar code instead. */
5029+
#if !defined(__x86_64__) && !defined(__aarch64__) && \
5030+
!(defined(__arm__) && defined(__ARM_NEON))
5031+
# error "128-bit vector SIMD not worthwhile on this architecture"
5032+
#endif
5033+
typedef unsigned char v16u8 __attribute__((vector_size(16)));
5034+
]], [[
5035+
v16u8 a = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
5036+
v16u8 b = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
5037+
v16u8 c = __builtin_shufflevector(a, b,
5038+
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
5039+
(void)c;
5040+
return 0;
5041+
]])
5042+
],[ac_cv_builtin_shufflevector=yes],[ac_cv_builtin_shufflevector=no])
5043+
])
5044+
5045+
AS_VAR_IF([ac_cv_builtin_shufflevector], [yes], [
5046+
AC_DEFINE([HAVE_BUILTIN_SHUFFLEVECTOR_AND_IT_IS_WORTH_USING], [1],
5047+
[Define if compiler supports __builtin_shufflevector with
5048+
128-bit vectors on an architecture with native SIMD])
5049+
])
5050+
50175051
# --with-mimalloc
50185052
AC_MSG_CHECKING([for --with-mimalloc])
50195053
AC_ARG_WITH([mimalloc],

pyconfig.h.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@
126126
/* Has builtin __atomic_load_n() and __atomic_store_n() functions */
127127
#undef HAVE_BUILTIN_ATOMIC
128128

129+
/* Define if compiler supports __builtin_shufflevector with 128-bit vectors on
130+
an architecture with native SIMD */
131+
#undef HAVE_BUILTIN_SHUFFLEVECTOR_AND_IT_IS_WORTH_USING
132+
129133
/* Define to 1 if you have the <bzlib.h> header file. */
130134
#undef HAVE_BZLIB_H
131135

0 commit comments

Comments
 (0)