diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 865c0df..eb25cc7 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,6 +9,54 @@ on: permissions: read-all jobs: + build-windows-msvc: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + + # Set up MSVC environment + - name: Set up MSVC Developer Command Prompt + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + # Install Python (Meson requires it) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + # Install Meson and Ninja + - name: Install Meson + Ninja + run: | + python -m pip install --upgrade pip + pip install meson ninja + + - name: Setup environment + run: | + echo "PKG_CONFIG=${{ github.workspace }}/vcpkg/installed/x64-windows/tools/pkgconf/pkgconf.exe" >> $env:GITHUB_ENV + echo "PKG_CONFIG_PATH=${{ github.workspace }}/vcpkg/installed/x64-windows/lib/pkgconfig" >> $env:GITHUB_ENV + echo "LIB=${{ github.workspace }}/vcpkg/installed/x64-windows/lib;$env:LIB" >> $env:GITHUB_ENV + echo "INCLUDE=${{ github.workspace }}/vcpkg/installed/x64-windows/include;$env:INCLUDE" >> $env:GITHUB_ENV + shell: pwsh + + - name: Setup vcpkg and install pkg-config and gtest + run: | + git clone https://github.com/Microsoft/vcpkg.git + .\vcpkg\bootstrap-vcpkg.bat + .\vcpkg\vcpkg install gtest:x64-windows pkgconf:x64-windows + + # Configure and build with Meson (MSVC will be used automatically) + - name: Configure (Meson) + run: meson setup -Dbuild_tests=true --warnlevel 2 --buildtype release builddir --backend=ninja + + - name: Build (Ninja) + run: ninja -C builddir + + - name: Run tests + run: meson test -C builddir --test-args "\-\-gtest_filter=*qsort*" -v + SKL-gcc9: runs-on: ubuntu-24.04 @@ -135,7 +183,7 @@ jobs: - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe --gtest_filter="*simdsort*" - ADL-ASAN-clang18: + ASAN-clang18: runs-on: ubuntu-24.04 @@ -169,48 +217,9 @@ jobs: cd builddir ninja - - name: Run test suite on ADL - run: sde -adl -- ./builddir/testexe --gtest_filter="*simdsort*" - - SPR-ASAN-clang18: - - runs-on: intel-ubuntu-24.04 - - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git - - - name: Install Intel SDE - run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz - mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ - sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - - - name: Build examples - env: - CXX: clang++-18 - run: | - cd examples - make all - - - name: Build - env: - CXX: clang++-18 - run: | - make clean - meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir - cd builddir - ninja + - name: Run test suite + run: ./builddir/testexe --gtest_filter="*simdsort*" - - name: Run test suite on SPR - run: sde -spr -- ./builddir/testexe - - name: Run ICL fp16 tests - # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future - run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*" SKX-SKL-openmp: diff --git a/lib/meson.build b/lib/meson.build index 44ced53..29ee139 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,40 +1,33 @@ libtargets = [] +libtargets += static_library('libavx', + files( + 'x86simdsort-avx2.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=haswell') - libtargets += static_library('libavx', - files( - 'x86simdsort-avx2.cpp', - ), - include_directories : [src], - cpp_args : ['-march=haswell'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif - -if cpp.has_argument('-march=skylake-avx512') - libtargets += static_library('libskx', - files( - 'x86simdsort-skx.cpp', - ), - include_directories : [src], - cpp_args : ['-march=skylake-avx512'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=icelake-client') - libtargets += static_library('libicl', - files( - 'x86simdsort-icl.cpp', - ), - include_directories : [src], - cpp_args : ['-march=icelake-client'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) if cancompilefp16 libtargets += static_library('libspr', @@ -42,7 +35,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-march=sapphirerapids'], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 6bbad2c..3e5c4b5 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,6 +1,9 @@ // ICL specific routines: #include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" +#ifdef _MSC_VER +#include "avx512-16bit-qsort.hpp" +#endif namespace xss { namespace avx512 { diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 35d6ce4..776ec56 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -1,6 +1,12 @@ +#if defined(_MSC_VER) +#define XSS_ATTRIBUTE_CONSTRUCTOR +#else +#define XSS_ATTRIBUTE_CONSTRUCTOR __attribute__((constructor)) +#endif #include "x86simdsort.h" #include "x86simdsort-internal.h" #include "x86simdsort-scalar.h" +#include "x86simdsortcpuid.h" #include #include #include @@ -12,23 +18,19 @@ static int check_cpu_feature_support(std::string_view cpufeature) if ((cpufeature == "avx512_spr") && (!disable_avx512)) #if defined(__FLT16_MAX__) && !defined(__INTEL_LLVM_COMPILER) \ && (!defined(__clang_major__) || __clang_major__ >= 18) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512fp16") - && __builtin_cpu_supports("avx512vbmi2"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512fp16") + && xss_cpu_supports("avx512vbmi2"); #else return 0; #endif else if ((cpufeature == "avx512_icl") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512vbmi2") - && __builtin_cpu_supports("avx512bw") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512vbmi2") + && xss_cpu_supports("avx512bw") && xss_cpu_supports("avx512vl"); else if ((cpufeature == "avx512_skx") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512dq") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512dq") + && xss_cpu_supports("avx512vl"); else if (cpufeature == "avx2") - return __builtin_cpu_supports("avx2"); + return xss_cpu_supports("avx2"); return 0; } @@ -57,10 +59,119 @@ namespace x86simdsort { #define CAT_(a, b) a##b #define CAT(a, b) CAT_(a, b) +/* runtime dispatch mechanism */ +#define DISPATCH(func, TYPE, ISA) \ + DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(resolve_, func), TYPE)(void) \ + { \ + CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ + xss_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu(ISA); \ + if constexpr (dispatch_requested("avx512", ISA)) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + if constexpr (IS_TYPE_FLOAT16()) { \ + if (preferred_cpu.find("avx512_spr") \ + != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::fp16_spr::func; \ + return; \ + } \ + if (preferred_cpu.find("avx512_icl") \ + != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::fp16_icl::func; \ + return; \ + } \ + } \ + else { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::avx512::func; \ + } \ + return; \ + } \ + } \ + if constexpr (dispatch_requested("avx2", ISA)) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ + return; \ + } \ + } \ + } + +#ifdef _MSC_VER +#define DECLARE_INTERNAL_qsort(TYPE) \ + static void CAT(resolve_qsort, TYPE)(void); \ + static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL qsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_qsort##TYPE == NULL) { CAT(resolve_qsort, TYPE)(); } \ + (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_qselect(TYPE) \ + static void CAT(resolve_qselect, TYPE)(void); \ + static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL qselect( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_qselect##TYPE == NULL) { CAT(resolve_qselect, TYPE)(); } \ + (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_partial_qsort(TYPE) \ + static void CAT(resolve_partial_qsort, TYPE)(void); \ + static void (*internal_partial_qsort##TYPE)( \ + TYPE *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL partial_qsort( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_partial_qsort##TYPE == NULL) { \ + CAT(resolve_partial_qsort, TYPE)(); \ + } \ + (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_argsort(TYPE) \ + static void CAT(resolve_argsort, TYPE)(void); \ + static std::vector (*internal_argsort##TYPE)( \ + const TYPE *, size_t, bool, bool) \ + = NULL; \ + template <> \ + std::vector XSS_EXPORT_SYMBOL argsort( \ + const TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_argsort##TYPE == NULL) { CAT(resolve_argsort, TYPE)(); } \ + return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_argselect(TYPE) \ + static void CAT(resolve_argselect, TYPE)(void); \ + static std::vector (*internal_argselect##TYPE)( \ + const TYPE *, size_t, size_t, bool) \ + = NULL; \ + template <> \ + std::vector XSS_EXPORT_SYMBOL argselect( \ + const TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + if (internal_argselect##TYPE == NULL) { \ + CAT(resolve_argselect, TYPE)(); \ + } \ + return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ + } + +#else + #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -69,7 +180,7 @@ namespace x86simdsort { static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void qselect( \ + void XSS_EXPORT_SYMBOL qselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -80,7 +191,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void partial_qsort( \ + void XSS_EXPORT_SYMBOL partial_qsort( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -91,7 +202,7 @@ namespace x86simdsort { const TYPE *, size_t, bool, bool) \ = NULL; \ template <> \ - std::vector argsort( \ + std::vector XSS_EXPORT_SYMBOL argsort( \ const TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ @@ -102,12 +213,14 @@ namespace x86simdsort { const TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - std::vector argselect( \ + std::vector XSS_EXPORT_SYMBOL argselect( \ const TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ } +#endif // _MSC_VER + /* simple constexpr function as a way around having #ifdef __FLT16_MAX__ block * within the DISPATCH macro */ template @@ -119,45 +232,6 @@ constexpr bool IS_TYPE_FLOAT16() return false; } -/* runtime dispatch mechanism */ -#define DISPATCH(func, TYPE, ISA) \ - DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ - CAT(CAT(resolve_, func), TYPE)(void) \ - { \ - CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ - __builtin_cpu_init(); \ - std::string_view preferred_cpu = find_preferred_cpu(ISA); \ - if constexpr (dispatch_requested("avx512", ISA)) { \ - if (preferred_cpu.find("avx512") != std::string_view::npos) { \ - if constexpr (IS_TYPE_FLOAT16()) { \ - if (preferred_cpu.find("avx512_spr") \ - != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::fp16_spr::func; \ - return; \ - } \ - if (preferred_cpu.find("avx512_icl") \ - != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::fp16_icl::func; \ - return; \ - } \ - } \ - else { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::avx512::func; \ - } \ - return; \ - } \ - } \ - if constexpr (dispatch_requested("avx2", ISA)) { \ - if (preferred_cpu.find("avx2") != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ - return; \ - } \ - } \ - } - #define ISA_LIST(...) \ std::initializer_list \ { \ @@ -204,7 +278,36 @@ DISPATCH_ALL(argselect, (ISA_LIST("avx512_skx", "avx2"))) /* Key-Value methods */ + +#define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ + static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ + { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::scalar::func; \ + xss_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu(ISA); \ + if constexpr (dispatch_requested("avx512", ISA)) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::avx512::func; \ + return; \ + } \ + } \ + if constexpr (dispatch_requested("avx2", ISA)) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::avx2::func; \ + return; \ + } \ + } \ + } + +#ifdef _MSC_VER #define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ + static void CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(void); \ + static void CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(void); \ + static void CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(void); \ static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ TYPE1 *, TYPE2 *, size_t, bool, bool) \ = NULL; \ @@ -215,61 +318,91 @@ DISPATCH_ALL(argselect, TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void keyvalue_qsort(TYPE1 *key, \ - TYPE2 *val, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2)) == NULL) { \ + CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ key, val, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_select(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2)) == NULL) { \ + CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_partial_sort(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2)) \ + == NULL) { \ + CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } - -#define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ - static __attribute__((constructor)) void CAT( \ - CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ +#else +#define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ + static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::scalar::func; \ - __builtin_cpu_init(); \ - std::string_view preferred_cpu = find_preferred_cpu(ISA); \ - if constexpr (dispatch_requested("avx512", ISA)) { \ - if (preferred_cpu.find("avx512") != std::string_view::npos) { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::avx512::func; \ - return; \ - } \ - } \ - if constexpr (dispatch_requested("avx2", ISA)) { \ - if (preferred_cpu.find("avx2") != std::string_view::npos) { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::avx2::func; \ - return; \ - } \ - } \ + (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + key, val, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ } +#endif // _MSC_VER #define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 34ed101..e30120e 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,8 +6,13 @@ #include #include +#if defined(_MSC_VER) +#define XSS_EXPORT_SYMBOL __declspec(dllexport) +#define XSS_HIDE_SYMBOL +#else #define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) +#endif #define UNUSED(x) (void)(x) namespace x86simdsort { diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h new file mode 100644 index 0000000..c2911db --- /dev/null +++ b/lib/x86simdsortcpuid.h @@ -0,0 +1,81 @@ +#ifndef X86SIMDSORT_CPUID_H +#define X86SIMDSORT_CPUID_H + +#ifdef _MSC_VER +#include +#include +#include + +static std::unordered_map xss_cpu_features; + +static bool os_supports_avx() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + + bool osxsaveSupported = (cpuInfo[2] & (1 << 27)) != 0; // OSXSAVE bit + bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit + if (!(avxSupported && osxsaveSupported)) return false; + + // Check XCR0[2:1] (XMM and YMM state) + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0x6) == 0x6; +} + +static bool os_supports_avx512() +{ + if (!os_supports_avx()) return false; + + // Need XCR0[7:5] = opmask/ZMM/YMM state enabled + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0xE0) == 0xE0; +} + +static void xss_cpu_init() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + int maxLeaf = cpuInfo[0]; + + bool hasAVX2 = false; + bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, + hasAVX512VL = false; + bool hasAVX512VBMI2 = false, hasAVX512FP16 = false; + + if (maxLeaf >= 7) { + __cpuidex(cpuInfo, 7, 0); + + // EBX bits + hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); + hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); + hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); + hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); + hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); + + // ECX bits + hasAVX512VBMI2 = os_supports_avx512() && (cpuInfo[2] & (1 << 6)); + + // EDX bits + hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); + } + + xss_cpu_features["avx2"] = hasAVX2; + xss_cpu_features["avx512f"] = hasAVX512F; + xss_cpu_features["avx512dq"] = hasAVX512DQ; + xss_cpu_features["avx512bw"] = hasAVX512BW; + xss_cpu_features["avx512vl"] = hasAVX512VL; + xss_cpu_features["avx512vbmi2"] = hasAVX512VBMI2; + xss_cpu_features["avx512fp16"] = hasAVX512FP16; +} + +inline bool xss_cpu_supports(const char *feature) +{ + auto it = xss_cpu_features.find(feature); + return it != xss_cpu_features.end() && it->second; +} + +#else +#define xss_cpu_init() __builtin_cpu_init() +#define xss_cpu_supports(feature) __builtin_cpu_supports(feature) +#endif // _MSC_VER +#endif // X86SIMDSORT_CPUID_H diff --git a/meson.build b/meson.build index 0b826f0..093b602 100644 --- a/meson.build +++ b/meson.build @@ -1,3 +1,4 @@ + project('x86-simd-sort', 'cpp', version : '7.0.x', license : 'BSD 3-clause', @@ -10,6 +11,13 @@ bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') +# check if compiler supports -march=haswell, -march=skylake-avx512 and -march=icelake-client and error out if not +if cpp.get_id() != 'msvc' + if not cpp.has_argument('-march=haswell') or not cpp.has_argument('-march=skylake-avx512') or not cpp.has_argument('-march=icelake-client') + error('Compiler does not support -march=haswell, -march=skylake-avx512 or -march=icelake-client. Please use a newer compiler version.') + endif +endif + # Add IPP sort to benchmarks: benchipp = false ipplink = [] @@ -37,6 +45,7 @@ if get_option('use_openmp') omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) endif + fp16code = '''#include int main() { __m512h temp = _mm512_set1_ph(1.0f); @@ -84,14 +93,14 @@ x86simdsortcpp_dep = declare_dependency( # Build test suite if option build_tests set to true if get_option('build_tests') - gtest_dep = dependency('gtest_main', required : true, static: false) + gtest_dep = dependency('gtest', main : true, required : true, static: false) subdir('tests') testexe = executable('testexe', include_directories : [lib, utils], dependencies : [gtest_dep, x86simdsortcpp_dep], link_whole : [libtests], ) - test('x86 simd sort tests', testexe) + test('x86 simd sort tests', testexe, timeout : 0) endif # Build benchmarking suite if option build_benchmarks is set to true