@@ -939,36 +939,21 @@ static inline int _avx512_run_container_cardinality(
939939 const int32_t n_runs = run -> n_runs ;
940940 const rle16_t * runs = run -> runs ;
941941
942- /* by initializing with n_runs, we omit counting the +1 for each pair. */
943- int sum = n_runs ;
944942 int32_t k = 0 ;
945- const int32_t step = sizeof (__m512i ) / sizeof (rle16_t );
946- if (n_runs > step ) {
947- __m512i total = _mm512_setzero_si512 ();
948- for (; k + step <= n_runs ; k += step ) {
949- __m512i ymm1 = _mm512_loadu_si512 ((const __m512i * )(runs + k ));
950- __m512i justlengths = _mm512_srli_epi32 (ymm1 , 16 );
951- total = _mm512_add_epi32 (total , justlengths );
952- }
953-
954- __m256i lo = _mm512_extracti32x8_epi32 (total , 0 );
955- __m256i hi = _mm512_extracti32x8_epi32 (total , 1 );
956-
957- // a store might be faster than extract?
958- uint32_t buffer [sizeof (__m256i ) / sizeof (rle16_t )];
959- _mm256_storeu_si256 ((__m256i * )buffer , lo );
960- sum += (buffer [0 ] + buffer [1 ]) + (buffer [2 ] + buffer [3 ]) +
961- (buffer [4 ] + buffer [5 ]) + (buffer [6 ] + buffer [7 ]);
962-
963- _mm256_storeu_si256 ((__m256i * )buffer , hi );
964- sum += (buffer [0 ] + buffer [1 ]) + (buffer [2 ] + buffer [3 ]) +
965- (buffer [4 ] + buffer [5 ]) + (buffer [6 ] + buffer [7 ]);
966- }
967- for (; k < n_runs ; ++ k ) {
968- sum += runs [k ].length ;
969- }
970-
971- return sum ;
943+ const int32_t step512 = sizeof (__m512i ) / sizeof (rle16_t );
944+ __m512i total = _mm512_setzero_si512 ();
945+ for (; k + step512 <= n_runs ; k += step512 ) {
946+ __m512i ymm1 = _mm512_loadu_si512 ((const __m512i * )(runs + k ));
947+ __m512i justlengths = _mm512_srli_epi32 (ymm1 , 16 );
948+ total = _mm512_add_epi32 (total , justlengths );
949+ }
950+ if (k < n_runs ) {
951+ __m512i ymm1 = _mm512_maskz_loadu_epi32 ((1 << (n_runs - k )) - 1 ,
952+ (const __m512i * )(runs + k ));
953+ __m512i justlengths = _mm512_srli_epi32 (ymm1 , 16 );
954+ total = _mm512_add_epi32 (total , justlengths );
955+ }
956+ return _mm512_reduce_add_epi32 (total ) + n_runs ;
972957}
973958
974959CROARING_UNTARGET_AVX512
@@ -1063,7 +1048,10 @@ static inline int _scalar_run_container_cardinality(
10631048}
10641049
10651050int run_container_cardinality (const run_container_t * run ) {
1066- #if CROARING_COMPILER_SUPPORTS_AVX512
1051+ // Empirically AVX-512 is not always faster than AVX2
1052+ #define CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY 0
1053+ #if CROARING_COMPILER_SUPPORTS_AVX512 && \
1054+ CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY
10671055 if (croaring_hardware_support () & ROARING_SUPPORTS_AVX512 ) {
10681056 return _avx512_run_container_cardinality (run );
10691057 } else
0 commit comments