Skip to content

Commit 8ef0eb3

Browse files
lemireDaniel Lemire
andauthored
run_container_cardinality + AVX-512 does not seem useful. (#746)
Co-authored-by: Daniel Lemire <[email protected]>
1 parent efcdd87 commit 8ef0eb3

File tree

1 file changed

+18
-30
lines changed

1 file changed

+18
-30
lines changed

src/containers/run.c

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -939,36 +939,21 @@ static inline int _avx512_run_container_cardinality(
939939
const int32_t n_runs = run->n_runs;
940940
const rle16_t *runs = run->runs;
941941

942-
/* by initializing with n_runs, we omit counting the +1 for each pair. */
943-
int sum = n_runs;
944942
int32_t k = 0;
945-
const int32_t step = sizeof(__m512i) / sizeof(rle16_t);
946-
if (n_runs > step) {
947-
__m512i total = _mm512_setzero_si512();
948-
for (; k + step <= n_runs; k += step) {
949-
__m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k));
950-
__m512i justlengths = _mm512_srli_epi32(ymm1, 16);
951-
total = _mm512_add_epi32(total, justlengths);
952-
}
953-
954-
__m256i lo = _mm512_extracti32x8_epi32(total, 0);
955-
__m256i hi = _mm512_extracti32x8_epi32(total, 1);
956-
957-
// a store might be faster than extract?
958-
uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)];
959-
_mm256_storeu_si256((__m256i *)buffer, lo);
960-
sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
961-
(buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
962-
963-
_mm256_storeu_si256((__m256i *)buffer, hi);
964-
sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
965-
(buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
966-
}
967-
for (; k < n_runs; ++k) {
968-
sum += runs[k].length;
969-
}
970-
971-
return sum;
943+
const int32_t step512 = sizeof(__m512i) / sizeof(rle16_t);
944+
__m512i total = _mm512_setzero_si512();
945+
for (; k + step512 <= n_runs; k += step512) {
946+
__m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k));
947+
__m512i justlengths = _mm512_srli_epi32(ymm1, 16);
948+
total = _mm512_add_epi32(total, justlengths);
949+
}
950+
if (k < n_runs) {
951+
__m512i ymm1 = _mm512_maskz_loadu_epi32((1 << (n_runs - k)) - 1,
952+
(const __m512i *)(runs + k));
953+
__m512i justlengths = _mm512_srli_epi32(ymm1, 16);
954+
total = _mm512_add_epi32(total, justlengths);
955+
}
956+
return _mm512_reduce_add_epi32(total) + n_runs;
972957
}
973958

974959
CROARING_UNTARGET_AVX512
@@ -1063,7 +1048,10 @@ static inline int _scalar_run_container_cardinality(
10631048
}
10641049

10651050
int run_container_cardinality(const run_container_t *run) {
1066-
#if CROARING_COMPILER_SUPPORTS_AVX512
1051+
// Empirically AVX-512 is not always faster than AVX2
1052+
#define CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY 0
1053+
#if CROARING_COMPILER_SUPPORTS_AVX512 && \
1054+
CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY
10671055
if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) {
10681056
return _avx512_run_container_cardinality(run);
10691057
} else

0 commit comments

Comments
 (0)