Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions include/bmat8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,14 @@ class BMat8 {
//! Returns a \c std::vector for rows of \c this
std::vector<uint8_t> rows() const;

//! Returns a \c std::vector for rows of \c this
//! This is the same as BMat8::rows, which is retained for backwards
//! compatibility.
std::vector<uint8_t> row_vector() const;

//! Returns a \c std::array for rows of \c this
std::array<uint8_t, 8> row_array() const;

//! Returns the cardinality of the row space of \c this
//!
//! Reference implementation computing all products
Expand Down Expand Up @@ -293,15 +301,10 @@ class BMat8 {
//! This method returns the 8 x 8 BMat8 with 1s on the main diagonal.
static BMat8 one(size_t dim = 8) {
HPCOMBI_ASSERT(dim <= 8);
static std::array<uint64_t, 9> const ones = {0x0000000000000000,
0x8000000000000000,
0x8040000000000000,
0x8040200000000000,
0x8040201000000000,
0x8040201008000000,
0x8040201008040000,
0x8040201008040200,
0x8040201008040201};
static std::array<uint64_t, 9> const ones = {
0x0000000000000000, 0x8000000000000000, 0x8040000000000000,
0x8040200000000000, 0x8040201000000000, 0x8040201008000000,
0x8040201008040000, 0x8040201008040200, 0x8040201008040201};
return BMat8(ones[dim]);
}

Expand All @@ -320,7 +323,7 @@ class BMat8 {
void swap(BMat8 &that) { std::swap(this->_data, that._data); }

//! Write \c this on \c os
std::ostream & write(std::ostream &os) const;
std::ostream &write(std::ostream &os) const;

#ifdef LIBSEMIGROUPS_DENSEHASHMAP
// FIXME do this another way
Expand Down
49 changes: 30 additions & 19 deletions include/bmat8_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,6 @@ inline BMat8 BMat8::transpose() const {
return BMat8(x);
}


inline BMat8 BMat8::transpose_mask() const {
epu8 x = _mm_set_epi64x(_data, _data << 1);
uint64_t res = _mm_movemask_epi8(x);
Expand All @@ -177,35 +176,34 @@ inline BMat8 BMat8::transpose_maskd() const {
return BMat8(res);
}


using epu64 = uint64_t __attribute__ ((__vector_size__ (16), __may_alias__));
using epu64 = uint64_t __attribute__((__vector_size__(16), __may_alias__));

inline void BMat8::transpose2(BMat8 &a, BMat8 &b) {
epu64 x = _mm_set_epi64x(a._data, b._data);
epu64 y = (x ^ (x >> 7)) & (epu64 {0xAA00AA00AA00AA, 0xAA00AA00AA00AA});
epu64 y = (x ^ (x >> 7)) & (epu64{0xAA00AA00AA00AA, 0xAA00AA00AA00AA});
x = x ^ y ^ (y << 7);
y = (x ^ (x >> 14)) & (epu64 {0xCCCC0000CCCC, 0xCCCC0000CCCC});
y = (x ^ (x >> 14)) & (epu64{0xCCCC0000CCCC, 0xCCCC0000CCCC});
x = x ^ y ^ (y << 14);
y = (x ^ (x >> 28)) & (epu64 {0xF0F0F0F0, 0xF0F0F0F0});
y = (x ^ (x >> 28)) & (epu64{0xF0F0F0F0, 0xF0F0F0F0});
x = x ^ y ^ (y << 28);
a._data = _mm_extract_epi64(x, 1);
b._data = _mm_extract_epi64(x, 0);
}

static constexpr epu8 rotlow { 7, 0, 1, 2, 3, 4, 5, 6};
static constexpr epu8 rothigh
{ 0, 1, 2, 3, 4, 5, 6, 7,15, 8, 9,10,11,12,13,14};
static constexpr epu8 rotboth
{ 7, 0, 1, 2, 3, 4, 5, 6,15, 8, 9,10,11,12,13,14};
static constexpr epu8 rot2
{ 6, 7, 0, 1, 2, 3, 4, 5,14,15, 8, 9,10,11,12,13};
static constexpr epu8 rotlow{7, 0, 1, 2, 3, 4, 5, 6};
static constexpr epu8 rothigh{0, 1, 2, 3, 4, 5, 6, 7,
15, 8, 9, 10, 11, 12, 13, 14};
static constexpr epu8 rotboth{7, 0, 1, 2, 3, 4, 5, 6,
15, 8, 9, 10, 11, 12, 13, 14};
static constexpr epu8 rot2{6, 7, 0, 1, 2, 3, 4, 5,
14, 15, 8, 9, 10, 11, 12, 13};

inline BMat8 BMat8::mult_transpose(BMat8 const &that) const {
epu8 x = _mm_set_epi64x(_data, _data);
epu8 y = _mm_shuffle_epi8(_mm_set_epi64x(that._data, that._data), rothigh);
epu8 data {};
epu8 diag {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40};
epu8 data{};
epu8 diag{0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40};
for (int i = 0; i < 4; ++i) {
data |= ((x & y) != epu8{}) & diag;
y = _mm_shuffle_epi8(y, rot2);
Expand All @@ -218,7 +216,7 @@ inline epu8 BMat8::row_space_basis_internal() const {
epu8 res = remove_dups(revsorted8(_mm_set_epi64x(0, _data)));
epu8 rescy = res;
// We now compute the union of all the included different rows
epu8 orincl {};
epu8 orincl{};
for (int i = 0; i < 7; i++) {
rescy = permuted(rescy, rotlow);
orincl |= ((rescy | res) == res) & rescy;
Expand All @@ -236,8 +234,8 @@ inline BMat8 BMat8::row_space_basis() const {
#endif /* FF */
#define FF 0xff

constexpr std::array<epu8, 4> masks {{
// clang-format off
constexpr std::array<epu8, 4> masks{
{// clang-format off
{FF, 0,FF, 0,FF, 0,FF, 0,FF, 0,FF, 0,FF, 0,FF, 0},
{FF,FF, 1, 1,FF,FF, 1, 1,FF,FF, 1, 1,FF,FF, 1, 1},
{FF,FF,FF,FF, 2, 2, 2, 2,FF,FF,FF,FF, 2, 2, 2, 2},
Expand Down Expand Up @@ -389,6 +387,10 @@ inline uint64_t BMat8::row_space_size_ref() const {
}

inline std::vector<uint8_t> BMat8::rows() const {
return row_vector();
}

inline std::vector<uint8_t> BMat8::row_vector() const {
std::vector<uint8_t> rows;
for (size_t i = 0; i < 8; ++i) {
uint8_t row = static_cast<uint8_t>(_data << (8 * i) >> 56);
Expand All @@ -397,6 +399,15 @@ inline std::vector<uint8_t> BMat8::rows() const {
return rows;
}

inline std::array<uint8_t, 8> BMat8::row_array() const {
std::array<uint8_t, 8> rows;
rows.fill(0);
for (size_t i = 0; i < 8; ++i) {
rows[i] = static_cast<uint8_t>(_data << (8 * i) >> 56);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be an even (slightly) faster way to do that using _mm_set_epi64x(_data, 0) and the _mm_extract_epi8 instruction. Actually some cast could do it, but we have to avoid some undefined behavior.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be most excellent.

}
return rows;
}

inline size_t BMat8::nr_rows() const {
epu8 x = _mm_set_epi64x(_data, 0);
return _mm_popcnt_u64(_mm_movemask_epi8(x != epu8 {}));
Expand Down
19 changes: 5 additions & 14 deletions include/epu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
#define HPCOMBI_CONSTEXPR_CONSTRUCTOR
#endif


namespace HPCombi {

/// Unsigned 8 bits int constant.
Expand All @@ -64,7 +63,6 @@ using xpu8 = uint8_t __attribute__((vector_size(32)));
static_assert(alignof(xpu8) == 32,
"xpu8 type is not properly aligned by the compiler !");


namespace { // Implementation detail code

/// A handmade C++11 constexpr lambda
Expand Down Expand Up @@ -136,15 +134,12 @@ uint8_t right_dup_fun(uint8_t i) { return i == 0 ? 0 : i - 1; }
HPCOMBI_CONSTEXPR
uint8_t complement_fun(uint8_t i) { return 15 - i; }
HPCOMBI_CONSTEXPR uint8_t popcount4_fun(uint8_t i) {
return ((i & 1) != 0 ? 1 : 0)
+ ((i & 2) != 0 ? 1 : 0)
+ ((i & 4) != 0 ? 1 : 0)
+ ((i & 8) != 0 ? 1 : 0);
return ((i & 1) != 0 ? 1 : 0) + ((i & 2) != 0 ? 1 : 0) +
((i & 4) != 0 ? 1 : 0) + ((i & 8) != 0 ? 1 : 0);
}

} // Anonymous namespace


/// Factory object for various SIMD constants in particular constexpr
TPUBuild<epu8> Epu8;

Expand Down Expand Up @@ -258,13 +253,12 @@ inline epu8 revsorted8(epu8 a);
* @details
* @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
*/
inline epu8 sort_perm(epu8 & a);
inline epu8 sort_perm(epu8 &a);
/** Sort \c this and return the sorting permutation
* @details
* @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
*/
inline epu8 sort8_perm(epu8 & a);

inline epu8 sort8_perm(epu8 &a);

/** Find if a vector is a permutation of one other
* @details
Expand Down Expand Up @@ -358,7 +352,6 @@ inline epu8 partial_sums_round(epu8);
/** @copydoc common_partial_sums */
inline epu8 partial_sums(epu8 v) { return partial_sums_round(v); }


/** @class common_horiz_max
* @brief Horizontal sum of a #HPCombi::epu8
* @details
Expand Down Expand Up @@ -422,7 +415,6 @@ inline epu8 partial_max_round(epu8);
/** @copydoc common_partial_max */
inline epu8 partial_max(epu8 v) { return partial_max_round(v); }


/** @class common_horiz_min
* @brief Horizontal sum of a #HPCombi::epu8
* @details
Expand Down Expand Up @@ -486,7 +478,6 @@ inline epu8 partial_min_round(epu8);
/** @copydoc common_partial_min */
inline epu8 partial_min(epu8 v) { return partial_min_round(v); }


/** @class common_eval16
* @brief Evaluation of a #HPCombi::epu8
* @details
Expand Down Expand Up @@ -710,7 +701,7 @@ inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a);
* - std::hash<epu8>
* - std::less<epu8>
*/
}
} // namespace std

#include "epu_impl.hpp"

Expand Down
Loading