diff --git a/cli/xsum_arch.h b/cli/xsum_arch.h index 83a0bfb..a8e2bec 100644 --- a/cli/xsum_arch.h +++ b/cli/xsum_arch.h @@ -162,6 +162,8 @@ # else # define XSUM_ARCH "wasm/asmjs" # endif +#elif defined(__riscv) +# define XSUM_ARCH "riscv" #elif defined(__loongarch_lp64) # if defined(__loongarch_asx) # define XSUM_ARCH "loongarch64 + lasx" diff --git a/tests/bench/Makefile b/tests/bench/Makefile index a7edd48..ec5d56f 100644 --- a/tests/bench/Makefile +++ b/tests/bench/Makefile @@ -52,11 +52,14 @@ benchHash_avx2: CXXFLAGS += -mavx2 benchHash_avx512: CFLAGS += -mavx512f benchHash_avx512: CXXFLAGS += -mavx512f +benchHash_rvv: CFLAGS += -march=rv64gcv -O2 +benchHash_rvv: CXXFLAGS += -march=rv64gcv -O2 + benchHash_hw: CPPFLAGS += -DHARDWARE_SUPPORT benchHash_hw: CFLAGS += -mavx2 -maes benchHash_hw: CXXFLAGS += -mavx2 -mpclmul -std=c++14 -benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_nosimd benchHash_hw: $(OBJ_LIST) +benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_nosimd benchHash_hw benchHash_rvv: $(OBJ_LIST) $(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ @@ -68,4 +71,4 @@ benchHash.o: benchHash.h clean: - $(RM) *.o benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_hw + $(RM) *.o benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_hw benchHash_rvv diff --git a/xxhash.h b/xxhash.h index f886a78..a1710d5 100644 --- a/xxhash.h +++ b/xxhash.h @@ -1126,7 +1126,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const # define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ # define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ # define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ - +# define XXH_RVV 9 /*!< RVV (RISC-V Vector) for RISC-V */ /*-********************************************************************** * XXH3 64-bit variant @@ -3882,6 +3882,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__loongarch_sx) # include +# elif defined(__riscv_vector) +# include # endif #endif @@ -4020,6 +4022,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # define XXH_VECTOR XXH_LASX # elif defined(__loongarch_sx) # define XXH_VECTOR XXH_LSX +# elif defined(__riscv_vector) +# define XXH_VECTOR XXH_RVV # else # define XXH_VECTOR XXH_SCALAR # endif @@ -4061,6 +4065,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_LSX /* lsx */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_RVV /* rvv */ +# define XXH_ACC_ALIGN 64 # endif #endif @@ -4069,6 +4075,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # define XXH_SEC_ALIGN XXH_ACC_ALIGN #elif XXH_VECTOR == XXH_SVE # define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_RVV +# define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif @@ -5833,6 +5841,136 @@ XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) #endif +#if (XXH_VECTOR == XXH_RVV) +#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \ + (defined(__clang__) && __clang_major__ < 16)) + #define RVV_OP(op) op +#else + #define concat2(X, Y) X ## Y + #define concat(X, Y) concat2(X, Y) + #define RVV_OP(op) concat(__riscv_, op) +#endif +XXH_FORCE_INLINE void +XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + { + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = RVV_OP(vsetvl_e64m2)(8); + + uint64_t* const xacc = (uint64_t*) acc; + const uint64_t* const xinput = (const uint64_t*) input; + const uint64_t* const xsecret = (const uint64_t*) secret; + uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + vuint64m2_t xswap_mask = RVV_OP(vle64_v_u64m2)(swap_mask, vl); + + // vuint64m1_t is sizeless. + // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512) + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* data_vec = input[i]; */ + vuint64m2_t data_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xinput + vl * i), vl * 8)); + /* key_vec = secret[i]; */ + vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8)); + /* data_key = data_vec ^ key_vec; */ + vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl); + /* data_key_lo = data_key >> 32; */ + vuint64m2_t data_key_lo = RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + vuint64m2_t product = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), RVV_OP(vand_vx_u64m2)(data_key_lo, 0xffffffff, vl), vl); + /* acc_vec = xacc[i]; */ + vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl); + acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, product, vl); + { + /* swap high and low halves */ + vuint64m2_t data_swap = RVV_OP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl); + acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, data_swap, vl); + } + RVV_OP(vse64_v_u64m2)(xacc + vl * i, acc_vec, vl); + } + } +} + +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + { + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = RVV_OP(vsetvl_e64m2)(8); + uint64_t* const xacc = (uint64_t*) acc; + const uint64_t* const xsecret = (const uint64_t*) secret; + + uint64_t prime[16] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,\ + XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1}; + vuint64m2_t vprime = RVV_OP(vle64_v_u64m2)(prime, vl); + + // vuint64m2_t is sizeless. + // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512) + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* xacc[i] ^= (xacc[i] >> 47) */ + vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl); + vuint64m2_t shifted = RVV_OP(vsrl_vx_u64m2)(acc_vec, 47, vl); + vuint64m2_t data_vec = RVV_OP(vxor_vv_u64m2)(acc_vec, shifted, vl); + /* xacc[i] ^= xsecret[i]; */ + vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8)); + vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl); + + /* xacc[i] *= XXH_PRIME32_1; */ + vuint64m2_t prod_even = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), vprime, vl); + vuint64m2_t prod_odd = RVV_OP(vmul_vv_u64m2)(RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl), vprime, vl); + vuint64m2_t prod = RVV_OP(vadd_vv_u64m2)(prod_even, RVV_OP(vsll_vx_u64m2)(prod_odd, 32, vl), vl); + RVV_OP(vse64_v_u64m2)(xacc + vl * i, prod, vl); + } + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + { + uint64_t* const xcustomSecret = (uint64_t*)customSecret; + + (void)(&XXH_writeLE64); + { + // Calculate the number of 64-bit elements in the `XXH3_kSecret` secret + size_t XXH3_kSecret_64b_len = XXH_SECRET_DEFAULT_SIZE / 8; + // Create an array of repeated seed values, alternating between seed64 and -seed64. + uint64_t seed_pos[16] = {seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64)}; + // Cast the default secret to a signed 64-bit pointer for vectorized access + const int64_t* const xXXH3_kSecret = (const int64_t*)XXH3_kSecret; + size_t vl = 0; + for (size_t i=0; i < XXH3_kSecret_64b_len; i += vl) { + + vl = RVV_OP(vsetvl_e64m2)(XXH3_kSecret_64b_len - i); + { + vint64m2_t seed = RVV_OP(vle64_v_i64m2)((int64_t*)seed_pos, vl); + vint64m2_t src = RVV_OP(vle64_v_i64m2)((const int64_t*)&xXXH3_kSecret[i], vl); + vint64m2_t res = RVV_OP(vadd_vv_i64m2)(src, seed, vl); + RVV_OP(vse64_v_i64m2)((int64_t*)&xcustomSecret[i], res, vl); + } + } + } + } +} +#endif + + /* scalar variants - universal */ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) @@ -6075,6 +6213,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_lsx #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_RVV) +#define XXH3_accumulate_512 XXH3_accumulate_512_rvv +#define XXH3_accumulate XXH3_accumulate_rvv +#define XXH3_scrambleAcc XXH3_scrambleAcc_rvv +#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar