Merge pull request #1043 from zijianli1234/dev

Add RVV Optimizations for RISC-V Architecture Platforms
2026-01-19 04:52:10 +00:00 · 2025-06-15 17:35:30 -07:00
parent 295f8f7cf0 0d7465ea4f
commit a83692aaf1
3 changed files with 152 additions and 3 deletions
--- a/cli/xsum_arch.h
+++ b/cli/xsum_arch.h
@@ -162,6 +162,8 @@
 #  else
 #    define XSUM_ARCH "wasm/asmjs"
 #  endif
+#elif defined(__riscv)
+#    define XSUM_ARCH "riscv"
 #elif defined(__loongarch_lp64)
 #  if defined(__loongarch_asx)
 #    define XSUM_ARCH "loongarch64 + lasx"
--- a/tests/bench/Makefile
+++ b/tests/bench/Makefile
@@ -52,11 +52,14 @@ benchHash_avx2: CXXFLAGS += -mavx2
 benchHash_avx512: CFLAGS   += -mavx512f
 benchHash_avx512: CXXFLAGS += -mavx512f

+benchHash_rvv: CFLAGS   += -march=rv64gcv -O2
+benchHash_rvv: CXXFLAGS += -march=rv64gcv -O2
+
 benchHash_hw: CPPFLAGS += -DHARDWARE_SUPPORT
 benchHash_hw: CFLAGS   += -mavx2 -maes
 benchHash_hw: CXXFLAGS += -mavx2 -mpclmul -std=c++14

-benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_nosimd benchHash_hw: $(OBJ_LIST)
+benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_nosimd benchHash_hw benchHash_rvv: $(OBJ_LIST)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ $(LDFLAGS) -o $@


@@ -68,4 +71,4 @@ benchHash.o: benchHash.h


 clean:
-	$(RM) *.o benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_hw
+	$(RM) *.o benchHash benchHash32 benchHash_avx2 benchHash_avx512 benchHash_hw benchHash_rvv
--- a/xxhash.h
+++ b/xxhash.h
@@ -1126,7 +1126,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const
 #  define XXH_SVE    6 /*!< SVE for some ARMv8-A and ARMv9-A */
 #  define XXH_LSX    7 /*!< LSX (128-bit SIMD) for LoongArch64 */
 #  define XXH_LASX   8 /*!< LASX (256-bit SIMD) for LoongArch64 */
-
+#  define XXH_RVV    9 /*!< RVV (RISC-V Vector) for RISC-V */

 /*-**********************************************************************
 *  XXH3 64-bit variant
@@ -3882,6 +3882,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #    include <lsxintrin.h>
 #  elif defined(__loongarch_sx)
 #    include <lsxintrin.h>
+#  elif defined(__riscv_vector)
+#    include <riscv_vector.h>
 #  endif
 #endif

@@ -4020,6 +4022,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #    define XXH_VECTOR XXH_LASX
 #  elif defined(__loongarch_sx)
 #    define XXH_VECTOR XXH_LSX
+#  elif defined(__riscv_vector)
+#    define XXH_VECTOR XXH_RVV
 #  else
 #    define XXH_VECTOR XXH_SCALAR
 #  endif
@@ -4061,6 +4065,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #     define XXH_ACC_ALIGN 64
 #  elif XXH_VECTOR == XXH_LSX   /* lsx */
 #     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_RVV   /* rvv */
+#     define XXH_ACC_ALIGN 64
 #  endif
 #endif

@@ -4069,6 +4075,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
 #elif XXH_VECTOR == XXH_SVE
 #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_RVV
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
 #else
 #  define XXH_SEC_ALIGN 8
 #endif
@@ -5833,6 +5841,136 @@ XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)

 #endif

+#if (XXH_VECTOR == XXH_RVV)
+#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \
+        (defined(__clang__) && __clang_major__ < 16))
+    #define RVV_OP(op) op
+#else
+    #define concat2(X, Y) X ## Y
+    #define concat(X, Y) concat2(X, Y)
+    #define RVV_OP(op) concat(__riscv_, op)
+#endif
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_rvv(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    {
+        // Try to set vector lenght to 512 bits.
+        // If this length is unavailable, then maximum available will be used
+        size_t vl = RVV_OP(vsetvl_e64m2)(8);
+
+        uint64_t* const xacc = (uint64_t*) acc;
+        const uint64_t* const xinput = (const uint64_t*) input;
+        const uint64_t* const xsecret = (const uint64_t*) secret;
+        uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        vuint64m2_t xswap_mask = RVV_OP(vle64_v_u64m2)(swap_mask, vl);
+
+        // vuint64m1_t is sizeless.
+        // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512)
+        for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){
+            /* data_vec    = input[i]; */
+            vuint64m2_t data_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xinput + vl * i), vl * 8));
+            /* key_vec     = secret[i]; */
+            vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8));
+            /* data_key    = data_vec ^ key_vec; */
+            vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl);
+            /* data_key_lo = data_key >> 32; */
+            vuint64m2_t data_key_lo = RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            vuint64m2_t product = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), RVV_OP(vand_vx_u64m2)(data_key_lo, 0xffffffff, vl), vl);
+            /* acc_vec = xacc[i]; */
+            vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl);
+            acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, product, vl);
+            {
+                /* swap high and low halves */
+                vuint64m2_t data_swap = RVV_OP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl);
+                acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, data_swap, vl);
+            }
+            RVV_OP(vse64_v_u64m2)(xacc + vl * i, acc_vec, vl);
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    {
+        // Try to set vector lenght to 512 bits.
+        // If this length is unavailable, then maximum available will be used
+        size_t vl = RVV_OP(vsetvl_e64m2)(8);
+        uint64_t* const xacc = (uint64_t*) acc;
+        const uint64_t* const xsecret = (const uint64_t*) secret;
+
+        uint64_t prime[16] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,\
+                                XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1};
+        vuint64m2_t vprime = RVV_OP(vle64_v_u64m2)(prime, vl);
+
+        // vuint64m2_t is sizeless.
+        // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512)
+        for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl);
+            vuint64m2_t shifted = RVV_OP(vsrl_vx_u64m2)(acc_vec, 47, vl);
+            vuint64m2_t data_vec = RVV_OP(vxor_vv_u64m2)(acc_vec, shifted, vl);
+            /* xacc[i] ^= xsecret[i]; */
+            vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8));
+            vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            vuint64m2_t prod_even = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), vprime, vl);
+            vuint64m2_t prod_odd = RVV_OP(vmul_vv_u64m2)(RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl), vprime, vl);
+            vuint64m2_t prod = RVV_OP(vadd_vv_u64m2)(prod_even, RVV_OP(vsll_vx_u64m2)(prod_odd, 32, vl), vl);
+            RVV_OP(vse64_v_u64m2)(xacc + vl * i, prod, vl);
+        }
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    {
+        uint64_t* const xcustomSecret = (uint64_t*)customSecret;
+
+        (void)(&XXH_writeLE64);
+        {
+            // Calculate the number of 64-bit elements in the `XXH3_kSecret` secret
+            size_t XXH3_kSecret_64b_len = XXH_SECRET_DEFAULT_SIZE / 8;
+            // Create an array of repeated seed values, alternating between seed64 and -seed64.
+            uint64_t seed_pos[16] = {seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64)};
+            // Cast the default secret to a signed 64-bit pointer for vectorized access
+            const int64_t* const xXXH3_kSecret = (const int64_t*)XXH3_kSecret;
+            size_t vl = 0;
+            for (size_t i=0; i < XXH3_kSecret_64b_len; i += vl) {
+
+                vl = RVV_OP(vsetvl_e64m2)(XXH3_kSecret_64b_len - i);
+                {
+                    vint64m2_t seed = RVV_OP(vle64_v_i64m2)((int64_t*)seed_pos, vl);
+                    vint64m2_t src = RVV_OP(vle64_v_i64m2)((const int64_t*)&xXXH3_kSecret[i], vl);
+                    vint64m2_t res = RVV_OP(vadd_vv_i64m2)(src, seed, vl);
+                    RVV_OP(vse64_v_i64m2)((int64_t*)&xcustomSecret[i], res, vl);
+                }
+            }
+        }
+    }
+}
+#endif
+
+
 /* scalar variants - universal */

 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
@@ -6075,6 +6213,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
 #define XXH3_scrambleAcc    XXH3_scrambleAcc_lsx
 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

+#elif (XXH_VECTOR == XXH_RVV)
+#define XXH3_accumulate_512 XXH3_accumulate_512_rvv
+#define XXH3_accumulate     XXH3_accumulate_rvv
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_rvv
+#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv
+
 #else /* scalar */

 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar