From 930b33de62ca6d2bef922e6c0243b893277a0e1c Mon Sep 17 00:00:00 2001
From: Adrien Wu <adrien1018@users.noreply.github.com>
Date: Mon, 24 Oct 2022 12:17:11 +0800
Subject: [PATCH 1/5] Add description for XXH3 for large inputs

---
 doc/xxhash_spec.md | 220 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 218 insertions(+), 2 deletions(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index 80576d6..d171d3a 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -24,6 +24,10 @@ Table of Contents
 - [Introduction](#introduction)
 - [XXH32 algorithm description](#xxh32-algorithm-description)
 - [XXH64 algorithm description](#xxh64-algorithm-description)
+- [XXH3 algorithm description](#xxh3-algorithm-overview)
+   - [Small inputs](#xxh3-algorithm-description-for-small-inputs)
+   - [Medium inputs](#xxh3-algorithm-description-for-medium-inputs)
+   - [Large inputs](#xxh3-algorithm-description-for-large-inputs)
 - [Performance considerations](#performance-considerations)
 - [Reference Implementation](#reference-implementation)
 
@@ -232,7 +236,7 @@ Each lane read its associated 64-bit value using __little-endian__ convention.
 For each {lane, accumulator}, the update process is called a _round_, and applies the following formula:
 
 ```c
-  round(accN,laneN):
+round(accN,laneN):
   accN = accN + (laneN * PRIME64_2);
   accN = accN <<< 31;
   return accN * PRIME64_1;
@@ -250,7 +254,7 @@ All 4 lane accumulators from previous steps are merged to produce a single remai
 Note that accumulator convergence is more complex than 32-bit variant, and requires to define another function called _mergeAccumulator()_:
 
 ```c
-  mergeAccumulator(acc,accN):
+mergeAccumulator(acc,accN):
   acc  = acc xor round(0, accN);
   acc  = acc * PRIME64_1;
   return acc + PRIME64_4;
@@ -324,6 +328,218 @@ The `XXH64()` function produces an unsigned 64-bit value as output.
 
 For systems which require to store and/or display the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence follows __big-endian__ convention (most significant byte first).
 
+XXH3 Algorithm Overview
+-------------------------------------
+
+XXH3 comes in two different versions: XXH3-64 and XXH3-128 (or XXH128), producing 64 and 128 bytes of output, respectively.
+
+XXH3 uses different algorithms for small (0-16 bytes), medium (17-240 bytes), and large (241+ bytes) inputs. The algorithms for small and medium inputs are optimized for performance. The three algorithms are described in the following sections.
+
+Many operations require some 64-bit prime number constants, which are the same constants used in XXH32 and XXH64, all defined below:
+
+```c
+  static const u64 PRIME32_1 = 0x9E3779B1U;  // 0b10011110001101110111100110110001
+  static const u64 PRIME32_2 = 0x85EBCA77U;  // 0b10000101111010111100101001110111
+  static const u64 PRIME32_3 = 0xC2B2AE3DU;  // 0b11000010101100101010111000111101
+  static const u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;  // 0b1001111000110111011110011011000110000101111010111100101010000111
+  static const u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;  // 0b1100001010110010101011100011110100100111110101001110101101001111
+  static const u64 PRIME64_3 = 0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
+  static const u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
+  static const u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
+```
+
+The `XXH3_64bits()` function produces an unsigned 64-bit value.  
+The `XXH3_128bits()` function produces a `XXH128_hash_t` struct containing `low64` and `high64` - the lower and higher 64-bit half values of the result, respectively.
+
+For systems requiring storing and/or displaying the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence following **big-endian** convention (most significant byte first).
+
+### Seed and Secret
+
+XXH3 provides seeded hashing by introducing two configurable constants used in the hashing process: the seed and the secret. The seed is an unsigned 64-bit value, and the secret is an array of bytes that is at least 136 bytes in size. The default seed is 0, and the default secret is the following value:
+
+```c
+static const u8 defaultSecret[192] = {
+  0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+  0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+  0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+  0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+  0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+  0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+  0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+  0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+  0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+  0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+  0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+  0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+```
+
+The seed and the secret can be specified using the `*_withSecret` and `*_withSeed` versions of the hash function.
+
+The seed and the secret cannot be specified simultaneously (`*_withSecretAndSeed` is just `*_withSeed` for inputs less than or equal to 240 bytes and `*_withSecret` otherwise). When one is specified, the other one uses the default value. There is one exception, though: if the input is larger than 240 bytes and the seed is given, the secret is derived from the seed value and the default secret using the following procedure:
+
+```c
+deriveSecret(u64 seed):
+  u64 derivedSecret[24] = defaultSecret[0:192];
+  for (i = 0; i < 12; i++) {
+    derivedSecret[i*2] += seed;
+    derivedSecret[i*2+1] -= seed;
+  }
+  return derivedSecret; // convert to u8[192] (little-endian)
+```
+
+The derivation treats the secrets as 24 64-bit values. In XXH3 algorithms, the secret is always read similarly by treating a contiguous segment of the array (whose size is a multiple of 8 bytes) as one or more 64-bit values. **The secret values are always read using little-endian convention**.
+
+
+XXH3 Algorithm Description (for small inputs)
+-------------------------------------
+
+*TODO*
+
+
+XXH3 Algorithm Description (for medium inputs)
+-------------------------------------
+
+*TODO*
+
+
+XXH3 Algorithm Description (for large inputs)
+-------------------------------------
+
+For inputs larger than 240 bytes, XXH3-64 and XXH3-128 use the same algorithm except for the finalizing step.
+
+The internal hash state is stored inside 8 "accumulators", each one storing an unsigned 64-bit value.
+
+### Step 1. Initialize internal accumulators
+
+The accumulators are initialized to fixed constants:
+
+```c
+u64 acc[8] = {
+  PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
+  PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1};
+```
+
+### Step 2. Process blocks
+
+The input is consumed and processed one full block at a time. The size of the block depends on the length of the secret. Specifically, a block consists of several 64-byte stripes. The number of stripes per block is `floor((secretLen-64)/8)` . For the default 192-byte secret, there are 16 stripes in a block, and thus the block size is 1024 bytes.
+
+```c
+secretLen = lengthInBytes(secret);    // default 192; at least 136
+stripesPerBlock = (secretLen-64) / 8; // default 16; at least 9
+blockSize = 64 * stripesPerBlock;     // default 1024; at least 576
+```
+
+The process of processing a full block is called a *round*. It consists of the following two sub-steps:
+
+#### Step 2-1. Process stripes in the block
+
+A stripe is evenly divided into 8 lanes, of 8 bytes each. In an accumulation step, one stripe and a 64-byte contiguous segment of the secret are used to update the accumulators. Each lane reads its associated 64-bit value using little-endian convention.
+
+The accumulation step applies the following procedure:
+
+```c
+accumulate(u64 stripe[8], size secretOffset):
+  u64 secretWords[8] = secret[secretOffset:secretOffset+64];
+  for (i = 0; i < 8; i++) {
+    u64 dataKey = stripe[i] xor secretWords[i];
+    acc[i xor 1] += stripe[i];
+    acc[i] += (u64)lowerHalf(dataKey) * (u64)higherHalf(dataKey);
+              // (data_key and 0xFFFFFFFF) * (data_key >> 32)
+  }
+```
+
+The accumulation step is repeated for all stripes in a block, using different segments of the secret, starting from the first 64 bytes for the first stripe, and offset by 8 bytes for each following round:
+
+```c
+round_accumulate(u8 block[blockSize]):
+  for (n = 0; n < stripesPerBlock; n++) {
+    u64 stripe[8] = block[n*64:n*64+64]; // 64 bytes = 8 u64s
+    accumulate(stripe, n*8);
+  }
+```
+
+#### Step 2-2. Scramble accumulators
+
+After the accumulation steps are finished for all stripes in the block, the accumulators are scrambled using the last 64 bytes of the secret.
+
+```c
+round_scramble():
+  u64 secretWords[8] = secret[secretSize-64:secretSize];
+  for (i = 0; i < 8; i++) {
+    acc[i] ^= acc[i] >> 47;
+    acc[i] ^= secretWords[i];
+    acc[i] *= PRIME32_1;
+  }
+```
+
+A round is thus a `round_accumulate` followed by a `round_scramble`:
+
+```c
+round(u8 block[blockSize]):
+  round_accumulate(block);
+  round_scramble();
+```
+
+Step 2 is looped to consume the input until there are less than or equal to `blockSize` bytes of input left. Note that we leave the last block to the next step even if it is a full block.
+
+### Step 3. Process the last block and the last 64 bytes
+
+Accumulation steps are run for the stripes in the last block, except for the last stripe (whether it is full or not). After that, run a final accumulation step by treating the last 64 bytes as a stripe. Note that the last 64 bytes might overlap with the second-to-last block.
+
+```c
+// len is the size of the last block (1 <= len <= blockSize)
+lastRound(u8 block[], size len, u64 lastStripe[8]):
+  size nFullStripes = (len-1)/64;
+  for (n = 0; n < nFullStripes; n++) {
+    u64 stripe[8] = block[n*64:n*64+64];
+    accumulate(stripe, n * 8);
+  }
+  accumulate(lastStripe, secretSize - 71);
+```
+
+### Step 4. Finalization
+
+In the finalization step, a merging procedure is used to extract a single 64-bit value from the accumulators, using an initial seed value and a 64-byte segment of the secret.
+
+```c
+finalMerge(u64 initValue, size secretOffset):
+  u64 secretWords[8] = secret[secretOffset:secretOffset+64];
+  u64 result = initValue;
+  for (i = 0; i < 4; i++) {
+    // 64-bit by 64-bit multiplication to 128-bit full result
+    u128 mulResult = (u128)(acc[i*2] xor secretWords[i*2]) *
+                     (u128)(acc[i*2+1] xor secretWords[i*2+1]);
+    result += lowerHalf(mulResult) xor higherHalf(mulResult);
+              // (mulResult and 0xFFFFFFFFFFFFFFFF) xor (mulResult >> 64)
+  }
+  // final mix (avalanche)
+  result ^= result >> 37;
+  result *= PRIME64_3;
+  result ^= result >> 32;
+  return result;
+```
+
+#### XXH3-128
+
+XXH3-128 runs the merging procedure twice for the two halves of the result, using different secret segments and different initial values derived from the total input length:
+
+```c
+finalize128():
+  return {finalMerge((u64)inputLength * PRIME64_1, 11), // lower half
+          finalMerge(~((u64)inputLength * PRIME64_2), secretSize - 75)}; // higher half
+```
+
+#### XXH3-64
+
+The XXH3-64 result is just the lower half of the XXH3-128 result:
+
+```c
+finalize64():
+  return finalMerge((u64)inputLength * PRIME64_1, 11);
+```
+
+
 Performance considerations
 ----------------------------------
 

From 1e02e9ae9ebee1803f17a33f7a9845643319a400 Mon Sep 17 00:00:00 2001
From: Adrien Wu <adrien1018@users.noreply.github.com>
Date: Mon, 24 Oct 2022 16:57:39 +0800
Subject: [PATCH 2/5] Add spec for small inputs

---
 doc/xxhash_spec.md | 188 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 158 insertions(+), 30 deletions(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index d171d3a..6d19539 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -47,13 +47,17 @@ However, a given variant shall produce exactly the same output, irrespective of
 ### Operation notations
 
 All operations are performed modulo {32,64} bits. Arithmetic overflows are expected.
-`XXH32` uses 32-bit modular operations. `XXH64` uses 64-bit modular operations.
+`XXH32` uses 32-bit modular operations. `XXH64` and `XXH3` uses 64-bit modular operations.
 
 - `+`: denotes modular addition
 - `*`: denotes modular multiplication
+    - **Exception:** In `XXH3`, if it is in the form `(u128)x * (u128)y`, it denotes 64-bit by 64-bit normal multiplication into a full 128-bit result.
 - `X <<< s`: denotes the value obtained by circularly shifting (rotating) `X` left by `s` bit positions.
 - `X >> s`: denotes the value obtained by shifting `X` right by s bit positions. Upper `s` bits become `0`.
+- `X << s`: denotes the value obtained by shifting `X` left by s bit positions. Lower `s` bits become `0`.
 - `X xor Y`: denotes the bit-wise XOR of `X` and `Y` (same width).
+- `X | Y`: denotes the bit-wise OR of `X` and `Y` (same width).
+- `~X`: denotes the bit-wise negation of `X`.
 
 
 XXH32 Algorithm Description
@@ -335,7 +339,7 @@ XXH3 comes in two different versions: XXH3-64 and XXH3-128 (or XXH128), producin
 
 XXH3 uses different algorithms for small (0-16 bytes), medium (17-240 bytes), and large (241+ bytes) inputs. The algorithms for small and medium inputs are optimized for performance. The three algorithms are described in the following sections.
 
-Many operations require some 64-bit prime number constants, which are the same constants used in XXH32 and XXH64, all defined below:
+Many operations require some 64-bit prime number constants, which are mostly the same constants used in XXH32 and XXH64, all defined below:
 
 ```c
   static const u64 PRIME32_1 = 0x9E3779B1U;  // 0b10011110001101110111100110110001
@@ -346,6 +350,7 @@ Many operations require some 64-bit prime number constants, which are the same c
   static const u64 PRIME64_3 = 0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
   static const u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
   static const u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
+  static const u64 PRIME_MIX = 0x9FB21C651E98DF25ULL;  // 0b1001111110110010000111000110010100011110100110001101111100100101
 ```
 
 The `XXH3_64bits()` function produces an unsigned 64-bit value.  
@@ -355,7 +360,7 @@ For systems requiring storing and/or displaying the result in binary or hexadeci
 
 ### Seed and Secret
 
-XXH3 provides seeded hashing by introducing two configurable constants used in the hashing process: the seed and the secret. The seed is an unsigned 64-bit value, and the secret is an array of bytes that is at least 136 bytes in size. The default seed is 0, and the default secret is the following value:
+XXH3 provides seeded hashing by introducing two configurable constants used in the hashing process: the seed and the secret. The seed is an unsigned 64-bit value, and the secret is an array of bytes that is at least 136 bytes in size. The default seed is 0, and the default secret is the following 192-byte value:
 
 ```c
 static const u8 defaultSecret[192] = {
@@ -388,13 +393,147 @@ deriveSecret(u64 seed):
   return derivedSecret; // convert to u8[192] (little-endian)
 ```
 
-The derivation treats the secrets as 24 64-bit values. In XXH3 algorithms, the secret is always read similarly by treating a contiguous segment of the array (whose size is a multiple of 8 bytes) as one or more 64-bit values. **The secret values are always read using little-endian convention**.
+The derivation treats the secrets as 24 64-bit values. In XXH3 algorithms, the secret is always read similarly by treating a contiguous segment of the array as one or more 32-bit or 64-bit values. **The secret values are always read using little-endian convention**.
 
+### Final Mixing Step (avalanche)
+
+To make sure that all input bits have a chance to impact any bit in the output digest (avalanche effect), the final step of the XXH3 algorithm is usually an fixed operation that mixes the bits in a 64-bit value. This operation is denoted `avalanche()` in the following XXH3 description.
+
+```c
+avalanche(u64 x):
+  x = x xor (x >> 37);
+  x = x * PRIME64_3;
+  x = x xor (x >> 32);
+  return x;
+```
 
 XXH3 Algorithm Description (for small inputs)
 -------------------------------------
 
-*TODO*
+The algorithm for small inputs is further divided into 4 cases: empty, 1-3 bytes, 4-8 bytes, and 9-16 bytes of input.
+
+The algorithm uses byte-swap operations. The byte-swap operation reverses the byte order in a 32-bit or 64-bit value. It is denoted `bswap32` and `bswap64` for its 32-bit and 64-bit versions, respectively.
+
+### Empty input
+
+The hash of empty input is calculated from the seed and a segment of the secret:
+
+```c
+XXH3_128_empty():
+  u64 secretWords[4] = secret[64:96];
+  return {avalanche(seed xor secretWords[0] xor secretWords[1]), // lower half
+          avalanche(seed xor secretWords[2] xor secretWords[3])}; // higher half
+
+XXH3_64_empty():
+  u64 secretWords[2] = secret[56:72];
+  return avalanche(seed xor secretWords[0] xor secretWords[1]);
+```
+
+### 1-3 bytes of input
+
+The algorithm starts from a single 32-bit value combining the input bytes and its length:
+
+```c
+u32 combined = (u32)input[inputLength-1] | ((u32)inputLength << 8) |
+               ((u32)input[0] << 16) | ((u32)input[inputLength>>1] << 24);
+// LSB          8       16           24                    MSB
+//  | last byte | length | first byte | middle-or-last byte |
+```
+
+Then the final output is calculated from the value and the first 8 bytes (XXH3-64) or 16 bytes (XXH3-128) of the secret to produce the final result. The secret here is read as 32-bit values instead of the usual 64-bit values.
+
+```c
+XXH3_64_1to3():
+  u32 secretWords[2] = secret[0:8];
+  u32 value = ((secretWords[0] xor secretWords[1]) + seed) xor combined;
+  return avalanche(value);
+
+XXH3_128_1to3():
+  u32 secretWords[4] = secret[0:16];
+  u32 low = ((secretWords[0] xor secretWords[1]) + seed) xor combined;
+  u32 high = ((secretWords[2] xor secretWords[3]) - seed) xor (bswap32(combined) <<< 13);
+  return {avalanche(low), // lower half
+          avalanche(high)}; // higher half
+```
+
+Note that the XXH3-64 result is the lower half of XXH3-128 result.
+
+### 4-8 bytes of input
+
+The algorithm starts from reading the first and last 4 bytes of the input as little-endian 32-bit values, and a modified seed:
+
+```c
+u32 inputFirst = input[0:4];
+u32 inputLast = input[inputLength-4:inputLength];
+u64 modifiedSeed = seed xor ((u64)bswap32((u32)lowerHalf(seed)) << 32);
+```
+
+Again, these values are combined with a segment of the secret to produce the final value.
+
+```c
+XXH3_64_4to8():
+  u64 secretWords[2] = secret[8:24];
+  u64 combined = (u64)inputLast | ((u64)inputFirst << 32);
+  u64 value = ((secretWords[0] xor secretWords[1]) + modifiedSeed) xor combined;
+  value = value xor (value <<< 49) xor (value <<< 24);
+  value = value * PRIME_MIX;
+  value = value xor ((value >> 35) + inputLength);
+  value = value * PRIME_MIX;
+  value = value xor (value >> 28);
+  return value;
+
+XXH3_128_4to8():
+  u64 secretWords[2] = secret[16:32];
+  u64 combined = (u64)inputFirst | ((u64)inputLast << 32);
+  u64 value = ((secretWords[0] xor secretWords[1]) + modifiedSeed) xor combined;
+  u128 mulResult = (u128)value * (u128)(PRIME64_1 + (inputLength << 2));
+  u64 high = higherHalf(mulResult); // mulResult >> 64
+  u64 low = lowerHalf(mulResult); // mulResult & 0xFFFFFFFFFFFFFFFF
+  high = high + (low << 1);
+  low = low xor (high >> 3);
+  low = low xor (low >> 35);
+  low = low * PRIME_MIX;
+  low = low xor (low >> 28);
+  high = avalanche(high);
+  return {low, high};
+```
+
+### 9-16 bytes of input
+
+The algorithm starts from reading the first and last 8 bytes of the input as little-endian 64-bit values:
+
+```c
+u64 inputFirst = input[0:8];
+u64 inputLast = input[inputLength-8:inputLength];
+```
+
+Once again, these values are combined with a segment of the secret to produce the final value.
+
+```c
+XXH3_64_9to16():
+  u64 secretWords[4] = secret[24:56];
+  u64 low = ((secretWords[0] xor secretWords[1]) + seed) xor inputFirst;
+  u64 high = ((secretWords[2] xor secretWords[3]) - seed) xor inputLast;
+  u128 mulResult = (u128)low * (u128)high;
+  u64 value = len + bswap64(low) + high + (u64)(lowerHalf(mulResult) xor higherHalf(mulResult));
+  return avalanche(value);
+  
+XXH3_128_9to16():
+  u64 secretWords[4] = secret[32:64];
+  u64 val1 = ((secretWords[0] xor secretWords[1]) - seed) xor inputFirst xor inputLast;
+  u64 val2 = ((secretWords[2] xor secretWords[3]) + seed) xor inputLast;
+  u128 mulResult = (u128)val1 * (u128)PRIME64_1;
+  u64 low = lowerHalf(mulResult) + ((u64)(inputLength - 1) << 54);
+  u64 high = higherHalf(mulResult) + ((u64)higherHalf(inputLast) << 32) + (u64)lowerHalf(inputLast) * PRIME32_2;
+  // the above line can also be simplified to higherHalf(mulResult) + inputLast + (u64)lowerHalf(inputLast) * (PRIME32_2 - 1);
+  low = low xor bswap64(high);
+  // the following three lines are in fact a 128x64 -> 128 multiplication ({low,high} = (u128){low,high} * PRIME64_2)
+  u128 mulResult2 = (u128)low * (u128)PRIME64_2;
+  low = lowerHalf(mulResult2);
+  high = higherHalf(mulResult2) + high * PRIME64_2;  
+  return {avalanche(low), // lower half
+          avalanche(high)}; // higher half
+```
 
 
 XXH3 Algorithm Description (for medium inputs)
@@ -442,10 +581,10 @@ The accumulation step applies the following procedure:
 accumulate(u64 stripe[8], size secretOffset):
   u64 secretWords[8] = secret[secretOffset:secretOffset+64];
   for (i = 0; i < 8; i++) {
-    u64 dataKey = stripe[i] xor secretWords[i];
-    acc[i xor 1] += stripe[i];
-    acc[i] += (u64)lowerHalf(dataKey) * (u64)higherHalf(dataKey);
-              // (data_key and 0xFFFFFFFF) * (data_key >> 32)
+    u64 value = stripe[i] xor secretWords[i];
+    acc[i xor 1] = acc[i xor 1] + stripe[i];
+    acc[i] = acc[i] + (u64)lowerHalf(value) * (u64)higherHalf(value);
+                      // (value and 0xFFFFFFFF) * (value >> 32)
   }
 ```
 
@@ -467,9 +606,9 @@ After the accumulation steps are finished for all stripes in the block, the accu
 round_scramble():
   u64 secretWords[8] = secret[secretSize-64:secretSize];
   for (i = 0; i < 8; i++) {
-    acc[i] ^= acc[i] >> 47;
-    acc[i] ^= secretWords[i];
-    acc[i] *= PRIME32_1;
+    acc[i] = acc[i] xor (acc[i] >> 47);
+    acc[i] = acc[i] xor secretWords[i];
+    acc[i] = acc[i] * PRIME32_1;
   }
 ```
 
@@ -510,32 +649,21 @@ finalMerge(u64 initValue, size secretOffset):
     // 64-bit by 64-bit multiplication to 128-bit full result
     u128 mulResult = (u128)(acc[i*2] xor secretWords[i*2]) *
                      (u128)(acc[i*2+1] xor secretWords[i*2+1]);
-    result += lowerHalf(mulResult) xor higherHalf(mulResult);
-              // (mulResult and 0xFFFFFFFFFFFFFFFF) xor (mulResult >> 64)
+    result = result + (lowerHalf(mulResult) xor higherHalf(mulResult));
+                      // (mulResult and 0xFFFFFFFFFFFFFFFF) xor (mulResult >> 64)
   }
-  // final mix (avalanche)
-  result ^= result >> 37;
-  result *= PRIME64_3;
-  result ^= result >> 32;
-  return result;
+  return avalanche(result);
 ```
 
-#### XXH3-128
-
-XXH3-128 runs the merging procedure twice for the two halves of the result, using different secret segments and different initial values derived from the total input length:
+XXH3-128 runs the merging procedure twice for the two halves of the result, using different secret segments and different initial values derived from the total input length.  
+The XXH3-64 result is just the lower half of the XXH3-128 result.
 
 ```c
-finalize128():
+XXH3_128_large():
   return {finalMerge((u64)inputLength * PRIME64_1, 11), // lower half
           finalMerge(~((u64)inputLength * PRIME64_2), secretSize - 75)}; // higher half
-```
 
-#### XXH3-64
-
-The XXH3-64 result is just the lower half of the XXH3-128 result:
-
-```c
-finalize64():
+XXH3_64_large():
   return finalMerge((u64)inputLength * PRIME64_1, 11);
 ```
 

From 664c4b3d129efba3f7d7e55b6d27b79eea49bd2c Mon Sep 17 00:00:00 2001
From: adrien1018 <adrien1018@users.noreply.github.com>
Date: Mon, 24 Oct 2022 20:02:19 +0800
Subject: [PATCH 3/5] Add spec for medium inputs

---
 doc/xxhash_spec.md | 121 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 6 deletions(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index 6d19539..de95661 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -50,6 +50,7 @@ All operations are performed modulo {32,64} bits. Arithmetic overflows are expec
 `XXH32` uses 32-bit modular operations. `XXH64` and `XXH3` uses 64-bit modular operations.
 
 - `+`: denotes modular addition
+- `-`: denotes modular subtraction
 - `*`: denotes modular multiplication
     - **Exception:** In `XXH3`, if it is in the form `(u128)x * (u128)y`, it denotes 64-bit by 64-bit normal multiplication into a full 128-bit result.
 - `X <<< s`: denotes the value obtained by circularly shifting (rotating) `X` left by `s` bit positions.
@@ -410,7 +411,7 @@ avalanche(u64 x):
 XXH3 Algorithm Description (for small inputs)
 -------------------------------------
 
-The algorithm for small inputs is further divided into 4 cases: empty, 1-3 bytes, 4-8 bytes, and 9-16 bytes of input.
+The algorithm for small inputs (0-16 bytes of input) is further divided into 4 cases: empty, 1-3 bytes, 4-8 bytes, and 9-16 bytes of input.
 
 The algorithm uses byte-swap operations. The byte-swap operation reverses the byte order in a 32-bit or 64-bit value. It is denoted `bswap32` and `bswap64` for its 32-bit and 64-bit versions, respectively.
 
@@ -517,7 +518,7 @@ XXH3_64_9to16():
   u128 mulResult = (u128)low * (u128)high;
   u64 value = len + bswap64(low) + high + (u64)(lowerHalf(mulResult) xor higherHalf(mulResult));
   return avalanche(value);
-  
+
 XXH3_128_9to16():
   u64 secretWords[4] = secret[32:64];
   u64 val1 = ((secretWords[0] xor secretWords[1]) - seed) xor inputFirst xor inputLast;
@@ -539,15 +540,123 @@ XXH3_128_9to16():
 XXH3 Algorithm Description (for medium inputs)
 -------------------------------------
 
-*TODO*
+This algorithm is used for medium inputs (17-240 bytes of input). Its internal hash state is stored inside 1 (XXH3-64) or 2 (XXH3-128) "accumulators", each storing an unsigned 64-bit value.
 
+### Step 1. Initialize internal accumulators
+
+The accumulator(s) are initialized based on the input length.
+
+```c
+// For XXH3-64
+u64 acc = inputLength * PRIME64_1;
+
+// For XXH3-128
+u64 acc[2] = {inputLength * PRIME64_1, 0};
+```
+
+### Step 2. Process the input
+
+This step is further divided into two cases: one for 17-128 bytes of input, and one for 129-240 bytes of input.
+
+#### Mixing operation
+
+This step uses a mixing operation that mixes a 16-byte segment of data, a 16-byte segment of secret and the seed into a 64-bit value as a building block. This operation treat the segment of data and secret as little-endian 64-bit values.
+
+```c
+mixStep(u8 data[16], size secretOffset, u64 seed):
+  u64 dataWords[2] = data[0:16];
+  u64 secretWords[2] = secret[secretOffset:secretOffset+16];
+  u128 mulResult = (u128)(dataWords[0] xor (secretWords[0] + seed)) *
+                   (u128)(dataWords[1] xor (secretWords[1] - seed));
+  return lowerHalf(mulResult) xor higherHalf(mulResult);
+```
+
+The mixing operation in XXH3-128 is always invoke in groups of two, where two 16-byte segments of data are mixed with a 32-byte segment of secret, and the accumulators are updated accordingly.
+
+```c
+mixTwoChunks(u8 data1[16], u8 data2[16], size secretOffset, u64 seed):
+  u64 dataWords1[2] = data1[0:16]; // again, little-endian conversion
+  u64 dataWords2[2] = data2[0:16];
+  acc[0] = acc[0] + mixStep(data1, secretOffset, seed);
+  acc[1] = acc[1] + mixStep(data2, secretOffset, seed);
+  acc[0] = acc[0] xor (dataWords2[0] + dataWords2[1]);
+  acc[1] = acc[1] xor (dataWords1[0] + dataWords1[1]);
+```
+
+The input is split into several 16-byte chunks and mixed, and the result is added to the accumulator(s).
+
+#### 17-128 bytes of input
+
+The input is read as *N* 16-byte chunks starting from the beginning and *N* chunks starting from the end, where *N* is the smallest number that these 2*N* chunks cover the whole input. These chunks are paired up and mixed, and the results are accumulated to the accumulator(s).
+
+```c
+processInput_XXH3_64_17to128(u8 data[]):
+  u64 numRounds = (inputLength - 1) >> 5;
+  for (i = 0; i < numRounds; i++) {
+    size offsetStart = i*16;
+    size offsetEnd = inputLength - i*16 - 16;
+    acc += mixStep(data[offsetStart:offsetStart+16], i*32, seed);
+    acc += mixStep(data[offsetEnd:offsetEnd+16], i*32+16, seed);
+  }
+
+processInput_XXH3_128_17to128(u8 data[]):
+  u64 numRounds = (inputLength - 1) >> 5;
+  for (i = 0; i < numRounds; i++) {
+    size offsetStart = i*16;
+    size offsetEnd = inputLength - i*16 - 16;
+    mixTwoChunks(data[offsetStart:offsetStart+16], data[offsetEnd:offsetEnd+16], i*32, seed);
+  }
+```
+
+#### 129-240 bytes of input
+
+The input is split into 16-byte (XXH3-64) or 32-byte (XXH3-128) chunks. The first 128 bytes are first mixed chunk by chunk, followed by an intermediate avalanche operation. Then the remaining full chunks are processed, and finally the last 16/32 bytes are treated as a chunk to process.
+
+```c
+processInput_XXH3_64_129to240(u8 data[]):
+  u64 numChunks = inputLength >> 4;
+  for (i = 0; i < 8; i++) {
+    acc += mixStep(data[i*16:i*16+16], i*16, seed);
+  }
+  acc = avalanche(acc);
+  for (i = 8; i < numChunks; i++) {
+    acc += mixStep(data[i*16:i*16+16], (i-8)*16 + 3, seed);
+  }
+  acc += mixStep(data[inputLength-16:inputLength], 119, seed);
+
+processInput_XXH3_128_129to240(u8 data[]):
+  u64 numChunks = inputLength >> 5;
+  for (i = 0; i < 4; i++) {
+    mixTwoChunks(data[i*32:i*32+16], data[i*32+16:i*32+32], i*32, seed);
+  }
+  acc[0] = avalanche(acc[0]);
+  acc[1] = avalanche(acc[1]);
+  for (i = 8; i < numChunks; i++) {
+    mixTwoChunks(data[i*32:i*32+16], data[i*32+16:i*32+32], (i-4)*32 + 3, seed);
+  }
+  // note that the half-chunk order is different here
+  mixTwoChunks(data[inputLength-16:inputLength], data[inputLength-32:inputLength-16], 103, seed);
+```
+
+### Step 3. Finalization
+
+The final result is extracted from the accumulator(s).
+
+```c
+XXH3_64_17to240():
+  return avalanche(acc);
+
+XXH3_128_17to240():
+  u64 low = acc[0] + acc[1];
+  u64 high = (acc[0] * PRIME64_1) + (acc[1] * PRIME64_4) + (((u64)inputLength - seed) * PRIME64_2);
+  return {avalanche(low), // lower half
+          (u64)0 - avalanche(high)}; // higher half
+```
 
 XXH3 Algorithm Description (for large inputs)
 -------------------------------------
 
-For inputs larger than 240 bytes, XXH3-64 and XXH3-128 use the same algorithm except for the finalizing step.
-
-The internal hash state is stored inside 8 "accumulators", each one storing an unsigned 64-bit value.
+This algorithm is used for inputs larger than 240 bytes. The internal hash state is stored inside 8 "accumulators", each one storing an unsigned 64-bit value.
 
 ### Step 1. Initialize internal accumulators
 

From 9759aa188f7a19e52f5c8c35b91abac588f1200f Mon Sep 17 00:00:00 2001
From: adrien1018 <adrien1018@users.noreply.github.com>
Date: Mon, 24 Oct 2022 20:17:19 +0800
Subject: [PATCH 4/5] Fix a typo

---
 doc/xxhash_spec.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index de95661..a2ebaa4 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -631,7 +631,7 @@ processInput_XXH3_128_129to240(u8 data[]):
   }
   acc[0] = avalanche(acc[0]);
   acc[1] = avalanche(acc[1]);
-  for (i = 8; i < numChunks; i++) {
+  for (i = 4; i < numChunks; i++) {
     mixTwoChunks(data[i*32:i*32+16], data[i*32+16:i*32+32], (i-4)*32 + 3, seed);
   }
   // note that the half-chunk order is different here

From 0e1d320d5c90b334d96d4a0e1030a534521fb144 Mon Sep 17 00:00:00 2001
From: adrien1018 <adrien1018@users.noreply.github.com>
Date: Tue, 25 Oct 2022 00:08:04 +0800
Subject: [PATCH 5/5] Fix errors in specification

---
 doc/xxhash_spec.md | 117 +++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 53 deletions(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index a2ebaa4..19f8233 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -336,7 +336,7 @@ For systems which require to store and/or display the result in binary or hexade
 XXH3 Algorithm Overview
 -------------------------------------
 
-XXH3 comes in two different versions: XXH3-64 and XXH3-128 (or XXH128), producing 64 and 128 bytes of output, respectively.
+XXH3 comes in two different versions: XXH3-64 and XXH3-128 (or XXH128), producing 64 and 128 bits of output, respectively.
 
 XXH3 uses different algorithms for small (0-16 bytes), medium (17-240 bytes), and large (241+ bytes) inputs. The algorithms for small and medium inputs are optimized for performance. The three algorithms are described in the following sections.
 
@@ -351,7 +351,8 @@ Many operations require some 64-bit prime number constants, which are mostly the
   static const u64 PRIME64_3 = 0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
   static const u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
   static const u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
-  static const u64 PRIME_MIX = 0x9FB21C651E98DF25ULL;  // 0b1001111110110010000111000110010100011110100110001101111100100101
+  static const u64 PRIME_MX1 = 0x165667919E3779F9ULL;  // 0b0001011001010110011001111001000110011110001101110111100111111001
+  static const u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  // 0b1001111110110010000111000110010100011110100110001101111100100101
 ```
 
 The `XXH3_64bits()` function produces an unsigned 64-bit value.  
@@ -398,11 +399,19 @@ The derivation treats the secrets as 24 64-bit values. In XXH3 algorithms, the s
 
 ### Final Mixing Step (avalanche)
 
-To make sure that all input bits have a chance to impact any bit in the output digest (avalanche effect), the final step of the XXH3 algorithm is usually an fixed operation that mixes the bits in a 64-bit value. This operation is denoted `avalanche()` in the following XXH3 description.
+To make sure that all input bits have a chance to impact any bit in the output digest (avalanche effect), the final step of the XXH3 algorithm is usually one of the two fixed operations that mix the bits in a 64-bit value. These operation are denoted `avalanche()` and `avalanche_XXH64()` in the following XXH3 description.
 
 ```c
 avalanche(u64 x):
   x = x xor (x >> 37);
+  x = x * PRIME_MX1;
+  x = x xor (x >> 32);
+  return x;
+
+avalanche_XXH64(u64 x):
+  x = x xor (x >> 33);
+  x = x * PRIME64_2;
+  x = x xor (x >> 29);
   x = x * PRIME64_3;
   x = x xor (x >> 32);
   return x;
@@ -420,14 +429,14 @@ The algorithm uses byte-swap operations. The byte-swap operation reverses the by
 The hash of empty input is calculated from the seed and a segment of the secret:
 
 ```c
-XXH3_128_empty():
-  u64 secretWords[4] = secret[64:96];
-  return {avalanche(seed xor secretWords[0] xor secretWords[1]), // lower half
-          avalanche(seed xor secretWords[2] xor secretWords[3])}; // higher half
-
 XXH3_64_empty():
   u64 secretWords[2] = secret[56:72];
-  return avalanche(seed xor secretWords[0] xor secretWords[1]);
+  return avalanche_XXH64(seed xor secretWords[0] xor secretWords[1]);
+
+XXH3_128_empty():
+  u64 secretWords[4] = secret[64:96];
+  return {avalanche_XXH64(seed xor secretWords[0] xor secretWords[1]), // lower half
+          avalanche_XXH64(seed xor secretWords[2] xor secretWords[3])}; // higher half
 ```
 
 ### 1-3 bytes of input
@@ -446,15 +455,16 @@ Then the final output is calculated from the value and the first 8 bytes (XXH3-6
 ```c
 XXH3_64_1to3():
   u32 secretWords[2] = secret[0:8];
-  u32 value = ((secretWords[0] xor secretWords[1]) + seed) xor combined;
-  return avalanche(value);
+  u64 value = ((u64)(secretWords[0] xor secretWords[1]) + seed) xor (u64)combined;
+  return avalanche_XXH64(value);
 
 XXH3_128_1to3():
   u32 secretWords[4] = secret[0:16];
-  u32 low = ((secretWords[0] xor secretWords[1]) + seed) xor combined;
-  u32 high = ((secretWords[2] xor secretWords[3]) - seed) xor (bswap32(combined) <<< 13);
-  return {avalanche(low), // lower half
-          avalanche(high)}; // higher half
+  u64 low = ((u64)(secretWords[0] xor secretWords[1]) + seed) xor (u64)combined;
+  u64 high = ((u64)(secretWords[2] xor secretWords[3]) - seed) xor (u64)(bswap32(combined) <<< 13);
+  // note that the bswap32(combined) <<< 13 above is 32-bit rotate
+  return {avalanche_XXH64(low), // lower half
+          avalanche_XXH64(high)}; // higher half
 ```
 
 Note that the XXH3-64 result is the lower half of XXH3-128 result.
@@ -475,11 +485,11 @@ Again, these values are combined with a segment of the secret to produce the fin
 XXH3_64_4to8():
   u64 secretWords[2] = secret[8:24];
   u64 combined = (u64)inputLast | ((u64)inputFirst << 32);
-  u64 value = ((secretWords[0] xor secretWords[1]) + modifiedSeed) xor combined;
+  u64 value = ((secretWords[0] xor secretWords[1]) - modifiedSeed) xor combined;
   value = value xor (value <<< 49) xor (value <<< 24);
-  value = value * PRIME_MIX;
+  value = value * PRIME_MX2;
   value = value xor ((value >> 35) + inputLength);
-  value = value * PRIME_MIX;
+  value = value * PRIME_MX2;
   value = value xor (value >> 28);
   return value;
 
@@ -493,7 +503,7 @@ XXH3_128_4to8():
   high = high + (low << 1);
   low = low xor (high >> 3);
   low = low xor (low >> 35);
-  low = low * PRIME_MIX;
+  low = low * PRIME_MX2;
   low = low xor (low >> 28);
   high = avalanche(high);
   return {low, high};
@@ -516,7 +526,7 @@ XXH3_64_9to16():
   u64 low = ((secretWords[0] xor secretWords[1]) + seed) xor inputFirst;
   u64 high = ((secretWords[2] xor secretWords[3]) - seed) xor inputLast;
   u128 mulResult = (u128)low * (u128)high;
-  u64 value = len + bswap64(low) + high + (u64)(lowerHalf(mulResult) xor higherHalf(mulResult));
+  u64 value = inputLength + bswap64(low) + high + (u64)(lowerHalf(mulResult) xor higherHalf(mulResult));
   return avalanche(value);
 
 XXH3_128_9to16():
@@ -525,13 +535,13 @@ XXH3_128_9to16():
   u64 val2 = ((secretWords[2] xor secretWords[3]) + seed) xor inputLast;
   u128 mulResult = (u128)val1 * (u128)PRIME64_1;
   u64 low = lowerHalf(mulResult) + ((u64)(inputLength - 1) << 54);
-  u64 high = higherHalf(mulResult) + ((u64)higherHalf(inputLast) << 32) + (u64)lowerHalf(inputLast) * PRIME32_2;
-  // the above line can also be simplified to higherHalf(mulResult) + inputLast + (u64)lowerHalf(inputLast) * (PRIME32_2 - 1);
+  u64 high = higherHalf(mulResult) + ((u64)higherHalf(val2) << 32) + (u64)lowerHalf(val2) * PRIME32_2;
+  // the above line can also be simplified to higherHalf(mulResult) + val2 + (u64)lowerHalf(val2) * (PRIME32_2 - 1);
   low = low xor bswap64(high);
   // the following three lines are in fact a 128x64 -> 128 multiplication ({low,high} = (u128){low,high} * PRIME64_2)
   u128 mulResult2 = (u128)low * (u128)PRIME64_2;
   low = lowerHalf(mulResult2);
-  high = higherHalf(mulResult2) + high * PRIME64_2;  
+  high = higherHalf(mulResult2) + high * PRIME64_2;
   return {avalanche(low), // lower half
           avalanche(high)}; // higher half
 ```
@@ -571,14 +581,14 @@ mixStep(u8 data[16], size secretOffset, u64 seed):
   return lowerHalf(mulResult) xor higherHalf(mulResult);
 ```
 
-The mixing operation in XXH3-128 is always invoke in groups of two, where two 16-byte segments of data are mixed with a 32-byte segment of secret, and the accumulators are updated accordingly.
+The mixing operation is always invoked in groups of two in XXH3-128, where two 16-byte segments of data are mixed with a 32-byte segment of secret, and the accumulators are updated accordingly.
 
 ```c
 mixTwoChunks(u8 data1[16], u8 data2[16], size secretOffset, u64 seed):
   u64 dataWords1[2] = data1[0:16]; // again, little-endian conversion
   u64 dataWords2[2] = data2[0:16];
   acc[0] = acc[0] + mixStep(data1, secretOffset, seed);
-  acc[1] = acc[1] + mixStep(data2, secretOffset, seed);
+  acc[1] = acc[1] + mixStep(data2, secretOffset + 16, seed);
   acc[0] = acc[0] xor (dataWords2[0] + dataWords2[1]);
   acc[1] = acc[1] xor (dataWords1[0] + dataWords1[1]);
 ```
@@ -590,21 +600,22 @@ The input is split into several 16-byte chunks and mixed, and the result is adde
 The input is read as *N* 16-byte chunks starting from the beginning and *N* chunks starting from the end, where *N* is the smallest number that these 2*N* chunks cover the whole input. These chunks are paired up and mixed, and the results are accumulated to the accumulator(s).
 
 ```c
-processInput_XXH3_64_17to128(u8 data[]):
-  u64 numRounds = (inputLength - 1) >> 5;
-  for (i = 0; i < numRounds; i++) {
+// the loop variable `i` should be signed to avoid underflow in implementation
+processInput_XXH3_64_17to128():
+  u64 numRounds = ((inputLength - 1) >> 5) + 1;
+  for (i = numRounds - 1; i >= 0; i--) {
     size offsetStart = i*16;
     size offsetEnd = inputLength - i*16 - 16;
-    acc += mixStep(data[offsetStart:offsetStart+16], i*32, seed);
-    acc += mixStep(data[offsetEnd:offsetEnd+16], i*32+16, seed);
+    acc += mixStep(input[offsetStart:offsetStart+16], i*32, seed);
+    acc += mixStep(input[offsetEnd:offsetEnd+16], i*32+16, seed);
   }
 
-processInput_XXH3_128_17to128(u8 data[]):
-  u64 numRounds = (inputLength - 1) >> 5;
-  for (i = 0; i < numRounds; i++) {
+processInput_XXH3_128_17to128():
+  u64 numRounds = ((inputLength - 1) >> 5) + 1;
+  for (i = numRounds - 1; i >= 0; i--) {
     size offsetStart = i*16;
     size offsetEnd = inputLength - i*16 - 16;
-    mixTwoChunks(data[offsetStart:offsetStart+16], data[offsetEnd:offsetEnd+16], i*32, seed);
+    mixTwoChunks(input[offsetStart:offsetStart+16], input[offsetEnd:offsetEnd+16], i*32, seed);
   }
 ```
 
@@ -613,29 +624,29 @@ processInput_XXH3_128_17to128(u8 data[]):
 The input is split into 16-byte (XXH3-64) or 32-byte (XXH3-128) chunks. The first 128 bytes are first mixed chunk by chunk, followed by an intermediate avalanche operation. Then the remaining full chunks are processed, and finally the last 16/32 bytes are treated as a chunk to process.
 
 ```c
-processInput_XXH3_64_129to240(u8 data[]):
+processInput_XXH3_64_129to240():
   u64 numChunks = inputLength >> 4;
   for (i = 0; i < 8; i++) {
-    acc += mixStep(data[i*16:i*16+16], i*16, seed);
+    acc += mixStep(input[i*16:i*16+16], i*16, seed);
   }
   acc = avalanche(acc);
   for (i = 8; i < numChunks; i++) {
-    acc += mixStep(data[i*16:i*16+16], (i-8)*16 + 3, seed);
+    acc += mixStep(input[i*16:i*16+16], (i-8)*16 + 3, seed);
   }
-  acc += mixStep(data[inputLength-16:inputLength], 119, seed);
+  acc += mixStep(input[inputLength-16:inputLength], 119, seed);
 
-processInput_XXH3_128_129to240(u8 data[]):
+processInput_XXH3_128_129to240():
   u64 numChunks = inputLength >> 5;
   for (i = 0; i < 4; i++) {
-    mixTwoChunks(data[i*32:i*32+16], data[i*32+16:i*32+32], i*32, seed);
+    mixTwoChunks(input[i*32:i*32+16], input[i*32+16:i*32+32], i*32, seed);
   }
   acc[0] = avalanche(acc[0]);
   acc[1] = avalanche(acc[1]);
   for (i = 4; i < numChunks; i++) {
-    mixTwoChunks(data[i*32:i*32+16], data[i*32+16:i*32+32], (i-4)*32 + 3, seed);
+    mixTwoChunks(input[i*32:i*32+16], input[i*32+16:i*32+32], (i-4)*32 + 3, seed);
   }
-  // note that the half-chunk order is different here
-  mixTwoChunks(data[inputLength-16:inputLength], data[inputLength-32:inputLength-16], 103, seed);
+  // note that the half-chunk order and the seed is different here
+  mixTwoChunks(input[inputLength-16:inputLength], input[inputLength-32:inputLength-16], 103, (u64)0 - seed);
 ```
 
 ### Step 3. Finalization
@@ -670,12 +681,12 @@ u64 acc[8] = {
 
 ### Step 2. Process blocks
 
-The input is consumed and processed one full block at a time. The size of the block depends on the length of the secret. Specifically, a block consists of several 64-byte stripes. The number of stripes per block is `floor((secretLen-64)/8)` . For the default 192-byte secret, there are 16 stripes in a block, and thus the block size is 1024 bytes.
+The input is consumed and processed one full block at a time. The size of the block depends on the length of the secret. Specifically, a block consists of several 64-byte stripes. The number of stripes per block is `floor((secretLength-64)/8)` . For the default 192-byte secret, there are 16 stripes in a block, and thus the block size is 1024 bytes.
 
 ```c
-secretLen = lengthInBytes(secret);    // default 192; at least 136
-stripesPerBlock = (secretLen-64) / 8; // default 16; at least 9
-blockSize = 64 * stripesPerBlock;     // default 1024; at least 576
+secretLength = lengthInBytes(secret);    // default 192; at least 136
+stripesPerBlock = (secretLength-64) / 8; // default 16; at least 9
+blockSize = 64 * stripesPerBlock;        // default 1024; at least 576
 ```
 
 The process of processing a full block is called a *round*. It consists of the following two sub-steps:
@@ -713,7 +724,7 @@ After the accumulation steps are finished for all stripes in the block, the accu
 
 ```c
 round_scramble():
-  u64 secretWords[8] = secret[secretSize-64:secretSize];
+  u64 secretWords[8] = secret[secretLength-64:secretLength];
   for (i = 0; i < 8; i++) {
     acc[i] = acc[i] xor (acc[i] >> 47);
     acc[i] = acc[i] xor secretWords[i];
@@ -743,7 +754,7 @@ lastRound(u8 block[], size len, u64 lastStripe[8]):
     u64 stripe[8] = block[n*64:n*64+64];
     accumulate(stripe, n * 8);
   }
-  accumulate(lastStripe, secretSize - 71);
+  accumulate(lastStripe, secretLength - 71);
 ```
 
 ### Step 4. Finalization
@@ -768,12 +779,12 @@ XXH3-128 runs the merging procedure twice for the two halves of the result, usin
 The XXH3-64 result is just the lower half of the XXH3-128 result.
 
 ```c
-XXH3_128_large():
-  return {finalMerge((u64)inputLength * PRIME64_1, 11), // lower half
-          finalMerge(~((u64)inputLength * PRIME64_2), secretSize - 75)}; // higher half
-
 XXH3_64_large():
   return finalMerge((u64)inputLength * PRIME64_1, 11);
+
+XXH3_128_large():
+  return {finalMerge((u64)inputLength * PRIME64_1, 11), // lower half
+          finalMerge(~((u64)inputLength * PRIME64_2), secretLength - 75)}; // higher half
 ```