Faster CRC32-C on x86

Computing the CRC-32C of a 4KB buffer on a modern x86 chip can be done as a simple loop around the crc32 instruction:

uint32_t crc32_4k(uint32_t acc, char* buf) {
    for (char* end = buf + 4096; buf < end; buf += 8) {
        acc = _mm_crc32_u64(acc, *(uint64_t*)buf);
    }
    return acc;
}

Note that this is bit-reflected CRC32-C with polynomial 0x11EDC6F41, which is different to the zlib CRC (polynomial 0x104C11DB7). Furthermore, there is no bitwise-inverse here; some texts apply ~ to acc on input and again on output, as in:

uint32_t crc32_4k_inverse(uint32_t acc, char* buf) {
    return ~crc32_4k(~acc, buf);
}

The loop within crc32_4k is fine, but leaves a lot of performance on the table: the underlying crc32 instruction has latency 3 and throughput 1 on every Intel chip since its introduction in Nehalem. To fully hide the latency, three independent CRC streams need to be calculated in parallel. Some mathematical tricks allow us to chop the 4KB buffer into multiple chunks, compute the CRC of each chunks independently, and then merge the results together. For a good exposition of the tricks, see Option 12 and Option 13 of komrad36/CRC. As covered in that exposition, it can be convenient to chop into four chunks rather than three, with the fourth being exactly 8 bytes long. This leads to the following function:

uint32_t crc32_4k_three_way(uint32_t acc_a, char* buf) {
    // Four chunks:
    //  Chunk A: 1360 bytes from 0 through 1360
    //  Chunk B: 1360 bytes from 1360 through 2720
    //  Chunk C: 1368 bytes from 2720 through 4088
    //  Chunk D: 8 bytes from 4088 through 4096
    uint32_t acc_b = 0;
    uint32_t acc_c = 0;
    for (char* end = buf + 1360; buf < end; buf += 8) {
        acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
        acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 1360));
        acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 1360*2));
    }
    // Merge together A and B, leaving space for C+D
    // kA == magic((1360+1368+8)*8-33)
    // kB == magic((     1368+8)*8-33)
    __m128i kAkB = _mm_setr_epi32(/*kA*/ 0x8A074012, 0, /*kB*/ 0x93E106A4, 0);
    __m128i vec_a = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_a), kAkB, 0x00);
    __m128i vec_b = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_b), kAkB, 0x10);
    uint64_t ab = _mm_cvtsi128_si64(_mm_xor_si128(vec_a, vec_b));
    // Final 8 bytes of C
    acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 1360*2));
    // Merge together C, AB, and D
    return _mm_crc32_u64(acc_c, ab ^ *(uint64_t*)(buf + 1360*2 + 8));
}

The main loop of crc32_4k_three_way contains three times as much code as crc32_4k, and runs approximately three times faster. The magic happens in the merging step, with literal magic numbers. The magic numbers can be generated with the following function, which computes the CRC of a 1 bit followed by n zero bits:

uint32_t magic(uint32_t n) {
    uint32_t crc = ((uint32_t)1) << (31 - (n & 31));
    if (n & 32) crc = _mm_crc32_u32(crc, 0);
    for (n >>= 6; n; --n) crc = _mm_crc32_u64(crc, 0);
    return crc;
}

Read the aforementioned exposition for details, but the quick summary is that merging step for a given chunk requires a magic number corresponding to how many bits appear in subsequent chunks. Chunks C/D doesn't require any magic, as there are no subsequent chunks. Chunk B has 1368+8 bytes in subsequent chunks, hence magic((1368+8)*8-33), and chunk A has 1360+1368+8 bytes in subsequent chunks, hence magic((1360+1368+8)*8-33). The -33 is actually -32-1: -32 to counter the shift performed by the final _mm_crc32_u64, and -1 to counter the bit-reversed inputs/outputs to _mm_clmulepi64_si128.

There is also a completely different way of computing CRC-32C (or any other 32-bit CRC), based around the pclmulqdq instruction (which is what _mm_clmulepi64_si128 compiles to). The canonical exposition for this is Intel's whitepaper, and the following code is inspired by Chromium's implementation of said whitepaper:

uint32_t crc32_4k_pclmulqdq(uint32_t acc, char* buf) {
    // First block of 64 is easy.
    __m128i x1 = _mm_loadu_si128((__m128i*)buf);
    __m128i x2 = _mm_loadu_si128((__m128i*)(buf + 16));
    __m128i x3 = _mm_loadu_si128((__m128i*)(buf + 32));
    __m128i x4 = _mm_loadu_si128((__m128i*)(buf + 48));
    x1 = _mm_xor_si128(_mm_cvtsi32_si128(acc), x1);
    // Parallel fold remaining blocks of 64.
    // k1 == magic(4*128+32-1)
    // k2 == magic(4*128-32-1)
    __m128i k1k2 = _mm_setr_epi32(/*k1*/ 0x740EEF02, 0, /*k2*/ 0x9E4ADDF8, 0);
    char* end = buf + 4096 - 64;
    do {
        __m128i x5 = _mm_clmulepi64_si128(x1, k1k2, 0x00);
        x1 = _mm_clmulepi64_si128(x1, k1k2, 0x11);
        __m128i x6 = _mm_clmulepi64_si128(x2, k1k2, 0x00);
        x2 = _mm_clmulepi64_si128(x2, k1k2, 0x11);
        __m128i x7 = _mm_clmulepi64_si128(x3, k1k2, 0x00);
        x3 = _mm_clmulepi64_si128(x3, k1k2, 0x11);
        __m128i x8 = _mm_clmulepi64_si128(x4, k1k2, 0x00);
        x4 = _mm_clmulepi64_si128(x4, k1k2, 0x11);
        x5 = _mm_xor_si128(x5, _mm_loadu_si128((__m128i*)(buf + 64)));
        x1 = _mm_xor_si128(x1, x5);
        x6 = _mm_xor_si128(x6, _mm_loadu_si128((__m128i*)(buf + 80)));
        x2 = _mm_xor_si128(x2, x6);
        x7 = _mm_xor_si128(x7, _mm_loadu_si128((__m128i*)(buf + 96)));
        x3 = _mm_xor_si128(x3, x7);
        x8 = _mm_xor_si128(x8, _mm_loadu_si128((__m128i*)(buf + 112)));
        x4 = _mm_xor_si128(x4, x8);
        buf += 64;
    } while (buf < end);
    // Fold together the four parallel streams into one.
    // k3 == magic(128+32-1)
    // k4 == magic(128-32-1)
    __m128i k3k4 = _mm_setr_epi32(/*k3*/ 0xF20C0DFE, 0, /*k4*/ 0x493C7D27, 0);
    __m128i x5 = _mm_clmulepi64_si128(x1, k3k4, 0x00);
    x1 = _mm_clmulepi64_si128(x1, k3k4, 0x11);
    x5 = _mm_xor_si128(x5, x2);
    x1 = _mm_xor_si128(x1, x5);
    x5 = _mm_clmulepi64_si128(x3, k3k4, 0x00);
    x3 = _mm_clmulepi64_si128(x3, k3k4, 0x11);
    x5 = _mm_xor_si128(x5, x4);
    x3 = _mm_xor_si128(x3, x5);
    // k5 == magic(2*128+32-1)
    // k6 == magic(2*128-32-1)
    __m128i k5k6 = _mm_setr_epi32(/*k5*/ 0x3DA6D0CB, 0, /*k6*/ 0xBA4FC28E, 0);
    x5 = _mm_clmulepi64_si128(x1, k5k6, 0x00);
    x1 = _mm_clmulepi64_si128(x1, k5k6, 0x11);
    x5 = _mm_xor_si128(x5, x3);
    x1 = _mm_xor_si128(x1, x5);
    // Apply missing <<32 and fold down to 32-bits.
    acc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
    return _mm_crc32_u64(acc, _mm_extract_epi64(x1, 1));
}

Note that crc32_4k_three_way consists of lots of _mm_crc32_u64 calls followed by a few _mm_clmulepi64_si128 calls, whereas crc32_4k_pclmulqdq is the opposite: lots of _mm_clmulepi64_si128 calls followed by a few _mm_crc32_u64 calls. This is very convenient, as on all Intel chips since Broadwell, _mm_clmulepi64_si128 has throughput 1, and executes on a separate port to _mm_crc32_u64. This means that we should be able to issue one _mm_clmulepi64_si128 and one _mm_crc32_u64 per cycle, thus doubling our speed. The latency of _mm_clmulepi64_si128 has varied slightly over time; 5 cycles on Broadwell, 7 cycles on Skylake and Coffee Lake and Cannon Lake, then 6 cycles on Ice Lake. In the best case (5 cycles), the main loop of crc32_4k_pclmulqdq consumes 64 bytes every 7 cycles, and in the worst case (7 cycles), the main loop of crc32_4k_pclmulqdq consumes 64 bytes every 9 cycles. In contrast, the main loop of crc32_4k_three_way consumes 24 bytes every 3 cycles, which is 72 bytes every 9 cycles. By combining the two main loops, we should be able to consume 728 bytes every 9 cycles. Splitting 4096 into two pieces in a 72:64 ratio gives 2168.47:1927.53, which we'll round to 2176:1920. The _mm_clmulepi64_si128 main loop can work through the 1920, while the crc32_4k_three_way main loop works through the 2176 (as 728+728+720). The resultant code is an unholy fusion of crc32_4k_three_way and crc32_4k_pclmulqdq, which interleaves the code from both:

uint32_t crc32_4k_fusion(uint32_t acc_a, char* buf) {
    // Four chunks:
    //  Chunk A: 728 bytes from 0 through 728
    //  Chunk B: 728 bytes from 728 through 1456
    //  Chunk C: 720 bytes from 1456 through 2176
    //  Chunk D: 1920 bytes from 2176 through 4096
    // First block of 64 from D is easy.
    char* buf2 = buf + 2176;
    __m128i x1 = _mm_loadu_si128((__m128i*)buf2);
    __m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
    __m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
    __m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
    uint32_t acc_b = 0;
    uint32_t acc_c = 0;
    // Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
    // k1 == magic(4*128+32-1)
    // k2 == magic(4*128-32-1)
    __m128i k1k2 = _mm_setr_epi32(/*k1*/ 0x740EEF02, 0, /*k2*/ 0x9E4ADDF8, 0);
    char* end = buf + 4096 - 64;
    do {
        acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
        __m128i x5 = _mm_clmulepi64_si128(x1, k1k2, 0x00);
        acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728));
        x1 = _mm_clmulepi64_si128(x1, k1k2, 0x11);
        acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2));
        __m128i x6 = _mm_clmulepi64_si128(x2, k1k2, 0x00);
        acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
        x2 = _mm_clmulepi64_si128(x2, k1k2, 0x11);
        acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728 + 8));
        __m128i x7 = _mm_clmulepi64_si128(x3, k1k2, 0x00);
        acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2 + 8));
        x3 = _mm_clmulepi64_si128(x3, k1k2, 0x11);
        acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
        __m128i x8 = _mm_clmulepi64_si128(x4, k1k2, 0x00);
        acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728 + 16));
        x4 = _mm_clmulepi64_si128(x4, k1k2, 0x11);
        acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2 + 16));
        x5 = _mm_xor_si128(x5, _mm_loadu_si128((__m128i*)(buf2 + 64)));
        x1 = _mm_xor_si128(x1, x5);
        x6 = _mm_xor_si128(x6, _mm_loadu_si128((__m128i*)(buf2 + 80)));
        x2 = _mm_xor_si128(x2, x6);
        x7 = _mm_xor_si128(x7, _mm_loadu_si128((__m128i*)(buf2 + 96)));
        x3 = _mm_xor_si128(x3, x7);
        x8 = _mm_xor_si128(x8, _mm_loadu_si128((__m128i*)(buf2 + 112)));
        x4 = _mm_xor_si128(x4, x8);
        buf2 += 64;
        buf += 24;
    } while (buf2 < end);
    // Next 24 bytes from A/B/C, and 8 more from A/B, then merge A/B/C.
    // Meanwhile, fold together D's four parallel streams.
    // k3 == magic(128+32-1)
    // k4 == magic(128-32-1)
    __m128i k3k4 = _mm_setr_epi32(/*k3*/ 0xF20C0DFE, 0, /*k4*/ 0x493C7D27, 0);
    acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
    __m128i x5 = _mm_clmulepi64_si128(x1, k3k4, 0x00);
    acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728));
    x1 = _mm_clmulepi64_si128(x1, k3k4, 0x11);
    acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2));
    __m128i x6 = _mm_clmulepi64_si128(x3, k3k4, 0x00);
    acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
    x3 = _mm_clmulepi64_si128(x3, k3k4, 0x11);
    acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728 + 8));
    acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2 + 8));
    acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
    acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728 + 16));
    x5 = _mm_xor_si128(x5, x2);
    acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + 728*2 + 16));
    x1 = _mm_xor_si128(x1, x5);
    acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 24));
    // k5 == magic(2*128+32-1)
    // k6 == magic(2*128-32-1)
    __m128i k5k6 = _mm_setr_epi32(/*k5*/ 0x3DA6D0CB, 0, /*k6*/ 0xBA4FC28E, 0);
    x6 = _mm_xor_si128(x6, x4);
    x3 = _mm_xor_si128(x3, x6);
    x5 = _mm_clmulepi64_si128(x1, k5k6, 0x00);
    acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + 728 + 24));
    x1 = _mm_clmulepi64_si128(x1, k5k6, 0x11);
    // kC == magic((        1920)*8-33)
    __m128i kCk0 = _mm_setr_epi32(/*kC*/ 0xF48642E9, 0, 0, 0);
    __m128i vec_c = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_c), kCk0, 0x00);
    // kB == magic((    720+1920)*8-33)
    // kA == magic((728+720+1920)*8-33)
    __m128i kAkB = _mm_setr_epi32(/*kA*/ 0x155AD968, 0, /*kB*/ 0x2E7D11A7, 0);
    __m128i vec_a = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_a), kAkB, 0x00);
    __m128i vec_b = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_b), kAkB, 0x10);
    x5 = _mm_xor_si128(x5, x3);
    x1 = _mm_xor_si128(x1, x5);
    uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
    // Apply missing <<32 and fold down to 32-bits.
    uint32_t crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
    crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
    return crc;
}

To give an idea of relative performance, I ran some very crude benchmarks of every function on a range of servers. The resultant gigabytes per second (GB/s) measurements are given below. As the servers have different clock speeds (both advertised and burst), I've assumed crc32_4k runs at 21.33 bits per cycle (b/c) on each, and used this to translate the other GB/s measurements to b/c.

                    Broadwell @ 3.00GHz  Skylake @ 3.20GHz    Ice Lake @ 2.40GHz
          crc32_4k  10.19 GB/s 21.3 b/c  12.64 GB/s 21.3 b/c  11.59 GB/s 21.3 b/c
crc32_4k_pclmulqdq  23.32 GB/s 48.8 b/c  28.12 GB/s 47.5 b/c  27.45 GB/s 50.5 b/c
crc32_4k_three_way  26.99 GB/s 56.5 b/c  33.26 GB/s 56.1 b/c  29.18 GB/s 53.7 b/c
   crc32_4k_fusion  44.44 GB/s 93.0 b/c  55.65 GB/s 93.9 b/c  50.36 GB/s 92.7 b/c

PS. If running on ARM rather than Intel, see dougallj's Faster CRC32 on the Apple M1.

Update: See https://github.com/corsix/fast-crc32/.