GSoC status update #3 (strcmp)

01-07-2024

Table of Contents

GSoC status update #3 (strcmp)
Translating to Aarch64
- Stay within your page
Benchmarks
References

What is strcmp even?

The man page on FreeBSD has the following to say:

STRCMP(3)              FreeBSD Library Functions Manual              STRCMP(3)

NAME
     strcmp, strncmp – compare strings

LIBRARY
     Standard C Library (libc, -lc)

SYNOPSIS
     #include <string.h>

     int
     strcmp(const char *s1, const char *s2);

     int
     strncmp(const char *s1, const char *s2, size_t len);

DESCRIPTION
     The strcmp() and strncmp() functions lexicographically compare the null-
     terminated strings s1 and s2.

     The strncmp() function compares not more than len characters.  Because
     strncmp() is designed for comparing strings rather than binary data,
     characters that appear after a ‘\0’ character are not compared.

RETURN VALUES
     The strcmp() and strncmp() functions return an integer greater than,
     equal to, or less than 0, according as the string s1 is greater than,
     equal to, or less than the string s2.  The comparison is done using
     unsigned characters, so that ‘\200’ is greater than ‘\0’.

SEE ALSO
     bcmp(3), memcmp(3), strcasecmp(3), strcoll(3), strxfrm(3), wcscmp(3)

STANDARDS
     The strcmp() and strncmp() functions conform to ISO/IEC 9899:1990
     (“ISO C90”).

HISTORY
     The strcmp() function first appeared in the Programmer's Workbench
     (PWB/UNIX) and was ported to Version 7 AT&T UNIX; strncmp() first
     appeared in Version 7 AT&T UNIX.

FreeBSD 15.0-CURRENT             April 3, 2022            FreeBSD 15.0-CURRENT

Scalar compiler output

strcmp assembly listing

amd64 SIMD implementation

This is a longest one of the bunch, but its split into two separate code paths which makes the code long but the algorithm ~1/2 of the total LoC.

ARCHENTRY(strcmp, baseline)
    /* check if either string crosses a page in the head */
    lea     15(%rdi), %r8d  # end of head
    lea     15(%rsi), %r9d
    mov     %edi, %eax
    mov     %esi, %edx
    xor     %edi, %r8d  # bits that changed between first and last byte
    xor     %esi, %r9d
    and     $~0xf, %rdi # align heads to 16 bytes
    and     $~0xf, %rsi
    or      %r8d, %r9d  # in either RSI or RDI
    and     $0xf, %eax  # offset from alignment
    and     $0xf, %edx
    pxor        %xmm1, %xmm1
    test        $PAGE_SIZE, %r9d # did the page change?
    jz      0f      # if not, take fast path

    /* heads may cross page boundary, avoid unmapped loads */
    movdqa      (%rdi), %xmm0   # load aligned heads
    movdqa      (%rsi), %xmm2
    mov     $-1, %r8d
    mov     $-1, %r9d
    mov     %eax, %ecx
    shl     %cl, %r8d   # string head in XMM0
    mov     %edx, %ecx
    shl     %cl, %r9d   # string head in XMM2
    movdqa      %xmm0, -40(%rsp) # stash copies of the heads on the stack
    movdqa      %xmm2, -24(%rsp)
    pcmpeqb     %xmm1, %xmm0
    pcmpeqb     %xmm1, %xmm2
    pmovmskb    %xmm0, %r10d
    pmovmskb    %xmm2, %r11d
    test        %r8d, %r10d # NUL byte present in first string?
    lea     -40(%rsp), %r8
    cmovz       %rdi, %r8
    test        %r9d, %r11d # NUL byte present in second string?
    lea     -24(%rsp), %r9
    cmovz       %rsi, %r9
    movdqu      (%r8, %rax, 1), %xmm0 # load true (or fake) heads
    movdqu      (%r9, %rdx, 1), %xmm4
    jmp     1f

0:  movdqu      (%rdi, %rax, 1), %xmm0 # load true heads
    movdqu      (%rsi, %rdx, 1), %xmm4
1:  pxor        %xmm2, %xmm2
    pcmpeqb     %xmm0, %xmm2    # NUL byte present?
    pcmpeqb     %xmm0, %xmm4    # which bytes match?
    pandn       %xmm4, %xmm2    # match and not NUL byte?
    pmovmskb    %xmm2, %r9d
    xor     $0xffff, %r9d   # mismatch or NUL byte?
    jnz     .Lhead_mismatch

    /* load head and second chunk */
    movdqa      16(%rdi), %xmm2 # load second chunks
    movdqa      16(%rsi), %xmm3
    sub     %rdx, %rax  # is a&0xf >= b&0xf?
    jb      .Lswapped   # if not, proceed with swapped operands

    neg     %rax
    movdqu      16(%rsi, %rax, 1), %xmm0
    sub     %rdi, %rsi  # express RSI as distance from RDI
    lea     (%rsi, %rax, 1), %rdx # point RDX to offset in second string
    neg     %rax
    pcmpeqb     %xmm3, %xmm1    # ... corresponding to RDI
    pcmpeqb     %xmm2, %xmm0
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    add     $16, %rdi
    test        %r8d, %r8d
    jnz     .Lnul_found
    xor     $0xffff, %r9d
    jnz     .Lmismatch
    add     $16, %rdi   # advance aligned pointers

    /*
     * During the main loop, the layout of the two strings is something like:
     *
     *          v ------1------ v ------2------ v
     *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
     *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
     *
     * where v indicates the alignment boundaries and corresponding chunks
     * of the strings have the same letters.  Chunk A has been checked in
     * the previous iteration.  This iteration, we first check that string
     * RSI doesn't end within region 2, then we compare chunk B between the
     * two strings.  As RSI is known not to hold a NUL byte in regsions 1
     * and 2 at this point, this also ensures that RDI has not ended yet.
     */
    ALIGN_TEXT
0:  movdqu      (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
    pxor        %xmm1, %xmm1
    pcmpeqb     (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
    pcmpeqb     (%rdi), %xmm0   # where do the chunks match?
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    test        %r8d, %r8d
    jnz     .Lnul_found
    xor     $0xffff, %r9d   # any mismatches?
    jnz     .Lmismatch

    /* main loop unrolled twice */
    movdqu      16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
    pxor        %xmm1, %xmm1
    pcmpeqb     16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
    pcmpeqb     16(%rdi), %xmm0 # where do the chunks match?
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    add     $32, %rdi
    test        %r8d, %r8d
    jnz     .Lnul_found2
    xor     $0xffff, %r9d   # any mismatches?
    jz      0b

    sub     $16, %rdi   # roll back second increment

    /* a mismatch has been found between RDX and RSI */
.Lmismatch:
    tzcnt       %r9d, %r9d  # where is the mismatch?
    add     %rdi, %rdx  # turn RDX from offset to pointer
    movzbl      (%rdx, %r9, 1), %ecx
    movzbl      (%rdi, %r9, 1), %eax
    sub     %ecx, %eax  # difference of the mismatching chars
    ret

    /* mismatch in true heads */
.Lhead_mismatch:
    tzcnt       %r9d, %r9d  # where is the mismatch?
    add     %rax, %rdi  # return to true heads
    add     %rdx, %rsi
    movzbl      (%rdi, %r9, 1), %eax # mismatching characters
    movzbl      (%rsi, %r9, 1), %ecx
    sub     %ecx, %eax
    ret

.Lnul_found2:
    sub     $16, %rdi   # roll back second increment

    /* a NUL has been found in RSI */
.Lnul_found:
    mov     %eax, %ecx
    mov     %r8d, %r10d
    shl     %cl, %r8w   # adjust NUL mask to positions in RDI/RDX
    xor     $0xffff, %r9d   # mask of mismatches
    or      %r8d, %r9d  # NUL bytes also count as mismatches
    jnz     .Lmismatch

    /*
     * (RDI) == (RSI) and NUL is past the string.
     * Compare (RSI) with the corresponding part
     * of the other string until the NUL byte.
     */
    movdqu      (%rdi, %rax, 1), %xmm0
    pcmpeqb     (%rdi, %rsi, 1), %xmm0
    add     %rdi, %rsi  # restore RSI pointer
    add     %rax, %rdi  # point RDI to chunk corresponding to (RSI)
    pmovmskb    %xmm0, %ecx # mask of matches
    not     %ecx        # mask of mismatches
    or      %r10d, %ecx # mask of mismatches or NUL bytes
    tzcnt       %ecx, %ecx  # location of first mismatch
    movzbl      (%rdi, %rcx, 1), %eax
    movzbl      (%rsi, %rcx, 1), %ecx
    sub     %ecx, %eax
    ret

    /*
     * If (a&0xf) < (b&0xf), we do the same thing but with swapped
     * operands.  I found that this performs slightly better than
     * using conditional moves to do the swap branchless.
     */
.Lswapped:
    movdqu      16(%rdi, %rax, 1), %xmm0
    sub     %rsi, %rdi  # express RDI as distance from RSI
    lea     (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
    neg     %rax        # make difference positive
    pcmpeqb     %xmm2, %xmm1
    pcmpeqb     %xmm3, %xmm0
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    add     $16, %rsi   # advance aligned pointers
    test        %r8d, %r8d
    jnz     .Lnul_founds
    xor     $0xffff, %r9d
    jnz     .Lmismatchs
    add     $16, %rsi

    /*
     * During the main loop, the layout of the two strings is something like:
     *
     *          v ------1------ v ------2------ v
     *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
     *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
     *
     * where v indicates the alignment boundaries and corresponding chunks
     * of the strings have the same letters.  Chunk A has been checked in
     * the previous iteration.  This iteration, we first check that string
     * RSI doesn't end within region 2, then we compare chunk B between the
     * two strings.  As RSI is known not to hold a NUL byte in regsions 1
     * and 2 at this point, this also ensures that RDI has not ended yet.
     */
    ALIGN_TEXT
0:  movdqu      (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
    pxor        %xmm1, %xmm1
    pcmpeqb     (%rsi, %rdi, 1), %xmm1 # end of string in RSI?
    pcmpeqb     (%rsi), %xmm0   # where do the chunks match?
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    test        %r8d, %r8d
    jnz     .Lnul_founds
    xor     $0xffff, %r9d   # any mismatches?
    jnz     .Lmismatchs

    /* main loop unrolled twice */
    movdqu      16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
    pxor        %xmm1, %xmm1
    pcmpeqb     16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
    pcmpeqb     16(%rsi), %xmm0 # where do the chunks match?
    pmovmskb    %xmm1, %r8d
    pmovmskb    %xmm0, %r9d
    add     $32, %rsi
    test        %r8d, %r8d
    jnz     .Lnul_found2s
    xor     $0xffff, %r9d   # any mismatches?
    jz      0b

    sub     $16, %rsi   # roll back second increment

    /* a mismatch has been found between RDX and RDI */
.Lmismatchs:
    tzcnt       %r9d, %r9d  # where is the mismatch?
    add     %rsi, %rdx  # turn RDX from offset to pointer
    movzbl      (%rdx, %r9, 1), %eax
    movzbl      (%rsi, %r9, 1), %ecx
    sub     %ecx, %eax  # difference of the mismatching chars
    ret

.Lnul_found2s:
    sub     $16, %rsi   # roll back second increment

    /* a NUL has been found in RSI */
.Lnul_founds:
    mov     %eax, %ecx
    mov     %r8d, %r10d
    shl     %cl, %r8w   # adjust NUL mask to positions in RDI/RDX
    xor     $0xffff, %r9d   # mask of mismatches
    or      %r8d, %r9d  # NUL bytes also count as mismatches
    jnz     .Lmismatchs

    /*
     * (RDI) == (RSI) and NUL is past the string.
     * Compare (RSI) with the corresponding part
     * of the other string until the NUL byte.
     */
    movdqu      (%rsi, %rax, 1), %xmm0
    pcmpeqb     (%rsi, %rdi, 1), %xmm0
    add     %rsi, %rdi  # restore RDI pointer
    add     %rax, %rsi  # point RSI to chunk corresponding to (RDI)
    pmovmskb    %xmm0, %ecx # mask of matches
    not     %ecx        # mask of mismatches
    or      %r10d, %ecx # mask of mismatches or NUL bytes
    tzcnt       %ecx, %ecx  # location of first mismatch
    movzbl      (%rdi, %rcx, 1), %eax
    movzbl      (%rsi, %rcx, 1), %ecx
    sub     %ecx, %eax
    ret
ARCHEND(strcmp, baseline)

fuz described this the following way:

The basic idea is to process the bulk of the string in aligned blocks of 16 bytes such that one string runs ahead and the other runs behind. The string that runs ahead is checked for NUL bytes, the one that runs behind is compared with the corresponding chunk of the string that runs ahead. This trades an extra load per iteration for the very complicated block-reassembly needed in the other implementations (bionic, glibc). On the flip side, we need two code paths depending on the relative alignment of the two buffers.

The initial part of the string is compared directly if it is known not to cross a page boundary. Otherwise, a complex slow path to avoid crossing into unmapped memory commences. [1]

Translating to Aarch64

As we’ve been gotten somewhat comfortable with porting the common instructions like PMOVMSKB we can do a rather simple 1:1 mapping of the amd64 code and optimize it afterwards. The only gripe is the Swapping based on the two input strings relative offset to the nearest 16 byte boundary.

ENTRY(_strcmp)

    bic x8, x0, #0xf            // x8 is x0 but aligned to the boundary
    and x9, x0, #0xf            // x9 is the offset
    bic x10, x1, #0xf           // x10 is x1 but aligned to the boundary
    and x11, x1, #0xf           // x11 is the offset

    /*
     * Check if either string is located at end of page to avoid crossing
     * into unmapped page. If so, we load 16 bytes from the nearest
     * alignment boundary and shift based on the offset.
     */

    add x3, x0, #16         // end of head
    add x4, x1, #16
    eor x3, x3, x0
    eor x4, x4, x1              // bits that changed
    orr x3, x3, x4              // in either str1 or str2
    tst w3, #PAGE_SIZE
    b.eq    .Lbegin

    ldr q0, [x8]                // load aligned head
    ldr q2, [x10]

    mov x3, #0xffffffffffffffff
    mov x4, #0xffffffffffffffff
    lsl x14, x9, #2
    lsl x3, x3, x14         // string head
    lsl x15, x11, #2
    lsl x4, x4, x15

    cmeq    v0.16b, v0.16b, #0
    cmeq    v2.16b, v2.16b, #0

    shrn    v0.8b, v0.8h, #4
    shrn    v2.8b, v2.8h, #4
    fmov    x5, d0
    fmov    x6, d2

    /* heads may cross page boundary, avoid unmapped loads */
    tst x5, x3
    b.eq    0f
    adrp    x2, shift_data
    add x2, x2, :lo12:shift_data
    add x2, x2, x9
    ldr q0, [x8]
    ldr q4, [x2]                // load permutation table
    tbl v0.16b, {v0.16b}, v4.16b
    b       1f
    .p2align 4
0:
    ldr q0, [x0]                // load true head
1:
    tst x6, x4
    b.eq    0f

    adrp    x2, shift_data
    add x2, x2, :lo12:shift_data
    add x2, x2, x11
    ldr q1, [x10]
    ldr q4, [x2]
    tbl v4.16b, {v1.16b}, v4.16b

    b 1f

    .p2align 4
.Lbegin:
    ldr q0, [x0]                // load true heads
0:
    ldr q4, [x1]
1:

    cmeq    v2.16b, v0.16b, #0      // NUL byte present?
    cmeq    v4.16b, v0.16b, v4.16b  // which bytes match?

    not v2.16b, v2.16b
    and v2.16b, v2.16b, v4.16b  // match and not NUL byte

    shrn    v2.8b, v2.8h, #4
    fmov    x5, d2
    mvn x5, x5                  // mismatch or NUL byte?

    cbnz    x5, .Lhead_mismatch

    ldr q2, [x8, #16]           // load second chunk
    ldr q3, [x10, #16]
    subs    x9, x9, x11         // is a&0xf >= b&0xf
    b.lt    .Lswapped               // if not swap operands
    neg x9, x9
    add x12, x10, x9
    ldr q0, [x12, #16]!
    sub x10, x10, x8
    add x11, x10, x9            // point x11 to offset in second string
    neg x9, x9

    cmeq    v1.16b, v3.16b, #0
    cmeq    v0.16b, v0.16b, v2.16b
    add x8, x8, #16
    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfound
    mvn x5, x5
    cbnz    x5, .Lmismatch
    add x8, x8, #16         // advance aligned pointers

    /*
     * During the main loop, the layout of the two strings is something like:
     *
     *          v ------1------ v ------2------ v
     *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
     *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
     *
     * where v indicates the alignment boundaries and corresponding chunks
     * of the strings have the same letters.  Chunk A has been checked in
     * the previous iteration.  This iteration, we first check that string
     * X1 doesn't end within region 2, then we compare chunk B between the
     * two strings.  As X1 is known not to hold a NUL byte in regions 1
     * and 2 at this point, this also ensures that x0 has not ended yet.
     */
    .p2align 4
0:
    ldr q0, [x8, x11]
    ldr q1, [x8, x9]
    ldr q2, [x8]

    cmeq    v1.16b, v1.16b, #0      // end of string?
    cmeq    v0.16b, v0.16b, v2.16b  // do the chunks match?

    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfound
    mvn x5, x5                  // any mismatches?
    cbnz    x5, .Lmismatch

    add x8, x8, #16

    ldr q0, [x8, x11]
    ldr q1, [x8, x9]
    ldr q2, [x8]

    add x8, x8, #16
    cmeq    v1.16b, v1.16b, #0
    cmeq    v0.16b, v0.16b, v2.16b

    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfound2
    mvn x5, x5
    cbz x5, 0b

    sub x8, x8, #16         // roll back second increment
.Lmismatch:
    rbit    x2, x5
    clz x2, x2                  // index of mismatch
    lsr x2, x2, #2
    add x11, x8, x11

    ldrb    w4, [x8, x2]
    ldrb    w5, [x11, x2]
    sub w0, w4, w5              // difference of the mismatching chars
    ret

.Lnulfound2:
    sub x8, x8, #16

.Lnulfound:
    mov x7, x9
    mov x4, x6

    and x7, x7, #0xf
    lsl x7, x7, #2
    lsl x6, x6, x7              // adjust NUL mask to indices
    mvn x5, x5
    orr x5, x5, x6              // NUL bytes also count as mismatches
    cbnz    x5, .Lmismatch

    /*
     * (x0) == (x1) and NUL is past the string.
     * Compare (x1) with the corresponding part
     * of the other string until the NUL byte.
     */
    ldr q0, [x8, x9]
    ldr q1, [x8, x10]

    cmeq    v1.16b, v0.16b, v1.16b
    shrn    v1.8b, v1.8h, #4
    fmov    x5, d1

    mvn x5, x5
    orr x5, x5, x4

    rbit    x2, x5
    clz x2, x2
    lsr x5, x2, #2

    add x10, x10, x8            // restore x10 pointer
    add x8, x8, x9              // point to corresponding chunk in x0

    ldrb    w4, [x8, x5]
    ldrb    w5, [x10, x5]
    sub w0, w4, w5
    ret

.Lhead_mismatch:
    rbit    x2, x5
    clz x2, x2                  // index of mismatch
    lsr x2, x2, #2
    ldrb    w4, [x0, x2]
    ldrb    w5, [x1, x2]
    sub w0, w4, w5
    ret

    /*
     * If (a&0xf) < (b&0xf), we do the same thing but with swapped
     * operands.  I found that this performs slightly better than
     * using conditional moves to do the swap branchless.
     */
.Lswapped:
    add x12, x8, x9
    ldr q0, [x12, #16]!
    sub x8, x8, x10
    add x11, x8, x9
    neg x9, x9

    cmeq    v1.16b, v2.16b, #0
    cmeq    v0.16b, v0.16b, v3.16b
    add x10, x10, #16
    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfounds
    mvn x5, x5
    cbnz    x5, .Lmismatchs
    add x10, x10, #16

    /*
     * During the main loop, the layout of the two strings is something like:
     *
     *          v ------1------ v ------2------ v
     *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
     *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
     *
     * where v indicates the alignment boundaries and corresponding chunks
     * of the strings have the same letters.  Chunk A has been checked in
     * the previous iteration.  This iteration, we first check that string
     * X0 doesn't end within region 2, then we compare chunk B between the
     * two strings.  As X0 is known not to hold a NUL byte in regions 1
     * and 2 at this point, this also ensures that X1 has not ended yet.
     */
    .p2align 4
0:
    ldr q0, [x10, x11]
    ldr q1, [x10, x8]
    ldr q2, [x10]

    cmeq    v1.16b, v1.16b, #0
    cmeq    v0.16b, v0.16b, v2.16b

    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfounds
    mvn x5, x5
    cbnz    x5, .Lmismatchs

    add x10, x10, #16

    ldr q0, [x10, x11]
    ldr q1, [x10, x8]
    ldr q2, [x10]

    add x10, x10, #16
    cmeq    v1.16b, v1.16b, #0
    cmeq    v0.16b, v0.16b, v2.16b

    shrn    v1.8b, v1.8h, #4
    fmov    x6, d1
    shrn    v0.8b, v0.8h, #4
    fmov    x5, d0
    cbnz    x6, .Lnulfound2s
    mvn x5, x5
    cbz x5, 0b

    sub x10, x10, #16

.Lmismatchs:
    rbit    x2, x5
    clz x2, x2
    lsr x2, x2, #2
    add x11, x10, x11

    ldrb    w4, [x10, x2]
    ldrb    w5, [x11, x2]
    sub w0, w5, w4
    ret

.Lnulfound2s:
    sub x10, x10, #16
.Lnulfounds:
    mov x7, x9
    mov x4, x6

    and x7, x7, #0xf
    lsl x7, x7, #2
    lsl x6, x6, x7
    mvn x5, x5
    orr x5, x5, x6
    cbnz    x5, .Lmismatchs

    ldr q0, [x10, x9]
    ldr q1, [x10, x8]

    cmeq    v1.16b, v0.16b, v1.16b
    shrn    v1.8b, v1.8h, #4
    fmov    x5, d1

    mvn x5, x5
    orr x5, x5, x4

    rbit    x2, x5
    clz x2, x2
    lsr x5, x2, #2

    add x11, x10, x8
    add x10, x10, x9

    ldrb    w4, [x10, x5]
    ldrb    w5, [x11, x5]
    sub w0, w5, w4
    ret

END(_strcmp)

    .section .rodata
    .p2align 4
shift_data:
    .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15

strcmp is implemented the following way

Check if the page boundary is within 16 bytes, if so we enter a complex path similarly to how we implemented memcmp to avoid crossing into unmapped memory. But we simply need to check if theres any null bytes present in either of the strings. Otherwise we compare the true heads directly, i.e. we load 16 bytes from the source addresses for string 1 and 2.
When loading the true heads we first check for the presence of null bytes and then if theres any mismatch between the strings. The result of those two operations are then compared and whichever occured first gives us the result.
If no match was found we start setting up the loop by loading the second chunk from aligned addresses, this means that is the worst case where a string was only 1 byte from the nearest boundary 15 bytes will be reevaluated in this pass.
The algorithm for the setup is that based on which of the two source buffers has the greatest relative alignment we swap the operands to have one buffer run one chunk ahead checking for null bytes.
Then the main loop is entered and is unrolled twice, checking for null bytes first and then checking for mismatches. It is described well in the block comment in the code

/*
 * During the main loop, the layout of the two strings is something like:
 *
 *          v ------1------ v ------2------ v
 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
 *
 * where v indicates the alignment boundaries and corresponding chunks
 * of the strings have the same letters.  Chunk A has been checked in
 * the previous iteration.  This iteration, we first check that string
 * RSI doesn't end within region 2, then we compare chunk B between the
 * two strings.  As RSI is known not to hold a NUL byte in regions 1
 * and 2 at this point, this also ensures that RDI has not ended yet.
 */

Then in the case of a null byte found in the string running ahead we find its offset and shift according to the other strings relative offset and compare to the mismatch. So that we catch mismatches in the strings running behind the string being checked for presence of null bytes. (Here we need to shift the offset left by two as PMOVMSKB results in a 16 bit result whereas cmeq followed by shrn results in a 64 bit result).

Stay within your page

Similarly to memcmp we also need to make sure we dont read into unmapped pages. This can be achieved in a similar way by checking that there are no null bytes in the bytes leading up to a page boundary, if there is no page boundary within the first few bytes we can go on as usual.

Improvements

The main loop for checking mismatches and occurrences of null bytes is currently implemented using a cmeq followed by shrn similarly to how I’ve done it in strlen and memcmp. A downside of shrn is that it commonly only utilizes vector pipelines V13 (1 and 3). I’ve experimented with using umaxp to overcome this which utilizes vector pipeline V (all the pipelines). I realized that preloading a vector register with 0, 1, 2, ..., 15, 16 and or’ing it with the result from cmeq followed with uminv will immediately produce the index of the first mismatch. Although uminv is also bound to V13 we can reduce the scalar instruction count for the rbit, clz, lsr dance that we have perfomed previously.

Note: The pipelines utilized by specific instructions varies between processors but comparing the available Software Optimization Guides [2] from ARM has shown that this is a common implementation.

This could be implemented the following way:

cmeq       v0.16B,v0.16B,#0     // v0 <- 0xFFFF00FFFFFF
ldr        q1,shift_data
orr        v0.16B,v0.16B,v1.16B // v0 <- 0xFFFF07FFFFFF
uminv      b0,v0.16B            // b0 <- 0x07
fmov       w2,s0
ldrb       w4,[x0, x2]
ldrb       w5,[x1, x2]
sub        x0,x4,x5
ret

Benchmarks

os: FreeBSD
arch: arm64
cpu: ARM Cortex-A76 r4p1
                     │  strcmpARM  │             strcmpSIMD              │
                     │   sec/op    │   sec/op     vs base                │
StrcmpShortAligned     137.6µ ± 1%   113.8µ ± 0%  -17.35% (p=0.000 n=20)
StrcmpMidAligned       37.54µ ± 2%   38.93µ ± 0%   +3.69% (p=0.000 n=20)
StrcmpLongAligned      17.65µ ± 0%   14.67µ ± 0%  -16.89% (p=0.000 n=20)
StrcmpShortUnaligned   183.7µ ± 1%   125.2µ ± 0%  -31.83% (p=0.000 n=20)
StrcmpMidUnaligned     51.74µ ± 0%   38.69µ ± 2%  -25.23% (p=0.000 n=20)
StrcmpLongUnaligned    16.20µ ± 0%   16.12µ ± 0%   -0.50% (p=0.000 n=20)
StrcmpShortQsort       1.511m ± 0%   1.450m ± 0%   -4.05% (p=0.000 n=20)
StrcmpMidQsort         354.1µ ± 0%   345.1µ ± 0%   -2.56% (p=0.000 n=20)
geomean                96.49µ        84.24µ       -12.69%

                     │  strcmpARM   │              strcmpSIMD               │
                     │     B/s      │      B/s       vs base                │
StrcmpShortAligned     866.2Mi ± 1%   1048.0Mi ± 0%  +20.99% (p=0.000 n=20)
StrcmpMidAligned       3.101Gi ± 1%    2.991Gi ± 0%   -3.56% (p=0.000 n=20)
StrcmpLongAligned      6.597Gi ± 0%    7.938Gi ± 0%  +20.33% (p=0.000 n=20)
StrcmpShortUnaligned   649.0Mi ± 1%    952.1Mi ± 0%  +46.70% (p=0.000 n=20)
StrcmpMidUnaligned     2.250Gi ± 0%    3.009Gi ± 2%  +33.74% (p=0.000 n=20)
StrcmpLongUnaligned    7.186Gi ± 0%    7.222Gi ± 0%   +0.50% (p=0.000 n=20)
StrcmpShortQsort       78.89Mi ± 0%    82.22Mi ± 0%   +4.22% (p=0.000 n=20)
StrcmpMidQsort         336.6Mi ± 0%    345.5Mi ± 0%   +2.62% (p=0.000 n=20)
geomean                1.207Gi         1.382Gi       +14.53%

os: FreeBSD
arch: arm64
cpu: ARM Neoverse-V1 r1p1
                     │  strcmpARM   │             strcmpSIMD              │
                     │    sec/op    │   sec/op     vs base                │
StrcmpShortAligned      85.63µ ± 1%   72.88µ ± 1%  -14.88% (p=0.000 n=20)
StrcmpMidAligned        21.59µ ± 5%   17.51µ ± 2%  -18.88% (p=0.000 n=20)
StrcmpLongAligned      13.457µ ± 8%   8.315µ ± 3%  -38.21% (p=0.000 n=20)
StrcmpShortUnaligned   135.96µ ± 0%   85.68µ ± 0%  -36.98% (p=0.000 n=20)
StrcmpMidUnaligned      30.31µ ± 1%   18.64µ ± 1%  -38.49% (p=0.000 n=20)
StrcmpLongUnaligned    13.649µ ± 1%   8.442µ ± 2%  -38.15% (p=0.000 n=20)
StrcmpShortQsort       1173.0µ ± 0%   984.4µ ± 0%  -16.08% (p=0.000 n=20)
StrcmpMidQsort          263.1µ ± 0%   227.8µ ± 0%  -13.38% (p=0.000 n=20)
geomean                 67.51µ        48.79µ       -27.74%

                     │  strcmpARM   │              strcmpSIMD               │
                     │     B/s      │      B/s       vs base                │
StrcmpShortAligned     1.360Gi ± 1%    1.597Gi ± 1%  +17.49% (p=0.000 n=20)
StrcmpMidAligned       5.393Gi ± 5%    6.648Gi ± 2%  +23.28% (p=0.000 n=20)
StrcmpLongAligned      8.651Gi ± 9%   14.001Gi ± 3%  +61.83% (p=0.000 n=20)
StrcmpShortUnaligned   876.8Mi ± 0%   1391.3Mi ± 0%  +58.69% (p=0.000 n=20)
StrcmpMidUnaligned     3.841Gi ± 1%    6.245Gi ± 1%  +62.58% (p=0.000 n=20)
StrcmpLongUnaligned    8.529Gi ± 1%   13.791Gi ± 2%  +61.69% (p=0.000 n=20)
StrcmpShortQsort       101.6Mi ± 0%    121.1Mi ± 0%  +19.16% (p=0.000 n=20)
StrcmpMidQsort         453.2Mi ± 0%    523.2Mi ± 0%  +15.45% (p=0.000 n=20)
geomean                1.724Gi         2.386Gi       +38.39%

References

[1]: https://reviews.freebsd.org/D41971
[2]: https://developer.arm.com/documentation/109898/latest/

~getz