/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
*/

#include <machine/asm.h>
#include <machine/param.h>

        .weak   strncmp
        .set    strncmp, __strncmp
        .text

ENTRY(__strncmp)

        bic     x8, x0, #0xf                    // x0 aligned to the boundary
        and     x9, x0, #0xf                    // x9 is the offset
        bic     x10, x1, #0xf                   // x1 aligned to the boundary
        and     x11, x1, #0xf                   // x11 is the offset

        subs    x2, x2, #1
        b.lo    .Lempty

        mov     x13, #-1                        // save constants for later
        mov     x16, #0xf

        /*
         * Check if either string is located at end of page to avoid crossing
         * into unmapped page. If so, we load 16 bytes from the nearest
         * alignment boundary and shift based on the offset.
         */

        add     x3, x0, #16                     // end of head
        add     x4, x1, #16
        eor     x3, x3, x0
        eor     x4, x4, x1                      // bits that changed
        orr     x3, x3, x4                      // in either str1 or str2
        cmp     x2,#16
        b.lo    .Llt16
        tbz     w3, #PAGE_SHIFT, .Lbegin

        ldr     q0, [x8]                        // load aligned head
        ldr     q1, [x10]

        lsl     x14, x9, #2
        lsl     x15, x11, #2
        lsl     x3, x13, x14                    // string head
        lsl     x4, x13, x15

        cmeq    v5.16b, v0.16b, #0
        cmeq    v6.16b, v1.16b, #0

        shrn    v5.8b, v5.8h, #4
        shrn    v6.8b, v6.8h, #4
        fmov    x5, d5
        fmov    x6, d6

        adrp    x14, shift_data
        add     x14, x14, :lo12:shift_data

        /* heads may cross page boundary, avoid unmapped loads */
        tst     x5, x3
        b.eq    0f

        ldr     q4, [x14, x9]                   // load permutation table
        tbl     v0.16b, {v0.16b}, v4.16b

        b       1f
        .p2align 4
0:
        ldr     q0, [x0]                        // load true head
1:
        tst     x6, x4
        b.eq    0f

        ldr     q4, [x14, x11]
        tbl     v4.16b, {v1.16b}, v4.16b

        b 1f

        .p2align 4
.Lbegin:
        ldr     q0, [x0]                        // load true heads
0:
        ldr     q4, [x1]
1:
        cmeq    v2.16b, v0.16b, #0              // NUL byte present?
        cmeq    v4.16b, v0.16b, v4.16b          // which bytes match?

        orn     v2.16b, v2.16b, v4.16b          // mismatch or NUL byte?

        shrn    v2.8b, v2.8h, #4
        fmov    x5, d2

        cbnz    x5, .Lhead_mismatch
        /* load head and second chunk */
        ldr     q2, [x8, #16]                   // load second chunk
        ldr     q3, [x10, #16]

Generated by Getz using scpaste at Sat Jan 25 09:24:26 2025. CET. (original)