On Sun, 4 Jan 2026, 22:32 Frederick Virchanza Gotham via Std-Proposals, <std-proposals@lists.isocpp.org> wrote:
On Mon, Dec 29, 2025 at 1:35 AM Jonathan Wakely wrote:
>
>
> You need -mcx16 to let GCC use that instruction, and even
> with that option atomic ops will call into libatomic which decides
> at runtime whether to use cmpxchg16b or not.


I've been looking into this a bit. If I build the following program
with GNU g++ trunk x86_64:

    #include <atomic>
    using namespace std;

    int main(int const argc, char **const argv)
    {
        return __atomic_fetch_add( (__uint128_t*)argv[0], 123,
__ATOMIC_SEQ_CST );
    }

The assembler I get back is:

main:
        sub     rsp, 8
        mov     rdi, QWORD PTR [rsi]
        mov     ecx, 5
        xor     edx, edx
        mov     esi, 123
        call    __atomic_fetch_add_16
        add     rsp, 8
        ret

So now let's write code to see what's inside __atomic_fetch_add_16:

    #include <cstdint>
    #include <cstring>
    #include <cstdlib>
    #include <iomanip>
    #include <iostream>

    #include <dlfcn.h>

    using namespace std;

    // Print N bytes of machine code in hexadecimal
    static void PrintN(char unsigned const *const p, unsigned const n)
    {
    for ( unsigned i = 0; i < n; ++i )
    {
        cout << hex << setfill('0') << setw(2) << (unsigned)p[i] << flush;
    }
    cout << endl;
    }

    // Get the address of __atomic_fetch_add_16
    static auto GetFuncAddress(void)
    {
      void (*p)(void) = nullptr;
      __asm__("leaq __atomic_fetch_add_16@PLT(%%rip), %0" : "=r"(p));
      return p;
    }

    int main()
    {
    auto p = (char unsigned*)GetFuncAddress();

    PrintN(p, 6);

    if ( p[0] != 0xff || p[1] != 0x25 ) return EXIT_FAILURE;

    int32_t offset = 0;
    memcpy( &offset, p + 2, sizeof offset );

    uintptr_t slot = (uintptr_t)(p + 6) + (intptr_t)offset;

    char unsigned *target = nullptr;
    memcpy( &target, (void const*)slot, sizeof target );

    PrintN( target, 120u );

    Dl_info info;
    if ( dladdr((void*)target, &info) )
    {
        cout << "dli_fname: " << (info.dli_fname ? info.dli_fname :
"?") << "\n";
    }
    return 0;
    }

The above program prints out:

   ff25622f0000
   f30f1efa41544989d14989fa4989f053f048830c2400488b07488b570866662e0f1f84000000000066662e0f1f84000000000066662e0f1f84000000000066904c89c14c89cb4889c64889d74801c14811d34989dc4889cb4c89e1f0490fc70a4831c64831d74889f14809f975d2f048830c24005b415cc3
   dli_fname: /opt/compiler-explorer/gcc-15.2.0/lib64/libatomic.so.1

The first line, i.e. "ff25622f0000", is an immediate jump to
program_counter[ 6u + 0x2f62 ].
The second line is the implementation of 16-byte fetch_add which is
disassembled as follows:

0:  f3 0f 1e fa             endbr64
4:  41 54                   push   r12
6:  49 89 d1                mov    r9,rdx
9:  49 89 fa                mov    r10,rdi
c:  49 89 f0                mov    r8,rsi
f:  53                      push   rbx
10: f0 48 83 0c 24 00       lock or QWORD PTR [rsp],0x0
16: 48 8b 07                mov    rax,QWORD PTR [rdi]
19: 48 8b 57 08             mov    rdx,QWORD PTR [rdi+0x8]
1d: 66 66 2e 0f 1f 84 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
24: 00 00 00 00
28: 66 66 2e 0f 1f 84 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
2f: 00 00 00 00
33: 66 66 2e 0f 1f 84 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
3a: 00 00 00 00
3e: 66 90                   xchg   ax,ax
40: 4c 89 c1                mov    rcx,r8
43: 4c 89 cb                mov    rbx,r9
46: 48 89 c6                mov    rsi,rax
49: 48 89 d7                mov    rdi,rdx
4c: 48 01 c1                add    rcx,rax
4f: 48 11 d3                adc    rbx,rdx
52: 49 89 dc                mov    r12,rbx
55: 48 89 cb                mov    rbx,rcx
58: 4c 89 e1                mov    rcx,r12
5b: f0 49 0f c7 0a          lock cmpxchg16b OWORD PTR [r10]
60: 48 31 c6                xor    rsi,rax
63: 48 31 d7                xor    rdi,rdx
66: 48 89 f1                mov    rcx,rsi
69: 48 09 f9                or     rcx,rdi
6c: 75 d2                   jne    0x40
6e: f0 48 83 0c 24 00       lock or QWORD PTR [rsp],0x0
74: 5b                      pop    rbx
75: 41 5c                   pop    r12
77: c3                      ret

You can see 10 lines above, the instruction "lock cmpxchg16b".

I've tried a few different compiler flags like '-mcx16',
'-march=x86-64',' -march=x86-64-v2', but they have no effect on the
output of the above program. This is because our program itself
doesn't have any translation units that use cmpxchg16b. The only use
of cmpxchg16b is found inside libatomic which we don't compile (i.e.
it's already available as a binary).

So you don't need to use "-mcx16" in your own programs with the GNU
g++ compiler. It will still use the cmpxchg16b instruction even if you
don't specify '-mcx16'. Just wanted to clear that up.

As I said, using -mcx16 doesn't affect whether __atomic_compare_exchange calls into libatomic or not (it always does). Libatomic then decides at runtime whether to use cmpxchg16b, based on the actual CPU running the program. 

If you run the program on an older CPU, libatomic will not use that instruction.