C++ Logo

std-proposals

Advanced search

Re: [std-proposals] std::atomic_pointer_pair

From: Frederick Virchanza Gotham <cauldwell.thomas_at_[hidden]>
Date: Sun, 4 Jan 2026 22:33:12 +0000
On Mon, Dec 29, 2025 at 1:35 AM Jonathan Wakely wrote:
>
>
> You need -mcx16 to let GCC use that instruction, and even
> with that option atomic ops will call into libatomic which decides
> at runtime whether to use cmpxchg16b or not.


I've been looking into this a bit. If I build the following program
with GNU g++ trunk x86_64:

    #include <atomic>
    using namespace std;

    int main(int const argc, char **const argv)
    {
        return __atomic_fetch_add( (__uint128_t*)argv[0], 123,
__ATOMIC_SEQ_CST );
    }

The assembler I get back is:

main:
        sub rsp, 8
        mov rdi, QWORD PTR [rsi]
        mov ecx, 5
        xor edx, edx
        mov esi, 123
        call __atomic_fetch_add_16
        add rsp, 8
        ret

So now let's write code to see what's inside __atomic_fetch_add_16:

    #include <cstdint>
    #include <cstring>
    #include <cstdlib>
    #include <iomanip>
    #include <iostream>

    #include <dlfcn.h>

    using namespace std;

    // Print N bytes of machine code in hexadecimal
    static void PrintN(char unsigned const *const p, unsigned const n)
    {
    for ( unsigned i = 0; i < n; ++i )
    {
        cout << hex << setfill('0') << setw(2) << (unsigned)p[i] << flush;
    }
    cout << endl;
    }

    // Get the address of __atomic_fetch_add_16
    static auto GetFuncAddress(void)
    {
      void (*p)(void) = nullptr;
      __asm__("leaq __atomic_fetch_add_16_at_PLT(%%rip), %0" : "=r"(p));
      return p;
    }

    int main()
    {
    auto p = (char unsigned*)GetFuncAddress();

    PrintN(p, 6);

    if ( p[0] != 0xff || p[1] != 0x25 ) return EXIT_FAILURE;

    int32_t offset = 0;
    memcpy( &offset, p + 2, sizeof offset );

    uintptr_t slot = (uintptr_t)(p + 6) + (intptr_t)offset;

    char unsigned *target = nullptr;
    memcpy( &target, (void const*)slot, sizeof target );

    PrintN( target, 120u );

    Dl_info info;
    if ( dladdr((void*)target, &info) )
    {
        cout << "dli_fname: " << (info.dli_fname ? info.dli_fname :
"?") << "\n";
    }
    return 0;
    }

The above program prints out:

   ff25622f0000
   f30f1efa41544989d14989fa4989f053f048830c2400488b07488b570866662e0f1f84000000000066662e0f1f84000000000066662e0f1f84000000000066904c89c14c89cb4889c64889d74801c14811d34989dc4889cb4c89e1f0490fc70a4831c64831d74889f14809f975d2f048830c24005b415cc3
   dli_fname: /opt/compiler-explorer/gcc-15.2.0/lib64/libatomic.so.1

The first line, i.e. "ff25622f0000", is an immediate jump to
program_counter[ 6u + 0x2f62 ].
The second line is the implementation of 16-byte fetch_add which is
disassembled as follows:

0: f3 0f 1e fa endbr64
4: 41 54 push r12
6: 49 89 d1 mov r9,rdx
9: 49 89 fa mov r10,rdi
c: 49 89 f0 mov r8,rsi
f: 53 push rbx
10: f0 48 83 0c 24 00 lock or QWORD PTR [rsp],0x0
16: 48 8b 07 mov rax,QWORD PTR [rdi]
19: 48 8b 57 08 mov rdx,QWORD PTR [rdi+0x8]
1d: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
24: 00 00 00 00
28: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
2f: 00 00 00 00
33: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
3a: 00 00 00 00
3e: 66 90 xchg ax,ax
40: 4c 89 c1 mov rcx,r8
43: 4c 89 cb mov rbx,r9
46: 48 89 c6 mov rsi,rax
49: 48 89 d7 mov rdi,rdx
4c: 48 01 c1 add rcx,rax
4f: 48 11 d3 adc rbx,rdx
52: 49 89 dc mov r12,rbx
55: 48 89 cb mov rbx,rcx
58: 4c 89 e1 mov rcx,r12
5b: f0 49 0f c7 0a lock cmpxchg16b OWORD PTR [r10]
60: 48 31 c6 xor rsi,rax
63: 48 31 d7 xor rdi,rdx
66: 48 89 f1 mov rcx,rsi
69: 48 09 f9 or rcx,rdi
6c: 75 d2 jne 0x40
6e: f0 48 83 0c 24 00 lock or QWORD PTR [rsp],0x0
74: 5b pop rbx
75: 41 5c pop r12
77: c3 ret

You can see 10 lines above, the instruction "lock cmpxchg16b".

I've tried a few different compiler flags like '-mcx16',
'-march=x86-64',' -march=x86-64-v2', but they have no effect on the
output of the above program. This is because our program itself
doesn't have any translation units that use cmpxchg16b. The only use
of cmpxchg16b is found inside libatomic which we don't compile (i.e.
it's already available as a binary).

So you don't need to use "-mcx16" in your own programs with the GNU
g++ compiler. It will still use the cmpxchg16b instruction even if you
don't specify '-mcx16'. Just wanted to clear that up.

Still though I want to patch GNU g++ to add a compiler flag that
places cmpxchg16b inline in the code.

Received on 2026-01-04 22:32:32