ISOCPP std-proposals List: Re: [std-proposals] Interceptor Function (preserve stack and all registers)

From: Frederick Virchanza Gotham <cauldwell.thomas_at_[hidden]>
Date: Tue, 14 Apr 2026 10:42:20 +0100

Forget about all the [[musttail]] stuff. I think I've settled on a better
way of doing this.

Firstly, here's how you'd write an interceptor:

[[interceptor]] void Logger(void)
{
  puts("Function called");
  goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" );
}

When the compiler encounters the above, it will pretend a few things:
(1) Function name will be have "__core_" prepended to it (i.e.
__core_Logger). The name will be mangled, so it can be a template function
inside a namespace.
(2) The signature will be changed to "auto (*)(void) -> void(*)(void)"
(3) The "goto ->" will be changed to "return"

So here's what you'll have:

auto __core__Z6Loggerv(void) -> void(*)(void)
{
  puts("Function called");
  return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" );
}

Next the compiler will emit a second function in assembler named
'_Z6Loggerv' as follows:

_Z6Loggerv:
    save_call_state
    call __core__Z6Loggerv
    stash_address_in_scratch_register
    restore_call_state
    jmp *scratch_register

Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs
aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I
can't write 'save_call_state' once for every architecture, I *can* write it
so that it works for all known calling conventions for the selected
architecture (so for example on x86_32, it will work for cdecl, stdcall,
thiscall).

I am implementing this first of all for x86_64. The registers needed for
SystemV are a superset of the registers needed for msabi, so I just save
all the SystemV registers. Msabi requires 32 bytes of shadow space on the
stack before a function call, so I've added that in (it's harmless on
SystemV).

Some x86_64 processors only have SSE, some have AVX-256, and some have
AVX-512, so at runtime I need to execute the CPUID instruction to find out.
Here's the assembler I've got for all this at the moment:

    # ============================================================
    # PUSH_ARGS_AND_MODE
    #
    # Saves:
    # GP: rax, rdi, rsi, rdx, rcx, r8, r9
    # mode tag:
    # 1 = SSE (xmm0..xmm7)
    # 2 = AVX (ymm0..ymm7)
    # 3 = AVX512 (zmm0..zmm7 + k0..k7)
    #
    # Notes:
    # - CPUID is executed every entry.
    # - RBX is preserved around CPUID because CPUID clobbers it.
    # - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.
    # - The chosen frame sizes keep RSP 0 mod 16 before calling
__core_Logger
    # ============================================================

    .macro PUSH_ARGS_AND_MODE
        # Save inbound GP argument state first.
        # Offsets:
        # 0 rax
        # 8 rdi
        # 16 rsi
        # 24 rdx
        # 32 rcx
        # 40 r8
        # 48 r9
        sub $56, %rsp

        mov %rax, 0(%rsp)
        mov %rdi, 8(%rsp)
        mov %rsi, 16(%rsp)
        mov %rdx, 24(%rsp)
        mov %rcx, 32(%rsp)
        mov %r8, 40(%rsp)
        mov %r9, 48(%rsp)

        # Preserve RBX around CPUID.
        push %rbx

        # Detect AVX / AVX-512 each time.
        # Default fallback is SSE.
        mov $1, %eax
        cpuid

        bt $27, %ecx # OSXSAVE
        jnc .Lpush_sse\@
        bt $28, %ecx # AVX
        jnc .Lpush_sse\@

        xor %ecx, %ecx
        xgetbv # XCR0 -> edx:eax
        mov %eax, %r10d
        and $0x6, %r10d # bits 1,2 => XMM+YMM
        cmp $0x6, %r10d
        jne .Lpush_sse\@

        mov $7, %eax
        xor %ecx, %ecx
        cpuid

        bt $16, %ebx # AVX512F
        jnc .Lpush_avx\@

        xor %ecx, %ecx
        xgetbv
        and $0xE6, %eax # bits 1,2,5,6,7
        cmp $0xE6, %eax
        je .Lpush_avx512\@

.Lpush_avx\@:
        pop %rbx

        # Layout after second subtraction:
        # 0 mode = 2
        # 8 ymm0
        # 40 ymm1
        # 72 ymm2
        # 104 ymm3
        # 136 ymm4
        # 168 ymm5
        # 200 ymm6
        # 232 ymm7
        # 264 pad
        # 272 saved GP block (56 bytes)
        #
        # Additional size = 272, total frame = 328 bytes.
        sub $272, %rsp
        movq $2, 0(%rsp)

        vmovdqu %ymm0, 8(%rsp)
        vmovdqu %ymm1, 40(%rsp)
        vmovdqu %ymm2, 72(%rsp)
        vmovdqu %ymm3, 104(%rsp)
        vmovdqu %ymm4, 136(%rsp)
        vmovdqu %ymm5, 168(%rsp)
        vmovdqu %ymm6, 200(%rsp)
        vmovdqu %ymm7, 232(%rsp)
        jmp .Lpush_done\@

.Lpush_avx512\@:
        pop %rbx

        # Layout after second subtraction:
        # 0 mode = 3
        # 8 zmm0
        # 72 zmm1
        # 136 zmm2
        # 200 zmm3
        # 264 zmm4
        # 328 zmm5
        # 392 zmm6
        # 456 zmm7
        # 520 k0
        # 528 k1
        # 536 k2
        # 544 k3
        # 552 k4
        # 560 k5
        # 568 k6
        # 576 k7
        # 584 pad
        # 592 saved GP block (56 bytes)
        #
        # Additional size = 592, total frame = 648 bytes.
        sub $592, %rsp
        movq $3, 0(%rsp)

        vmovdqu64 %zmm0, 8(%rsp)
        vmovdqu64 %zmm1, 72(%rsp)
        vmovdqu64 %zmm2, 136(%rsp)
        vmovdqu64 %zmm3, 200(%rsp)
        vmovdqu64 %zmm4, 264(%rsp)
        vmovdqu64 %zmm5, 328(%rsp)
        vmovdqu64 %zmm6, 392(%rsp)
        vmovdqu64 %zmm7, 456(%rsp)

        kmovq %k0, 520(%rsp)
        kmovq %k1, 528(%rsp)
        kmovq %k2, 536(%rsp)
        kmovq %k3, 544(%rsp)
        kmovq %k4, 552(%rsp)
        kmovq %k5, 560(%rsp)
        kmovq %k6, 568(%rsp)
        kmovq %k7, 576(%rsp)
        jmp .Lpush_done\@

.Lpush_sse\@:
        pop %rbx

        # Layout after second subtraction:
        # 0 mode = 1
        # 8 xmm0
        # 24 xmm1
        # 40 xmm2
        # 56 xmm3
        # 72 xmm4
        # 88 xmm5
        # 104 xmm6
        # 120 xmm7
        # 136 pad
        # 144 saved GP block (56 bytes)
        #
        # Additional size = 144, total frame = 200 bytes.
        sub $144, %rsp
        movq $1, 0(%rsp)

        movdqu %xmm0, 8(%rsp)
        movdqu %xmm1, 24(%rsp)
        movdqu %xmm2, 40(%rsp)
        movdqu %xmm3, 56(%rsp)
        movdqu %xmm4, 72(%rsp)
        movdqu %xmm5, 88(%rsp)
        movdqu %xmm6, 104(%rsp)
        movdqu %xmm7, 120(%rsp)

.Lpush_done\@:
    .endm

    # ============================================================
    # POP_ARGS_AND_JUMP
    #
    # Expects:
    # r11 = final jump target returned by __core_Logger
    #
    # Restores according to mode tag, then jumps through r11.
    # ============================================================

    .macro POP_ARGS_AND_JUMP
        cmpq $1, 0(%rsp)
        je .Lpop_sse\@
        cmpq $2, 0(%rsp)
        je .Lpop_avx\@
        cmpq $3, 0(%rsp)
        je .Lpop_avx512\@
        ud2

.Lpop_sse\@:
        movdqu 8(%rsp), %xmm0
        movdqu 24(%rsp), %xmm1
        movdqu 40(%rsp), %xmm2
        movdqu 56(%rsp), %xmm3
        movdqu 72(%rsp), %xmm4
        movdqu 88(%rsp), %xmm5
        movdqu 104(%rsp), %xmm6
        movdqu 120(%rsp), %xmm7

        mov 144(%rsp), %rax
        mov 152(%rsp), %rdi
        mov 160(%rsp), %rsi
        mov 168(%rsp), %rdx
        mov 176(%rsp), %rcx
        mov 184(%rsp), %r8
        mov 192(%rsp), %r9

        add $200, %rsp
        jmp *%r11

.Lpop_avx\@:
        vmovdqu 8(%rsp), %ymm0
        vmovdqu 40(%rsp), %ymm1
        vmovdqu 72(%rsp), %ymm2
        vmovdqu 104(%rsp), %ymm3
        vmovdqu 136(%rsp), %ymm4
        vmovdqu 168(%rsp), %ymm5
        vmovdqu 200(%rsp), %ymm6
        vmovdqu 232(%rsp), %ymm7

        mov 272(%rsp), %rax
        mov 280(%rsp), %rdi
        mov 288(%rsp), %rsi
        mov 296(%rsp), %rdx
        mov 304(%rsp), %rcx
        mov 312(%rsp), %r8
        mov 320(%rsp), %r9

        add $328, %rsp
        jmp *%r11

.Lpop_avx512\@:
        vmovdqu64 8(%rsp), %zmm0
        vmovdqu64 72(%rsp), %zmm1
        vmovdqu64 136(%rsp), %zmm2
        vmovdqu64 200(%rsp), %zmm3
        vmovdqu64 264(%rsp), %zmm4
        vmovdqu64 328(%rsp), %zmm5
        vmovdqu64 392(%rsp), %zmm6
        vmovdqu64 456(%rsp), %zmm7

        kmovq 520(%rsp), %k0
        kmovq 528(%rsp), %k1
        kmovq 536(%rsp), %k2
        kmovq 544(%rsp), %k3
        kmovq 552(%rsp), %k4
        kmovq 560(%rsp), %k5
        kmovq 568(%rsp), %k6
        kmovq 576(%rsp), %k7

        mov 592(%rsp), %rax
        mov 600(%rsp), %rdi
        mov 608(%rsp), %rsi
        mov 616(%rsp), %rdx
        mov 624(%rsp), %rcx
        mov 632(%rsp), %r8
        mov 640(%rsp), %r9

        add $648, %rsp
        jmp *%r11
    .endm

    # ============================================================
    # Public stub
    #
    # Microsoft x64 requires 32 bytes of shadow space for the call.
    # Reserving it here is harmless on SysV if alignment stays correct.
    # ============================================================

Logger:
    PUSH_ARGS_AND_MODE
    sub $32, %rsp
    call __core_Logger
    add $32, %rsp
    mov %rax, %r11
    POP_ARGS_AND_JUMP

So I think I've got a decent plan here for the implementation. This will
successfully intercept any function call on x86_64 (even if binaries are
mixing calling conventions).

Received on 2026-04-14 09:42:24