AW: [std-proposals] Interceptor Function (preserve stack and all registers)

If the compiler processes it, there is no need for you for providing assembler. Or for defining how the implementations do it.

You can (with a C++ standard proposal) just declare two template functions (to be flexible about the parameters/overload), which are called before a call to the marked function and after a call to the marked function. (The question is, whether the marking may be attributes, as it changes the program)

- can set a debug breakpoint (if you have some location, which is not optimized away)

https://learn.microsoft.com/en-us/cpp/build/reference/gh-enable-penter-hook-function

-----Ursprüngliche Nachricht-----
Von: Frederick Virchanza Gotham via Std-Proposals <std-proposals@lists.isocpp.org>
Gesendet: Di 14.04.2026 11:42
Betreff: Re: [std-proposals] Interceptor Function (preserve stack and all registers)
An: std-proposals@lists.isocpp.org;
CC: Frederick Virchanza Gotham <cauldwell.thomas@gmail.com>;

Forget about all the [[musttail]] stuff. I think I've settled on a better way of doing this.

Firstly, here's how you'd write an interceptor:

[[interceptor]] void Logger(void)

{

puts("Function called");

goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" );

}

When the compiler encounters the above, it will pretend a few things:

(1) Function name will be have "__core_" prepended to it (i.e. __core_Logger). The name will be mangled, so it can be a template function inside a namespace.

(2) The signature will be changed to "auto (*)(void) -> void(*)(void)"

(3) The "goto ->" will be changed to "return"

So here's what you'll have:

auto __core__Z6Loggerv(void) -> void(*)(void)

{

puts("Function called");

return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" );

}

Next the compiler will emit a second function in assembler named '_Z6Loggerv' as follows:

_Z6Loggerv:

save_call_state

call __core__Z6Loggerv

stash_address_in_scratch_register

restore_call_state

jmp *scratch_register

Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I can't write 'save_call_state' once for every architecture, I *can* write it so that it works for all known calling conventions for the selected architecture (so for example on x86_32, it will work for cdecl, stdcall, thiscall).

I am implementing this first of all for x86_64. The registers needed for SystemV are a superset of the registers needed for msabi, so I just save all the SystemV registers. Msabi requires 32 bytes of shadow space on the stack before a function call, so I've added that in (it's harmless on SystemV).

Some x86_64 processors only have SSE, some have AVX-256, and some have AVX-512, so at runtime I need to execute the CPUID instruction to find out. Here's the assembler I've got for all this at the moment:

# ============================================================

# PUSH_ARGS_AND_MODE

# Saves:

# GP: rax, rdi, rsi, rdx, rcx, r8, r9

# mode tag:

# 1 = SSE (xmm0..xmm7)

# 2 = AVX (ymm0..ymm7)

# 3 = AVX512 (zmm0..zmm7 + k0..k7)

# Notes:

# - CPUID is executed every entry.

# - RBX is preserved around CPUID because CPUID clobbers it.

# - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.

# - The chosen frame sizes keep RSP 0 mod 16 before calling __core_Logger

# ============================================================

.macro PUSH_ARGS_AND_MODE

# Save inbound GP argument state first.

# Offsets:

# 0 rax

# 8 rdi

# 16 rsi

# 24 rdx

# 32 rcx

# 40 r8

# 48 r9

sub $56, %rsp

mov %rax, 0(%rsp)

mov %rdi, 8(%rsp)

mov %rsi, 16(%rsp)

mov %rdx, 24(%rsp)

mov %rcx, 32(%rsp)

mov %r8, 40(%rsp)

mov %r9, 48(%rsp)

# Preserve RBX around CPUID.

push %rbx

# Detect AVX / AVX-512 each time.

# Default fallback is SSE.

mov $1, %eax

cpuid

bt $27, %ecx # OSXSAVE

jnc .Lpush_sse\@

bt $28, %ecx # AVX

jnc .Lpush_sse\@

xor %ecx, %ecx

xgetbv # XCR0 -> edx:eax

mov %eax, %r10d

and $0x6, %r10d # bits 1,2 => XMM+YMM

cmp $0x6, %r10d

jne .Lpush_sse\@

mov $7, %eax

xor %ecx, %ecx

cpuid

bt $16, %ebx # AVX512F

jnc .Lpush_avx\@

xor %ecx, %ecx

xgetbv

and $0xE6, %eax # bits 1,2,5,6,7

cmp $0xE6, %eax

je .Lpush_avx512\@

.Lpush_avx\@:

pop %rbx

# Layout after second subtraction:

# 0 mode = 2

# 8 ymm0

# 40 ymm1

# 72 ymm2

# 104 ymm3

# 136 ymm4

# 168 ymm5

# 200 ymm6

# 232 ymm7

# 264 pad

# 272 saved GP block (56 bytes)

# Additional size = 272, total frame = 328 bytes.

sub $272, %rsp

movq $2, 0(%rsp)

vmovdqu %ymm0, 8(%rsp)

vmovdqu %ymm1, 40(%rsp)

vmovdqu %ymm2, 72(%rsp)

vmovdqu %ymm3, 104(%rsp)

vmovdqu %ymm4, 136(%rsp)

vmovdqu %ymm5, 168(%rsp)

vmovdqu %ymm6, 200(%rsp)

vmovdqu %ymm7, 232(%rsp)

jmp .Lpush_done\@

.Lpush_avx512\@:

pop %rbx

# Layout after second subtraction:

# 0 mode = 3

# 8 zmm0

# 72 zmm1

# 136 zmm2

# 200 zmm3

# 264 zmm4

# 328 zmm5

# 392 zmm6

# 456 zmm7

# 520 k0

# 528 k1

# 536 k2

# 544 k3

# 552 k4

# 560 k5

# 568 k6

# 576 k7

# 584 pad

# 592 saved GP block (56 bytes)

# Additional size = 592, total frame = 648 bytes.

sub $592, %rsp

movq $3, 0(%rsp)

vmovdqu64 %zmm0, 8(%rsp)

vmovdqu64 %zmm1, 72(%rsp)

vmovdqu64 %zmm2, 136(%rsp)

vmovdqu64 %zmm3, 200(%rsp)

vmovdqu64 %zmm4, 264(%rsp)

vmovdqu64 %zmm5, 328(%rsp)

vmovdqu64 %zmm6, 392(%rsp)

vmovdqu64 %zmm7, 456(%rsp)

kmovq %k0, 520(%rsp)

kmovq %k1, 528(%rsp)

kmovq %k2, 536(%rsp)

kmovq %k3, 544(%rsp)

kmovq %k4, 552(%rsp)

kmovq %k5, 560(%rsp)

kmovq %k6, 568(%rsp)

kmovq %k7, 576(%rsp)

jmp .Lpush_done\@

.Lpush_sse\@:

pop %rbx

# Layout after second subtraction:

# 0 mode = 1

# 8 xmm0

# 24 xmm1

# 40 xmm2

# 56 xmm3

# 72 xmm4

# 88 xmm5

# 104 xmm6

# 120 xmm7

# 136 pad

# 144 saved GP block (56 bytes)

# Additional size = 144, total frame = 200 bytes.

sub $144, %rsp

movq $1, 0(%rsp)

movdqu %xmm0, 8(%rsp)

movdqu %xmm1, 24(%rsp)

movdqu %xmm2, 40(%rsp)

movdqu %xmm3, 56(%rsp)

movdqu %xmm4, 72(%rsp)

movdqu %xmm5, 88(%rsp)

movdqu %xmm6, 104(%rsp)

movdqu %xmm7, 120(%rsp)

.Lpush_done\@:

.endm

# ============================================================

# POP_ARGS_AND_JUMP

# Expects:

# r11 = final jump target returned by __core_Logger

# Restores according to mode tag, then jumps through r11.

# ============================================================

.macro POP_ARGS_AND_JUMP

cmpq $1, 0(%rsp)

je .Lpop_sse\@

cmpq $2, 0(%rsp)

je .Lpop_avx\@

cmpq $3, 0(%rsp)

je .Lpop_avx512\@

ud2

.Lpop_sse\@:

movdqu 8(%rsp), %xmm0

movdqu 24(%rsp), %xmm1

movdqu 40(%rsp), %xmm2

movdqu 56(%rsp), %xmm3

movdqu 72(%rsp), %xmm4

movdqu 88(%rsp), %xmm5

movdqu 104(%rsp), %xmm6

movdqu 120(%rsp), %xmm7

mov 144(%rsp), %rax

mov 152(%rsp), %rdi

mov 160(%rsp), %rsi

mov 168(%rsp), %rdx

mov 176(%rsp), %rcx

mov 184(%rsp), %r8

mov 192(%rsp), %r9

add $200, %rsp

jmp *%r11

.Lpop_avx\@:

vmovdqu 8(%rsp), %ymm0

vmovdqu 40(%rsp), %ymm1

vmovdqu 72(%rsp), %ymm2

vmovdqu 104(%rsp), %ymm3

vmovdqu 136(%rsp), %ymm4

vmovdqu 168(%rsp), %ymm5

vmovdqu 200(%rsp), %ymm6

vmovdqu 232(%rsp), %ymm7

mov 272(%rsp), %rax

mov 280(%rsp), %rdi

mov 288(%rsp), %rsi

mov 296(%rsp), %rdx

mov 304(%rsp), %rcx

mov 312(%rsp), %r8

mov 320(%rsp), %r9

add $328, %rsp

jmp *%r11

.Lpop_avx512\@:

vmovdqu64 8(%rsp), %zmm0

vmovdqu64 72(%rsp), %zmm1

vmovdqu64 136(%rsp), %zmm2

vmovdqu64 200(%rsp), %zmm3

vmovdqu64 264(%rsp), %zmm4

vmovdqu64 328(%rsp), %zmm5

vmovdqu64 392(%rsp), %zmm6

vmovdqu64 456(%rsp), %zmm7

kmovq 520(%rsp), %k0

kmovq 528(%rsp), %k1

kmovq 536(%rsp), %k2

kmovq 544(%rsp), %k3

kmovq 552(%rsp), %k4

kmovq 560(%rsp), %k5

kmovq 568(%rsp), %k6

kmovq 576(%rsp), %k7

mov 592(%rsp), %rax

mov 600(%rsp), %rdi

mov 608(%rsp), %rsi

mov 616(%rsp), %rdx

mov 624(%rsp), %rcx

mov 632(%rsp), %r8

mov 640(%rsp), %r9

add $648, %rsp

jmp *%r11

.endm

# ============================================================

# Public stub

# Microsoft x64 requires 32 bytes of shadow space for the call.

# Reserving it here is harmless on SysV if alignment stays correct.

# ============================================================

Logger:

PUSH_ARGS_AND_MODE

sub $32, %rsp

call __core_Logger

add $32, %rsp

mov %rax, %r11

POP_ARGS_AND_JUMP

So I think I've got a decent plan here for the implementation. This will successfully intercept any function call on x86_64 (even if binaries are mixing calling conventions).

-- 
 Std-Proposals mailing list
 Std-Proposals@lists.isocpp.org
 https://lists.isocpp.org/mailman/listinfo.cgi/std-proposals