C++ Logo

std-proposals

Advanced search

Re: [std-proposals] Interceptor Function (preserve stack and all registers)

From: Sebastian Wittmeier <wittmeier_at_[hidden]>
Date: Tue, 14 Apr 2026 13:52:18 +0200
If the compiler processes it, there is no need for you for providing assembler. Or for defining how the implementations do it. It is just compiled or even optimized like normal C++ code.   You can (with a C++ standard proposal) just declare two template functions (to be flexible about the parameters/overload), which are called before a call to the marked function and after a call to the marked function. (The question is, whether the marking may be attributes, as it changes the program)   Then you  - get notified about the call,  - can set a debug breakpoint (if you have some location, which is not optimized away)  - can log the parameters  - get notified about the return and return type     Something like _penter, but inserted on the call site. https://learn.microsoft.com/en-us/cpp/build/reference/gh-enable-penter-hook-function   -----Ursprüngliche Nachricht----- Von:Frederick Virchanza Gotham via Std-Proposals <std-proposals_at_[hidden]> Gesendet:Di 14.04.2026 11:42 Betreff:Re: [std-proposals] Interceptor Function (preserve stack and all registers) An:std-proposals_at_[hidden]; CC:Frederick Virchanza Gotham <cauldwell.thomas_at_[hidden]>;  Forget about all the [[musttail]] stuff. I think I've settled on a better way of doing this.  Firstly, here's how you'd write an interceptor:  [[interceptor]] void Logger(void) {   puts("Function called");   goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" ); }  When the compiler encounters the above, it will pretend a few things: (1) Function name will be have "__core_" prepended to it (i.e. __core_Logger). The name will be mangled, so it can be a template function inside a namespace. (2) The signature will be changed to "auto (*)(void) -> void(*)(void)" (3) The "goto ->" will be changed to "return"  So here's what you'll have:  auto __core__Z6Loggerv(void) -> void(*)(void) {   puts("Function called");   return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" ); }  Next the compiler will emit a second function in assembler named '_Z6Loggerv' as follows:  _Z6Loggerv:     save_call_state     call __core__Z6Loggerv     stash_address_in_scratch_register     restore_call_state     jmp *scratch_register  Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I can't write 'save_call_state' once for every architecture, I *can* write it so that it works for all known calling conventions for the selected architecture (so for example on x86_32, it will work for cdecl, stdcall, thiscall).  I am implementing this first of all for x86_64. The registers needed for SystemV are a superset of the registers needed for msabi, so I just save all the SystemV registers. Msabi requires 32 bytes of shadow space on the stack before a function call, so I've added that in (it's harmless on SystemV).  Some x86_64 processors only have SSE, some have AVX-256, and some have AVX-512, so at runtime I need to execute the CPUID instruction to find out. Here's the assembler I've got for all this at the moment:      # ============================================================     # PUSH_ARGS_AND_MODE     #     # Saves:     # GP: rax, rdi, rsi, rdx, rcx, r8, r9     # mode tag:     # 1 = SSE (xmm0..xmm7)     # 2 = AVX (ymm0..ymm7)     # 3 = AVX512 (zmm0..zmm7 + k0..k7)     #     # Notes:     # - CPUID is executed every entry.     # - RBX is preserved around CPUID because CPUID clobbers it.     # - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.     # - The chosen frame sizes keep RSP 0 mod 16 before calling __core_Logger     # ============================================================      .macro PUSH_ARGS_AND_MODE         # Save inbound GP argument state first.         # Offsets:         # 0 rax         # 8 rdi         # 16 rsi         # 24 rdx         # 32 rcx         # 40 r8         # 48 r9         sub $56, %rsp          mov %rax, 0(%rsp)         mov %rdi, 8(%rsp)         mov %rsi, 16(%rsp)         mov %rdx, 24(%rsp)         mov %rcx, 32(%rsp)         mov %r8, 40(%rsp)         mov %r9, 48(%rsp)          # Preserve RBX around CPUID.         push %rbx          # Detect AVX / AVX-512 each time.         # Default fallback is SSE.         mov $1, %eax         cpuid          bt $27, %ecx # OSXSAVE         jnc .Lpush_sse\@         bt $28, %ecx # AVX         jnc .Lpush_sse\@          xor %ecx, %ecx         xgetbv # XCR0 -> edx:eax         mov %eax, %r10d         and $0x6, %r10d # bits 1,2 => XMM+YMM         cmp $0x6, %r10d         jne .Lpush_sse\@          mov $7, %eax         xor %ecx, %ecx         cpuid          bt $16, %ebx # AVX512F         jnc .Lpush_avx\@          xor %ecx, %ecx         xgetbv         and $0xE6, %eax # bits 1,2,5,6,7         cmp $0xE6, %eax         je .Lpush_avx512\@  .Lpush_avx\@:         pop %rbx          # Layout after second subtraction:         # 0 mode = 2         # 8 ymm0         # 40 ymm1         # 72 ymm2         # 104 ymm3         # 136 ymm4         # 168 ymm5         # 200 ymm6         # 232 ymm7         # 264 pad         # 272 saved GP block (56 bytes)         #         # Additional size = 272, total frame = 328 bytes.         sub $272, %rsp         movq $2, 0(%rsp)          vmovdqu %ymm0, 8(%rsp)         vmovdqu %ymm1, 40(%rsp)         vmovdqu %ymm2, 72(%rsp)         vmovdqu %ymm3, 104(%rsp)         vmovdqu %ymm4, 136(%rsp)         vmovdqu %ymm5, 168(%rsp)         vmovdqu %ymm6, 200(%rsp)         vmovdqu %ymm7, 232(%rsp)         jmp .Lpush_done\@  .Lpush_avx512\@:         pop %rbx          # Layout after second subtraction:         # 0 mode = 3         # 8 zmm0         # 72 zmm1         # 136 zmm2         # 200 zmm3         # 264 zmm4         # 328 zmm5         # 392 zmm6         # 456 zmm7         # 520 k0         # 528 k1         # 536 k2         # 544 k3         # 552 k4         # 560 k5         # 568 k6         # 576 k7         # 584 pad         # 592 saved GP block (56 bytes)         #         # Additional size = 592, total frame = 648 bytes.         sub $592, %rsp         movq $3, 0(%rsp)          vmovdqu64 %zmm0, 8(%rsp)         vmovdqu64 %zmm1, 72(%rsp)         vmovdqu64 %zmm2, 136(%rsp)         vmovdqu64 %zmm3, 200(%rsp)         vmovdqu64 %zmm4, 264(%rsp)         vmovdqu64 %zmm5, 328(%rsp)         vmovdqu64 %zmm6, 392(%rsp)         vmovdqu64 %zmm7, 456(%rsp)          kmovq %k0, 520(%rsp)         kmovq %k1, 528(%rsp)         kmovq %k2, 536(%rsp)         kmovq %k3, 544(%rsp)         kmovq %k4, 552(%rsp)         kmovq %k5, 560(%rsp)         kmovq %k6, 568(%rsp)         kmovq %k7, 576(%rsp)         jmp .Lpush_done\@  .Lpush_sse\@:         pop %rbx          # Layout after second subtraction:         # 0 mode = 1         # 8 xmm0         # 24 xmm1         # 40 xmm2         # 56 xmm3         # 72 xmm4         # 88 xmm5         # 104 xmm6         # 120 xmm7         # 136 pad         # 144 saved GP block (56 bytes)         #         # Additional size = 144, total frame = 200 bytes.         sub $144, %rsp         movq $1, 0(%rsp)          movdqu %xmm0, 8(%rsp)         movdqu %xmm1, 24(%rsp)         movdqu %xmm2, 40(%rsp)         movdqu %xmm3, 56(%rsp)         movdqu %xmm4, 72(%rsp)         movdqu %xmm5, 88(%rsp)         movdqu %xmm6, 104(%rsp)         movdqu %xmm7, 120(%rsp)  .Lpush_done\@:     .endm      # ============================================================     # POP_ARGS_AND_JUMP     #     # Expects:     # r11 = final jump target returned by __core_Logger     #     # Restores according to mode tag, then jumps through r11.     # ============================================================      .macro POP_ARGS_AND_JUMP         cmpq $1, 0(%rsp)         je .Lpop_sse\@         cmpq $2, 0(%rsp)         je .Lpop_avx\@         cmpq $3, 0(%rsp)         je .Lpop_avx512\@         ud2  .Lpop_sse\@:         movdqu 8(%rsp), %xmm0         movdqu 24(%rsp), %xmm1         movdqu 40(%rsp), %xmm2         movdqu 56(%rsp), %xmm3         movdqu 72(%rsp), %xmm4         movdqu 88(%rsp), %xmm5         movdqu 104(%rsp), %xmm6         movdqu 120(%rsp), %xmm7          mov 144(%rsp), %rax         mov 152(%rsp), %rdi         mov 160(%rsp), %rsi         mov 168(%rsp), %rdx         mov 176(%rsp), %rcx         mov 184(%rsp), %r8         mov 192(%rsp), %r9          add $200, %rsp         jmp *%r11  .Lpop_avx\@:         vmovdqu 8(%rsp), %ymm0         vmovdqu 40(%rsp), %ymm1         vmovdqu 72(%rsp), %ymm2         vmovdqu 104(%rsp), %ymm3         vmovdqu 136(%rsp), %ymm4         vmovdqu 168(%rsp), %ymm5         vmovdqu 200(%rsp), %ymm6         vmovdqu 232(%rsp), %ymm7          mov 272(%rsp), %rax         mov 280(%rsp), %rdi         mov 288(%rsp), %rsi         mov 296(%rsp), %rdx         mov 304(%rsp), %rcx         mov 312(%rsp), %r8         mov 320(%rsp), %r9          add $328, %rsp         jmp *%r11  .Lpop_avx512\@:         vmovdqu64 8(%rsp), %zmm0         vmovdqu64 72(%rsp), %zmm1         vmovdqu64 136(%rsp), %zmm2         vmovdqu64 200(%rsp), %zmm3         vmovdqu64 264(%rsp), %zmm4         vmovdqu64 328(%rsp), %zmm5         vmovdqu64 392(%rsp), %zmm6         vmovdqu64 456(%rsp), %zmm7          kmovq 520(%rsp), %k0         kmovq 528(%rsp), %k1         kmovq 536(%rsp), %k2         kmovq 544(%rsp), %k3         kmovq 552(%rsp), %k4         kmovq 560(%rsp), %k5         kmovq 568(%rsp), %k6         kmovq 576(%rsp), %k7          mov 592(%rsp), %rax         mov 600(%rsp), %rdi         mov 608(%rsp), %rsi         mov 616(%rsp), %rdx         mov 624(%rsp), %rcx         mov 632(%rsp), %r8         mov 640(%rsp), %r9          add $648, %rsp         jmp *%r11     .endm      # ============================================================     # Public stub     #     # Microsoft x64 requires 32 bytes of shadow space for the call.     # Reserving it here is harmless on SysV if alignment stays correct.     # ============================================================  Logger:     PUSH_ARGS_AND_MODE     sub $32, %rsp     call __core_Logger     add $32, %rsp     mov %rax, %r11     POP_ARGS_AND_JUMP  So I think I've got a decent plan here for the implementation. This will successfully intercept any function call on x86_64 (even if binaries are mixing calling conventions). -- Std-Proposals mailing list Std-Proposals_at_[hidden] https://lists.isocpp.org/mailman/listinfo.cgi/std-proposals

Received on 2026-04-14 11:53:43