Index | Thread | Search

From:
Yuichiro NAITO <naito.yuichiro@gmail.com>
Subject:
Re: lang/luajit add support of IBT for amd64
To:
stu@spacehopper.org
Cc:
ports@openbsd.org
Date:
Wed, 15 Oct 2025 08:52:47 +0900

Download raw body.

Thread
From: Stuart Henderson <stu@spacehopper.org>
Subject: Re: lang/luajit add support of IBT for amd64
Date: Fri, 10 Oct 2025 11:00:55 +0100

> On 2025/10/10 10:11, Stuart Henderson wrote:
>> On 2025/10/10 17:27, Yuichiro NAITO wrote:
>> > Hi, I see that the LuaJIT fails to run on a processor that enables
>> > the IBT (Indirect Branch Tracking) feature since OpenBSD 7.4.
>> 
>> luajit in the ports tree is built with enforcement disabled (see
>> USE_NOBTCFI in the Makefile).

Sorry that I forget reporting I removed the USE_NOBCFI option in my
Ports tree. Once I had a nervous mind that saw LuaJIT ran without IBT
feature. So I removed the option and began to insert endbr instructions.

>> It would be good to be able to remove that, but it would be preferable
>> to get this landed upstream if possible, rather than as patches in
>> the ports tree, which may end up needing to get removed if they
>> conflict with future upstream code changes. (I'll consider adding
>> as patches if there's no interest upstream, but I think that should
>> be tried first).

I sent a Pull Request to the upstream. But I don't have a response yet.

https://github.com/LuaJIT/LuaJIT/pull/1391

> a ports diff would look like this

Thanks for the diff. It works for me on amd64. I don't have IBT featured
test environment on i386 but checked building successfully. While the build,
I saw the following warning message.

```
In file included from lj_asm.c:221:
./lj_emit_x86.h:73:13: warning: unused function 'emit_endbr' [-Wunused-function]
   73 | static void emit_endbr(ASMState *as)
      |             ^~~~~~~~~~
1 warning generated.
```

I would like to update your patch as the following to suppress the warning.

```
diff --git a/lang/luajit/patches/patch-src_lj_asm_c b/lang/luajit/patches/patch-src_lj_asm_c
index cca418187dc..790b1ac5a9a 100644
--- a/lang/luajit/patches/patch-src_lj_asm_c
+++ b/lang/luajit/patches/patch-src_lj_asm_c
@@ -5,7 +5,7 @@ Index: src/lj_asm.c
    spadj = asm_stack_adjust(as);
    as->T->spadjust = (uint16_t)spadj;
    emit_spsub(as, spadj);
-+#if LJ_TARGET_X64
++#if LJ_TARGET_X86ORX64
 +  emit_endbr(as);
 +#endif
    /* Root traces assume a checked stack for the starting proto. */
@@ -16,7 +16,7 @@ Index: src/lj_asm.c
      /* Continue with coalescing to fix up the broken cycle(s). */
    }
 -
-+#if LJ_TARGET_X64
++#if LJ_TARGET_X86ORX64
 +  emit_endbr(as);
 +#endif
    /* Inherit top stack slot already checked by parent trace. */
diff --git a/lang/luajit/patches/patch-src_lj_emit_x86_h b/lang/luajit/patches/patch-src_lj_emit_x86_h
index e706d833607..015356e9cef 100644
--- a/lang/luajit/patches/patch-src_lj_emit_x86_h
+++ b/lang/luajit/patches/patch-src_lj_emit_x86_h
@@ -1,7 +1,7 @@
 Index: src/lj_emit_x86.h
 --- src/lj_emit_x86.h.orig
 +++ src/lj_emit_x86.h
-@@ -70,6 +70,15 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg
+@@ -70,6 +70,13 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg
    return p;
  }
  
@@ -9,8 +9,6 @@ Index: src/lj_emit_x86.h
 +{
 +#if LJ_64
 +  emit_u32(as, 0xfa1e0ff3);      // endbr64
-+#else
-+  emit_u32(as, 0xfb1e0ff3);      // endbr32
 +#endif
 +}
 +
```

I feel it's OK to the other part of your patch.

> other ports would need syncs to change USE_NOBTCFI and bump revision
> (benchmarks/wrk, games/luanti, games/luasteam, games/openmw,
> games/powder-toy, games/solarus/solarus, mail/rspamd, net/hexchat,
> net/snort, www/luakit, x11/kde-applications/cantor)
> 
> Index: Makefile
> ===================================================================
> RCS file: /cvs/ports/lang/luajit/Makefile,v
> diff -u -p -r1.38 Makefile
> --- Makefile	24 Jul 2025 14:40:51 -0000	1.38
> +++ Makefile	10 Oct 2025 09:53:14 -0000
> @@ -1,6 +1,7 @@
>  # keep arch-defines.mk LUAJIT_ARCHS in sync
>  # bump ports which use PROPERTIES:Mluajit if changing
>  ONLY_FOR_ARCHS = aarch64 arm amd64 i386 powerpc
> +USE_NOBTCFI-aarch64 = Yes
>  #
>  # games/tome4 is using embedded copy of luajit
>  
> @@ -18,6 +19,7 @@ GH_COMMIT =	871db2c84ecefd70a850e03a6c34
>  # epoch time of the commit; easiest found in ${WRKSRC}/.relver of the
>  # git-archive tar
>  V =		2.1.1753364724
> +REVISION=	0
>  
>  COMMENT =	just-in-time compiler for Lua
>  DISTNAME =	LuaJIT-${V}
> @@ -29,8 +31,6 @@ HOMEPAGE =	https://luajit.org/
>  
>  # MIT
>  PERMIT_PACKAGE =	Yes
> -
> -USE_NOBTCFI =	Yes
>  
>  WANTLIB =	c m
>  
> Index: patches/patch-src_lj_asm_c
> ===================================================================
> RCS file: patches/patch-src_lj_asm_c
> diff -N patches/patch-src_lj_asm_c
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ patches/patch-src_lj_asm_c	10 Oct 2025 09:53:14 -0000
> @@ -0,0 +1,26 @@
> +amd64 bti fixes
> +
> +Index: src/lj_asm.c
> +--- src/lj_asm.c.orig
> ++++ src/lj_asm.c
> +@@ -1917,6 +1917,9 @@ static void asm_head_root(ASMState *as)
> +   spadj = asm_stack_adjust(as);
> +   as->T->spadjust = (uint16_t)spadj;
> +   emit_spsub(as, spadj);
> ++#if LJ_TARGET_X64
> ++  emit_endbr(as);
> ++#endif
> +   /* Root traces assume a checked stack for the starting proto. */
> +   as->T->topslot = gcref(as->T->startpt)->pt.framesize;
> + }
> +@@ -2085,7 +2088,9 @@ static void asm_head_side(ASMState *as)
> +     checkmclim(as);
> +     /* Continue with coalescing to fix up the broken cycle(s). */
> +   }
> +-
> ++#if LJ_TARGET_X64
> ++  emit_endbr(as);
> ++#endif
> +   /* Inherit top stack slot already checked by parent trace. */
> +   as->T->topslot = as->parent->topslot;
> +   if (as->topslot > as->T->topslot) {  /* Need to check for higher slot? */
> Index: patches/patch-src_lj_emit_x86_h
> ===================================================================
> RCS file: patches/patch-src_lj_emit_x86_h
> diff -N patches/patch-src_lj_emit_x86_h
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ patches/patch-src_lj_emit_x86_h	10 Oct 2025 09:53:14 -0000
> @@ -0,0 +1,21 @@
> +amd64 bti fixes
> +
> +Index: src/lj_emit_x86.h
> +--- src/lj_emit_x86.h.orig
> ++++ src/lj_emit_x86.h
> +@@ -70,6 +70,15 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg
> +   return p;
> + }
> + 
> ++static void emit_endbr(ASMState *as)
> ++{
> ++#if LJ_64
> ++  emit_u32(as, 0xfa1e0ff3);      // endbr64
> ++#else
> ++  emit_u32(as, 0xfb1e0ff3);      // endbr32
> ++#endif
> ++}
> ++
> + /* op + modrm */
> + #define emit_opm(xo, mode, rr, rb, p, delta) \
> +   (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
> Index: patches/patch-src_vm_x64_dasc
> ===================================================================
> RCS file: patches/patch-src_vm_x64_dasc
> diff -N patches/patch-src_vm_x64_dasc
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ patches/patch-src_vm_x64_dasc	10 Oct 2025 09:53:14 -0000
> @@ -0,0 +1,826 @@
> +amd64 bti fixes
> +
> +Index: src/vm_x64.dasc
> +--- src/vm_x64.dasc.orig
> ++++ src/vm_x64.dasc
> +@@ -192,13 +192,13 @@
> + |//-----------------------------------------------------------------------
> + |
> + |// Instruction headers.
> +-|.macro ins_A; .endmacro
> +-|.macro ins_AD; .endmacro
> +-|.macro ins_AJ; .endmacro
> +-|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
> +-|.macro ins_AB_; movzx RBd, RCH; .endmacro
> +-|.macro ins_A_C; movzx RCd, RCL; .endmacro
> +-|.macro ins_AND; not RD; .endmacro
> ++|.macro ins_A; endbr64; .endmacro
> ++|.macro ins_AD; endbr64; .endmacro
> ++|.macro ins_AJ; endbr64; .endmacro
> ++|.macro ins_ABC; endbr64; movzx RBd, RCH; movzx RCd, RCL; .endmacro
> ++|.macro ins_AB_; endbr64; movzx RBd, RCH; .endmacro
> ++|.macro ins_A_C; endbr64; movzx RCd, RCL; .endmacro
> ++|.macro ins_AND; endbr64; not RD; .endmacro
> + |
> + |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
> + |.macro ins_NEXT
> +@@ -387,6 +387,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->vm_returnp:
> ++  |  endbr64
> +   |  test PCd, FRAME_P
> +   |  jz ->cont_dispatch
> +   |
> +@@ -400,6 +401,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov aword [BASE+RA], ITYPE		// Prepend true to results.
> +   |
> +   |->vm_returnc:
> ++  |  endbr64
> +   |  add RDd, 1				// RD = nresults+1
> +   |  jz ->vm_unwind_yield
> +   |  mov MULTRES, RDd
> +@@ -407,6 +409,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jz ->BC_RET_Z			// Handle regular return to Lua.
> +   |
> +   |->vm_return:
> ++  |  endbr64
> +   |  // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
> +   |  xor PC, FRAME_C
> +   |  test PCd, FRAME_TYPE
> +@@ -440,11 +443,13 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov L:RB->top, BASE
> +   |
> +   |->vm_leave_cp:
> ++  |  endbr64
> +   |  mov RA, SAVE_CFRAME		// Restore previous C frame.
> +   |  mov L:RB->cframe, RA
> +   |  xor eax, eax			// Ok return status for vm_pcall.
> +   |
> +   |->vm_leave_unw:
> ++  |  endbr64
> +   |  restoreregs
> +   |  ret
> +   |
> +@@ -479,20 +484,24 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp <3
> +   |
> +   |->vm_unwind_yield:
> ++  |  endbr64
> +   |  mov al, LUA_YIELD
> +   |  jmp ->vm_unwind_c_eh
> +   |
> +   |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
> ++  |  endbr64
> +   |  // (void *cframe, int errcode)
> +   |  mov eax, CARG2d			// Error return status for vm_pcall.
> +   |  mov rsp, CARG1
> +   |->vm_unwind_c_eh:			// Landing pad for external unwinder.
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov GL:RB, L:RB->glref
> +   |  mov dword GL:RB->vmstate, ~LJ_VMST_C
> +   |  jmp ->vm_leave_unw
> +   |
> +   |->vm_unwind_rethrow:
> ++  |  endbr64
> +   |.if not X64WIN
> +   |  mov CARG1, SAVE_L
> +   |  mov CARG2d, eax
> +@@ -501,10 +510,12 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.endif
> +   |
> +   |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
> ++  |  endbr64
> +   |  // (void *cframe)
> +   |  and CARG1, CFRAME_RAWMASK
> +   |  mov rsp, CARG1
> +   |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov RDd, 1+1			// Really 1+2 results, incr. later.
> +   |  mov BASE, L:RB->base
> +@@ -524,14 +535,17 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->vm_growstack_c:			// Grow stack for C function.
> ++  |  endbr64
> +   |  mov CARG2d, LUA_MINSTACK
> +   |  jmp >2
> +   |
> +   |->vm_growstack_v:			// Grow stack for vararg Lua function.
> ++  |  endbr64
> +   |  sub RD, 16				// LJ_FR2
> +   |  jmp >1
> +   |
> +   |->vm_growstack_f:			// Grow stack for fixarg Lua function.
> ++  |  endbr64
> +   |  // BASE = new base, RD = nargs+1, RB = L, PC = first PC
> +   |  lea RD, [BASE+NARGS:RD*8-8]
> +   |1:
> +@@ -560,6 +574,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->vm_resume:				// Setup C frame and resume thread.
> ++  |  endbr64
> +   |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
> +   |  saveregs
> +   |  mov L:RB, CARG1			// Caveat: CARG1 may be RA.
> +@@ -595,6 +610,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp ->vm_return
> +   |
> +   |->vm_pcall:				// Setup protected C frame and enter VM.
> ++  |  endbr64
> +   |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
> +   |  saveregs
> +   |  mov PCd, FRAME_CP
> +@@ -602,6 +618,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >1
> +   |
> +   |->vm_call:				// Setup C frame and enter VM.
> ++  |  endbr64
> +   |  // (lua_State *L, TValue *base, int nres1)
> +   |  saveregs
> +   |  mov PCd, FRAME_C
> +@@ -632,15 +649,18 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  add NARGS:RDd, 1			// RD = nargs+1
> +   |
> +   |->vm_call_dispatch:
> ++  |  endbr64
> +   |  mov LFUNC:RB, [RA-16]
> +   |  checkfunc LFUNC:RB, ->vmeta_call	// Ensure KBASE defined and != BASE.
> +   |
> +   |->vm_call_dispatch_f:
> ++  |  endbr64
> +   |  mov BASE, RA
> +   |  ins_call
> +   |  // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
> +   |
> +   |->vm_cpcall:				// Setup protected C frame, call C.
> ++  |  endbr64
> +   |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
> +   |  saveregs
> +   |  mov L:RB, CARG1			// Caveat: CARG1 may be RA.
> +@@ -675,6 +695,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Continuation dispatch ----------------------------------------------
> +   |
> +   |->cont_dispatch:
> ++  |  endbr64
> +   |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
> +   |  add RA, BASE
> +   |  and PC, -8
> +@@ -706,6 +727,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.endif
> +   |
> +   |->cont_cat:				// BASE = base, RC = result, RB = mbase
> ++  |  endbr64
> +   |  movzx RAd, PC_RB
> +   |  sub RB, 32
> +   |  lea RA, [BASE+RA*8]
> +@@ -733,6 +755,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Table indexing metamethods -----------------------------------------
> +   |
> +   |->vmeta_tgets:
> ++  |  endbr64
> +   |  settp STR:RC, LJ_TSTR		// STR:RC = GCstr *
> +   |  mov TMP1, STR:RC
> +   |  lea RC, TMP1
> +@@ -744,6 +767,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >2
> +   |
> +   |->vmeta_tgetb:
> ++  |  endbr64
> +   |  movzx RCd, PC_RC
> +   |.if DUALNUM
> +   |  setint RC
> +@@ -756,6 +780,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >1
> +   |
> +   |->vmeta_tgetv:
> ++  |  endbr64
> +   |  movzx RCd, PC_RC			// Reload TValue *k from RC.
> +   |  lea RC, [BASE+RC*8]
> +   |1:
> +@@ -774,6 +799,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  test RC, RC
> +   |  jz >3
> +   |->cont_ra:				// BASE = base, RC = result
> ++  |  endbr64
> +   |  movzx RAd, PC_RA
> +   |  mov RB, [RC]
> +   |  mov [BASE+RA*8], RB
> +@@ -791,6 +817,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp ->vm_call_dispatch_f
> +   |
> +   |->vmeta_tgetr:
> ++  |  endbr64
> +   |  mov CARG1, TAB:RB
> +   |  mov RB, BASE			// Save BASE.
> +   |  mov CARG2d, RCd			// Caveat: CARG2 == BASE
> +@@ -806,6 +833,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->vmeta_tsets:
> ++  |  endbr64
> +   |  settp STR:RC, LJ_TSTR		// STR:RC = GCstr *
> +   |  mov TMP1, STR:RC
> +   |  lea RC, TMP1
> +@@ -817,6 +845,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >2
> +   |
> +   |->vmeta_tsetb:
> ++  |  endbr64
> +   |  movzx RCd, PC_RC
> +   |.if DUALNUM
> +   |  setint RC
> +@@ -829,6 +858,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >1
> +   |
> +   |->vmeta_tsetv:
> ++  |  endbr64
> +   |  movzx RCd, PC_RC			// Reload TValue *k from RC.
> +   |  lea RC, [BASE+RC*8]
> +   |1:
> +@@ -851,6 +881,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov RB, [BASE+RA*8]
> +   |  mov [RC], RB
> +   |->cont_nop:				// BASE = base, (RC = result)
> ++  |  endbr64
> +   |  ins_next
> +   |
> +   |3:  // Call __newindex metamethod.
> +@@ -869,6 +900,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp ->vm_call_dispatch_f
> +   |
> +   |->vmeta_tsetr:
> ++  |  endbr64
> +   |.if X64WIN
> +   |  mov L:CARG1, SAVE_L
> +   |  mov CARG3d, RCd
> +@@ -891,6 +923,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Comparison metamethods ---------------------------------------------
> +   |
> +   |->vmeta_comp:
> ++  |  endbr64
> +   |  movzx RDd, PC_RD
> +   |  movzx RAd, PC_RA
> +   |  mov L:RB, SAVE_L
> +@@ -921,6 +954,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  ins_next
> +   |
> +   |->cont_condt:			// BASE = base, RC = result
> ++  |  endbr64
> +   |  add PC, 4
> +   |  mov ITYPE, [RC]
> +   |  sar ITYPE, 47
> +@@ -929,12 +963,14 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp <6
> +   |
> +   |->cont_condf:			// BASE = base, RC = result
> ++  |  endbr64
> +   |  mov ITYPE, [RC]
> +   |  sar ITYPE, 47
> +   |  cmp ITYPEd, LJ_TISTRUECOND		// Branch if result is false.
> +   |  jmp <4
> +   |
> +   |->vmeta_equal:
> ++  |  endbr64
> +   |  cleartp TAB:RD
> +   |  sub PC, 4
> +   |.if X64WIN
> +@@ -958,6 +994,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp <3
> +   |
> +   |->vmeta_equal_cd:
> ++  |  endbr64
> +   |.if FFI
> +   |  sub PC, 4
> +   |  mov L:RB, SAVE_L
> +@@ -971,6 +1008,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.endif
> +   |
> +   |->vmeta_istype:
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov L:RB->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
> +   |  mov CARG2d, RAd
> +@@ -984,36 +1022,43 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Arithmetic metamethods ---------------------------------------------
> +   |
> +   |->vmeta_arith_vno:
> ++  |  endbr64
> +   |.if DUALNUM
> +   |  movzx RBd, PC_RB
> +   |  movzx RCd, PC_RC
> +   |.endif
> +   |->vmeta_arith_vn:
> ++  |  endbr64
> +   |  lea RC, [KBASE+RC*8]
> +   |  jmp >1
> +   |
> +   |->vmeta_arith_nvo:
> ++  |  endbr64
> +   |.if DUALNUM
> +   |  movzx RBd, PC_RB
> +   |  movzx RCd, PC_RC
> +   |.endif
> +   |->vmeta_arith_nv:
> ++  |  endbr64
> +   |  lea TMPR, [KBASE+RC*8]
> +   |  lea RC, [BASE+RB*8]
> +   |  mov RB, TMPR
> +   |  jmp >2
> +   |
> +   |->vmeta_unm:
> ++  |  endbr64
> +   |  lea RC, [BASE+RD*8]
> +   |  mov RB, RC
> +   |  jmp >2
> +   |
> +   |->vmeta_arith_vvo:
> ++  |  endbr64
> +   |.if DUALNUM
> +   |  movzx RBd, PC_RB
> +   |  movzx RCd, PC_RC
> +   |.endif
> +   |->vmeta_arith_vv:
> ++  |  endbr64
> +   |  lea RC, [BASE+RC*8]
> +   |1:
> +   |  lea RB, [BASE+RB*8]
> +@@ -1046,6 +1091,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |  // Call metamethod for binary op.
> +   |->vmeta_binop:
> ++  |  endbr64
> +   |  // BASE = base, RC = new base, stack = cont/func/o1/o2
> +   |  mov RA, RC
> +   |  sub RC, BASE
> +@@ -1055,6 +1101,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp ->vm_call_dispatch
> +   |
> +   |->vmeta_len:
> ++  |  endbr64
> +   |  movzx RDd, PC_RD
> +   |  mov L:RB, SAVE_L
> +   |  mov L:RB->base, BASE
> +@@ -1078,8 +1125,10 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Call metamethod ----------------------------------------------------
> +   |
> +   |->vmeta_call_ra:
> ++  |  endbr64
> +   |  lea RA, [BASE+RA*8+16]
> +   |->vmeta_call:			// Resolve and call __call metamethod.
> ++  |  endbr64
> +   |  // BASE = old base, RA = new base, RC = nargs+1, PC = return
> +   |  mov TMP1d, NARGS:RDd		// Save RA, RC for us.
> +   |  mov RB, RA
> +@@ -1113,6 +1162,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-- Argument coercion for 'for' statement ------------------------------
> +   |
> +   |->vmeta_for:
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov L:RB->base, BASE
> +   |  mov CARG2, RA			// Caveat: CARG2 == BASE
> +@@ -1132,16 +1182,17 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |.macro .ffunc, name
> +   |->ff_ .. name:
> ++  | endbr64
> +   |.endmacro
> +   |
> +   |.macro .ffunc_1, name
> +   |->ff_ .. name:
> +-  |  cmp NARGS:RDd, 1+1;  jb ->fff_fallback
> ++  |  endbr64; cmp NARGS:RDd, 1+1;  jb ->fff_fallback
> +   |.endmacro
> +   |
> +   |.macro .ffunc_2, name
> +   |->ff_ .. name:
> +-  |  cmp NARGS:RDd, 2+1;  jb ->fff_fallback
> ++  |  endbr64; cmp NARGS:RDd, 2+1;  jb ->fff_fallback
> +   |.endmacro
> +   |
> +   |.macro .ffunc_n, name, op
> +@@ -1414,6 +1465,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov RB, [RD]
> +   |  mov [BASE-8], RB
> +   |->fff_res2:
> ++  |  endbr64
> +   |  mov RDd, 1+2
> +   |  jmp ->fff_res
> +   |2:  // Check for empty hash part first. Otherwise call C function.
> +@@ -1434,6 +1486,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  test RD, RD
> +   |  jnz <1
> +   |->fff_res0:
> ++  |  endbr64
> +   |  mov RDd, 1+0
> +   |  jmp ->fff_res
> +   |
> +@@ -1665,8 +1718,10 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  neg RBd; js >2
> +   |->fff_resbit:
> +   |->fff_resi:
> ++  |  endbr64
> +   |  setint RB
> +   |->fff_resRB:
> ++  |  endbr64
> +   |  mov PC, [BASE-8]
> +   |  mov [BASE-16], RB
> +   |  jmp ->fff_res1
> +@@ -1686,15 +1741,19 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |.ffunc_n math_sqrt, sqrtsd
> +   |->fff_resxmm0:
> ++  |  endbr64
> +   |  mov PC, [BASE-8]
> +   |  movsd qword [BASE-16], xmm0
> +   |  // fallthrough
> +   |
> +   |->fff_res1:
> ++  |  endbr64
> +   |  mov RDd, 1+1
> +   |->fff_res:
> ++  |  endbr64
> +   |  mov MULTRES, RDd
> +   |->fff_res_:
> ++  |  endbr64
> +   |  test PCd, FRAME_TYPE
> +   |  jnz >7
> +   |5:
> +@@ -1907,6 +1966,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov TMPRd, 1
> +   |  lea RD, TMP1			// Points to stack. Little-endian.
> +   |->fff_newstr:
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov L:RB->base, BASE
> +   |  mov CARG3d, TMPRd			// Zero-extended to size_t.
> +@@ -1915,6 +1975,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  mov SAVE_PC, PC
> +   |  call extern lj_str_new		// (lua_State *L, char *str, size_t l)
> +   |->fff_resstr:
> ++  |  endbr64
> +   |  // GCstr * returned in eax (RD).
> +   |  mov BASE, L:RB->base
> +   |  mov PC, [BASE-8]
> +@@ -1979,6 +2040,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp <3
> +   |
> +   |->fff_emptystr:  // Range underflow.
> ++  |  endbr64
> +   |  xor TMPRd, TMPRd			// Zero length. Any ptr in RD is ok.
> +   |  jmp <4
> +   |
> +@@ -2090,11 +2152,13 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp ->fff_resbit
> +   |.else
> +   |->fff_resbit:
> ++  |  endbr64
> +   |  cvtsi2sd xmm0, RBd
> +   |  jmp ->fff_resxmm0
> +   |.endif
> +   |
> +   |->fff_fallback_bit_op:
> ++  |  endbr64
> +   |  mov NARGS:RDd, TMPRd		// Restore for fallback
> +   |  jmp ->fff_fallback
> +   |
> +@@ -2125,11 +2189,14 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->fff_fallback_2:
> ++  |  endbr64
> +   |  mov NARGS:RDd, 1+2			// Other args are ignored, anyway.
> +   |  jmp ->fff_fallback
> +   |->fff_fallback_1:
> ++  |  endbr64
> +   |  mov NARGS:RDd, 1+1			// Other args are ignored, anyway.
> +   |->fff_fallback:			// Call fast function fallback handler.
> ++  |  endbr64
> +   |  // BASE = new base, RD = nargs+1
> +   |  mov L:RB, SAVE_L
> +   |  mov PC, [BASE-8]			// Fallback may overwrite PC.
> +@@ -2160,6 +2227,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |// Reconstruct previous base for vmeta_call during tailcall.
> +   |->vm_call_tail:
> ++  |  endbr64
> +   |  mov RA, BASE
> +   |  test PCd, FRAME_TYPE
> +   |  jnz >3
> +@@ -2182,6 +2250,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp <1				// Dumb retry (goes through ff first).
> +   |
> +   |->fff_gcstep:			// Call GC step function.
> ++  |  endbr64
> +   |  // BASE = new base, RD = nargs+1
> +   |  pop RB				// Must keep stack at same level.
> +   |  mov TMP1, RB			// Save return address
> +@@ -2207,6 +2276,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->vm_record:				// Dispatch target for recording phase.
> +   |.if JIT
> ++  |  endbr64
> +   |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
> +   |  test RDL, HOOK_VMEVENT		// No recording while in vmevent.
> +   |  jnz >5
> +@@ -2220,12 +2290,14 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.endif
> +   |
> +   |->vm_rethook:			// Dispatch target for return hooks.
> ++  |  endbr64
> +   |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
> +   |  test RDL, HOOK_ACTIVE		// Hook already active?
> +   |  jnz >5
> +   |  jmp >1
> +   |
> +   |->vm_inshook:			// Dispatch target for instr/line hooks.
> ++  |  endbr64
> +   |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
> +   |  test RDL, HOOK_ACTIVE		// Hook already active?
> +   |  jnz >5
> +@@ -2253,6 +2325,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Re-dispatch to static ins.
> +   |
> +   |->cont_hook:				// Continue from hook yield.
> ++  |  endbr64
> +   |  add PC, 4
> +   |  mov RA, [RB-40]
> +   |  mov MULTRES, RAd			// Restore MULTRES for *M ins.
> +@@ -2260,6 +2333,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->vm_hotloop:			// Hot loop counter underflow.
> +   |.if JIT
> ++  |  endbr64
> +   |  mov LFUNC:RB, [BASE-16]		// Same as curr_topL(L).
> +   |  cleartp LFUNC:RB
> +   |  mov RB, LFUNC:RB->pc
> +@@ -2277,6 +2351,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.endif
> +   |
> +   |->vm_callhook:			// Dispatch target for call hooks.
> ++  |  endbr64
> +   |  mov SAVE_PC, PC
> +   |.if JIT
> +   |  jmp >1
> +@@ -2284,6 +2359,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->vm_hotcall:			// Hot call counter underflow.
> +   |.if JIT
> ++  |  endbr64
> +   |  mov SAVE_PC, PC
> +   |  or PC, 1				// Marker for hot call.
> +   |1:
> +@@ -2312,6 +2388,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->cont_stitch:			// Trace stitching.
> +   |.if JIT
> ++  |  endbr64
> +   |  // BASE = base, RC = result, RB = mbase
> +   |  mov TRACE:ITYPE, [RB-40]		// Save previous trace.
> +   |  cleartp TRACE:ITYPE
> +@@ -2364,6 +2441,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->vm_profhook:			// Dispatch target for profiler hook.
> + #if LJ_HASPROFILE
> ++  |  endbr64
> +   |  mov L:RB, SAVE_L
> +   |  mov L:RB->base, BASE
> +   |  mov CARG2, PC			// Caveat: CARG2 == BASE
> +@@ -2383,6 +2461,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |// The 16 bit exit number is stored with two (sign-extended) push imm8.
> +   |->vm_exit_handler:
> +   |.if JIT
> ++  |  endbr64
> +   |  push r13; push r12
> +   |  push r11; push r10; push r9; push r8
> +   |  push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
> +@@ -2431,6 +2510,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  jmp >1
> +   |.endif
> +   |->vm_exit_interp:
> ++  |  endbr64
> +   |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
> +   |.if JIT
> +   |  // Restore additional callee-save registers only used in compiled code.
> +@@ -2524,6 +2604,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |.macro vm_round, name, mode, cond
> +   |->name:
> +   |->name .. _sse:
> ++  |  endbr64
> +   |  sseconst_abs xmm2, RD
> +   |  sseconst_2p52 xmm3, RD
> +   |  movaps xmm1, xmm0
> +@@ -2569,6 +2650,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |->vm_mod:
> +   |// Args in xmm0/xmm1, return value in xmm0.
> +   |// Caveat: xmm0-xmm5 and RC (eax) modified!
> ++  |  endbr64
> +   |  movaps xmm5, xmm0
> +   |  divsd xmm0, xmm1
> +   |  sseconst_abs xmm2, RD
> +@@ -2601,6 +2683,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
> +   |->vm_cpuid:
> ++  |  endbr64
> +   |  mov eax, CARG1d
> +   |  .if X64WIN; push rsi; mov rsi, CARG2; .endif
> +   |  push rbx
> +@@ -2634,6 +2717,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |// Next idx returned in edx.
> +   |->vm_next:
> +   |.if JIT
> ++  |  endbr64
> +   |  mov NEXT_ASIZE, NEXT_TAB->asize
> +   |1:  // Traverse array part.
> +   |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
> +@@ -2680,6 +2764,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |//-----------------------------------------------------------------------
> +   |
> +   |->assert_bad_for_arg_type:
> ++  |  endbr64
> + #ifdef LUA_USE_ASSERT
> +   |  int3
> + #endif
> +@@ -2693,6 +2778,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |->vm_ffi_callback:
> +   |.if FFI
> +   |.type CTSTATE, CTState, PC
> ++  |  endbr64
> +   |  saveregs_	// ebp/rbp already saved. ebp now holds global_State *.
> +   |  lea DISPATCH, [ebp+GG_G2DISP]
> +   |  mov CTSTATE, GL:ebp->ctype_state
> +@@ -2736,6 +2822,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |
> +   |->cont_ffi_callback:			// Return from FFI callback.
> +   |.if FFI
> ++  |  endbr64
> +   |  mov L:RA, SAVE_L
> +   |  mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
> +   |  mov aword CTSTATE->L, L:RA
> +@@ -2753,7 +2840,7 @@ static void build_subroutines(BuildCtx *ctx)
> +   |  // Caveat: needs special frame unwinding, see below.
> +   |.if FFI
> +   |  .type CCSTATE, CCallState, rbx
> +-  |  push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
> ++  |  endbr64; push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
> +   |
> +   |  // Readjust stack.
> +   |  mov eax, CCSTATE->spadj
> +@@ -3221,6 +3308,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |3:
> + #endif
> +     |->BC_LEN_Z:
> ++    |  endbr64
> +     |  mov RB, BASE			// Save BASE.
> +     |  call extern lj_tab_len		// (GCtab *t)
> +     |  // Length of table returned in eax (RD).
> +@@ -3341,6 +3429,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +   case BC_MODVN:
> +     |  ins_arithpre movsd, xmm1
> +     |->BC_MODVN_Z:
> ++    |  endbr64
> +     |  call ->vm_mod
> +     |  ins_arithpost
> +     |  ins_next
> +@@ -3367,6 +3456,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  mov CARG3d, RCd
> +     |  sub CARG3d, RBd
> +     |->BC_CAT_Z:
> ++    |  endbr64
> +     |  mov L:RB, L:CARG1
> +     |  mov SAVE_PC, PC
> +     |  call extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
> +@@ -3701,6 +3791,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  mov STR:RC, [KBASE+RC*8]
> +     |  checktab TAB:RB, ->vmeta_tgets
> +     |->BC_TGETS_Z:	// RB = GCtab *, RC = GCstr *
> ++    |  endbr64
> +     |  mov TMPRd, TAB:RB->hmask
> +     |  and TMPRd, STR:RC->sid
> +     |  imul TMPRd, #NODE
> +@@ -3771,8 +3862,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  add RC, TAB:RB->array
> +     |  // Get array slot.
> +     |->BC_TGETR_Z:
> ++    |  endbr64
> +     |  mov ITYPE, [RC]
> +     |->BC_TGETR2_Z:
> ++    |  endbr64
> +     |  mov [BASE+RA*8], ITYPE
> +     |  ins_next
> +     break;
> +@@ -3833,6 +3926,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  mov STR:RC, [KBASE+RC*8]
> +     |  checktab TAB:RB, ->vmeta_tsets
> +     |->BC_TSETS_Z:	// RB = GCtab *, RC = GCstr *
> ++    |  endbr64
> +     |  mov TMPRd, TAB:RB->hmask
> +     |  and TMPRd, STR:RC->sid
> +     |  imul TMPRd, #NODE
> +@@ -3940,6 +4034,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  add RC, TAB:RB->array
> +     |  // Set array slot.
> +     |->BC_TSETR_Z:
> ++    |  endbr64
> +     |  mov ITYPE, [BASE+RA*8]
> +     |  mov [RC], ITYPE
> +     |  ins_next
> +@@ -4021,6 +4116,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     |  mov LFUNC:RB, [RA-16]
> +     |  checktp_nc LFUNC:RB, LJ_TFUNC, ->vmeta_call
> +     |->BC_CALLT_Z:
> ++    |  endbr64
> +     |  mov PC, [BASE-8]
> +     |  test PCd, FRAME_TYPE
> +     |  jnz >7
> +@@ -4087,6 +4183,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> + 
> +   case BC_ITERN:
> +     |.if JIT
> ++    |  endbr64
> +     |  hotloop RBd
> +     |.endif
> +     |->vm_IITERN:
> +@@ -4267,6 +4364,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     switch (op) {
> +     case BC_RET:
> +       |->BC_RET_Z:
> ++      |  endbr64
> +       |  mov KBASE, BASE		// Use KBASE for result move.
> +       |  sub RDd, 1
> +       |  jz >3
> +@@ -4284,10 +4382,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +       |  ja >6
> +       break;
> +     case BC_RET1:
> ++      |  endbr64
> +       |  mov RB, [BASE+RA]
> +       |  mov [BASE-16], RB
> +       /* fallthrough */
> +     case BC_RET0:
> ++      |  endbr64
> +       |5:
> +       |  cmp PC_RB, RDL			// More results expected?
> +       |  ja >6
> +@@ -4334,6 +4434,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> + 
> +   case BC_FORL:
> +     |.if JIT
> ++    |  endbr64
> +     |  hotloop RBd
> +     |.endif
> +     | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
> +@@ -4342,6 +4443,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +   case BC_JFORI:
> +   case BC_JFORL:
> + #if !LJ_HASJIT
> ++    |  endbr64
> +     break;
> + #endif
> +   case BC_FORI:
> +@@ -4485,6 +4587,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> + 
> +   case BC_ITERL:
> +     |.if JIT
> ++    |  endbr64
> +     |  hotloop RBd
> +     |.endif
> +     | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
> +@@ -4492,6 +4595,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> + 
> +   case BC_JITERL:
> + #if !LJ_HASJIT
> ++    |  endbr64
> +     break;
> + #endif
> +   case BC_IITERL:
> +@@ -4578,13 +4682,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> + 
> +   case BC_FUNCF:
> +     |.if JIT
> ++    |  endbr64
> +     |  hotcall RBd
> +     |.endif
> +   case BC_FUNCV:  /* NYI: compiled vararg functions. */
> +     | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
> ++    |  endbr64
> +     break;
> + 
> +   case BC_JFUNCF:
> ++    |  endbr64
> + #if !LJ_HASJIT
> +     break;
> + #endif
> +@@ -4615,6 +4722,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo
> +     break;
> + 
> +   case BC_JFUNCV:
> ++    |  endbr64
> + #if !LJ_HASJIT
> +     break;
> + #endif

-- 
Yuichiro NAITO (naito.yuichiro@gmail.com)