From: Yuichiro NAITO Subject: Re: lang/luajit add support of IBT for amd64 To: stu@spacehopper.org Cc: ports@openbsd.org Date: Wed, 15 Oct 2025 08:52:47 +0900 From: Stuart Henderson Subject: Re: lang/luajit add support of IBT for amd64 Date: Fri, 10 Oct 2025 11:00:55 +0100 > On 2025/10/10 10:11, Stuart Henderson wrote: >> On 2025/10/10 17:27, Yuichiro NAITO wrote: >> > Hi, I see that the LuaJIT fails to run on a processor that enables >> > the IBT (Indirect Branch Tracking) feature since OpenBSD 7.4. >> >> luajit in the ports tree is built with enforcement disabled (see >> USE_NOBTCFI in the Makefile). Sorry that I forget reporting I removed the USE_NOBCFI option in my Ports tree. Once I had a nervous mind that saw LuaJIT ran without IBT feature. So I removed the option and began to insert endbr instructions. >> It would be good to be able to remove that, but it would be preferable >> to get this landed upstream if possible, rather than as patches in >> the ports tree, which may end up needing to get removed if they >> conflict with future upstream code changes. (I'll consider adding >> as patches if there's no interest upstream, but I think that should >> be tried first). I sent a Pull Request to the upstream. But I don't have a response yet. https://github.com/LuaJIT/LuaJIT/pull/1391 > a ports diff would look like this Thanks for the diff. It works for me on amd64. I don't have IBT featured test environment on i386 but checked building successfully. While the build, I saw the following warning message. ``` In file included from lj_asm.c:221: ./lj_emit_x86.h:73:13: warning: unused function 'emit_endbr' [-Wunused-function] 73 | static void emit_endbr(ASMState *as) | ^~~~~~~~~~ 1 warning generated. ``` I would like to update your patch as the following to suppress the warning. ``` diff --git a/lang/luajit/patches/patch-src_lj_asm_c b/lang/luajit/patches/patch-src_lj_asm_c index cca418187dc..790b1ac5a9a 100644 --- a/lang/luajit/patches/patch-src_lj_asm_c +++ b/lang/luajit/patches/patch-src_lj_asm_c @@ -5,7 +5,7 @@ Index: src/lj_asm.c spadj = asm_stack_adjust(as); as->T->spadjust = (uint16_t)spadj; emit_spsub(as, spadj); -+#if LJ_TARGET_X64 ++#if LJ_TARGET_X86ORX64 + emit_endbr(as); +#endif /* Root traces assume a checked stack for the starting proto. */ @@ -16,7 +16,7 @@ Index: src/lj_asm.c /* Continue with coalescing to fix up the broken cycle(s). */ } - -+#if LJ_TARGET_X64 ++#if LJ_TARGET_X86ORX64 + emit_endbr(as); +#endif /* Inherit top stack slot already checked by parent trace. */ diff --git a/lang/luajit/patches/patch-src_lj_emit_x86_h b/lang/luajit/patches/patch-src_lj_emit_x86_h index e706d833607..015356e9cef 100644 --- a/lang/luajit/patches/patch-src_lj_emit_x86_h +++ b/lang/luajit/patches/patch-src_lj_emit_x86_h @@ -1,7 +1,7 @@ Index: src/lj_emit_x86.h --- src/lj_emit_x86.h.orig +++ src/lj_emit_x86.h -@@ -70,6 +70,15 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg +@@ -70,6 +70,13 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg return p; } @@ -9,8 +9,6 @@ Index: src/lj_emit_x86.h +{ +#if LJ_64 + emit_u32(as, 0xfa1e0ff3); // endbr64 -+#else -+ emit_u32(as, 0xfb1e0ff3); // endbr32 +#endif +} + ``` I feel it's OK to the other part of your patch. > other ports would need syncs to change USE_NOBTCFI and bump revision > (benchmarks/wrk, games/luanti, games/luasteam, games/openmw, > games/powder-toy, games/solarus/solarus, mail/rspamd, net/hexchat, > net/snort, www/luakit, x11/kde-applications/cantor) > > Index: Makefile > =================================================================== > RCS file: /cvs/ports/lang/luajit/Makefile,v > diff -u -p -r1.38 Makefile > --- Makefile 24 Jul 2025 14:40:51 -0000 1.38 > +++ Makefile 10 Oct 2025 09:53:14 -0000 > @@ -1,6 +1,7 @@ > # keep arch-defines.mk LUAJIT_ARCHS in sync > # bump ports which use PROPERTIES:Mluajit if changing > ONLY_FOR_ARCHS = aarch64 arm amd64 i386 powerpc > +USE_NOBTCFI-aarch64 = Yes > # > # games/tome4 is using embedded copy of luajit > > @@ -18,6 +19,7 @@ GH_COMMIT = 871db2c84ecefd70a850e03a6c34 > # epoch time of the commit; easiest found in ${WRKSRC}/.relver of the > # git-archive tar > V = 2.1.1753364724 > +REVISION= 0 > > COMMENT = just-in-time compiler for Lua > DISTNAME = LuaJIT-${V} > @@ -29,8 +31,6 @@ HOMEPAGE = https://luajit.org/ > > # MIT > PERMIT_PACKAGE = Yes > - > -USE_NOBTCFI = Yes > > WANTLIB = c m > > Index: patches/patch-src_lj_asm_c > =================================================================== > RCS file: patches/patch-src_lj_asm_c > diff -N patches/patch-src_lj_asm_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_lj_asm_c 10 Oct 2025 09:53:14 -0000 > @@ -0,0 +1,26 @@ > +amd64 bti fixes > + > +Index: src/lj_asm.c > +--- src/lj_asm.c.orig > ++++ src/lj_asm.c > +@@ -1917,6 +1917,9 @@ static void asm_head_root(ASMState *as) > + spadj = asm_stack_adjust(as); > + as->T->spadjust = (uint16_t)spadj; > + emit_spsub(as, spadj); > ++#if LJ_TARGET_X64 > ++ emit_endbr(as); > ++#endif > + /* Root traces assume a checked stack for the starting proto. */ > + as->T->topslot = gcref(as->T->startpt)->pt.framesize; > + } > +@@ -2085,7 +2088,9 @@ static void asm_head_side(ASMState *as) > + checkmclim(as); > + /* Continue with coalescing to fix up the broken cycle(s). */ > + } > +- > ++#if LJ_TARGET_X64 > ++ emit_endbr(as); > ++#endif > + /* Inherit top stack slot already checked by parent trace. */ > + as->T->topslot = as->parent->topslot; > + if (as->topslot > as->T->topslot) { /* Need to check for higher slot? */ > Index: patches/patch-src_lj_emit_x86_h > =================================================================== > RCS file: patches/patch-src_lj_emit_x86_h > diff -N patches/patch-src_lj_emit_x86_h > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_lj_emit_x86_h 10 Oct 2025 09:53:14 -0000 > @@ -0,0 +1,21 @@ > +amd64 bti fixes > + > +Index: src/lj_emit_x86.h > +--- src/lj_emit_x86.h.orig > ++++ src/lj_emit_x86.h > +@@ -70,6 +70,15 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg > + return p; > + } > + > ++static void emit_endbr(ASMState *as) > ++{ > ++#if LJ_64 > ++ emit_u32(as, 0xfa1e0ff3); // endbr64 > ++#else > ++ emit_u32(as, 0xfb1e0ff3); // endbr32 > ++#endif > ++} > ++ > + /* op + modrm */ > + #define emit_opm(xo, mode, rr, rb, p, delta) \ > + (p[(delta)-1] = MODRM((mode), (rr), (rb)), \ > Index: patches/patch-src_vm_x64_dasc > =================================================================== > RCS file: patches/patch-src_vm_x64_dasc > diff -N patches/patch-src_vm_x64_dasc > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_vm_x64_dasc 10 Oct 2025 09:53:14 -0000 > @@ -0,0 +1,826 @@ > +amd64 bti fixes > + > +Index: src/vm_x64.dasc > +--- src/vm_x64.dasc.orig > ++++ src/vm_x64.dasc > +@@ -192,13 +192,13 @@ > + |//----------------------------------------------------------------------- > + | > + |// Instruction headers. > +-|.macro ins_A; .endmacro > +-|.macro ins_AD; .endmacro > +-|.macro ins_AJ; .endmacro > +-|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro > +-|.macro ins_AB_; movzx RBd, RCH; .endmacro > +-|.macro ins_A_C; movzx RCd, RCL; .endmacro > +-|.macro ins_AND; not RD; .endmacro > ++|.macro ins_A; endbr64; .endmacro > ++|.macro ins_AD; endbr64; .endmacro > ++|.macro ins_AJ; endbr64; .endmacro > ++|.macro ins_ABC; endbr64; movzx RBd, RCH; movzx RCd, RCL; .endmacro > ++|.macro ins_AB_; endbr64; movzx RBd, RCH; .endmacro > ++|.macro ins_A_C; endbr64; movzx RCd, RCL; .endmacro > ++|.macro ins_AND; endbr64; not RD; .endmacro > + | > + |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster). > + |.macro ins_NEXT > +@@ -387,6 +387,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->vm_returnp: > ++ | endbr64 > + | test PCd, FRAME_P > + | jz ->cont_dispatch > + | > +@@ -400,6 +401,7 @@ static void build_subroutines(BuildCtx *ctx) > + | mov aword [BASE+RA], ITYPE // Prepend true to results. > + | > + |->vm_returnc: > ++ | endbr64 > + | add RDd, 1 // RD = nresults+1 > + | jz ->vm_unwind_yield > + | mov MULTRES, RDd > +@@ -407,6 +409,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jz ->BC_RET_Z // Handle regular return to Lua. > + | > + |->vm_return: > ++ | endbr64 > + | // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return > + | xor PC, FRAME_C > + | test PCd, FRAME_TYPE > +@@ -440,11 +443,13 @@ static void build_subroutines(BuildCtx *ctx) > + | mov L:RB->top, BASE > + | > + |->vm_leave_cp: > ++ | endbr64 > + | mov RA, SAVE_CFRAME // Restore previous C frame. > + | mov L:RB->cframe, RA > + | xor eax, eax // Ok return status for vm_pcall. > + | > + |->vm_leave_unw: > ++ | endbr64 > + | restoreregs > + | ret > + | > +@@ -479,20 +484,24 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp <3 > + | > + |->vm_unwind_yield: > ++ | endbr64 > + | mov al, LUA_YIELD > + | jmp ->vm_unwind_c_eh > + | > + |->vm_unwind_c: // Unwind C stack, return from vm_pcall. > ++ | endbr64 > + | // (void *cframe, int errcode) > + | mov eax, CARG2d // Error return status for vm_pcall. > + | mov rsp, CARG1 > + |->vm_unwind_c_eh: // Landing pad for external unwinder. > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov GL:RB, L:RB->glref > + | mov dword GL:RB->vmstate, ~LJ_VMST_C > + | jmp ->vm_leave_unw > + | > + |->vm_unwind_rethrow: > ++ | endbr64 > + |.if not X64WIN > + | mov CARG1, SAVE_L > + | mov CARG2d, eax > +@@ -501,10 +510,12 @@ static void build_subroutines(BuildCtx *ctx) > + |.endif > + | > + |->vm_unwind_ff: // Unwind C stack, return from ff pcall. > ++ | endbr64 > + | // (void *cframe) > + | and CARG1, CFRAME_RAWMASK > + | mov rsp, CARG1 > + |->vm_unwind_ff_eh: // Landing pad for external unwinder. > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov RDd, 1+1 // Really 1+2 results, incr. later. > + | mov BASE, L:RB->base > +@@ -524,14 +535,17 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->vm_growstack_c: // Grow stack for C function. > ++ | endbr64 > + | mov CARG2d, LUA_MINSTACK > + | jmp >2 > + | > + |->vm_growstack_v: // Grow stack for vararg Lua function. > ++ | endbr64 > + | sub RD, 16 // LJ_FR2 > + | jmp >1 > + | > + |->vm_growstack_f: // Grow stack for fixarg Lua function. > ++ | endbr64 > + | // BASE = new base, RD = nargs+1, RB = L, PC = first PC > + | lea RD, [BASE+NARGS:RD*8-8] > + |1: > +@@ -560,6 +574,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->vm_resume: // Setup C frame and resume thread. > ++ | endbr64 > + | // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0) > + | saveregs > + | mov L:RB, CARG1 // Caveat: CARG1 may be RA. > +@@ -595,6 +610,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp ->vm_return > + | > + |->vm_pcall: // Setup protected C frame and enter VM. > ++ | endbr64 > + | // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef) > + | saveregs > + | mov PCd, FRAME_CP > +@@ -602,6 +618,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >1 > + | > + |->vm_call: // Setup C frame and enter VM. > ++ | endbr64 > + | // (lua_State *L, TValue *base, int nres1) > + | saveregs > + | mov PCd, FRAME_C > +@@ -632,15 +649,18 @@ static void build_subroutines(BuildCtx *ctx) > + | add NARGS:RDd, 1 // RD = nargs+1 > + | > + |->vm_call_dispatch: > ++ | endbr64 > + | mov LFUNC:RB, [RA-16] > + | checkfunc LFUNC:RB, ->vmeta_call // Ensure KBASE defined and != BASE. > + | > + |->vm_call_dispatch_f: > ++ | endbr64 > + | mov BASE, RA > + | ins_call > + | // BASE = new base, RB = func, RD = nargs+1, PC = caller PC > + | > + |->vm_cpcall: // Setup protected C frame, call C. > ++ | endbr64 > + | // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp) > + | saveregs > + | mov L:RB, CARG1 // Caveat: CARG1 may be RA. > +@@ -675,6 +695,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Continuation dispatch ---------------------------------------------- > + | > + |->cont_dispatch: > ++ | endbr64 > + | // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES) > + | add RA, BASE > + | and PC, -8 > +@@ -706,6 +727,7 @@ static void build_subroutines(BuildCtx *ctx) > + |.endif > + | > + |->cont_cat: // BASE = base, RC = result, RB = mbase > ++ | endbr64 > + | movzx RAd, PC_RB > + | sub RB, 32 > + | lea RA, [BASE+RA*8] > +@@ -733,6 +755,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Table indexing metamethods ----------------------------------------- > + | > + |->vmeta_tgets: > ++ | endbr64 > + | settp STR:RC, LJ_TSTR // STR:RC = GCstr * > + | mov TMP1, STR:RC > + | lea RC, TMP1 > +@@ -744,6 +767,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >2 > + | > + |->vmeta_tgetb: > ++ | endbr64 > + | movzx RCd, PC_RC > + |.if DUALNUM > + | setint RC > +@@ -756,6 +780,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >1 > + | > + |->vmeta_tgetv: > ++ | endbr64 > + | movzx RCd, PC_RC // Reload TValue *k from RC. > + | lea RC, [BASE+RC*8] > + |1: > +@@ -774,6 +799,7 @@ static void build_subroutines(BuildCtx *ctx) > + | test RC, RC > + | jz >3 > + |->cont_ra: // BASE = base, RC = result > ++ | endbr64 > + | movzx RAd, PC_RA > + | mov RB, [RC] > + | mov [BASE+RA*8], RB > +@@ -791,6 +817,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp ->vm_call_dispatch_f > + | > + |->vmeta_tgetr: > ++ | endbr64 > + | mov CARG1, TAB:RB > + | mov RB, BASE // Save BASE. > + | mov CARG2d, RCd // Caveat: CARG2 == BASE > +@@ -806,6 +833,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->vmeta_tsets: > ++ | endbr64 > + | settp STR:RC, LJ_TSTR // STR:RC = GCstr * > + | mov TMP1, STR:RC > + | lea RC, TMP1 > +@@ -817,6 +845,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >2 > + | > + |->vmeta_tsetb: > ++ | endbr64 > + | movzx RCd, PC_RC > + |.if DUALNUM > + | setint RC > +@@ -829,6 +858,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >1 > + | > + |->vmeta_tsetv: > ++ | endbr64 > + | movzx RCd, PC_RC // Reload TValue *k from RC. > + | lea RC, [BASE+RC*8] > + |1: > +@@ -851,6 +881,7 @@ static void build_subroutines(BuildCtx *ctx) > + | mov RB, [BASE+RA*8] > + | mov [RC], RB > + |->cont_nop: // BASE = base, (RC = result) > ++ | endbr64 > + | ins_next > + | > + |3: // Call __newindex metamethod. > +@@ -869,6 +900,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp ->vm_call_dispatch_f > + | > + |->vmeta_tsetr: > ++ | endbr64 > + |.if X64WIN > + | mov L:CARG1, SAVE_L > + | mov CARG3d, RCd > +@@ -891,6 +923,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Comparison metamethods --------------------------------------------- > + | > + |->vmeta_comp: > ++ | endbr64 > + | movzx RDd, PC_RD > + | movzx RAd, PC_RA > + | mov L:RB, SAVE_L > +@@ -921,6 +954,7 @@ static void build_subroutines(BuildCtx *ctx) > + | ins_next > + | > + |->cont_condt: // BASE = base, RC = result > ++ | endbr64 > + | add PC, 4 > + | mov ITYPE, [RC] > + | sar ITYPE, 47 > +@@ -929,12 +963,14 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp <6 > + | > + |->cont_condf: // BASE = base, RC = result > ++ | endbr64 > + | mov ITYPE, [RC] > + | sar ITYPE, 47 > + | cmp ITYPEd, LJ_TISTRUECOND // Branch if result is false. > + | jmp <4 > + | > + |->vmeta_equal: > ++ | endbr64 > + | cleartp TAB:RD > + | sub PC, 4 > + |.if X64WIN > +@@ -958,6 +994,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp <3 > + | > + |->vmeta_equal_cd: > ++ | endbr64 > + |.if FFI > + | sub PC, 4 > + | mov L:RB, SAVE_L > +@@ -971,6 +1008,7 @@ static void build_subroutines(BuildCtx *ctx) > + |.endif > + | > + |->vmeta_istype: > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov L:RB->base, BASE // Caveat: CARG2/CARG3 may be BASE. > + | mov CARG2d, RAd > +@@ -984,36 +1022,43 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Arithmetic metamethods --------------------------------------------- > + | > + |->vmeta_arith_vno: > ++ | endbr64 > + |.if DUALNUM > + | movzx RBd, PC_RB > + | movzx RCd, PC_RC > + |.endif > + |->vmeta_arith_vn: > ++ | endbr64 > + | lea RC, [KBASE+RC*8] > + | jmp >1 > + | > + |->vmeta_arith_nvo: > ++ | endbr64 > + |.if DUALNUM > + | movzx RBd, PC_RB > + | movzx RCd, PC_RC > + |.endif > + |->vmeta_arith_nv: > ++ | endbr64 > + | lea TMPR, [KBASE+RC*8] > + | lea RC, [BASE+RB*8] > + | mov RB, TMPR > + | jmp >2 > + | > + |->vmeta_unm: > ++ | endbr64 > + | lea RC, [BASE+RD*8] > + | mov RB, RC > + | jmp >2 > + | > + |->vmeta_arith_vvo: > ++ | endbr64 > + |.if DUALNUM > + | movzx RBd, PC_RB > + | movzx RCd, PC_RC > + |.endif > + |->vmeta_arith_vv: > ++ | endbr64 > + | lea RC, [BASE+RC*8] > + |1: > + | lea RB, [BASE+RB*8] > +@@ -1046,6 +1091,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + | // Call metamethod for binary op. > + |->vmeta_binop: > ++ | endbr64 > + | // BASE = base, RC = new base, stack = cont/func/o1/o2 > + | mov RA, RC > + | sub RC, BASE > +@@ -1055,6 +1101,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp ->vm_call_dispatch > + | > + |->vmeta_len: > ++ | endbr64 > + | movzx RDd, PC_RD > + | mov L:RB, SAVE_L > + | mov L:RB->base, BASE > +@@ -1078,8 +1125,10 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Call metamethod ---------------------------------------------------- > + | > + |->vmeta_call_ra: > ++ | endbr64 > + | lea RA, [BASE+RA*8+16] > + |->vmeta_call: // Resolve and call __call metamethod. > ++ | endbr64 > + | // BASE = old base, RA = new base, RC = nargs+1, PC = return > + | mov TMP1d, NARGS:RDd // Save RA, RC for us. > + | mov RB, RA > +@@ -1113,6 +1162,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//-- Argument coercion for 'for' statement ------------------------------ > + | > + |->vmeta_for: > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov L:RB->base, BASE > + | mov CARG2, RA // Caveat: CARG2 == BASE > +@@ -1132,16 +1182,17 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |.macro .ffunc, name > + |->ff_ .. name: > ++ | endbr64 > + |.endmacro > + | > + |.macro .ffunc_1, name > + |->ff_ .. name: > +- | cmp NARGS:RDd, 1+1; jb ->fff_fallback > ++ | endbr64; cmp NARGS:RDd, 1+1; jb ->fff_fallback > + |.endmacro > + | > + |.macro .ffunc_2, name > + |->ff_ .. name: > +- | cmp NARGS:RDd, 2+1; jb ->fff_fallback > ++ | endbr64; cmp NARGS:RDd, 2+1; jb ->fff_fallback > + |.endmacro > + | > + |.macro .ffunc_n, name, op > +@@ -1414,6 +1465,7 @@ static void build_subroutines(BuildCtx *ctx) > + | mov RB, [RD] > + | mov [BASE-8], RB > + |->fff_res2: > ++ | endbr64 > + | mov RDd, 1+2 > + | jmp ->fff_res > + |2: // Check for empty hash part first. Otherwise call C function. > +@@ -1434,6 +1486,7 @@ static void build_subroutines(BuildCtx *ctx) > + | test RD, RD > + | jnz <1 > + |->fff_res0: > ++ | endbr64 > + | mov RDd, 1+0 > + | jmp ->fff_res > + | > +@@ -1665,8 +1718,10 @@ static void build_subroutines(BuildCtx *ctx) > + | neg RBd; js >2 > + |->fff_resbit: > + |->fff_resi: > ++ | endbr64 > + | setint RB > + |->fff_resRB: > ++ | endbr64 > + | mov PC, [BASE-8] > + | mov [BASE-16], RB > + | jmp ->fff_res1 > +@@ -1686,15 +1741,19 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |.ffunc_n math_sqrt, sqrtsd > + |->fff_resxmm0: > ++ | endbr64 > + | mov PC, [BASE-8] > + | movsd qword [BASE-16], xmm0 > + | // fallthrough > + | > + |->fff_res1: > ++ | endbr64 > + | mov RDd, 1+1 > + |->fff_res: > ++ | endbr64 > + | mov MULTRES, RDd > + |->fff_res_: > ++ | endbr64 > + | test PCd, FRAME_TYPE > + | jnz >7 > + |5: > +@@ -1907,6 +1966,7 @@ static void build_subroutines(BuildCtx *ctx) > + | mov TMPRd, 1 > + | lea RD, TMP1 // Points to stack. Little-endian. > + |->fff_newstr: > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov L:RB->base, BASE > + | mov CARG3d, TMPRd // Zero-extended to size_t. > +@@ -1915,6 +1975,7 @@ static void build_subroutines(BuildCtx *ctx) > + | mov SAVE_PC, PC > + | call extern lj_str_new // (lua_State *L, char *str, size_t l) > + |->fff_resstr: > ++ | endbr64 > + | // GCstr * returned in eax (RD). > + | mov BASE, L:RB->base > + | mov PC, [BASE-8] > +@@ -1979,6 +2040,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp <3 > + | > + |->fff_emptystr: // Range underflow. > ++ | endbr64 > + | xor TMPRd, TMPRd // Zero length. Any ptr in RD is ok. > + | jmp <4 > + | > +@@ -2090,11 +2152,13 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp ->fff_resbit > + |.else > + |->fff_resbit: > ++ | endbr64 > + | cvtsi2sd xmm0, RBd > + | jmp ->fff_resxmm0 > + |.endif > + | > + |->fff_fallback_bit_op: > ++ | endbr64 > + | mov NARGS:RDd, TMPRd // Restore for fallback > + | jmp ->fff_fallback > + | > +@@ -2125,11 +2189,14 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->fff_fallback_2: > ++ | endbr64 > + | mov NARGS:RDd, 1+2 // Other args are ignored, anyway. > + | jmp ->fff_fallback > + |->fff_fallback_1: > ++ | endbr64 > + | mov NARGS:RDd, 1+1 // Other args are ignored, anyway. > + |->fff_fallback: // Call fast function fallback handler. > ++ | endbr64 > + | // BASE = new base, RD = nargs+1 > + | mov L:RB, SAVE_L > + | mov PC, [BASE-8] // Fallback may overwrite PC. > +@@ -2160,6 +2227,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |// Reconstruct previous base for vmeta_call during tailcall. > + |->vm_call_tail: > ++ | endbr64 > + | mov RA, BASE > + | test PCd, FRAME_TYPE > + | jnz >3 > +@@ -2182,6 +2250,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp <1 // Dumb retry (goes through ff first). > + | > + |->fff_gcstep: // Call GC step function. > ++ | endbr64 > + | // BASE = new base, RD = nargs+1 > + | pop RB // Must keep stack at same level. > + | mov TMP1, RB // Save return address > +@@ -2207,6 +2276,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->vm_record: // Dispatch target for recording phase. > + |.if JIT > ++ | endbr64 > + | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)] > + | test RDL, HOOK_VMEVENT // No recording while in vmevent. > + | jnz >5 > +@@ -2220,12 +2290,14 @@ static void build_subroutines(BuildCtx *ctx) > + |.endif > + | > + |->vm_rethook: // Dispatch target for return hooks. > ++ | endbr64 > + | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)] > + | test RDL, HOOK_ACTIVE // Hook already active? > + | jnz >5 > + | jmp >1 > + | > + |->vm_inshook: // Dispatch target for instr/line hooks. > ++ | endbr64 > + | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)] > + | test RDL, HOOK_ACTIVE // Hook already active? > + | jnz >5 > +@@ -2253,6 +2325,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp aword [DISPATCH+OP*8+GG_DISP2STATIC] // Re-dispatch to static ins. > + | > + |->cont_hook: // Continue from hook yield. > ++ | endbr64 > + | add PC, 4 > + | mov RA, [RB-40] > + | mov MULTRES, RAd // Restore MULTRES for *M ins. > +@@ -2260,6 +2333,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->vm_hotloop: // Hot loop counter underflow. > + |.if JIT > ++ | endbr64 > + | mov LFUNC:RB, [BASE-16] // Same as curr_topL(L). > + | cleartp LFUNC:RB > + | mov RB, LFUNC:RB->pc > +@@ -2277,6 +2351,7 @@ static void build_subroutines(BuildCtx *ctx) > + |.endif > + | > + |->vm_callhook: // Dispatch target for call hooks. > ++ | endbr64 > + | mov SAVE_PC, PC > + |.if JIT > + | jmp >1 > +@@ -2284,6 +2359,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->vm_hotcall: // Hot call counter underflow. > + |.if JIT > ++ | endbr64 > + | mov SAVE_PC, PC > + | or PC, 1 // Marker for hot call. > + |1: > +@@ -2312,6 +2388,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->cont_stitch: // Trace stitching. > + |.if JIT > ++ | endbr64 > + | // BASE = base, RC = result, RB = mbase > + | mov TRACE:ITYPE, [RB-40] // Save previous trace. > + | cleartp TRACE:ITYPE > +@@ -2364,6 +2441,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->vm_profhook: // Dispatch target for profiler hook. > + #if LJ_HASPROFILE > ++ | endbr64 > + | mov L:RB, SAVE_L > + | mov L:RB->base, BASE > + | mov CARG2, PC // Caveat: CARG2 == BASE > +@@ -2383,6 +2461,7 @@ static void build_subroutines(BuildCtx *ctx) > + |// The 16 bit exit number is stored with two (sign-extended) push imm8. > + |->vm_exit_handler: > + |.if JIT > ++ | endbr64 > + | push r13; push r12 > + | push r11; push r10; push r9; push r8 > + | push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp > +@@ -2431,6 +2510,7 @@ static void build_subroutines(BuildCtx *ctx) > + | jmp >1 > + |.endif > + |->vm_exit_interp: > ++ | endbr64 > + | // RD = MULTRES or negated error code, BASE, PC and DISPATCH set. > + |.if JIT > + | // Restore additional callee-save registers only used in compiled code. > +@@ -2524,6 +2604,7 @@ static void build_subroutines(BuildCtx *ctx) > + |.macro vm_round, name, mode, cond > + |->name: > + |->name .. _sse: > ++ | endbr64 > + | sseconst_abs xmm2, RD > + | sseconst_2p52 xmm3, RD > + | movaps xmm1, xmm0 > +@@ -2569,6 +2650,7 @@ static void build_subroutines(BuildCtx *ctx) > + |->vm_mod: > + |// Args in xmm0/xmm1, return value in xmm0. > + |// Caveat: xmm0-xmm5 and RC (eax) modified! > ++ | endbr64 > + | movaps xmm5, xmm0 > + | divsd xmm0, xmm1 > + | sseconst_abs xmm2, RD > +@@ -2601,6 +2683,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |// int lj_vm_cpuid(uint32_t f, uint32_t res[4]) > + |->vm_cpuid: > ++ | endbr64 > + | mov eax, CARG1d > + | .if X64WIN; push rsi; mov rsi, CARG2; .endif > + | push rbx > +@@ -2634,6 +2717,7 @@ static void build_subroutines(BuildCtx *ctx) > + |// Next idx returned in edx. > + |->vm_next: > + |.if JIT > ++ | endbr64 > + | mov NEXT_ASIZE, NEXT_TAB->asize > + |1: // Traverse array part. > + | cmp NEXT_IDX, NEXT_ASIZE; jae >5 > +@@ -2680,6 +2764,7 @@ static void build_subroutines(BuildCtx *ctx) > + |//----------------------------------------------------------------------- > + | > + |->assert_bad_for_arg_type: > ++ | endbr64 > + #ifdef LUA_USE_ASSERT > + | int3 > + #endif > +@@ -2693,6 +2778,7 @@ static void build_subroutines(BuildCtx *ctx) > + |->vm_ffi_callback: > + |.if FFI > + |.type CTSTATE, CTState, PC > ++ | endbr64 > + | saveregs_ // ebp/rbp already saved. ebp now holds global_State *. > + | lea DISPATCH, [ebp+GG_G2DISP] > + | mov CTSTATE, GL:ebp->ctype_state > +@@ -2736,6 +2822,7 @@ static void build_subroutines(BuildCtx *ctx) > + | > + |->cont_ffi_callback: // Return from FFI callback. > + |.if FFI > ++ | endbr64 > + | mov L:RA, SAVE_L > + | mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)] > + | mov aword CTSTATE->L, L:RA > +@@ -2753,7 +2840,7 @@ static void build_subroutines(BuildCtx *ctx) > + | // Caveat: needs special frame unwinding, see below. > + |.if FFI > + | .type CCSTATE, CCallState, rbx > +- | push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1 > ++ | endbr64; push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1 > + | > + | // Readjust stack. > + | mov eax, CCSTATE->spadj > +@@ -3221,6 +3308,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + |3: > + #endif > + |->BC_LEN_Z: > ++ | endbr64 > + | mov RB, BASE // Save BASE. > + | call extern lj_tab_len // (GCtab *t) > + | // Length of table returned in eax (RD). > +@@ -3341,6 +3429,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + case BC_MODVN: > + | ins_arithpre movsd, xmm1 > + |->BC_MODVN_Z: > ++ | endbr64 > + | call ->vm_mod > + | ins_arithpost > + | ins_next > +@@ -3367,6 +3456,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | mov CARG3d, RCd > + | sub CARG3d, RBd > + |->BC_CAT_Z: > ++ | endbr64 > + | mov L:RB, L:CARG1 > + | mov SAVE_PC, PC > + | call extern lj_meta_cat // (lua_State *L, TValue *top, int left) > +@@ -3701,6 +3791,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | mov STR:RC, [KBASE+RC*8] > + | checktab TAB:RB, ->vmeta_tgets > + |->BC_TGETS_Z: // RB = GCtab *, RC = GCstr * > ++ | endbr64 > + | mov TMPRd, TAB:RB->hmask > + | and TMPRd, STR:RC->sid > + | imul TMPRd, #NODE > +@@ -3771,8 +3862,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | add RC, TAB:RB->array > + | // Get array slot. > + |->BC_TGETR_Z: > ++ | endbr64 > + | mov ITYPE, [RC] > + |->BC_TGETR2_Z: > ++ | endbr64 > + | mov [BASE+RA*8], ITYPE > + | ins_next > + break; > +@@ -3833,6 +3926,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | mov STR:RC, [KBASE+RC*8] > + | checktab TAB:RB, ->vmeta_tsets > + |->BC_TSETS_Z: // RB = GCtab *, RC = GCstr * > ++ | endbr64 > + | mov TMPRd, TAB:RB->hmask > + | and TMPRd, STR:RC->sid > + | imul TMPRd, #NODE > +@@ -3940,6 +4034,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | add RC, TAB:RB->array > + | // Set array slot. > + |->BC_TSETR_Z: > ++ | endbr64 > + | mov ITYPE, [BASE+RA*8] > + | mov [RC], ITYPE > + | ins_next > +@@ -4021,6 +4116,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | mov LFUNC:RB, [RA-16] > + | checktp_nc LFUNC:RB, LJ_TFUNC, ->vmeta_call > + |->BC_CALLT_Z: > ++ | endbr64 > + | mov PC, [BASE-8] > + | test PCd, FRAME_TYPE > + | jnz >7 > +@@ -4087,6 +4183,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + > + case BC_ITERN: > + |.if JIT > ++ | endbr64 > + | hotloop RBd > + |.endif > + |->vm_IITERN: > +@@ -4267,6 +4364,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + switch (op) { > + case BC_RET: > + |->BC_RET_Z: > ++ | endbr64 > + | mov KBASE, BASE // Use KBASE for result move. > + | sub RDd, 1 > + | jz >3 > +@@ -4284,10 +4382,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + | ja >6 > + break; > + case BC_RET1: > ++ | endbr64 > + | mov RB, [BASE+RA] > + | mov [BASE-16], RB > + /* fallthrough */ > + case BC_RET0: > ++ | endbr64 > + |5: > + | cmp PC_RB, RDL // More results expected? > + | ja >6 > +@@ -4334,6 +4434,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + > + case BC_FORL: > + |.if JIT > ++ | endbr64 > + | hotloop RBd > + |.endif > + | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op. > +@@ -4342,6 +4443,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + case BC_JFORI: > + case BC_JFORL: > + #if !LJ_HASJIT > ++ | endbr64 > + break; > + #endif > + case BC_FORI: > +@@ -4485,6 +4587,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + > + case BC_ITERL: > + |.if JIT > ++ | endbr64 > + | hotloop RBd > + |.endif > + | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op. > +@@ -4492,6 +4595,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + > + case BC_JITERL: > + #if !LJ_HASJIT > ++ | endbr64 > + break; > + #endif > + case BC_IITERL: > +@@ -4578,13 +4682,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + > + case BC_FUNCF: > + |.if JIT > ++ | endbr64 > + | hotcall RBd > + |.endif > + case BC_FUNCV: /* NYI: compiled vararg functions. */ > + | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op. > ++ | endbr64 > + break; > + > + case BC_JFUNCF: > ++ | endbr64 > + #if !LJ_HASJIT > + break; > + #endif > +@@ -4615,6 +4722,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defo > + break; > + > + case BC_JFUNCV: > ++ | endbr64 > + #if !LJ_HASJIT > + break; > + #endif -- Yuichiro NAITO (naito.yuichiro@gmail.com)