From: Klemens Nanni Subject: Re: multimedia/openh264: Fix arm64 asm To: Mark Kettenis Cc: kettenis@openbsd.org, ports@openbsd.org Date: Tue, 5 Mar 2024 16:41:55 +0000 On Tue, Mar 05, 2024 at 05:35:34PM +0100, Mark Kettenis wrote: > This has been broken for a while since there was no BTI support, but > also no X-only support. > > With this fixed, make test passes. No arm64 to test here, but you did (: OK kn > > ok? > > > Index: multimedia/openh264/Makefile > =================================================================== > RCS file: /cvs/ports/multimedia/openh264/Makefile,v > retrieving revision 1.10 > diff -u -p -r1.10 Makefile > --- multimedia/openh264/Makefile 21 Feb 2024 11:02:35 -0000 1.10 > +++ multimedia/openh264/Makefile 5 Mar 2024 16:32:14 -0000 > @@ -3,7 +3,7 @@ GH_ACCOUNT = cisco > GH_PROJECT = openh264 > GH_TAGNAME = v2.4.1 > CATEGORIES = multimedia > -REVISION = 0 > +REVISION = 1 > > SHARED_LIBS = openh264 1.0 # 7.0 > > Index: multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S > =================================================================== > RCS file: multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S > diff -N multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S 5 Mar 2024 16:32:14 -0000 > @@ -0,0 +1,11 @@ > +Index: codec/common/arm64/arm_arch64_common_macro.S > +--- codec/common/arm64/arm_arch64_common_macro.S.orig > ++++ codec/common/arm64/arm_arch64_common_macro.S > +@@ -60,6 +60,7 @@ ret > + .func \funcName > + #endif > + \funcName: > ++ bti c > + .endm > + > + .macro WELS_ASM_AARCH64_FUNC_END > Index: multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S > =================================================================== > RCS file: multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S > diff -N multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S 5 Mar 2024 16:32:14 -0000 > @@ -0,0 +1,64 @@ > +Index: codec/common/arm64/mc_aarch64_neon.S > +--- codec/common/arm64/mc_aarch64_neon.S.orig > ++++ codec/common/arm64/mc_aarch64_neon.S > +@@ -32,8 +32,10 @@ > + > + #ifdef HAVE_NEON_AARCH64 > + #include "arm_arch64_common_macro.S" > ++.rodata > + .align 4 > + filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 > ++.previous > + > + .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 > + // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 > +@@ -1912,7 +1914,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_ > + mov x5, #16 > + movi v0.8h, #20, lsl #0 > + movi v1.8h, #5, lsl #0 > +- ldr q22, filter_para > ++ adrp x6, filter_para > ++ ldr q22, [x6, #:lo12:filter_para] > + w17_h_mc_luma_loop: > + ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2] > + > +@@ -1946,7 +1949,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_n > + mov x5, #8 > + movi v0.8h, #20, lsl #0 > + movi v1.8h, #5, lsl #0 > +- ldr q22, filter_para > ++ adrp x6, filter_para > ++ ldr q22, [x6, #:lo12:filter_para] > + w9_h_mc_luma_loop: > + ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2] > + mov v3.d[0], v2.d[1] > +@@ -2012,7 +2016,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_ > + movi v1.8h, #5, lsl #0 > + sub x3, x3, #16 > + mov x5, #16 > +- ldr q29, filter_para > ++ adrp x6, filter_para > ++ ldr q29, [x6, #:lo12:filter_para] > + > + sub x4, x4, #1 > + > +@@ -2215,7 +2220,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_n > + movi v1.8h, #5, lsl #0 > + sub x3, x3, #8 > + mov x5, #8 > +- ldr q29, filter_para > ++ adrp x6, filter_para > ++ ldr q29, [x6, #:lo12:filter_para] > + sub x4, x4, #1 > + > + //prfm pldl1strm, [x0] > +@@ -2315,7 +2321,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_n > + movi v1.8h, #5, lsl #0 > + sub x3, x3, #4 > + mov x5, #4 > +- ldr q29, filter_para > ++ adrp x6, filter_para > ++ ldr q29, [x6, #:lo12:filter_para] > + sub x4, x4, #1 > + > + //prfm pldl1strm, [x0] > Index: multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S > =================================================================== > RCS file: multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S > diff -N multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S 5 Mar 2024 16:32:14 -0000 > @@ -0,0 +1,63 @@ > +Index: codec/decoder/core/arm64/intra_pred_aarch64_neon.S > +--- codec/decoder/core/arm64/intra_pred_aarch64_neon.S.orig > ++++ codec/decoder/core/arm64/intra_pred_aarch64_neon.S > +@@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTo > + .endr > + WELS_ASM_AARCH64_FUNC_END > + > ++.rodata > + .align 4 > + intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 > + intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 > ++.previous > + > + WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon > + sxtw x1, w1 > +@@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlan > + > + uxtl v1.8h, v1.8b > + uxtl v0.8h, v0.8b > +- ldr q2, intra_1_to_4 > +- ldr q3, intra_m3_to_p4 > ++ adrp x4, intra_1_to_4 > ++ adrp x5, intra_m3_to_p4 > ++ ldr q2, [x4, #:lo12:intra_1_to_4] > ++ ldr q3, [x5, #:lo12:intra_m3_to_p4] > + dup v4.8h, v0.h[3] > + dup v5.8h, v0.h[7] > + add v4.8h, v4.8h, v5.8h > +@@ -456,9 +460,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredD > + WELS_ASM_AARCH64_FUNC_END > + > + > ++.rodata > + .align 4 > + intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 > + intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 > ++.previous > + > + WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon > + sxtw x1, w1 > +@@ -492,7 +498,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredP > + uxtl v3.8h, v3.8b > + sub v0.8h, v1.8h, v0.8h > + sub v2.8h, v3.8h, v2.8h > +- ldr q4, intra_1_to_8 > ++ adrp x4, intra_1_to_8 > ++ ldr q4, [x4, #:lo12:intra_1_to_8] > + mul v0.8h, v0.8h, v4.8h > + mul v2.8h, v2.8h, v4.8h > + saddlv s0, v0.8h > +@@ -501,8 +508,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredP > + sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] > + sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] > + shl v1.8h, v1.8h, #4 // a is in v1.h[7] > +- ldr q4, intra_m7_to_p8 > +- ldr q5, intra_m7_to_p8 + 16 > ++ adrp x4, intra_m7_to_p8 > ++ add x5, x4, 16 > ++ ldr q4, [x4, #:lo12:intra_m7_to_p8] > ++ ldr q5, [x5, #:lo12:intra_m7_to_p8] > + dup v1.8h, v1.h[7] > + dup v3.8h, v1.h[7] > + mla v1.8h, v4.8h, v0.h[0] > Index: multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S > =================================================================== > RCS file: multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S > diff -N multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S 5 Mar 2024 16:32:14 -0000 > @@ -0,0 +1,63 @@ > +Index: codec/encoder/core/arm64/intra_pred_aarch64_neon.S > +--- codec/encoder/core/arm64/intra_pred_aarch64_neon.S.orig > ++++ codec/encoder/core/arm64/intra_pred_aarch64_neon.S > +@@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch > + .endr > + WELS_ASM_AARCH64_FUNC_END > + > ++.rodata > + .align 4 > + intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 > + intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 > ++.previous > + > + WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon > + SIGN_EXTENSION x2,w2 > +@@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch > + > + uxtl v1.8h, v1.8b > + uxtl v0.8h, v0.8b > +- ldr q2, intra_1_to_4 > +- ldr q3, intra_m3_to_p4 > ++ adrp x4, intra_1_to_4 > ++ adrp x5, intra_m3_to_p4 > ++ ldr q2, [x4, #:lo12:intra_1_to_4] > ++ ldr q3, [x5, #:lo12:intra_m3_to_p4] > + dup v4.8h, v0.h[3] > + dup v5.8h, v0.h[7] > + add v4.8h, v4.8h, v5.8h > +@@ -437,9 +441,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_A > + WELS_ASM_AARCH64_FUNC_END > + > + > ++.rodata > + .align 4 > + intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 > + intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 > ++.previous > + //void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); > + WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon > + SIGN_EXTENSION x2,w2 > +@@ -473,7 +479,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AA > + uxtl v3.8h, v3.8b > + sub v0.8h, v1.8h, v0.8h > + sub v2.8h, v3.8h, v2.8h > +- ldr q4, intra_1_to_8 > ++ adrp x4, intra_1_to_8 > ++ ldr q4, [x4, #:lo12:intra_1_to_8] > + mul v0.8h, v0.8h, v4.8h > + mul v2.8h, v2.8h, v4.8h > + saddlv s0, v0.8h > +@@ -482,8 +489,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AA > + sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] > + sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] > + shl v1.8h, v1.8h, #4 // a is in v1.h[7] > +- ldr q4, intra_m7_to_p8 > +- ldr q5, intra_m7_to_p8 + 16 > ++ adrp x4, intra_m7_to_p8 > ++ add x5, x4, 16 > ++ ldr q4, [x4, #:lo12:intra_m7_to_p8] > ++ ldr q5, [x5, #:lo12:intra_m7_to_p8] > + dup v1.8h, v1.h[7] > + dup v3.8h, v1.h[7] > + mla v1.8h, v4.8h, v0.h[0] > Index: multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S > =================================================================== > RCS file: multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S > diff -N multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S 5 Mar 2024 16:32:14 -0000 > @@ -0,0 +1,28 @@ > +Index: codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S > +--- codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S.orig > ++++ codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S > +@@ -283,16 +283,21 @@ _hash_assign_loop_x4_rem: > + _hash_assign_end: > + WELS_ASM_AARCH64_FUNC_END > + > ++.rodata > + .align 4 > + mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 > + mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 > + mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 > ++.previous > + > + WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon > + // void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) > +- ldr q7, mv_x_inc_x4 > +- ldr q6, mv_y_inc_x4 > +- ldr q5, mx_x_offset_x4 > ++ adrp x4, mv_x_inc_x4 > ++ adrp x5, mv_y_inc_x4 > ++ adrp x6, mx_x_offset_x4 > ++ ldr q7, [x4, #:lo12:mv_x_inc_x4] > ++ ldr q6, [x5, #:lo12:mv_y_inc_x4] > ++ ldr q5, [x6, #:lo12:mx_x_offset_x4] > + SIGN_EXTENSION x1,w1 > + SIGN_EXTENSION x2,w2 > + eor v4.16b, v4.16b, v4.16b >