Index | Thread | Search

From:
Klemens Nanni <kn@openbsd.org>
Subject:
Re: multimedia/openh264: Fix arm64 asm
To:
Mark Kettenis <mark.kettenis@xs4all.nl>
Cc:
kettenis@openbsd.org, ports@openbsd.org
Date:
Tue, 5 Mar 2024 16:41:55 +0000

Download raw body.

Thread
On Tue, Mar 05, 2024 at 05:35:34PM +0100, Mark Kettenis wrote:
> This has been broken for a while since there was no BTI support, but
> also no X-only support.
> 
> With this fixed, make test passes.

No arm64 to test here, but you did (:
OK kn

> 
> ok?
> 
> 
> Index: multimedia/openh264/Makefile
> ===================================================================
> RCS file: /cvs/ports/multimedia/openh264/Makefile,v
> retrieving revision 1.10
> diff -u -p -r1.10 Makefile
> --- multimedia/openh264/Makefile	21 Feb 2024 11:02:35 -0000	1.10
> +++ multimedia/openh264/Makefile	5 Mar 2024 16:32:14 -0000
> @@ -3,7 +3,7 @@ GH_ACCOUNT =		cisco
>  GH_PROJECT =		openh264
>  GH_TAGNAME =		v2.4.1
>  CATEGORIES =		multimedia
> -REVISION =		0
> +REVISION =		1
>  
>  SHARED_LIBS =		openh264	1.0	# 7.0
>  
> Index: multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S
> ===================================================================
> RCS file: multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S
> diff -N multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ multimedia/openh264/patches/patch-codec_common_arm64_arm_arch64_common_macro_S	5 Mar 2024 16:32:14 -0000
> @@ -0,0 +1,11 @@
> +Index: codec/common/arm64/arm_arch64_common_macro.S
> +--- codec/common/arm64/arm_arch64_common_macro.S.orig
> ++++ codec/common/arm64/arm_arch64_common_macro.S
> +@@ -60,6 +60,7 @@ ret
> + .func \funcName
> + #endif
> + \funcName:
> ++  bti c
> + .endm
> + 
> + .macro WELS_ASM_AARCH64_FUNC_END
> Index: multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S
> ===================================================================
> RCS file: multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S
> diff -N multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ multimedia/openh264/patches/patch-codec_common_arm64_mc_aarch64_neon_S	5 Mar 2024 16:32:14 -0000
> @@ -0,0 +1,64 @@
> +Index: codec/common/arm64/mc_aarch64_neon.S
> +--- codec/common/arm64/mc_aarch64_neon.S.orig
> ++++ codec/common/arm64/mc_aarch64_neon.S
> +@@ -32,8 +32,10 @@
> + 
> + #ifdef HAVE_NEON_AARCH64
> + #include "arm_arch64_common_macro.S"
> ++.rodata
> + .align 4
> + filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
> ++.previous
> + 
> + .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
> + //  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
> +@@ -1912,7 +1914,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_
> +     mov x5, #16
> +     movi v0.8h, #20, lsl #0
> +     movi v1.8h, #5, lsl #0
> +-    ldr q22, filter_para
> ++    adrp x6, filter_para
> ++    ldr q22, [x6, #:lo12:filter_para]
> + w17_h_mc_luma_loop:
> +     ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2]
> + 
> +@@ -1946,7 +1949,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_n
> +     mov x5, #8
> +     movi v0.8h, #20, lsl #0
> +     movi v1.8h, #5, lsl #0
> +-    ldr q22, filter_para
> ++    adrp x6, filter_para
> ++    ldr q22, [x6, #:lo12:filter_para]
> + w9_h_mc_luma_loop:
> +     ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2]
> +     mov v3.d[0], v2.d[1]
> +@@ -2012,7 +2016,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_
> +     movi v1.8h, #5, lsl #0
> +     sub x3, x3, #16
> +     mov x5, #16
> +-    ldr q29, filter_para
> ++    adrp x6, filter_para
> ++    ldr q29, [x6, #:lo12:filter_para]
> + 
> +     sub x4, x4, #1
> + 
> +@@ -2215,7 +2220,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_n
> +     movi v1.8h, #5, lsl #0
> +     sub x3, x3, #8
> +     mov x5, #8
> +-    ldr q29, filter_para
> ++    adrp x6, filter_para
> ++    ldr q29, [x6, #:lo12:filter_para]
> +     sub x4, x4, #1
> + 
> +     //prfm pldl1strm, [x0]
> +@@ -2315,7 +2321,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_n
> +     movi v1.8h, #5, lsl #0
> +     sub x3, x3, #4
> +     mov x5, #4
> +-    ldr q29, filter_para
> ++    adrp x6, filter_para
> ++    ldr q29, [x6, #:lo12:filter_para]
> +     sub x4, x4, #1
> + 
> +     //prfm pldl1strm, [x0]
> Index: multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S
> ===================================================================
> RCS file: multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S
> diff -N multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ multimedia/openh264/patches/patch-codec_decoder_core_arm64_intra_pred_aarch64_neon_S	5 Mar 2024 16:32:14 -0000
> @@ -0,0 +1,63 @@
> +Index: codec/decoder/core/arm64/intra_pred_aarch64_neon.S
> +--- codec/decoder/core/arm64/intra_pred_aarch64_neon.S.orig
> ++++ codec/decoder/core/arm64/intra_pred_aarch64_neon.S
> +@@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTo
> + .endr
> + WELS_ASM_AARCH64_FUNC_END
> + 
> ++.rodata
> + .align 4
> + intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
> + intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
> ++.previous
> + 
> + WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon
> +     sxtw    x1, w1
> +@@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlan
> + 
> +     uxtl    v1.8h, v1.8b
> +     uxtl    v0.8h, v0.8b
> +-    ldr     q2, intra_1_to_4
> +-    ldr     q3, intra_m3_to_p4
> ++    adrp    x4, intra_1_to_4
> ++    adrp    x5, intra_m3_to_p4
> ++    ldr     q2, [x4, #:lo12:intra_1_to_4]
> ++    ldr     q3, [x5, #:lo12:intra_m3_to_p4]
> +     dup     v4.8h, v0.h[3]
> +     dup     v5.8h, v0.h[7]
> +     add     v4.8h, v4.8h, v5.8h
> +@@ -456,9 +460,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredD
> + WELS_ASM_AARCH64_FUNC_END
> + 
> + 
> ++.rodata
> + .align 4
> + intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
> + intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
> ++.previous
> + 
> + WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon
> +     sxtw    x1, w1
> +@@ -492,7 +498,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredP
> +     uxtl    v3.8h, v3.8b
> +     sub     v0.8h, v1.8h, v0.8h
> +     sub     v2.8h, v3.8h, v2.8h
> +-    ldr     q4, intra_1_to_8
> ++    adrp    x4, intra_1_to_8
> ++    ldr     q4, [x4, #:lo12:intra_1_to_8]
> +     mul     v0.8h, v0.8h, v4.8h
> +     mul     v2.8h, v2.8h, v4.8h
> +     saddlv  s0, v0.8h
> +@@ -501,8 +508,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredP
> +     sqrshrn v0.4h, v0.4S, #6  // b is in v0.h[0]
> +     sqrshrn v2.4h, v2.4S, #6  // c is in v2.h[0]
> +     shl     v1.8h, v1.8h, #4   // a is in v1.h[7]
> +-    ldr     q4, intra_m7_to_p8
> +-    ldr     q5, intra_m7_to_p8 + 16
> ++    adrp    x4, intra_m7_to_p8
> ++    add     x5, x4, 16
> ++    ldr     q4, [x4, #:lo12:intra_m7_to_p8]
> ++    ldr     q5, [x5, #:lo12:intra_m7_to_p8]
> +     dup     v1.8h, v1.h[7]
> +     dup     v3.8h, v1.h[7]
> +     mla     v1.8h, v4.8h, v0.h[0]
> Index: multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S
> ===================================================================
> RCS file: multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S
> diff -N multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ multimedia/openh264/patches/patch-codec_encoder_core_arm64_intra_pred_aarch64_neon_S	5 Mar 2024 16:32:14 -0000
> @@ -0,0 +1,63 @@
> +Index: codec/encoder/core/arm64/intra_pred_aarch64_neon.S
> +--- codec/encoder/core/arm64/intra_pred_aarch64_neon.S.orig
> ++++ codec/encoder/core/arm64/intra_pred_aarch64_neon.S
> +@@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch
> + .endr
> + WELS_ASM_AARCH64_FUNC_END
> + 
> ++.rodata
> + .align 4
> + intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
> + intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
> ++.previous
> + 
> + WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon
> +     SIGN_EXTENSION x2,w2
> +@@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch
> + 
> +     uxtl    v1.8h, v1.8b
> +     uxtl    v0.8h, v0.8b
> +-    ldr     q2, intra_1_to_4
> +-    ldr     q3, intra_m3_to_p4
> ++    adrp    x4, intra_1_to_4
> ++    adrp    x5, intra_m3_to_p4
> ++    ldr     q2, [x4, #:lo12:intra_1_to_4]
> ++    ldr     q3, [x5, #:lo12:intra_m3_to_p4]
> +     dup     v4.8h, v0.h[3]
> +     dup     v5.8h, v0.h[7]
> +     add     v4.8h, v4.8h, v5.8h
> +@@ -437,9 +441,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_A
> + WELS_ASM_AARCH64_FUNC_END
> + 
> + 
> ++.rodata
> + .align 4
> + intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
> + intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
> ++.previous
> + //void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
> + WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
> +     SIGN_EXTENSION x2,w2
> +@@ -473,7 +479,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AA
> +     uxtl    v3.8h, v3.8b
> +     sub     v0.8h, v1.8h, v0.8h
> +     sub     v2.8h, v3.8h, v2.8h
> +-    ldr     q4, intra_1_to_8
> ++    adrp    x4, intra_1_to_8
> ++    ldr     q4, [x4, #:lo12:intra_1_to_8]
> +     mul     v0.8h, v0.8h, v4.8h
> +     mul     v2.8h, v2.8h, v4.8h
> +     saddlv  s0, v0.8h
> +@@ -482,8 +489,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AA
> +     sqrshrn v0.4h, v0.4S, #6  // b is in v0.h[0]
> +     sqrshrn v2.4h, v2.4S, #6  // c is in v2.h[0]
> +     shl     v1.8h, v1.8h, #4   // a is in v1.h[7]
> +-    ldr     q4, intra_m7_to_p8
> +-    ldr     q5, intra_m7_to_p8 + 16
> ++    adrp    x4, intra_m7_to_p8
> ++    add     x5, x4, 16
> ++    ldr     q4, [x4, #:lo12:intra_m7_to_p8]
> ++    ldr     q5, [x5, #:lo12:intra_m7_to_p8]
> +     dup     v1.8h, v1.h[7]
> +     dup     v3.8h, v1.h[7]
> +     mla     v1.8h, v4.8h, v0.h[0]
> Index: multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S
> ===================================================================
> RCS file: multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S
> diff -N multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ multimedia/openh264/patches/patch-codec_encoder_core_arm64_svc_motion_estimation_aarch64_neon_S	5 Mar 2024 16:32:14 -0000
> @@ -0,0 +1,28 @@
> +Index: codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
> +--- codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S.orig
> ++++ codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
> +@@ -283,16 +283,21 @@ _hash_assign_loop_x4_rem:
> + _hash_assign_end:
> + WELS_ASM_AARCH64_FUNC_END
> + 
> ++.rodata
> + .align 4
> + mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
> + mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
> + mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
> ++.previous
> + 
> + WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
> + // void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
> +-    ldr q7, mv_x_inc_x4
> +-    ldr q6, mv_y_inc_x4
> +-    ldr q5, mx_x_offset_x4
> ++    adrp x4, mv_x_inc_x4
> ++    adrp x5, mv_y_inc_x4
> ++    adrp x6, mx_x_offset_x4
> ++    ldr q7, [x4, #:lo12:mv_x_inc_x4]
> ++    ldr q6, [x5, #:lo12:mv_y_inc_x4]
> ++    ldr q5, [x6, #:lo12:mx_x_offset_x4]
> +     SIGN_EXTENSION x1,w1
> +     SIGN_EXTENSION x2,w2
> +     eor v4.16b, v4.16b, v4.16b
>