Download raw body.
dav1d: add missing IBT landing pads for AVX-512
A few landing pads were missed for AVX-512 code. No chance of hitting
those on OpenBSD since we don't support AVX-512 yet. But I have a
diff for this.
Passes regress on a machine with AVX-512 enabled. Allows me to watch
youtube vids on that machine.
ok?
Index: multimedia/dav1d/Makefile
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/Makefile,v
retrieving revision 1.37
diff -u -p -r1.37 Makefile
--- multimedia/dav1d/Makefile 27 Sep 2023 10:10:19 -0000 1.37
+++ multimedia/dav1d/Makefile 18 Feb 2024 18:51:06 -0000
@@ -6,7 +6,7 @@ COMMENT= small and fast AV1 decoder
VER= 1.2.1
DISTNAME= dav1d-${VER}
-REVISION= 1
+REVISION= 2
CATEGORIES= multimedia
SITES= https://downloads.videolan.org/pub/videolan/dav1d/${VER}/
EXTRACT_SUFX= .tar.xz
Index: multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm
===================================================================
RCS file: multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm
diff -N multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm 18 Feb 2024 18:51:06 -0000
@@ -0,0 +1,203 @@
+Index: src/x86/ipred16_avx512.asm
+--- src/x86/ipred16_avx512.asm.orig
++++ src/x86/ipred16_avx512.asm
+@@ -104,6 +104,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
+ add wq, r6
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+@@ -133,6 +134,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
+ .w4_end:
+ RET
+ .w8:
++ _CET_ENDBR
+ vbroadcasti32x4 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+@@ -152,6 +154,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x8 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ psubw m5, m4, m3
+@@ -168,6 +171,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ movu m4, [tlq+2]
+ psubw m5, m4, m3
+ pabsw m6, m5
+@@ -181,6 +185,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m4, [tlq+ 2]
+ movu m7, [tlq+66]
+ psubw m5, m4, m3
+@@ -212,6 +217,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ vpbroadcastq m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+@@ -239,6 +245,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
+ .end:
+ RET
+ .w8:
++ _CET_ENDBR
+ vbroadcasti32x4 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+@@ -256,6 +263,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
+ jl .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x8 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+@@ -277,6 +285,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
+ jl .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ movu m5, [tlq+2]
+ psubw m5, m6
+ .w32_loop:
+@@ -295,6 +304,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
+ jl .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m4, [tlq+ 2]
+ movu m5, [tlq+66]
+ psubw m4, m6
+@@ -329,6 +339,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
+ lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movsldup m4, [base+ipred_shuf]
+ vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
+ .w4_loop:
+@@ -356,6 +367,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
+ .end:
+ RET
+ .w8:
++ _CET_ENDBR
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
+ .w8_loop:
+@@ -373,6 +385,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
+ .w16_loop:
+@@ -395,6 +408,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ movu m5, [base+smooth_weights_1d_16bpc+32*2]
+ .w32_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+@@ -415,6 +429,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m4, [base+smooth_weights_1d_16bpc+64*2]
+ movu m5, [base+smooth_weights_1d_16bpc+64*3]
+ .w64_loop:
+@@ -456,6 +471,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ vpbroadcastq m5, [tlq+hq+2]
+ movshdup m3, [base+ipred_shuf]
+ movsldup m4, [base+ipred_shuf]
+@@ -483,6 +499,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
+ jg .w4_loop
+ RET
+ .w8:
++ _CET_ENDBR
+ vbroadcasti32x4 ym5, [tlq+hq+2]
+ movshdup m6, [base+ipred_shuf]
+ movsldup m7, [base+ipred_shuf]
+@@ -517,6 +534,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ pmovzxwd m5, [tlq+hq+2]
+ mova m6, [base+smooth_weights_2d_16bpc+16*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+@@ -541,6 +559,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ mova m7, [base+smooth_weights_2d_16bpc+32*4]
+@@ -574,6 +593,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ pmovzxwd m7, [tlq+hq+66]
+@@ -621,6 +641,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ pmovzxbw ym0, [idxq]
+ add idxq, 16
+ vpermw ym0, ym0, ym3
+@@ -634,6 +655,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
+ jg .w4
+ RET
+ .w8:
++ _CET_ENDBR
+ pmovzxbw m0, [idxq]
+ add idxq, 32
+ vpermw m0, m0, m3
+@@ -646,6 +668,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
+ jg .w8
+ RET
+ .w16:
++ _CET_ENDBR
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+@@ -660,6 +683,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
+ jg .w16
+ RET
+ .w32:
++ _CET_ENDBR
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+@@ -672,6 +696,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
+ jg .w32
+ RET
+ .w64:
++ _CET_ENDBR
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
Index: multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm
===================================================================
RCS file: multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm
diff -N multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm 18 Feb 2024 18:51:06 -0000
@@ -0,0 +1,374 @@
+Index: src/x86/ipred_avx512.asm
+--- src/x86/ipred_avx512.asm.orig
++++ src/x86/ipred_avx512.asm
+@@ -168,18 +168,23 @@ cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl,
+ add wq, r5
+ jmp r6
+ .h64:
++ _CET_ENDBR
+ movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
+ vpdpbusd ym0, ym1, ym2
+ .h32:
++ _CET_ENDBR
+ vextracti32x4 xm1, ym0, 1
+ paddd xm0, xm1
+ .h16:
++ _CET_ENDBR
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ .h8:
++ _CET_ENDBR
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ .h4:
++ _CET_ENDBR
+ vpsrlvd xm0, xmm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+@@ -204,10 +209,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ lea stride3q, [strideq*3]
+ jmp r6
+ .h4:
++ _CET_ENDBR
+ movd xmm1, [tlq-4]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movd xmm1, [tlq+1]
+ vpdpbusd xm0, xmm1, xm3
+ cmp hd, 4
+@@ -228,6 +235,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ .w4_end:
+ vpbroadcastb xm0, xmm0
+ .s4:
++ _CET_ENDBR
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+@@ -237,10 +245,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ jg .s4
+ RET
+ .h8:
++ _CET_ENDBR
+ movq xmm1, [tlq-8]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+ .w8:
++ _CET_ENDBR
+ movq xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+@@ -261,6 +271,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ .w8_end:
+ vpbroadcastb xm0, xmm0
+ .s8:
++ _CET_ENDBR
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+@@ -270,10 +281,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ jg .s8
+ RET
+ .h16:
++ _CET_ENDBR
+ mova xmm1, [tlq-16]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+ .w16:
++ _CET_ENDBR
+ movu xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+@@ -294,6 +307,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ .w16_end:
+ vpbroadcastb xm0, xmm0
+ .s16:
++ _CET_ENDBR
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+@@ -303,10 +317,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ jg .s16
+ RET
+ .h32:
++ _CET_ENDBR
+ mova ym1, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ jmp wq
+ .w32:
++ _CET_ENDBR
+ movu ym1, [tlq+1]
+ vpdpbusd ym0, ym1, ym3
+ vextracti32x4 xm1, ym0, 1
+@@ -326,6 +342,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ .w32_end:
+ vpbroadcastb ym0, xmm0
+ .s32:
++ _CET_ENDBR
+ mova [dstq+strideq*0], ym0
+ mova [dstq+strideq*1], ym0
+ mova [dstq+strideq*2], ym0
+@@ -335,12 +352,14 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ jg .s32
+ RET
+ .h64:
++ _CET_ENDBR
+ mova ym1, [tlq-64]
+ mova ym2, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ jmp wq
+ .w64:
++ _CET_ENDBR
+ movu ym1, [tlq+ 1]
+ movu ym2, [tlq+33]
+ vpdpbusd ym0, ym1, ym3
+@@ -361,6 +380,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
+ .w64_end:
+ vpbroadcastb m0, xmm0
+ .s64:
++ _CET_ENDBR
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+@@ -401,6 +421,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
+ add wq, r6
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ mova xmm1, [base+ipred_h_shuf+16]
+ .w4_loop:
+ movd xmm0, [tlq+hq-4]
+@@ -414,6 +435,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
+ jg .w4_loop
+ RET
+ .w8:
++ _CET_ENDBR
+ movsldup xmm2, [base+ipred_h_shuf+16]
+ movshdup xmm3, [base+ipred_h_shuf+16]
+ .w8_loop:
+@@ -429,6 +451,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ movsldup m1, [base+smooth_shuf]
+ .w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+@@ -442,6 +465,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
+ jg .w16
+ RET
+ .w32:
++ _CET_ENDBR
+ vpbroadcastd ym3, [base+pb_1]
+ vpord m2, m3, [base+pb_2] {1to16}
+ .w32_loop:
+@@ -457,6 +481,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ vpbroadcastd m4, [base+pb_3]
+ vpbroadcastd m5, [base+pb_2]
+ vpbroadcastd m6, [base+pb_1]
+@@ -509,6 +534,7 @@ cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w
+ jmp wq
+ INIT_YMM avx512icl
+ .w4:
++ _CET_ENDBR
+ vpbroadcastd m6, [topq]
+ mova m9, [ipred_h_shuf]
+ psubusb m7, m5, m6
+@@ -536,6 +562,7 @@ INIT_YMM avx512icl
+ RET
+ INIT_ZMM avx512icl
+ .w8:
++ _CET_ENDBR
+ vpbroadcastq m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+@@ -564,6 +591,7 @@ INIT_ZMM avx512icl
+ .w8_ret:
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+@@ -582,6 +610,7 @@ INIT_ZMM avx512icl
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ vbroadcasti32x8 m6, [topq]
+ mova ym9, ym8
+ psubusb m7, m5, m6
+@@ -598,6 +627,7 @@ INIT_ZMM avx512icl
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m6, [topq]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+@@ -626,6 +656,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ vpbroadcastd m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+@@ -656,6 +687,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
+ .ret:
+ RET
+ .w8:
++ _CET_ENDBR
+ vpbroadcastq m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+@@ -679,6 +711,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
+ jl .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+@@ -707,6 +740,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
+ jl .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ vbroadcasti32x8 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+@@ -733,6 +767,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
+ jl .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m3, [tlq+1]
+ mova m6, [smooth_endB]
+ punpcklbw m2, m3, m4
+@@ -772,6 +807,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movsldup m3, [smooth_shuf]
+ vpbroadcastq m7, [smooth_weights+4*2]
+ mova ym8, [smooth_endA]
+@@ -802,6 +838,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
+ .ret:
+ RET
+ .w8:
++ _CET_ENDBR
+ movsldup m3, [smooth_shuf]
+ vbroadcasti32x4 m7, [smooth_weights+8*2]
+ mova ym8, [smooth_endA]
+@@ -825,6 +862,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ movsldup m7, [smooth_shuf]
+ vbroadcasti32x4 m8, [smooth_weights+16*2]
+ vbroadcasti32x4 m9, [smooth_weights+16*3]
+@@ -850,6 +888,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ mova m10, [smooth_endA]
+ vpbroadcastd ym7, [pb_1]
+ vbroadcasti32x8 m8, [smooth_weights+32*2]
+@@ -874,6 +913,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ mova m7, [smooth_weights+64*2]
+ mova m8, [smooth_weights+64*3]
+ mova m9, [smooth_endA]
+@@ -912,6 +952,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ vpbroadcastd m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+@@ -954,6 +995,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
+ .ret:
+ RET
+ .w8:
++ _CET_ENDBR
+ vpbroadcastq m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+@@ -988,6 +1030,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m9, [tlq+hq+1]
+ movsldup m5, [smooth_shuf]
+ movshdup m10, [smooth_shuf]
+@@ -1031,6 +1074,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ vbroadcasti32x8 m9, [tlq+hq+1]
+ movshdup m10, [smooth_shuf]
+ mova m12, [smooth_weights+32*2]
+@@ -1073,6 +1117,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ movu m9, [tlq+hq+1]
+ mova m11, [smooth_weights+64*2]
+ mova m2, [smooth_weights+64*3]
+@@ -1122,6 +1167,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ pshufb xmm0, xm4, [idxq]
+ add idxq, 16
+ movd [dstq+strideq*0], xmm0
+@@ -1133,6 +1179,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
+ jg .w4
+ RET
+ .w8:
++ _CET_ENDBR
+ pshufb xmm0, xm4, [idxq+16*0]
+ pshufb xmm1, xm4, [idxq+16*1]
+ add idxq, 16*2
+@@ -1145,6 +1192,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
+ jg .w8
+ RET
+ .w16:
++ _CET_ENDBR
+ pshufb m0, m4, [idxq]
+ add idxq, 64
+ mova [dstq+strideq*0], xm0
+@@ -1156,6 +1204,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
+ jg .w16
+ RET
+ .w32:
++ _CET_ENDBR
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ add idxq, 64*2
+@@ -1168,6 +1217,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
+ jg .w32
+ RET
+ .w64:
++ _CET_ENDBR
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ pshufb m2, m4, [idxq+64*2]
Index: multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm,v
retrieving revision 1.1
diff -u -p -r1.1 patch-src_x86_itx_avx512_asm
--- multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm 13 Jul 2023 12:36:36 -0000 1.1
+++ multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm 18 Feb 2024 18:51:06 -0000
@@ -49,7 +49,15 @@ Index: src/x86/itx_avx512.asm
vextracti32x4 xm2, m0, 1
vextracti32x4 xm3, m1, 1
pshufd xm4, xm0, q1032
-@@ -818,6 +824,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str
+@@ -787,6 +793,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, str
+ punpckhwd m1, m3
+ jmp tx2q
+ .pass2:
++ _CET_ENDBR
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+@@ -818,6 +825,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str
vextracti32x8 ym1, m0, 1
jmp tx2q
.pass2:
@@ -57,7 +65,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd ym4, [o(pw_4096)]
jmp m(iadst_4x8_internal_8bpc).end2
-@@ -935,6 +942,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -935,6 +943,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride,
pmulhrsw m1, m4
jmp tx2q
.pass2:
@@ -65,7 +73,7 @@ Index: src/x86/itx_avx512.asm
vextracti32x4 xm2, ym0, 1
vextracti32x4 xm3, ym1, 1
vextracti32x4 xm4, m0, 2
-@@ -975,6 +983,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride
+@@ -975,6 +984,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride
punpcklwd m0, m2
jmp tx2q
.pass2:
@@ -73,7 +81,7 @@ Index: src/x86/itx_avx512.asm
call .main
vpbroadcastd m5, [o(pw_2048)]
psrlq m10, 4
-@@ -1082,6 +1091,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st
+@@ -1082,6 +1092,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st
punpckhwd m1, m2
jmp tx2q
.pass2:
@@ -81,7 +89,7 @@ Index: src/x86/itx_avx512.asm
call m(iadst_4x16_internal_8bpc).main
vpbroadcastd m6, [o(pw_2048)]
psrlq m10, 12
-@@ -1109,6 +1119,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st
+@@ -1109,6 +1120,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st
punpckhdq m1, m2
jmp tx2q
.pass2:
@@ -89,7 +97,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m3, [o(pw_1697x16)]
vpbroadcastd m5, [o(pw_2048)]
pmulhrsw m2, m3, m0
-@@ -1181,6 +1192,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1181,6 +1193,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride,
pshufb m1, m4
jmp tx2q
.pass2:
@@ -97,7 +105,7 @@ Index: src/x86/itx_avx512.asm
IDCT4_1D_PACKED
vpermq m0, m0, q3120
vpermq m1, m1, q2031
-@@ -1210,6 +1222,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1210,6 +1223,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m3
jmp tx2q
.pass2:
@@ -105,7 +113,7 @@ Index: src/x86/itx_avx512.asm
call .main
.end:
vpermq m0, m0, q3120
-@@ -1253,6 +1266,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str
+@@ -1253,6 +1267,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str
punpcklwd m0, m3
jmp tx2q
.pass2:
@@ -113,7 +121,7 @@ Index: src/x86/itx_avx512.asm
call m(iadst_8x4_internal_8bpc).main
mova m2, m1
vpermq m1, m0, q2031
-@@ -1280,6 +1294,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str
+@@ -1280,6 +1295,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str
paddsw m1, m1
jmp tx2q
.pass2:
@@ -121,7 +129,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
-@@ -1349,6 +1364,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1349,6 +1365,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride,
vshufi32x4 m3, m5, m3, 0x03
jmp tx2q
.pass2:
@@ -129,7 +137,7 @@ Index: src/x86/itx_avx512.asm
call .main
vpbroadcastd m4, [o(pw_2048)]
vpermq m0, m0, q3120
-@@ -1388,6 +1404,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1388,6 +1405,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride,
vinserti32x4 m1, m4, xm1, 1
jmp tx2q
.pass2:
@@ -137,7 +145,7 @@ Index: src/x86/itx_avx512.asm
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call .main_pass2
-@@ -1455,6 +1472,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str
+@@ -1455,6 +1473,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str
vshufi32x4 m2, m4, m2, 0x03
jmp tx2q
.pass2:
@@ -145,7 +153,7 @@ Index: src/x86/itx_avx512.asm
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal_8bpc).main_pass2
-@@ -1493,6 +1511,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str
+@@ -1493,6 +1512,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str
punpckhdq m3, m4
jmp tx2q
.pass2:
@@ -153,7 +161,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_8x8_internal_8bpc).end
-@@ -1553,6 +1572,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1553,6 +1573,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride,
punpckhdq m3, m4 ; 3 7 11 15
jmp tx2q
.pass2:
@@ -161,7 +169,7 @@ Index: src/x86/itx_avx512.asm
vprord m5, [o(int16_perm)], 16
vshufi32x4 m2, m2, q1320 ; 2 10 14 6
vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
-@@ -1686,6 +1706,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride
+@@ -1686,6 +1707,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride
punpckhqdq m3, m5
jmp tx2q
.pass2:
@@ -169,7 +177,7 @@ Index: src/x86/itx_avx512.asm
call .main_pass2
vpbroadcastd m6, [o(pw_2048)]
psrlq m10, 4
-@@ -1794,6 +1815,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st
+@@ -1794,6 +1816,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st
pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
jmp m(iadst_8x16_internal_8bpc).pass1_end
.pass2:
@@ -177,7 +185,7 @@ Index: src/x86/itx_avx512.asm
call m(iadst_8x16_internal_8bpc).main_pass2
vpbroadcastd m7, [o(pw_2048)]
psrlq m10, 36
-@@ -1823,6 +1845,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st
+@@ -1823,6 +1846,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st
punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
jmp tx2q
.pass2:
@@ -185,7 +193,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m7, [o(pw_1697x16)]
mova ym8, [o(gather8b)]
lea r3, [dstq+strideq*2]
-@@ -1897,6 +1920,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -1897,6 +1921,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m2
jmp tx2q
.pass2:
@@ -193,7 +201,7 @@ Index: src/x86/itx_avx512.asm
IDCT4_1D_PACKED
mova m2, [o(permA)]
jmp m(iadst_16x4_internal_8bpc).end
-@@ -1936,6 +1960,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride
+@@ -1936,6 +1961,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride
pmulhrsw m1, m6
jmp tx2q
.pass2:
@@ -201,7 +209,7 @@ Index: src/x86/itx_avx512.asm
call .main
movu m2, [o(permA+1)]
.end:
-@@ -1986,6 +2011,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st
+@@ -1986,6 +2012,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st
psrlq m10, 16
jmp m(iadst_16x4_internal_8bpc).pass1_end
.pass2:
@@ -209,7 +217,7 @@ Index: src/x86/itx_avx512.asm
call m(iadst_16x4_internal_8bpc).main
movu m2, [o(permA+2)]
jmp m(iadst_16x4_internal_8bpc).end
-@@ -2013,6 +2039,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st
+@@ -2013,6 +2040,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st
vpermb m1, m5, m1
jmp tx2q
.pass2:
@@ -217,7 +225,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
-@@ -2112,6 +2139,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride,
+@@ -2112,6 +2140,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride,
punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
jmp tx2q
.pass2:
@@ -225,7 +233,7 @@ Index: src/x86/itx_avx512.asm
vshufi32x4 m0, m2, m4, q2020 ; 0 1
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020 ; 2 3
-@@ -2211,6 +2239,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride
+@@ -2211,6 +2240,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride
REPX {pmulhrsw x, m7}, m2, m3, m4, m5
jmp tx2q
.pass2:
@@ -233,7 +241,7 @@ Index: src/x86/itx_avx512.asm
vshufi32x4 m0, m2, m4, q2020
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020
-@@ -2265,6 +2294,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st
+@@ -2265,6 +2295,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st
psrlq m10, 20
jmp m(iadst_16x8_internal_8bpc).pass1_end
.pass2:
@@ -241,7 +249,7 @@ Index: src/x86/itx_avx512.asm
vshufi32x4 m0, m2, m4, q2020
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020
-@@ -2314,6 +2344,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st
+@@ -2314,6 +2345,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st
REPX {vpermb x, m9, x}, m2, m3, m4, m5
jmp tx2q
.pass2:
@@ -249,7 +257,7 @@ Index: src/x86/itx_avx512.asm
mova m7, [o(permB)]
vpbroadcastd m6, [o(pw_4096)]
vpermq m0, m7, m2
-@@ -2373,6 +2404,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride
+@@ -2373,6 +2405,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride
punpckldq m6, m11
jmp tx2q
.pass2:
@@ -257,7 +265,7 @@ Index: src/x86/itx_avx512.asm
vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
-@@ -2538,6 +2570,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid
+@@ -2538,6 +2571,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
@@ -265,7 +273,7 @@ Index: src/x86/itx_avx512.asm
call .main_pass2
mova m10, [o(permD)]
psrlq m8, m10, 8
-@@ -2720,6 +2753,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s
+@@ -2720,6 +2754,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s
punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
jmp m(iadst_16x16_internal_8bpc).pass1_end
.pass2:
@@ -273,7 +281,7 @@ Index: src/x86/itx_avx512.asm
call m(iadst_16x16_internal_8bpc).main_pass2
mova m10, [o(permD)]
psrlq m8, m10, 8
-@@ -2789,6 +2823,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s
+@@ -2789,6 +2824,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s
jmp tx2q
ALIGN function_align
.pass2:
@@ -281,7 +289,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m11, [o(pw_1697x16)]
pmulhrsw m12, m11, m0
pmulhrsw m13, m11, m1
-@@ -3131,6 +3166,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst,
+@@ -3131,6 +3167,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst,
call m(idct_8x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
.pass2:
@@ -289,7 +297,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m10, [o(pw_8192)]
vpermt2q m0, m15, m4 ; t0 t1 t9 t8
vpermt2q m20, m15, m18 ; t31 t30a t23a t22
-@@ -3586,6 +3622,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst
+@@ -3586,6 +3623,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst
punpckhwd m17, m17
call .main_oddhalf_fast
.pass2:
@@ -297,7 +305,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m10, [o(pw_2048)]
mova m11, [o(end_16x32p)]
lea r3, [strideq*3]
-@@ -3798,6 +3835,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst
+@@ -3798,6 +3836,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst
punpckhwd m17, m17 ; 15
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
.pass2:
@@ -305,7 +313,7 @@ Index: src/x86/itx_avx512.asm
vpbroadcastd m9, [o(pw_16384)]
call .transpose_round
vshufi32x4 m16, m14, m2, q3131 ; 5
-@@ -5683,6 +5721,7 @@ ALIGN function_align
+@@ -5683,6 +5722,7 @@ ALIGN function_align
vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
ret
.pass2:
Index: multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm
===================================================================
RCS file: multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm
diff -N multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm 18 Feb 2024 18:51:06 -0000
@@ -0,0 +1,867 @@
+Index: src/x86/mc16_avx512.asm
+--- src/x86/mc16_avx512.asm.orig
++++ src/x86/mc16_avx512.asm
+@@ -276,6 +276,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ add t0, r7
+ jmp t0
+ .put_w2:
++ _CET_ENDBR
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -286,6 +287,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w2
+ RET
+ .put_w4:
++ _CET_ENDBR
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -296,6 +298,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w4
+ RET
+ .put_w8:
++ _CET_ENDBR
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -306,6 +309,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w8
+ RET
+ .put_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -316,6 +320,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w16
+ RET
+ .put_w32:
++ _CET_ENDBR
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -326,6 +331,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w32
+ RET
+ .put_w64:
++ _CET_ENDBR
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+@@ -340,6 +346,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .put_w64
+ RET
+ .put_w128:
++ _CET_ENDBR
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+@@ -368,6 +375,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
+ jmp t0
+ .h_w2:
++ _CET_ENDBR
+ movq xmm1, [srcq+ssq*0]
+ movhps xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+@@ -384,6 +392,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w2
+ RET
+ .h_w4:
++ _CET_ENDBR
+ movq xmm0, [srcq+ssq*0+0]
+ movhps xmm0, [srcq+ssq*1+0]
+ movq xmm1, [srcq+ssq*0+2]
+@@ -401,6 +410,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w4
+ RET
+ .h_w8:
++ _CET_ENDBR
+ movu xm0, [srcq+ssq*0+0]
+ vinserti32x4 ym0, [srcq+ssq*1+0], 1
+ movu xm1, [srcq+ssq*0+2]
+@@ -418,6 +428,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w8
+ RET
+ .h_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+ssq*0+0]
+ vinserti32x8 m0, [srcq+ssq*1+0], 1
+ movu ym1, [srcq+ssq*0+2]
+@@ -435,6 +446,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w16
+ RET
+ .h_w32:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ pmullw m1, m4, [srcq+ssq*1+0]
+@@ -453,6 +465,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w32
+ RET
+ .h_w64:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m2, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+@@ -471,6 +484,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .h_w64
+ RET
+ .h_w128:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m7, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+@@ -501,6 +515,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ add t0, r7
+ jmp t0
+ .v_w2:
++ _CET_ENDBR
+ movd xmm0, [srcq+ssq*0]
+ .v_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+@@ -518,6 +533,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .v_w2_loop
+ RET
+ .v_w4:
++ _CET_ENDBR
+ movq xmm0, [srcq+ssq*0]
+ .v_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+@@ -535,6 +551,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .v_w4_loop
+ RET
+ .v_w8:
++ _CET_ENDBR
+ movu xmm0, [srcq+ssq*0]
+ .v_w8_loop:
+ vbroadcasti128 ymm1, [srcq+ssq*1]
+@@ -553,6 +570,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ vzeroupper
+ RET
+ .v_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+ssq*0]
+ .v_w16_loop:
+ movu ym3, [srcq+ssq*1]
+@@ -571,6 +589,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .v_w16_loop
+ RET
+ .v_w32:
++ _CET_ENDBR
+ movu m0, [srcq+ssq*0]
+ .v_w32_loop:
+ movu m3, [srcq+ssq*1]
+@@ -589,6 +608,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .v_w32_loop
+ RET
+ .v_w64:
++ _CET_ENDBR
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ .v_w64_loop:
+@@ -618,6 +638,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .v_w64_loop
+ RET
+ .v_w128:
++ _CET_ENDBR
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*0+64*2]
+@@ -683,6 +704,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ .hv_12bpc:
+ jmp t0
+ .hv_w2:
++ _CET_ENDBR
+ vpbroadcastq xmm1, [srcq+ssq*0]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+@@ -714,6 +736,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .hv_w2_loop
+ RET
+ .hv_w4:
++ _CET_ENDBR
+ pmullw xmm0, xm4, [srcq+ssq*0-8]
+ pmullw xmm1, xm5, [srcq+ssq*0-6]
+ paddw xmm0, xm6
+@@ -744,6 +767,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .hv_w4_loop
+ RET
+ .hv_w8:
++ _CET_ENDBR
+ pmullw xmm0, xm4, [srcq+ssq*0+0]
+ pmullw xmm1, xm5, [srcq+ssq*0+2]
+ paddw xmm0, xm6
+@@ -775,6 +799,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ jg .hv_w8_loop
+ RET
+ .hv_w16:
++ _CET_ENDBR
+ pmullw ym0, ym4, [srcq+ssq*0+0]
+ pmullw ym1, ym5, [srcq+ssq*0+2]
+ paddw ym0, ym6
+@@ -808,6 +833,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w
+ .hv_w32:
+ .hv_w64:
+ .hv_w128:
++ _CET_ENDBR
+ movifnidn wd, wm
+ lea r6d, [hq+wq*8-256]
+ mov r4, srcq
+@@ -874,6 +900,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .prep_w4:
++ _CET_ENDBR
+ movq xmm0, [srcq+strideq*0]
+ movhps xmm0, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+@@ -890,6 +917,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ vzeroupper
+ RET
+ .prep_w8:
++ _CET_ENDBR
+ movu xm0, [srcq+strideq*0]
+ vinserti32x4 ym0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+@@ -903,6 +931,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .prep_w8
+ RET
+ .prep_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+@@ -919,6 +948,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .prep_w16
+ RET
+ .prep_w32:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+@@ -934,6 +964,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .prep_w32
+ RET
+ .prep_w64:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+strideq*0+64*0]
+ pmullw m1, m4, [srcq+strideq*0+64*1]
+ pmullw m2, m4, [srcq+strideq*1+64*0]
+@@ -949,6 +980,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .prep_w64
+ RET
+ .prep_w128:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+64*0]
+ pmullw m1, m4, [srcq+64*1]
+ pmullw m2, m4, [srcq+64*2]
+@@ -981,6 +1013,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .h_w4:
++ _CET_ENDBR
+ movu xm1, [srcq+strideq*0]
+ vinserti32x4 ym1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+@@ -1001,6 +1034,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .h_w4
+ RET
+ .h_w8:
++ _CET_ENDBR
+ movu xm0, [srcq+strideq*0+0]
+ movu xm1, [srcq+strideq*0+2]
+ vinserti32x4 ym0, [srcq+strideq*1+0], 1
+@@ -1021,6 +1055,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .h_w8
+ RET
+ .h_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+strideq*0+0]
+ vinserti32x8 m0, [srcq+strideq*1+0], 1
+ movu ym1, [srcq+strideq*0+2]
+@@ -1037,6 +1072,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .h_w16
+ RET
+ .h_w32:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ pmullw m1, m4, [srcq+strideq*1+0]
+@@ -1055,6 +1091,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .h_w32
+ RET
+ .h_w64:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+@@ -1073,6 +1110,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .h_w64
+ RET
+ .h_w128:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m7, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+@@ -1111,6 +1149,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ .v_12bpc:
+ jmp wq
+ .v_w4:
++ _CET_ENDBR
+ movq xmm0, [srcq+strideq*0]
+ .v_w4_loop:
+ vpbroadcastq xmm2, [srcq+strideq*1]
+@@ -1134,6 +1173,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ vzeroupper
+ RET
+ .v_w8:
++ _CET_ENDBR
+ movu xm0, [srcq+strideq*0]
+ .v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+@@ -1153,6 +1193,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .v_w8_loop
+ RET
+ .v_w16:
++ _CET_ENDBR
+ movu ym0, [srcq+strideq*0]
+ .v_w16_loop:
+ vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
+@@ -1179,6 +1220,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .v_w16_loop
+ RET
+ .v_w32:
++ _CET_ENDBR
+ movu m0, [srcq+strideq*0]
+ .v_w32_loop:
+ movu m3, [srcq+strideq*1]
+@@ -1201,6 +1243,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .v_w32_loop
+ RET
+ .v_w64:
++ _CET_ENDBR
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ .v_w64_loop:
+@@ -1224,6 +1267,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .v_w64_loop
+ RET
+ .v_w128:
++ _CET_ENDBR
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+@@ -1264,6 +1308,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .hv_w4:
++ _CET_ENDBR
+ movq xmm0, [srcq+strideq*0+0]
+ movq xmm1, [srcq+strideq*0+2]
+ pmullw xmm0, xm4
+@@ -1298,6 +1343,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .hv_w4_loop
+ RET
+ .hv_w8:
++ _CET_ENDBR
+ pmullw xm0, xm4, [srcq+strideq*0+0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm6
+@@ -1330,6 +1376,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .hv_w8_loop
+ RET
+ .hv_w16:
++ _CET_ENDBR
+ pmullw ym0, ym4, [srcq+strideq*0+0]
+ pmullw ym1, ym5, [srcq+strideq*0+2]
+ psubw ym0, ym6
+@@ -1358,6 +1405,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .hv_w16_loop
+ RET
+ .hv_w32:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+@@ -1388,6 +1436,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .hv_w32_loop
+ RET
+ .hv_w64:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+@@ -1425,6 +1474,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride,
+ jg .hv_w64_loop
+ RET
+ .hv_w128:
++ _CET_ENDBR
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+@@ -1534,6 +1584,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ %endif
+ jmp wq
+ .h_w2:
++ _CET_ENDBR
+ movzx mxd, mxb
+ sub srcq, 2
+ mova ym2, [spel_h_shuf2a]
+@@ -1559,6 +1610,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ jg .h_w2_loop
+ RET
+ .h_w4:
++ _CET_ENDBR
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+@@ -1608,6 +1660,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ je .h_w16
+ jg .h_w32
+ .h_w8:
++ _CET_ENDBR
+ mova m4, [spel_h_shufA]
+ movu m5, [spel_h_shufB]
+ movu m6, [spel_h_shufC]
+@@ -1636,6 +1689,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ jg .h_w8_loop
+ RET
+ .h_w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ .h_w16_loop:
+@@ -1672,6 +1726,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ jg .h_w16_loop
+ RET
+ .h_w32:
++ _CET_ENDBR
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea dstq, [dstq+wq*2]
+@@ -1731,6 +1786,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ vpbroadcastd m15, [rsp+stack_offset+20]
+ jmp r7
+ .v_w2:
++ _CET_ENDBR
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+@@ -1770,6 +1826,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ jg .v_w2_loop
+ RET
+ .v_w4:
++ _CET_ENDBR
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+@@ -1814,6 +1871,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ vzeroupper
+ RET
+ .v_w8:
++ _CET_ENDBR
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ vinserti32x4 m1, m2, [srcq+ssq*0], 0
+ vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
+@@ -1852,6 +1910,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ jg .v_w8_loop
+ RET
+ .v_w16:
++ _CET_ENDBR
+ vbroadcasti32x8 m1, [srcq+ssq*1]
+ vinserti32x8 m0, m1, [srcq+ssq*0], 0
+ vinserti32x8 m1, [srcq+ssq*2], 1
+@@ -1904,6 +1963,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w,
+ .v_w32:
+ .v_w64:
+ .v_w128:
++ _CET_ENDBR
+ %if WIN64
+ movaps [rsp+stack_offset+8], xmm6
+ %endif
+@@ -2595,6 +2655,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ %endif
+ jmp wq
+ .h_w4:
++ _CET_ENDBR
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+@@ -2646,6 +2707,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ je .h_w16
+ jg .h_w32
+ .h_w8:
++ _CET_ENDBR
+ mova m6, [spel_h_shufA]
+ movu m7, [spel_h_shufB]
+ movu m8, [spel_h_shufC]
+@@ -2682,6 +2744,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ jg .h_w8_loop
+ RET
+ .h_w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ mova m11, [prep_endC]
+@@ -2715,6 +2778,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ jg .h_w16_loop
+ RET
+ .h_w32:
++ _CET_ENDBR
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+@@ -2773,6 +2837,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ vpbroadcastd m15, [tmpq+12]
+ jmp r7
+ .v_w4:
++ _CET_ENDBR
+ movq xmm1, [srcq+strideq*0]
+ vpbroadcastq ymm0, [srcq+strideq*1]
+ vpbroadcastq ymm2, [srcq+strideq*2]
+@@ -2814,6 +2879,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ vzeroupper
+ RET
+ .v_w8:
++ _CET_ENDBR
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ vinserti32x4 m1, m2, [srcq+strideq*0], 0
+ vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2
+@@ -2849,6 +2915,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ jg .v_w8_loop
+ RET
+ .v_w16:
++ _CET_ENDBR
+ vbroadcasti32x8 m1, [srcq+strideq*1]
+ vinserti32x8 m0, m1, [srcq+strideq*0], 0
+ vinserti32x8 m1, [srcq+strideq*2], 1
+@@ -2896,6 +2963,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w
+ .v_w32:
+ .v_w64:
+ .v_w128:
++ _CET_ENDBR
+ %if WIN64
+ PUSH r8
+ movaps [rsp+stack_offset+8], xmm6
+@@ -3613,6 +3681,7 @@ ALIGN function_align
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+@@ -3647,6 +3716,7 @@ ALIGN function_align
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w8:
++ _CET_ENDBR
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+@@ -3665,6 +3735,7 @@ ALIGN function_align
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w16:
++ _CET_ENDBR
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+@@ -3676,6 +3747,7 @@ ALIGN function_align
+ call .main
+ lea dstq, [dstq+strideq*2]
+ .w32:
++ _CET_ENDBR
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+@@ -3685,6 +3757,7 @@ ALIGN function_align
+ call .main
+ add dstq, strideq
+ .w64:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+@@ -3694,6 +3767,7 @@ ALIGN function_align
+ call .main
+ add dstq, strideq
+ .w128:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+@@ -3853,6 +3927,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ mova m4, [w_mask_shuf4]
+ vpermt2b m2, m4, m3
+ mova m3, m14
+@@ -3890,6 +3965,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ .w4_end:
+ RET
+ .w8:
++ _CET_ENDBR
+ mova m8, [w_mask_shuf8]
+ vpbroadcastd m9, [pb_64]
+ jmp .w8_start
+@@ -3918,6 +3994,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ .w8_end:
+ RET
+ .w16:
++ _CET_ENDBR
+ mova m8, [w_mask_shuf16]
+ vpbroadcastd m9, [pb_64]
+ jmp .w16_start
+@@ -3943,6 +4020,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+ .w32:
++ _CET_ENDBR
+ paddw m2, m3
+ mova m8, m14
+ vpdpwssd m8, m11, m2
+@@ -3964,6 +4042,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+ .w64:
++ _CET_ENDBR
+ mova m8, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*0], m0
+@@ -3987,6 +4066,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+ .w128:
++ _CET_ENDBR
+ mova m16, m2
+ mova m8, m3
+ mova [dstq+strideq*0+64*0], m0
+@@ -4088,6 +4168,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+@@ -4122,6 +4203,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w8:
++ _CET_ENDBR
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+@@ -4140,6 +4222,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w16:
++ _CET_ENDBR
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+@@ -4151,6 +4234,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*2]
+ .w32:
++ _CET_ENDBR
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+@@ -4160,6 +4244,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ call .main
+ add dstq, strideq
+ .w64:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+@@ -4169,6 +4254,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1,
+ call .main
+ add dstq, strideq
+ .w128:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+@@ -4247,6 +4333,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ lea stride3q, [strideq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+@@ -4281,6 +4368,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w8:
++ _CET_ENDBR
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+@@ -4299,6 +4387,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*4]
+ .w16:
++ _CET_ENDBR
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+@@ -4310,6 +4399,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ call .main
+ lea dstq, [dstq+strideq*2]
+ .w32:
++ _CET_ENDBR
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+@@ -4319,6 +4409,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ call .main
+ add dstq, strideq
+ .w64:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+@@ -4328,6 +4419,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1,
+ call .main
+ add dstq, strideq
+ .w128:
++ _CET_ENDBR
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+@@ -4395,6 +4487,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+ lea r6, [dsq*3]
+ jmp wq
+ .w4:
++ _CET_ENDBR
+ pmovzxbw ym19, [maskq]
+ movq xm16, [dstq+dsq*0]
+ movhps xm16, [dstq+dsq*1]
+@@ -4419,6 +4512,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+ vzeroupper
+ RET
+ .w8:
++ _CET_ENDBR
+ pmovzxbw m2, [maskq]
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+@@ -4439,6 +4533,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+ jg .w8
+ RET
+ .w16:
++ _CET_ENDBR
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova ym0, [dstq+dsq*0]
+@@ -4464,6 +4559,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+ jg .w16
+ RET
+ .w32:
++ _CET_ENDBR
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova m0, [dstq+dsq*0]
+@@ -4493,6 +4589,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ add wq, r5
+ jmp wq
+ .w2:
++ _CET_ENDBR
+ vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
+ .w2_loop:
+ movd xmm0, [dstq+dsq*0]
+@@ -4509,6 +4606,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ jg .w2_loop
+ RET
+ .w4:
++ _CET_ENDBR
+ vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
+ .w4_loop:
+ movq xmm0, [dstq+dsq*0]
+@@ -4524,6 +4622,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ jg .w4_loop
+ RET
+ .w8:
++ _CET_ENDBR
+ vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
+ .w8_loop:
+ mova xm0, [dstq+dsq*0]
+@@ -4539,6 +4638,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ jg .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
+ .w16_loop:
+ mova ym0, [dstq+dsq*0]
+@@ -4554,6 +4654,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ jg .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ mova m4, [obmc_masks_avx2+32*2]
+ .w32_loop:
+ mova m0, [dstq+dsq*0]
+@@ -4586,6 +4687,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ neg hq
+ jmp wq
+ .w2:
++ _CET_ENDBR
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+@@ -4602,6 +4704,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w2
+ RET
+ .w4:
++ _CET_ENDBR
+ mova xmm3, [blend_shuf]
+ .w4_loop:
+ movq xmm0, [dstq+dsq*0]
+@@ -4619,6 +4722,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w4_loop
+ RET
+ .w8:
++ _CET_ENDBR
+ vbroadcasti32x4 ym3, [blend_shuf]
+ shufpd ym3, ym3, 0x0c
+ .w8_loop:
+@@ -4637,6 +4741,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w8_loop
+ RET
+ .w16:
++ _CET_ENDBR
+ vbroadcasti32x4 m3, [blend_shuf]
+ shufpd m3, m3, 0xf0
+ .w16_loop:
+@@ -4655,6 +4760,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w16_loop
+ RET
+ .w32:
++ _CET_ENDBR
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+@@ -4673,6 +4779,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w32
+ RET
+ .w64:
++ _CET_ENDBR
+ vpbroadcastw m4, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m2, m0, [tmpq+64*0]
+@@ -4690,6 +4797,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma
+ jl .w64
+ RET
+ .w128:
++ _CET_ENDBR
+ vpbroadcastw m8, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m4, m0, [tmpq+64*0]
Index: multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm,v
retrieving revision 1.1
diff -u -p -r1.1 patch-src_x86_mc_avx512_asm
--- multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm 13 Jul 2023 12:36:37 -0000 1.1
+++ multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm 18 Feb 2024 18:51:06 -0000
@@ -904,7 +904,23 @@ Index: src/x86/mc_avx512.asm
pmovzxbq m5, [pb_02461357]
.w32_loop:
W_MASK 0, 4, 0, 1
-@@ -3930,6 +4046,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+@@ -3874,6 +3990,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ pmovzxbq m5, [pb_02461357]
+ .w64_loop:
+ W_MASK 0, 4, 0, 1
+@@ -3892,6 +4009,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
+ jg .w64_loop
+ RET
+ .w128:
++ _CET_ENDBR
+ pmovzxbq m13, [pb_02461357]
+ .w128_loop:
+ W_MASK 0, 4, 0, 1
+@@ -3930,6 +4048,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
lea stride3q, [strideq*3]
jmp wq
.w4:
@@ -912,7 +928,7 @@ Index: src/x86/mc_avx512.asm
cmp hd, 8
jg .w4_h16
WRAP_YMM W_MASK 0, 4, 0, 1, 1
-@@ -3959,6 +4076,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+@@ -3959,6 +4078,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
vpscatterdd [dstq+m9]{k1}, m0
RET
.w8:
@@ -920,7 +936,7 @@ Index: src/x86/mc_avx512.asm
cmp hd, 4
jne .w8_h8
WRAP_YMM W_MASK 0, 4, 0, 1, 1
-@@ -4001,6 +4119,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+@@ -4001,6 +4121,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
add maskq, 64
lea dstq, [dstq+strideq*4]
.w16:
@@ -928,7 +944,7 @@ Index: src/x86/mc_avx512.asm
W_MASK 0, 4, 0, 1, 1
vpermb m4, m8, m4
vpermq m0, m0, q3120
-@@ -4013,6 +4132,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+@@ -4013,6 +4134,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
jg .w16_loop
RET
.w32:
@@ -936,7 +952,23 @@ Index: src/x86/mc_avx512.asm
pmovzxbq m9, [pb_02461357]
.w32_loop:
W_MASK 0, 4, 0, 1, 1
-@@ -4078,6 +4198,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+@@ -4029,6 +4151,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+ jg .w32_loop
+ RET
+ .w64:
++ _CET_ENDBR
+ pmovzxbq m9, [pb_02461357]
+ .w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+@@ -4044,6 +4167,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
+ jg .w64_loop
+ RET
+ .w128:
++ _CET_ENDBR
+ pmovzxbq m11, [pb_02461357]
+ .w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+@@ -4078,6 +4202,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
lea r6, [dsq*3]
jmp wq
.w4:
@@ -944,7 +976,7 @@ Index: src/x86/mc_avx512.asm
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
vpbroadcastd xmm1, [dstq+dsq*2]
-@@ -4104,6 +4225,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+@@ -4104,6 +4229,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
jg .w4
RET
.w8:
@@ -952,7 +984,7 @@ Index: src/x86/mc_avx512.asm
movq xmm0, [dstq+dsq*0]
vpbroadcastq xmm1, [dstq+dsq*1]
vpbroadcastq ymm2, [dstq+dsq*2]
-@@ -4134,6 +4256,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+@@ -4134,6 +4260,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
vzeroupper
RET
.w16:
@@ -960,7 +992,7 @@ Index: src/x86/mc_avx512.asm
mova xm1, [dstq+dsq*0]
vinserti32x4 ym1, [dstq+dsq*1], 1
vinserti32x4 m1, [dstq+dsq*2], 2
-@@ -4160,6 +4283,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+@@ -4160,6 +4287,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
jg .w16
RET
.w32:
@@ -968,7 +1000,7 @@ Index: src/x86/mc_avx512.asm
mova ym1, [dstq+dsq*0]
vinserti32x8 m1, [dstq+dsq*1], 1
mova m4, [maskq]
-@@ -4193,6 +4317,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
+@@ -4193,6 +4321,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
add maskq, obmc_masks-blend_v_avx512icl_table
jmp wq
.w2:
@@ -976,7 +1008,7 @@ Index: src/x86/mc_avx512.asm
vpbroadcastd xmm2, [maskq+2*2]
.w2_s0_loop:
movd xmm0, [dstq+dsq*0]
-@@ -4210,6 +4335,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
+@@ -4210,6 +4339,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w2_s0_loop
RET
.w4:
@@ -984,7 +1016,7 @@ Index: src/x86/mc_avx512.asm
vpbroadcastq xmm2, [maskq+4*2]
.w4_loop:
movd xmm0, [dstq+dsq*0]
-@@ -4227,6 +4353,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
+@@ -4227,6 +4357,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w4_loop
RET
.w8:
@@ -992,7 +1024,7 @@ Index: src/x86/mc_avx512.asm
mova xmm3, [maskq+8*2]
.w8_loop:
movq xmm0, [dstq+dsq*0]
-@@ -4247,6 +4374,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
+@@ -4247,6 +4378,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w8_loop
RET
.w16:
@@ -1000,7 +1032,7 @@ Index: src/x86/mc_avx512.asm
vbroadcasti32x4 ym3, [maskq+16*2]
vbroadcasti32x4 ym4, [maskq+16*3]
.w16_loop:
-@@ -4268,6 +4396,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
+@@ -4268,6 +4400,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w16_loop
RET
.w32:
@@ -1008,7 +1040,7 @@ Index: src/x86/mc_avx512.asm
mova m4, [maskq+32*2]
vshufi32x4 m3, m4, m4, q2020
vshufi32x4 m4, m4, q3131
-@@ -4305,6 +4434,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4305,6 +4438,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
neg hq
jmp wq
.w2:
@@ -1016,7 +1048,7 @@ Index: src/x86/mc_avx512.asm
movd xmm0, [dstq+dsq*0]
pinsrw xmm0, [dstq+dsq*1], 1
movd xmm2, [maskq+hq*2]
-@@ -4322,6 +4452,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4322,6 +4456,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w2
RET
.w4:
@@ -1024,7 +1056,7 @@ Index: src/x86/mc_avx512.asm
mova xmm3, [blend_shuf]
.w4_loop:
movd xmm0, [dstq+dsq*0]
-@@ -4341,6 +4472,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4341,6 +4476,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w4_loop
RET
.w8:
@@ -1032,7 +1064,7 @@ Index: src/x86/mc_avx512.asm
vbroadcasti128 ymm4, [blend_shuf]
shufpd ymm4, ymm4, 0x03
.w8_loop:
-@@ -4365,6 +4497,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4365,6 +4501,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
vzeroupper
RET
.w16:
@@ -1040,7 +1072,7 @@ Index: src/x86/mc_avx512.asm
vbroadcasti32x4 ym4, [blend_shuf]
shufpd ym4, ym4, 0x0c
.w16_loop:
-@@ -4388,6 +4521,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4388,6 +4525,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w16_loop
RET
.w32:
@@ -1048,7 +1080,7 @@ Index: src/x86/mc_avx512.asm
vbroadcasti32x4 m4, [blend_shuf]
shufpd m4, m4, 0xf0
.w32_loop:
-@@ -4411,6 +4545,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4411,6 +4549,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w32_loop
RET
.w64:
@@ -1056,7 +1088,7 @@ Index: src/x86/mc_avx512.asm
vpbroadcastw m3, [maskq+hq*2]
mova m1, [dstq]
mova m2, [tmpq]
-@@ -4428,6 +4563,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
+@@ -4428,6 +4567,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w64
RET
.w128:
dav1d: add missing IBT landing pads for AVX-512