From: Kirill A. Korinsky Subject: devel/libggml: fix for llvm22 To: OpenBSD ports Date: Wed, 20 May 2026 11:16:51 +0200 ports@, here a fix for devel/libggml's compiler issue. I can't check it due to no access to Sapphire Rapids CPU, but it should be safe. Can someone test it? Thanks. Index: Makefile =================================================================== RCS file: /home/cvs/ports/devel/libggml/Makefile,v diff -u -p -r1.15 Makefile --- Makefile 17 May 2026 20:30:51 -0000 1.15 +++ Makefile 20 May 2026 09:10:03 -0000 @@ -3,6 +3,7 @@ COMMENT= tensor library for machine lea GH_ACCOUNT= ggml-org GH_PROJECT= ggml GH_TAGNAME = v0.12.0 +REVISION = 0 PKGNAME= lib${DISTNAME} SHARED_LIBS += ggml 3.8 Index: patches/patch-src_ggml-cpu_amx_mmq_cpp =================================================================== RCS file: patches/patch-src_ggml-cpu_amx_mmq_cpp diff -N patches/patch-src_ggml-cpu_amx_mmq_cpp --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_ggml-cpu_amx_mmq_cpp 20 May 2026 00:22:06 -0000 @@ -0,0 +1,61 @@ +https://github.com/ggml-org/ggml/issues/1499 + +Index: src/ggml-cpu/amx/mmq.cpp +--- src/ggml-cpu/amx/mmq.cpp.orig ++++ src/ggml-cpu/amx/mmq.cpp +@@ -1510,18 +1510,15 @@ struct tinygemm_kernel_vnni(_B); + + __m512i va[8]; +- __m512i vb[8]; + __m512 vc[COLS]; + __m512 vd1; + + // Notes: s8s8 igemm compensation in avx512-vnni + // change s8s8 to u8s8 with compensate +- // a * b = (a + 128) * b - 128 * b ++ // a * b = (b + 128) * a - 128 * a + // s s u s u s +- // +- // (128 * b is pre-computed when packing B to vnni formats) +- // + const __m512i off = _mm512_set1_epi8(static_cast(0x80)); ++ __m512i vcomp; + + auto loadc = [&](auto col) { + vc[col] = _mm512_setzero_ps(); +@@ -1529,29 +1526,25 @@ struct tinygemm_kernel_vnni{}(loadc); + + auto compute = [&](auto col, auto i) { +- // load a and add offset 128 ++ // load a and compute compensation + if constexpr (col == 0) { + const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); ++ vcomp = _mm512_setzero_si512(); + for (int k = 0; k < 8; ++k) { + va[k] = _mm512_set1_epi32(a_ptr[k]); +- va[k] = _mm512_add_epi8(va[k], off); ++ vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]); + } + vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d)); + } + +- // load b + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); +- for (int k = 0; k < 8; ++k) { +- vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)); +- } + const int offset = TILE_N * TILE_K; + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); +- const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half); +- const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr + offset2)); + + __m512i vsum = _mm512_setzero_si512(); + for (int k = 0; k < 8; ++k) { +- vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]); ++ const __m512i vb = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)), off); ++ vsum = _mm512_dpbusd_epi32(vsum, vb, va[k]); + } + vsum = _mm512_sub_epi32(vsum, vcomp); + -- wbr, Kirill