chromium-beta/ppc-skia-revert-3.patch

From 68e53210b76cadb14e98aaea58e30388f94bc9a1 Mon Sep 17 00:00:00 2001
From: Kaylee Lubick <kjlubick@google.com>
Date: Mon, 21 Apr 2025 14:19:04 -0400
Subject: [PATCH] Avoid assumption of 32 bit aligned pixel data in RP's lowp
 gather()

The attached bug has most of the context, but the problem boils down
to the compiler assuming that a uint32_t* was aligned to 32 bits
and generated instructions like:
    vld1.32	{d16[0]}, [r4:32]
where the :32 means "aligned to 32 bits" [1]. Pixel data is usually
aligned to the so-called natural alignment (pointer size) because
we allocate it with `new` or `calloc`. However, if we were to call
drawRect and use a source starting at x = 1, this eventually leads
to SkBitmap::extractSubset creating a SkPixelRef that starts at
the original pixel* + 1 (which is now no longer evenly divisible
by 4).

Certain ARM devices were crashing on this generated assembly. Rather
than trying to write the assembly code by hand or use intrinsics,
we can tell GCC and Clang that the pointer might be unaligned and
then the generated instructions lack the :32 modifier (and make these
devices not crash anymore):
    vld1.32	{d16[0]}, [r4]

This fix is in our low precision pipeline code but may also be needed
in our high precision code too (which uses similar code). I wanted
to be careful with this change because it's pretty critical to
performance, so I kept the aligned version for cases where we know
the data is aligned (e.g. reading factors and biases for our gradient
stages).

This solution was inspired by Open CV
https://github.com/opencv/opencv/issues/25265

[1] https://developer.arm.com/documentation/ddi0597/2025-03/SIMD-FP-Instructions/VLD1--multiple-single-elements---Load-multiple-single-1-element-structures-to-one--two--three--or-four-registers-

Change-Id: I2892740acbb9db7434aab897e11fa41c3548a196
Bug: b/409859319
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/981638
Commit-Queue: Kaylee Lubick <kjlubick@google.com>
Commit-Queue: Daniel Dilan <danieldilan@google.com>
Auto-Submit: Kaylee Lubick <kjlubick@google.com>
Reviewed-by: Daniel Dilan <danieldilan@google.com>

diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 9573b84baf..d14df3677c 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -5877,6 +5877,10 @@ SI void store(T* ptr, V v) {
         return (U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
     }

+    template <typename V, typename T>
+    SI V gather_unaligned(const T* ptr, U32 ix) {
+        return gather<V, T>(ptr, ix);
+    }
 #elif defined(SKRP_CPU_HSW)
     template <typename V, typename T>
     SI V gather(const T* ptr, U32 ix) {
@@ -5903,6 +5907,11 @@ SI void store(T* ptr, V v) {
         return join<U32>(_mm256_i32gather_epi32((const int*)ptr, lo, 4),
                          _mm256_i32gather_epi32((const int*)ptr, hi, 4));
     }
+
+    template <typename V, typename T>
+    SI V gather_unaligned(const T* ptr, U32 ix) {
+        return gather<V, T>(ptr, ix);
+    }
 #elif defined(SKRP_CPU_LASX)
     template <typename V, typename T>
     SI V gather(const T* ptr, U32 ix) {
@@ -5911,12 +5920,43 @@ SI void store(T* ptr, V v) {
                   ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
                   ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
     }
+
+    template <typename V, typename T>
+    SI V gather_unaligned(const T* ptr, U32 ix) {
+        return gather<V, T>(ptr, ix);
+    }
+#elif defined(SKRP_CPU_NEON)
+    template <typename V, typename T>
+    SI V gather(const T* ptr, U32 ix) {
+        // The compiler assumes ptr is aligned, which caused crashes on some
+        // arm32 chips because a register was marked as "aligned to 32 bits"
+        // incorrectly. https://crbug.com/skia/409859319
+        SkASSERTF(reinterpret_cast<uintptr_t>(ptr) % alignof(T) == 0,
+                 "Should use gather_unaligned");
+        return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+                  ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
+    }
+
+    template <typename V, typename T>
+    SI V gather_unaligned(const T* ptr, U32 ix) {
+        // This tells the compiler ptr might not be aligned appropriately, so
+        // it generates better assembly.
+        typedef T __attribute__ ((aligned (1))) unaligned_ptr;
+        const unaligned_ptr* uptr = static_cast<const unaligned_ptr*>(ptr);
+        return V{ uptr[ix[ 0]], uptr[ix[ 1]], uptr[ix[ 2]], uptr[ix[ 3]],
+                  uptr[ix[ 4]], uptr[ix[ 5]], uptr[ix[ 6]], uptr[ix[ 7]], };
+    }
 #else
     template <typename V, typename T>
     SI V gather(const T* ptr, U32 ix) {
         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
                   ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
     }
+
+    template <typename V, typename T>
+    SI V gather_unaligned(const T* ptr, U32 ix) {
+        return gather<V, T>(ptr, ix);
+    }
 #endif


@@ -6049,7 +6089,7 @@ LOWP_STAGE_PP(store_8888, const SkRasterPipelineContexts::MemoryCtx* ctx) {
 LOWP_STAGE_GP(gather_8888, const SkRasterPipelineContexts::GatherCtx* ctx) {
     const uint32_t* ptr;
     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
-    from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
+    from_8888(gather_unaligned<U32>(ptr, ix), &r, &g, &b, &a);
 }

 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
@@ -6099,7 +6139,7 @@ LOWP_STAGE_PP(store_565, const SkRasterPipelineContexts::MemoryCtx* ctx) {
 LOWP_STAGE_GP(gather_565, const SkRasterPipelineContexts::GatherCtx* ctx) {
     const uint16_t* ptr;
     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
-    from_565(gather<U16>(ptr, ix), &r, &g, &b);
+    from_565(gather_unaligned<U16>(ptr, ix), &r, &g, &b);
     a = U16_255;
 }

@@ -6149,7 +6189,7 @@ LOWP_STAGE_PP(store_4444, const SkRasterPipelineContexts::MemoryCtx* ctx) {
 LOWP_STAGE_GP(gather_4444, const SkRasterPipelineContexts::GatherCtx* ctx) {
     const uint16_t* ptr;
     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
-    from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
+    from_4444(gather_unaligned<U16>(ptr, ix), &r,&g,&b,&a);
 }

 SI void from_88(U16 rg, U16* r, U16* g) {
@@ -6198,7 +6238,7 @@ LOWP_STAGE_PP(store_rg88, const SkRasterPipelineContexts::MemoryCtx* ctx) {
 LOWP_STAGE_GP(gather_rg88, const SkRasterPipelineContexts::GatherCtx* ctx) {
     const uint16_t* ptr;
     U32 ix = ix_and_ptr(&ptr, ctx, x, y);
-    from_88(gather<U16>(ptr, ix), &r, &g);
+    from_88(gather_unaligned<U16>(ptr, ix), &r, &g);
     b = U16_0;
     a = U16_255;
 }
@@ -6625,11 +6665,11 @@ LOWP_STAGE_GP(bilerp_clamp_8888, const SkRasterPipelineContexts::GatherCtx* ctx)
     const uint32_t* ptr;
     U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
     U16 leftR, leftG, leftB, leftA;
-    from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
+    from_8888(gather_unaligned<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);

     ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
     U16 rightR, rightG, rightB, rightA;
-    from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
+    from_8888(gather_unaligned<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);

     U16 topR = lerpX(leftR, rightR),
         topG = lerpX(leftG, rightG),
@@ -6637,10 +6677,10 @@ LOWP_STAGE_GP(bilerp_clamp_8888, const SkRasterPipelineContexts::GatherCtx* ctx)
         topA = lerpX(leftA, rightA);

     ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
-    from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
+    from_8888(gather_unaligned<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);

     ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
-    from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
+    from_8888(gather_unaligned<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);

     U16 bottomR = lerpX(leftR, rightR),
         bottomG = lerpX(leftG, rightG),