forked from chromium/chromium-beta
180 lines
7.7 KiB
Diff
180 lines
7.7 KiB
Diff
From 68e53210b76cadb14e98aaea58e30388f94bc9a1 Mon Sep 17 00:00:00 2001
|
|
From: Kaylee Lubick <kjlubick@google.com>
|
|
Date: Mon, 21 Apr 2025 14:19:04 -0400
|
|
Subject: [PATCH] Avoid assumption of 32 bit aligned pixel data in RP's lowp
|
|
gather()
|
|
|
|
The attached bug has most of the context, but the problem boils down
|
|
to the compiler assuming that a uint32_t* was aligned to 32 bits
|
|
and generated instructions like:
|
|
vld1.32 {d16[0]}, [r4:32]
|
|
where the :32 means "aligned to 32 bits" [1]. Pixel data is usually
|
|
aligned to the so-called natural alignment (pointer size) because
|
|
we allocate it with `new` or `calloc`. However, if we were to call
|
|
drawRect and use a source starting at x = 1, this eventually leads
|
|
to SkBitmap::extractSubset creating a SkPixelRef that starts at
|
|
the original pixel* + 1 (which is now no longer evenly divisible
|
|
by 4).
|
|
|
|
Certain ARM devices were crashing on this generated assembly. Rather
|
|
than trying to write the assembly code by hand or use intrinsics,
|
|
we can tell GCC and Clang that the pointer might be unaligned and
|
|
then the generated instructions lack the :32 modifier (and make these
|
|
devices not crash anymore):
|
|
vld1.32 {d16[0]}, [r4]
|
|
|
|
This fix is in our low precision pipeline code but may also be needed
|
|
in our high precision code too (which uses similar code). I wanted
|
|
to be careful with this change because it's pretty critical to
|
|
performance, so I kept the aligned version for cases where we know
|
|
the data is aligned (e.g. reading factors and biases for our gradient
|
|
stages).
|
|
|
|
This solution was inspired by Open CV
|
|
https://github.com/opencv/opencv/issues/25265
|
|
|
|
[1] https://developer.arm.com/documentation/ddi0597/2025-03/SIMD-FP-Instructions/VLD1--multiple-single-elements---Load-multiple-single-1-element-structures-to-one--two--three--or-four-registers-
|
|
|
|
Change-Id: I2892740acbb9db7434aab897e11fa41c3548a196
|
|
Bug: b/409859319
|
|
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/981638
|
|
Commit-Queue: Kaylee Lubick <kjlubick@google.com>
|
|
Commit-Queue: Daniel Dilan <danieldilan@google.com>
|
|
Auto-Submit: Kaylee Lubick <kjlubick@google.com>
|
|
Reviewed-by: Daniel Dilan <danieldilan@google.com>
|
|
|
|
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
|
|
index 9573b84baf..d14df3677c 100644
|
|
--- a/src/opts/SkRasterPipeline_opts.h
|
|
+++ b/src/opts/SkRasterPipeline_opts.h
|
|
@@ -5877,6 +5877,10 @@ SI void store(T* ptr, V v) {
|
|
return (U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
|
|
}
|
|
|
|
+ template <typename V, typename T>
|
|
+ SI V gather_unaligned(const T* ptr, U32 ix) {
|
|
+ return gather<V, T>(ptr, ix);
|
|
+ }
|
|
#elif defined(SKRP_CPU_HSW)
|
|
template <typename V, typename T>
|
|
SI V gather(const T* ptr, U32 ix) {
|
|
@@ -5903,6 +5907,11 @@ SI void store(T* ptr, V v) {
|
|
return join<U32>(_mm256_i32gather_epi32((const int*)ptr, lo, 4),
|
|
_mm256_i32gather_epi32((const int*)ptr, hi, 4));
|
|
}
|
|
+
|
|
+ template <typename V, typename T>
|
|
+ SI V gather_unaligned(const T* ptr, U32 ix) {
|
|
+ return gather<V, T>(ptr, ix);
|
|
+ }
|
|
#elif defined(SKRP_CPU_LASX)
|
|
template <typename V, typename T>
|
|
SI V gather(const T* ptr, U32 ix) {
|
|
@@ -5911,12 +5920,43 @@ SI void store(T* ptr, V v) {
|
|
ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
|
|
ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
|
|
}
|
|
+
|
|
+ template <typename V, typename T>
|
|
+ SI V gather_unaligned(const T* ptr, U32 ix) {
|
|
+ return gather<V, T>(ptr, ix);
|
|
+ }
|
|
+#elif defined(SKRP_CPU_NEON)
|
|
+ template <typename V, typename T>
|
|
+ SI V gather(const T* ptr, U32 ix) {
|
|
+ // The compiler assumes ptr is aligned, which caused crashes on some
|
|
+ // arm32 chips because a register was marked as "aligned to 32 bits"
|
|
+ // incorrectly. https://crbug.com/skia/409859319
|
|
+ SkASSERTF(reinterpret_cast<uintptr_t>(ptr) % alignof(T) == 0,
|
|
+ "Should use gather_unaligned");
|
|
+ return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
|
|
+ ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
|
|
+ }
|
|
+
|
|
+ template <typename V, typename T>
|
|
+ SI V gather_unaligned(const T* ptr, U32 ix) {
|
|
+ // This tells the compiler ptr might not be aligned appropriately, so
|
|
+ // it generates better assembly.
|
|
+ typedef T __attribute__ ((aligned (1))) unaligned_ptr;
|
|
+ const unaligned_ptr* uptr = static_cast<const unaligned_ptr*>(ptr);
|
|
+ return V{ uptr[ix[ 0]], uptr[ix[ 1]], uptr[ix[ 2]], uptr[ix[ 3]],
|
|
+ uptr[ix[ 4]], uptr[ix[ 5]], uptr[ix[ 6]], uptr[ix[ 7]], };
|
|
+ }
|
|
#else
|
|
template <typename V, typename T>
|
|
SI V gather(const T* ptr, U32 ix) {
|
|
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
|
|
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
|
|
}
|
|
+
|
|
+ template <typename V, typename T>
|
|
+ SI V gather_unaligned(const T* ptr, U32 ix) {
|
|
+ return gather<V, T>(ptr, ix);
|
|
+ }
|
|
#endif
|
|
|
|
|
|
@@ -6049,7 +6089,7 @@ LOWP_STAGE_PP(store_8888, const SkRasterPipelineContexts::MemoryCtx* ctx) {
|
|
LOWP_STAGE_GP(gather_8888, const SkRasterPipelineContexts::GatherCtx* ctx) {
|
|
const uint32_t* ptr;
|
|
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
|
- from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
|
|
+ from_8888(gather_unaligned<U32>(ptr, ix), &r, &g, &b, &a);
|
|
}
|
|
|
|
// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
|
|
@@ -6099,7 +6139,7 @@ LOWP_STAGE_PP(store_565, const SkRasterPipelineContexts::MemoryCtx* ctx) {
|
|
LOWP_STAGE_GP(gather_565, const SkRasterPipelineContexts::GatherCtx* ctx) {
|
|
const uint16_t* ptr;
|
|
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
|
- from_565(gather<U16>(ptr, ix), &r, &g, &b);
|
|
+ from_565(gather_unaligned<U16>(ptr, ix), &r, &g, &b);
|
|
a = U16_255;
|
|
}
|
|
|
|
@@ -6149,7 +6189,7 @@ LOWP_STAGE_PP(store_4444, const SkRasterPipelineContexts::MemoryCtx* ctx) {
|
|
LOWP_STAGE_GP(gather_4444, const SkRasterPipelineContexts::GatherCtx* ctx) {
|
|
const uint16_t* ptr;
|
|
U32 ix = ix_and_ptr(&ptr, ctx, x,y);
|
|
- from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
|
|
+ from_4444(gather_unaligned<U16>(ptr, ix), &r,&g,&b,&a);
|
|
}
|
|
|
|
SI void from_88(U16 rg, U16* r, U16* g) {
|
|
@@ -6198,7 +6238,7 @@ LOWP_STAGE_PP(store_rg88, const SkRasterPipelineContexts::MemoryCtx* ctx) {
|
|
LOWP_STAGE_GP(gather_rg88, const SkRasterPipelineContexts::GatherCtx* ctx) {
|
|
const uint16_t* ptr;
|
|
U32 ix = ix_and_ptr(&ptr, ctx, x, y);
|
|
- from_88(gather<U16>(ptr, ix), &r, &g);
|
|
+ from_88(gather_unaligned<U16>(ptr, ix), &r, &g);
|
|
b = U16_0;
|
|
a = U16_255;
|
|
}
|
|
@@ -6625,11 +6665,11 @@ LOWP_STAGE_GP(bilerp_clamp_8888, const SkRasterPipelineContexts::GatherCtx* ctx)
|
|
const uint32_t* ptr;
|
|
U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
|
|
U16 leftR, leftG, leftB, leftA;
|
|
- from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
|
+ from_8888(gather_unaligned<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
|
|
|
ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
|
|
U16 rightR, rightG, rightB, rightA;
|
|
- from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
|
+ from_8888(gather_unaligned<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
|
|
|
U16 topR = lerpX(leftR, rightR),
|
|
topG = lerpX(leftG, rightG),
|
|
@@ -6637,10 +6677,10 @@ LOWP_STAGE_GP(bilerp_clamp_8888, const SkRasterPipelineContexts::GatherCtx* ctx)
|
|
topA = lerpX(leftA, rightA);
|
|
|
|
ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
|
|
- from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
|
+ from_8888(gather_unaligned<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
|
|
|
ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
|
|
- from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
|
+ from_8888(gather_unaligned<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
|
|
|
U16 bottomR = lerpX(leftR, rightR),
|
|
bottomG = lerpX(leftG, rightG),
|