- U_draw-use-SoA-fetch-not-AoS-one.patch

* reverse-apply this patch to fix OpenGL support on s390x (bsc#1032272) OBS-URL: https://build.opensuse.org/package/show/X11:XOrg/Mesa?expand=0&rev=592
2017-04-07 15:06:33 +00:00 · 2017-04-07 15:06:33 +00:00 · 607719c047
commit 607719c047
parent fd56f9108c
3 changed files with 148 additions and 0 deletions
--- a/Mesa.changes
+++ b/Mesa.changes
@ -1,3 +1,10 @@
 -------------------------------------------------------------------
 Fri Apr  7 14:51:09 UTC 2017 - sndirsch@suse.com
 - U_draw-use-SoA-fetch-not-AoS-one.patch 
  * reverse-apply this patch to fix OpenGL support on s390x
    (bsc#1032272)
 -------------------------------------------------------------------
 Wed Apr  5 11:32:26 UTC 2017 - afaerber@suse.de
--- a/Mesa.spec
+++ b/Mesa.spec
@ -90,6 +90,8 @@ Patch32:        archlinux_glvnd-fix-gl-dot-pc.patch
 Patch33:        archlinux_0001-EGL-Implement-the-libglvnd-interface-for-EGL-v2.patch
 Patch34:        archlinux_0002-fixup-EGL-Implement-the-libglvnd-interface-for-EGL-v.patch
 Patch35:        fedora_0001-glxglvnddispatch-Add-missing-dispatch-for-GetDriverC.patch
 # reverse-apply this to fix OpenGL support on s390x (bsc#1032272)
 Patch40:        U_draw-use-SoA-fetch-not-AoS-one.patch
 # Nouveau multithreading workarounds from https://github.com/imirkin/mesa/commits/locking
 Patch61:        N_01-WIP-nouveau-add-locking.patch
@ -667,6 +669,9 @@ rm -rf docs/README.{VMS,WIN32,OS2}
 %patch35 -p1
 %endif
 # reverse-apply this patch to fix OpenGL support on s390x (bsc#1032272)
 %patch40 -R -p1
 %if %{use_broken_nouveau_locking_patches}
 %patch61 -p1
 %patch62 -p1
--- a/U_draw-use-SoA-fetch-not-AoS-one.patch
+++ b/U_draw-use-SoA-fetch-not-AoS-one.patch
@ -0,0 +1,136 @@
 From e827d9175675aaa6cfc0b981e2a80685fb7b3a74 Mon Sep 17 00:00:00 2001
 From: Roland Scheidegger <sroland@vmware.com>
 Date: Wed, 21 Dec 2016 04:43:07 +0100
 Subject: [PATCH] draw: use SoA fetch, not AoS one
 Now that there's some SoA fetch which never falls back, we should always get
 results which are better or at least not worse (something like rgba32f will
 stay the same).
 For cases which get way better, think something like R16_UNORM with 8-wide
 vectors: this was 8 sign-extend fetches, 8 cvt, 8 muls, followed by
 a couple of shuffles to stitch things together (if it is smart enough,
 6 unpacks) and then a (8-wide) transpose (not sure if llvm could even
 optimize the shuffles + transpose, since the 16bit values were actually
 sign-extended to 128bit before being cast to a float vec, so that would be
 another 8 unpacks). Now that is just 8 fetches (directly inserted into
 vector, albeit there's one 128bit insert needed), 1 cvt, 1 mul.
 v2: ditch the old AoS code instead of just disabling it.
 Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
 ---
 src/gallium/auxiliary/draw/draw_llvm.c | 71 +++++++++++-----------------------
 1 file changed, 23 insertions(+), 48 deletions(-)
 diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
 index 19b75a5003..8952dc8d3b 100644
 --- a/src/gallium/auxiliary/draw/draw_llvm.c
 +++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -713,39 +713,6 @@ fetch_instanced(struct gallivm_state *gallivm,
 static void
 -convert_to_soa(struct gallivm_state *gallivm,
 -               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
 -               LLVMValueRef dst_soa[TGSI_NUM_CHANNELS],
 -               const struct lp_type soa_type)
 -{
 -   unsigned j, k;
 -   struct lp_type aos_channel_type = soa_type;
 -
 -   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
 -   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
 -
 -   debug_assert(TGSI_NUM_CHANNELS == 4);
 -   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
 -
 -   aos_channel_type.length >>= 1;
 -
 -   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
 -      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
 -
 -      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
 -
 -      for (k = 0; k < pixels_per_channel; ++k) {
 -         channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
 -      }
 -
 -      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
 -   }
 -
 -   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
 -}
 -
 -
 -static void
 fetch_vector(struct gallivm_state *gallivm,
              const struct util_format_description *format_desc,
              struct lp_type vs_type,
@@ -755,11 +722,10 @@ fetch_vector(struct gallivm_state *gallivm,
              LLVMValueRef *inputs,
              LLVMValueRef indices)
 {
 -   LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context blduivec;
 +   struct lp_type fetch_type = vs_type;
    LLVMValueRef offset, valid_mask;
 -   LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
    unsigned i;
    lp_build_context_init(&blduivec, gallivm, lp_uint_type(vs_type));
@@ -783,28 +749,37 @@ fetch_vector(struct gallivm_state *gallivm,
    }
    /*
 -    * Note: we probably really want to use SoA fetch, not AoS one (albeit
 -    * for most formats it will amount to the same as this isn't very
 -    * optimized). But looks dangerous since it assumes alignment.
 +    * Unlike fetch_instanced, use SoA fetch instead of multiple AoS fetches.
 +    * This should always produce better code.
     */
 -   for (i = 0; i < vs_type.length; i++) {
 -      LLVMValueRef offset1, elem;
 -      elem = lp_build_const_int32(gallivm, i);
 -      offset1 = LLVMBuildExtractElement(builder, offset, elem, "");
 -      aos_fetch[i] = lp_build_fetch_rgba_aos(gallivm, format_desc,
 -                                             lp_float32_vec4_type(),
 -                                             FALSE, map_ptr, offset1,
 -                                             zero, zero, NULL);
 +   /* The type handling is annoying here... */
 +   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
 +       format_desc->channel[0].pure_integer) {
 +      if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
 +         fetch_type = lp_type_int_vec(vs_type.width, vs_type.width * vs_type.length);
 +      }
 +      else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
 +         fetch_type = lp_type_uint_vec(vs_type.width, vs_type.width * vs_type.length);
 +      }
    }
 -   convert_to_soa(gallivm, aos_fetch, inputs, vs_type);
 +
 +   lp_build_fetch_rgba_soa(gallivm, format_desc,
 +                           fetch_type, FALSE, map_ptr, offset,
 +                           blduivec.zero, blduivec.zero,
 +                           NULL, inputs);
    for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
 +      inputs[i] = LLVMBuildBitCast(builder, inputs[i],
 +                                   lp_build_vec_type(gallivm, vs_type), "");
 +   }
 +
 +   /* out-of-bound fetches return all zeros */
 +   for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
       inputs[i] = LLVMBuildBitCast(builder, inputs[i], blduivec.vec_type, "");
       inputs[i] = LLVMBuildAnd(builder, inputs[i], valid_mask, "");
       inputs[i] = LLVMBuildBitCast(builder, inputs[i],
                                    lp_build_vec_type(gallivm, vs_type), "");
 -
    }
 }
 -- 
 2.12.0