diff --git a/Mesa.changes b/Mesa.changes
index 321a7e7..6ce5cbf 100644
--- a/Mesa.changes
+++ b/Mesa.changes
@@ -1,3 +1,10 @@
+-------------------------------------------------------------------
+Fri Apr  7 14:51:09 UTC 2017 - sndirsch@suse.com
+
+- U_draw-use-SoA-fetch-not-AoS-one.patch 
+  * reverse-apply this patch to fix OpenGL support on s390x
+    (bsc#1032272)
+
 -------------------------------------------------------------------
 Wed Apr  5 11:32:26 UTC 2017 - afaerber@suse.de
 
diff --git a/Mesa.spec b/Mesa.spec
index 3344262..d3167fb 100644
--- a/Mesa.spec
+++ b/Mesa.spec
@@ -90,6 +90,8 @@ Patch32:        archlinux_glvnd-fix-gl-dot-pc.patch
 Patch33:        archlinux_0001-EGL-Implement-the-libglvnd-interface-for-EGL-v2.patch
 Patch34:        archlinux_0002-fixup-EGL-Implement-the-libglvnd-interface-for-EGL-v.patch
 Patch35:        fedora_0001-glxglvnddispatch-Add-missing-dispatch-for-GetDriverC.patch
+# reverse-apply this to fix OpenGL support on s390x (bsc#1032272)
+Patch40:        U_draw-use-SoA-fetch-not-AoS-one.patch
 
 # Nouveau multithreading workarounds from https://github.com/imirkin/mesa/commits/locking
 Patch61:        N_01-WIP-nouveau-add-locking.patch
@@ -667,6 +669,9 @@ rm -rf docs/README.{VMS,WIN32,OS2}
 %patch35 -p1
 %endif
 
+# reverse-apply this patch to fix OpenGL support on s390x (bsc#1032272)
+%patch40 -R -p1
+
 %if %{use_broken_nouveau_locking_patches}
 %patch61 -p1
 %patch62 -p1
diff --git a/U_draw-use-SoA-fetch-not-AoS-one.patch b/U_draw-use-SoA-fetch-not-AoS-one.patch
new file mode 100644
index 0000000..8d1bc51
--- /dev/null
+++ b/U_draw-use-SoA-fetch-not-AoS-one.patch
@@ -0,0 +1,136 @@
+From e827d9175675aaa6cfc0b981e2a80685fb7b3a74 Mon Sep 17 00:00:00 2001
+From: Roland Scheidegger <sroland@vmware.com>
+Date: Wed, 21 Dec 2016 04:43:07 +0100
+Subject: [PATCH] draw: use SoA fetch, not AoS one
+
+Now that there's some SoA fetch which never falls back, we should always get
+results which are better or at least not worse (something like rgba32f will
+stay the same).
+
+For cases which get way better, think something like R16_UNORM with 8-wide
+vectors: this was 8 sign-extend fetches, 8 cvt, 8 muls, followed by
+a couple of shuffles to stitch things together (if it is smart enough,
+6 unpacks) and then a (8-wide) transpose (not sure if llvm could even
+optimize the shuffles + transpose, since the 16bit values were actually
+sign-extended to 128bit before being cast to a float vec, so that would be
+another 8 unpacks). Now that is just 8 fetches (directly inserted into
+vector, albeit there's one 128bit insert needed), 1 cvt, 1 mul.
+
+v2: ditch the old AoS code instead of just disabling it.
+
+Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
+---
+ src/gallium/auxiliary/draw/draw_llvm.c | 71 +++++++++++-----------------------
+ 1 file changed, 23 insertions(+), 48 deletions(-)
+
+diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
+index 19b75a5003..8952dc8d3b 100644
+--- a/src/gallium/auxiliary/draw/draw_llvm.c
++++ b/src/gallium/auxiliary/draw/draw_llvm.c
+@@ -713,39 +713,6 @@ fetch_instanced(struct gallivm_state *gallivm,
+ 
+ 
+ static void
+-convert_to_soa(struct gallivm_state *gallivm,
+-               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+-               LLVMValueRef dst_soa[TGSI_NUM_CHANNELS],
+-               const struct lp_type soa_type)
+-{
+-   unsigned j, k;
+-   struct lp_type aos_channel_type = soa_type;
+-
+-   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+-   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
+-
+-   debug_assert(TGSI_NUM_CHANNELS == 4);
+-   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
+-
+-   aos_channel_type.length >>= 1;
+-
+-   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+-      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+-
+-      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+-
+-      for (k = 0; k < pixels_per_channel; ++k) {
+-         channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
+-      }
+-
+-      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+-   }
+-
+-   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+-}
+-
+-
+-static void
+ fetch_vector(struct gallivm_state *gallivm,
+              const struct util_format_description *format_desc,
+              struct lp_type vs_type,
+@@ -755,11 +722,10 @@ fetch_vector(struct gallivm_state *gallivm,
+              LLVMValueRef *inputs,
+              LLVMValueRef indices)
+ {
+-   LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
+    LLVMBuilderRef builder = gallivm->builder;
+    struct lp_build_context blduivec;
++   struct lp_type fetch_type = vs_type;
+    LLVMValueRef offset, valid_mask;
+-   LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+    unsigned i;
+ 
+    lp_build_context_init(&blduivec, gallivm, lp_uint_type(vs_type));
+@@ -783,28 +749,37 @@ fetch_vector(struct gallivm_state *gallivm,
+    }
+ 
+    /*
+-    * Note: we probably really want to use SoA fetch, not AoS one (albeit
+-    * for most formats it will amount to the same as this isn't very
+-    * optimized). But looks dangerous since it assumes alignment.
++    * Unlike fetch_instanced, use SoA fetch instead of multiple AoS fetches.
++    * This should always produce better code.
+     */
+-   for (i = 0; i < vs_type.length; i++) {
+-      LLVMValueRef offset1, elem;
+-      elem = lp_build_const_int32(gallivm, i);
+-      offset1 = LLVMBuildExtractElement(builder, offset, elem, "");
+ 
+-      aos_fetch[i] = lp_build_fetch_rgba_aos(gallivm, format_desc,
+-                                             lp_float32_vec4_type(),
+-                                             FALSE, map_ptr, offset1,
+-                                             zero, zero, NULL);
++   /* The type handling is annoying here... */
++   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
++       format_desc->channel[0].pure_integer) {
++      if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
++         fetch_type = lp_type_int_vec(vs_type.width, vs_type.width * vs_type.length);
++      }
++      else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
++         fetch_type = lp_type_uint_vec(vs_type.width, vs_type.width * vs_type.length);
++      }
+    }
+-   convert_to_soa(gallivm, aos_fetch, inputs, vs_type);
++
++   lp_build_fetch_rgba_soa(gallivm, format_desc,
++                           fetch_type, FALSE, map_ptr, offset,
++                           blduivec.zero, blduivec.zero,
++                           NULL, inputs);
+ 
+    for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
++      inputs[i] = LLVMBuildBitCast(builder, inputs[i],
++                                   lp_build_vec_type(gallivm, vs_type), "");
++   }
++
++   /* out-of-bound fetches return all zeros */
++   for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+       inputs[i] = LLVMBuildBitCast(builder, inputs[i], blduivec.vec_type, "");
+       inputs[i] = LLVMBuildAnd(builder, inputs[i], valid_mask, "");
+       inputs[i] = LLVMBuildBitCast(builder, inputs[i],
+                                    lp_build_vec_type(gallivm, vs_type), "");
+-
+    }
+ }
+ 
+-- 
+2.12.0
+