From f2a81390aa237e34f9f45b8149e8f25fba2674fc076519a579e3a6ba232f4937 Mon Sep 17 00:00:00 2001
From: Guillaume GARDET <guillaume.gardet@opensuse.org>
Date: Wed, 21 Sep 2022 12:12:47 +0000
Subject: [PATCH] Accepting request 1005196 from
 home:Guillaume_G:branches:science:machinelearning

- Add patch to fix build with latest Arm Compute Library:
  * 1428.patch
  * fa93750.patch (dep for 1428.patch)

OBS-URL: https://build.opensuse.org/request/show/1005196
OBS-URL: https://build.opensuse.org/package/show/science:machinelearning/onednn?expand=0&rev=18
---
 1428.patch     |   34 ++
 fa93750.patch  | 1222 ++++++++++++++++++++++++++++++++++++++++++++++++
 onednn.changes |    7 +
 onednn.spec    |    4 +
 4 files changed, 1267 insertions(+)
 create mode 100644 1428.patch
 create mode 100644 fa93750.patch

diff --git a/1428.patch b/1428.patch
new file mode 100644
index 0000000..2021fa9
--- /dev/null
+++ b/1428.patch
@@ -0,0 +1,34 @@
+From 24d4c1379d3c4cbf423f979fa25283b1914159c7 Mon Sep 17 00:00:00 2001
+From: Diana Bite <diana.bite@arm.com>
+Date: Thu, 21 Jul 2022 12:27:17 +0100
+Subject: [PATCH] cpu: aarch64: acl: disallow large padding in ACL wino to be
+ consist with oneDNN
+
+---
+ src/cpu/aarch64/acl_convolution_utils.cpp | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
+index db3bde6a48..542033df0e 100644
+--- a/src/cpu/aarch64/acl_convolution_utils.cpp
++++ b/src/cpu/aarch64/acl_convolution_utils.cpp
+@@ -307,10 +307,17 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+     // General Compute Library checks, memory tags are also set there
+     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
+ 
+-    const bool wino_shape_ok // unit strides only, no dilations
++    const bool shape_ok
++            // only unit strides allowed
+             = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
++            // Note: Compute Library supports arbitrary padding for wino kernels
++            // but we only allow small padding to be consistent with oneDNN
++            && (acp.padstride_info.pad().first <= 1) // padding left/right
++            && (acp.padstride_info.pad().second <= 1) // padding top/bottom
++            // only non-dilated convolutions allowed
+             && (acp.dilation_info == arm_compute::Size2D(1, 1));
+-    if (!wino_shape_ok) return status::unimplemented;
++
++    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
+ 
+     // clang-format off
+     // Validate convolution manually to check for return status
diff --git a/fa93750.patch b/fa93750.patch
new file mode 100644
index 0000000..e3a03b0
--- /dev/null
+++ b/fa93750.patch
@@ -0,0 +1,1222 @@
+From fa93750bfb821fe05e3190b36f52b5bd88a57110 Mon Sep 17 00:00:00 2001
+From: Diana Bite <diana.bite@arm.com>
+Date: Thu, 24 Feb 2022 14:25:49 +0000
+Subject: [PATCH] cpu: aarch64: acl: fix inner_prod test failure and improve
+ validation
+
+---
+ src/cpu/aarch64/acl_binary.hpp                |   8 +-
+ src/cpu/aarch64/acl_convolution_utils.cpp     |  65 ++-----
+ src/cpu/aarch64/acl_eltwise.hpp               |   4 +-
+ src/cpu/aarch64/acl_eltwise_utils.cpp         |  17 +-
+ src/cpu/aarch64/acl_gemm_convolution.hpp      |   6 +-
+ .../aarch64/acl_indirect_gemm_convolution.hpp |   6 +-
+ src/cpu/aarch64/acl_inner_product.cpp         |  13 +-
+ src/cpu/aarch64/acl_inner_product.hpp         | 179 +++++++++++++++--
+ src/cpu/aarch64/acl_inner_product_utils.cpp   | 181 ------------------
+ src/cpu/aarch64/acl_inner_product_utils.hpp   |  62 ------
+ src/cpu/aarch64/acl_softmax.hpp               |  17 +-
+ src/cpu/aarch64/acl_utils.cpp                 |   4 +-
+ src/cpu/aarch64/acl_utils.hpp                 |  32 +++-
+ src/cpu/aarch64/acl_winograd_convolution.hpp  |   6 +-
+ src/cpu/aarch64/matmul/acl_matmul.cpp         |   4 +-
+ src/cpu/aarch64/matmul/acl_matmul.hpp         |   4 +-
+ src/cpu/aarch64/matmul/acl_matmul_utils.cpp   |  45 ++---
+ src/cpu/aarch64/matmul/acl_matmul_utils.hpp   |   6 +-
+ tests/benchdnn/inputs/ip/test_ip_acl          |  26 +++
+ 19 files changed, 281 insertions(+), 404 deletions(-)
+ delete mode 100644 src/cpu/aarch64/acl_inner_product_utils.cpp
+ delete mode 100644 src/cpu/aarch64/acl_inner_product_utils.hpp
+ create mode 100644 tests/benchdnn/inputs/ip/test_ip_acl
+
+diff --git a/src/cpu/aarch64/acl_binary.hpp b/src/cpu/aarch64/acl_binary.hpp
+index 77adb45bef..122b094587 100644
+--- a/src/cpu/aarch64/acl_binary.hpp
++++ b/src/cpu/aarch64/acl_binary.hpp
+@@ -125,7 +125,7 @@ struct acl_binary_t : public primitive_t {
+ 
+         status_t init(engine_t *engine) {
+ 
+-            using namespace acl_common_utils;
++            using namespace acl_utils;
+ 
+             // Only support f32 and s32 for now
+             data_type_t ddt = dst_md(0)->data_type;
+@@ -179,11 +179,7 @@ struct acl_binary_t : public primitive_t {
+             }
+ 
+             // Call operator specific validate function to check support
+-            arm_compute::Status acl_st = validate(asp_);
+-            if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-                MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-                return status::unimplemented;
+-            }
++            ACL_CHECK_VALID(validate(asp_));
+ 
+             // Initialize the ACL threads
+             acl_thread_bind();
+diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
+index ca91de49e3..e072dc5490 100644
+--- a/src/cpu/aarch64/acl_convolution_utils.cpp
++++ b/src/cpu/aarch64/acl_convolution_utils.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2020-2021 Arm Ltd. and affiliates
++* Copyright 2020-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -156,10 +156,10 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+     const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
+                                     : arm_compute::DataLayout::NCHW;
+ 
+-    auto acl_src_data_t = acl_common_utils::get_acl_data_t(src_d.data_type());
+-    auto acl_wei_data_t = acl_common_utils::get_acl_data_t(wei_d.data_type());
+-    auto acl_dst_data_t = acl_common_utils::get_acl_data_t(dst_d.data_type());
+-    auto acl_bia_data_t = acl_common_utils::get_acl_data_t(bia_d.data_type());
++    auto acl_src_data_t = acl_utils::get_acl_data_t(src_d.data_type());
++    auto acl_wei_data_t = acl_utils::get_acl_data_t(wei_d.data_type());
++    auto acl_dst_data_t = acl_utils::get_acl_data_t(dst_d.data_type());
++    auto acl_bia_data_t = acl_utils::get_acl_data_t(bia_d.data_type());
+ 
+     if (acl_bia_data_t == arm_compute::DataType::UNKNOWN)
+         acl_bia_data_t = arm_compute::DataType::F32;
+@@ -212,33 +212,14 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+     // is_eltwise(true) here stands for eltwise.scale == 1.f check
+     acp.sum_with_eltwise = (post_ops.len() == 2) && post_ops.entry_[0].is_sum()
+             && post_ops.entry_[1].is_eltwise(true);
+-    acp.act_info = acl_common_utils::get_acl_act(attr);
++    acp.act_info = acl_utils::get_acl_act(attr);
+ 
+     if (acp.sum_with_eltwise) {
+-        // clang-format off
+-        // Validate activation layer manually to check for return status
+-        auto acl_al_st = arm_compute::NEActivationLayer::validate(
+-            &acp.dst_info,
+-            &acp.dst_info,
+-            acp.act_info);
+-        // clang-format on
+-        if (acl_al_st.error_code() != arm_compute::ErrorCode::OK) {
+-            MAYBE_REPORT_ACL_ERROR(acl_al_st.error_description().c_str());
+-            return status::unimplemented;
+-        }
+-
+-        // clang-format off
+-        // Validate arithmetic addition manually to check for return status
+-        auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
+-            &acp.dst_info,
+-            &acp.dst_info,
+-            &acp.dst_info,
+-            arm_compute::ConvertPolicy::SATURATE);
+-        // clang-format on
+-        if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
+-            MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
+-            return status::unimplemented;
+-        }
++        ACL_CHECK_VALID(arm_compute::NEActivationLayer::validate( // eltwise
++                &acp.dst_info, &acp.dst_info, acp.act_info));
++        ACL_CHECK_VALID(arm_compute::NEArithmeticAddition::validate( // sum
++                &acp.dst_info, &acp.dst_info, &acp.dst_info,
++                arm_compute::ConvertPolicy::SATURATE));
+     }
+ 
+     return status::success;
+@@ -254,7 +235,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+ 
+     // clang-format off
+     // Validate convolution manually to check for return status
+-    auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
++    ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::validate(
+         &acp.src_info,
+         &acp.wei_info,
+         acp.with_bias ? &acp.bia_info : nullptr,
+@@ -263,12 +244,8 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+         acp.weights_info,
+         acp.dilation_info,
+         acp.act_info,
+-        acp.fast_math);
++        acp.fast_math));
+     // clang-format on
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-        return status::unimplemented;
+-    }
+ 
+     return status::success;
+ }
+@@ -289,7 +266,7 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+ 
+     // clang-format off
+     // NOTE: indirect convolution method supports only nhwc layout.
+-    auto acl_st = arm_compute::NEGEMMConv2d::validate(
++    ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
+         &acp.src_info,
+         &acp.wei_info,
+         acp.with_bias ? &acp.bia_info : nullptr,
+@@ -298,12 +275,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+                                 acp.dilation_info,
+                                 acp.act_info,
+                                 acp.fast_math,
+-                                1));
++                                1)));
+     // clang-format on
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-        return status::unimplemented;
+-    }
+ 
+     return status::success;
+ }
+@@ -336,19 +309,15 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+ 
+     // clang-format off
+     // Validate convolution manually to check for return status
+-    auto acl_st = arm_compute::NEWinogradConvolutionLayer::validate(
++    ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
+         &acp.src_info,
+         &acp.wei_info,
+         acp.with_bias ? &acp.bia_info : nullptr,
+         &acp.dst_info,
+         acp.padstride_info,
+         acp.act_info,
+-        true); // enable_fast_math flag in ACL Winograd
++        true)); // enable_fast_math flag in ACL Winograd
+     // clang-format on
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-        return status::unimplemented;
+-    }
+ 
+     return status::success;
+ }
+diff --git a/src/cpu/aarch64/acl_eltwise.hpp b/src/cpu/aarch64/acl_eltwise.hpp
+index a55b89272c..381368aabb 100644
+--- a/src/cpu/aarch64/acl_eltwise.hpp
++++ b/src/cpu/aarch64/acl_eltwise.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -78,7 +78,7 @@ struct acl_eltwise_fwd_t : public primitive_t {
+                     aep_, data_md_, *desc(), *attr());
+             if (conf_status != status::success) return status::unimplemented;
+ 
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+diff --git a/src/cpu/aarch64/acl_eltwise_utils.cpp b/src/cpu/aarch64/acl_eltwise_utils.cpp
+index 35e809e042..880b6aeaae 100644
+--- a/src/cpu/aarch64/acl_eltwise_utils.cpp
++++ b/src/cpu/aarch64/acl_eltwise_utils.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -46,7 +46,7 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
+ 
+     const alg_kind_t eltwise_alg = ed.alg_kind;
+ 
+-    bool activation_supported = acl_common_utils::acl_act_ok(eltwise_alg);
++    bool activation_supported = acl_utils::acl_act_ok(eltwise_alg);
+     if (!activation_supported) { return status::unimplemented; }
+ 
+     // batch size
+@@ -69,8 +69,8 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
+     const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
+                                     : arm_compute::DataLayout::NCHW;
+ 
+-    auto acl_src_data_t = acl_common_utils::get_acl_data_t(data_d.data_type());
+-    auto acl_dst_data_t = acl_common_utils::get_acl_data_t(data_d.data_type());
++    auto acl_src_data_t = acl_utils::get_acl_data_t(data_d.data_type());
++    auto acl_dst_data_t = acl_utils::get_acl_data_t(data_d.data_type());
+ 
+     // clang-format off
+     aep.src_info = arm_compute::TensorInfo(
+@@ -93,7 +93,7 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
+         aep.dst_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
+     }
+ 
+-    aep.act_info = acl_common_utils::get_acl_act(ed);
++    aep.act_info = acl_utils::get_acl_act(ed);
+ 
+     return status::success;
+ }
+@@ -105,14 +105,11 @@ status_t init_conf_eltwise(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
+     CHECK(acl_eltwise_check(aep, data_md, ed, attr));
+ 
+     // clang-format off
+-    auto acl_st = arm_compute::NEActivationLayer::validate(
++    ACL_CHECK_VALID(arm_compute::NEActivationLayer::validate(
+         &aep.src_info,
+         &aep.dst_info,
+-        aep.act_info);
++        aep.act_info));
+     // clang-format on
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        return status::unimplemented;
+-    }
+ 
+     return status::success;
+ }
+diff --git a/src/cpu/aarch64/acl_gemm_convolution.hpp b/src/cpu/aarch64/acl_gemm_convolution.hpp
+index 3e7542b6bf..496f501211 100644
+--- a/src/cpu/aarch64/acl_gemm_convolution.hpp
++++ b/src/cpu/aarch64/acl_gemm_convolution.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2020-2021 Arm Ltd. and affiliates
++* Copyright 2020-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -108,7 +108,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
+                     src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr());
+             if (conf_status != status::success) return status::unimplemented;
+ 
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+@@ -146,7 +146,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
+             // sum+eltwise post-ops
+             if (eltwise_only || sum_with_eltwise) {
+                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
+-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
++                eltwise_ok = acl_utils::acl_act_ok(act_type);
+             }
+ 
+             return eltwise_ok || (po.len() == 0);
+diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
+index 0a0021aeee..18e757a2c9 100644
+--- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
++++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -109,7 +109,7 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
+                     *attr());
+             if (conf_status != status::success) return status::unimplemented;
+ 
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+@@ -134,7 +134,7 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
+             // sum+eltwise post-ops
+             if (eltwise_only || sum_with_eltwise) {
+                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
+-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
++                eltwise_ok = acl_utils::acl_act_ok(act_type);
+             }
+ 
+             return eltwise_ok || (po.len() == 0);
+diff --git a/src/cpu/aarch64/acl_inner_product.cpp b/src/cpu/aarch64/acl_inner_product.cpp
+index 7a316135f8..f355a657c7 100644
+--- a/src/cpu/aarch64/acl_inner_product.cpp
++++ b/src/cpu/aarch64/acl_inner_product.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -21,23 +21,18 @@ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+ 
+-using namespace dnnl::impl::status;
+-using namespace dnnl::impl::memory_tracking::names;
+-using namespace dnnl::impl::utils;
+-
+ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+     // Lock here is needed because resource_mapper does not support
+     // concurrent multithreaded access.
+     std::lock_guard<std::mutex> _lock {this->mtx};
+ 
+-    status_t status = status::success;
+     auto src_base = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+     auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
+     auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+ 
+-    bool with_bias = pd()->aip_.with_bias;
+-    bool with_sum = pd()->aip_.with_sum;
++    bool with_bias = pd()->aip.with_bias;
++    bool with_sum = pd()->aip.with_sum;
+ 
+     // Retrieve primitive resource and configured Compute Library objects
+     auto *acl_resource
+@@ -64,7 +59,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+     acl_obj.dst_tensor.allocator()->free();
+     if (with_bias) { acl_obj.bia_tensor.allocator()->free(); }
+ 
+-    return status;
++    return status::success;
+ }
+ 
+ } // namespace aarch64
+diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
+index dd742ea0bc..e5a9bdcc8a 100644
+--- a/src/cpu/aarch64/acl_inner_product.hpp
++++ b/src/cpu/aarch64/acl_inner_product.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -17,15 +17,34 @@
+ #ifndef CPU_AARCH64_ACL_INNER_PRODUCT_HPP
+ #define CPU_AARCH64_ACL_INNER_PRODUCT_HPP
+ 
++#include "cpu/aarch64/acl_utils.hpp"
+ #include "cpu/cpu_inner_product_pd.hpp"
+ 
+-#include "cpu/aarch64/acl_inner_product_utils.hpp"
+-
+ namespace dnnl {
+ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+ 
++struct acl_ip_obj_t {
++    arm_compute::NEFullyConnectedLayer fc;
++    arm_compute::NEArithmeticAddition add;
++    arm_compute::Tensor src_tensor;
++    arm_compute::Tensor wei_tensor;
++    arm_compute::Tensor bia_tensor;
++    arm_compute::Tensor dst_tensor;
++    arm_compute::Tensor dst_acc_tensor;
++};
++
++struct acl_ip_conf_t {
++    bool with_bias;
++    bool with_sum;
++    arm_compute::TensorInfo src_info;
++    arm_compute::TensorInfo wei_info;
++    arm_compute::TensorInfo bia_info;
++    arm_compute::TensorInfo dst_info;
++    arm_compute::FullyConnectedLayerInfo fc_info;
++};
++
+ struct acl_ip_resource_t : public resource_t {
+     acl_ip_resource_t() : acl_ip_obj_(utils::make_unique<acl_ip_obj_t>()) {}
+ 
+@@ -71,33 +90,26 @@ struct acl_inner_product_fwd_t : public primitive_t {
+     struct pd_t : public cpu_inner_product_fwd_pd_t {
+         using cpu_inner_product_fwd_pd_t::cpu_inner_product_fwd_pd_t;
+ 
+-        DECLARE_COMMON_PD_T("inner_product:acl", acl_inner_product_fwd_t);
++        DECLARE_COMMON_PD_T("acl", acl_inner_product_fwd_t);
+ 
+         status_t init(engine_t *engine) {
+-            using namespace utils;
+-
+             const bool ok = is_fwd() && !has_zero_dim_memory()
+                     && expect_data_types(data_type::f32, data_type::f32,
+                             data_type::f32, data_type::f32, data_type::f32)
+                     && attr()->has_default_values(
+                             primitive_attr_t::skip_mask_t::post_ops,
+                             data_type::f32)
+-                    && (set_default_params() == status::success)
+-                    && post_ops_ok();
++                    && set_default_params() == status::success && post_ops_ok();
+ 
+             if (!ok) return status::unimplemented;
+ 
+-            auto conf_status = acl_inner_product_utils::init_conf_ip(aip_,
+-                    src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr());
+-            // conf_status here can be either status::success or status::unimplemented
+-            if (conf_status != status::success) return conf_status;
+-
+-            acl_common_utils::acl_thread_bind();
++            CHECK(init_conf_ip(aip, src_md_, weights_md_, dst_md_, bias_md_,
++                    *desc(), *attr()));
+ 
+             return status::success;
+         }
+ 
+-        acl_ip_conf_t aip_;
++        acl_ip_conf_t aip;
+ 
+     protected:
+         bool post_ops_ok() const {
+@@ -111,16 +123,149 @@ struct acl_inner_product_fwd_t : public primitive_t {
+             // Compute Library supports here only one eltwise post-op or sum
+             if (po.len() == 1 && is_eltwise(0)) {
+                 const auto act_type = po.entry_[0].eltwise.alg;
+-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
++                eltwise_ok = acl_utils::acl_act_ok(act_type);
+             }
+ 
+             return eltwise_ok || (po.len() == 1 && is_sum(0))
+                     || (po.len() == 0);
+         }
++
++        status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
++                memory_desc_t &wei_md, memory_desc_t &dst_md,
++                memory_desc_t &bias_md, const inner_product_desc_t &ipd,
++                const primitive_attr_t &attr) {
++
++            ACL_CHECK_SUPPORT(src_md.ndims != wei_md.ndims,
++                    "source and weights dimensions must match");
++
++            const int ndims = src_md.ndims;
++
++            const bool is_2d = (ndims == 2);
++            const bool is_4d = (ndims == 4);
++
++            ACL_CHECK_SUPPORT(
++                    !(is_2d || is_4d), "ACL supports only 2d or 4d cases");
++
++            // batch size
++            const int n = src_md.dims[0];
++
++            // input and output channels
++            const int ic = src_md.dims[1];
++            const int oc = dst_md.dims[1];
++
++            // source spatial dimensions
++            const int ih = is_4d ? src_md.dims[ndims - 2] : 0;
++            const int iw = is_4d ? src_md.dims[ndims - 1] : 0;
++
++            // weights spatial dimensions
++            const int kh = is_4d ? wei_md.dims[ndims - 2] : 0;
++            const int kw = is_4d ? wei_md.dims[ndims - 1] : 0;
++
++            // Only NCHW or NHWC derivatives supported by ACL kernels
++            using namespace format_tag;
++            auto src_tag = memory_desc_matches_one_of_tag(
++                    src_md, nhwc, nchw, nc, cn);
++            auto wei_tag = memory_desc_matches_one_of_tag(
++                    wei_md, ohwi, oihw, oi, io);
++            auto dst_tag = memory_desc_matches_one_of_tag(dst_md, nc, cn);
++
++            ACL_CHECK_SUPPORT(
++                    utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
++                    "unsupported memory layout");
++
++            ACL_CHECK_SUPPORT(is_2d && src_tag != dst_tag,
++                    "for src and dst layouts must match");
++
++            arm_compute::TensorShape src_shape, wei_shape;
++            if (is_2d) {
++                src_shape = (src_tag == nc) ? arm_compute::TensorShape(ic, n)
++                                            : arm_compute::TensorShape(n, ic);
++
++                wei_shape = (wei_tag == io) ? arm_compute::TensorShape(oc, ic)
++                                            : arm_compute::TensorShape(ic, oc);
++            }
++            if (is_4d) {
++                src_shape = (src_tag == nhwc)
++                        ? arm_compute::TensorShape(ic, iw, ih, n)
++                        : arm_compute::TensorShape(iw, ih, ic, n);
++
++                // ACL requires the weights to be in 2D flattened shape
++                const int flattened_ic = is_4d ? ic * kh * kw : ic;
++                wei_shape = arm_compute::TensorShape(flattened_ic, oc);
++            }
++
++            arm_compute::DataLayout src_layout = (src_tag == nhwc)
++                    ? arm_compute::DataLayout::NHWC
++                    : arm_compute::DataLayout::NCHW;
++
++            arm_compute::DataLayout wei_layout = (wei_tag == ohwi)
++                    ? arm_compute::DataLayout::NHWC
++                    : arm_compute::DataLayout::NCHW;
++
++            aip.src_info = arm_compute::TensorInfo(
++                    src_shape, 1, arm_compute::DataType::F32, src_layout);
++
++            aip.wei_info = arm_compute::TensorInfo(
++                    wei_shape, 1, arm_compute::DataType::F32, wei_layout);
++
++            aip.dst_info
++                    = arm_compute::TensorInfo(arm_compute::TensorShape(oc, n),
++                            1, arm_compute::DataType::F32);
++
++            aip.with_bias = ipd.bias_desc.format_kind != format_kind::undef;
++            aip.bia_info = arm_compute::TensorInfo(aip.with_bias
++                            ? arm_compute::TensorShape(oc)
++                            : arm_compute::TensorShape(),
++                    1, arm_compute::DataType::F32);
++
++            aip.fc_info.weights_trained_layout = wei_layout;
++            if (is_2d && wei_tag != src_tag) {
++                // weights are already transposed
++                aip.fc_info.transpose_weights = false;
++            }
++
++            // Either activation or sum is supported as post-op at the moment
++            aip.fc_info.activation_info = acl_utils::get_acl_act(attr);
++            const auto &post_ops = attr.post_ops_;
++            aip.with_sum = (post_ops.len() == 1) && post_ops.entry_[0].is_sum();
++
++            // Fast math mode
++            auto math_mode = get_fpmath_mode();
++            bool is_fastmath_enabled = utils::one_of(
++                    math_mode, fpmath_mode::bf16, fpmath_mode::any);
++            aip.fc_info.enable_fast_math = is_fastmath_enabled;
++
++            // clang-format off
++            // Validate fully connected layer manually to check for return status
++            ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
++                &aip.src_info,
++                &aip.wei_info,
++                aip.with_bias ? &aip.bia_info : nullptr,
++                &aip.dst_info,
++                aip.fc_info));
++
++            if (aip.with_sum) {
++                // Validate arithmetic addition manually to check for return status
++                ACL_CHECK_VALID(arm_compute::NEArithmeticAddition::validate(
++                    &aip.dst_info,
++                    &aip.dst_info,
++                    &aip.dst_info,
++                    arm_compute::ConvertPolicy::SATURATE));
++                // clang-format on
++            }
++
++            return status::success;
++        }
+     }; // pd_t
+ 
+     acl_inner_product_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+ 
++    status_t init(engine_t *engine) override {
++        acl_utils::acl_thread_bind();
++
++        return status::success;
++    }
++
+     status_t create_resource(
+             engine_t *engine, resource_mapper_t &mapper) const override {
+         if (mapper.has_resource(this)) return status::success;
+@@ -129,7 +274,7 @@ struct acl_inner_product_fwd_t : public primitive_t {
+         if (!r) return status::out_of_memory;
+ 
+         // Configure the resource based on information from primitive descriptor
+-        auto st = r->configure(pd()->aip_);
++        auto st = r->configure(pd()->aip);
+         if (st == status::success) { mapper.add(this, std::move(r)); }
+ 
+         return st;
+diff --git a/src/cpu/aarch64/acl_inner_product_utils.cpp b/src/cpu/aarch64/acl_inner_product_utils.cpp
+deleted file mode 100644
+index c8fab86f22..0000000000
+--- a/src/cpu/aarch64/acl_inner_product_utils.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
+-*
+-* Licensed under the Apache License, Version 2.0 (the "License");
+-* you may not use this file except in compliance with the License.
+-* You may obtain a copy of the License at
+-*
+-*     http://www.apache.org/licenses/LICENSE-2.0
+-*
+-* Unless required by applicable law or agreed to in writing, software
+-* distributed under the License is distributed on an "AS IS" BASIS,
+-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-* See the License for the specific language governing permissions and
+-* limitations under the License.
+-*******************************************************************************/
+-
+-#include "cpu/aarch64/acl_inner_product_utils.hpp"
+-
+-namespace dnnl {
+-namespace impl {
+-namespace cpu {
+-namespace aarch64 {
+-
+-namespace acl_inner_product_utils {
+-
+-using namespace format_tag;
+-using namespace utils;
+-using namespace status;
+-
+-status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
+-        memory_desc_t &wei_md, memory_desc_t &dst_md, memory_desc_t &bias_md,
+-        const inner_product_desc_t &ipd, const primitive_attr_t &attr) {
+-    const memory_desc_wrapper src_d(&src_md);
+-    const memory_desc_wrapper wei_d(&wei_md);
+-    const memory_desc_wrapper dst_d(&dst_md);
+-    const memory_desc_wrapper bia_d(&bias_md);
+-
+-    // Compute Library currently supports forward propagation only
+-    const prop_kind_t prop_kind = ipd.prop_kind;
+-    const bool is_fwd = (prop_kind == dnnl_forward_training)
+-            || (prop_kind == dnnl_forward_inference);
+-    if (!is_fwd) return status::unimplemented;
+-
+-    const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
+-    const int ndims = src_d.ndims();
+-
+-    // There are two sub-cases: src & wei tensors are either 2- or 4-dimensional
+-    const bool is_2d = (ndims == 2) && (wei_d.ndims() == 2);
+-    const bool is_4d = (ndims == 4) && (wei_d.ndims() == 4);
+-
+-    // Compute Library unsupported shape scenarios
+-    // FP32 only is supported at the moment
+-    if (one_of(true, !(is_4d || is_2d), with_groups)) { return unimplemented; }
+-
+-    // batch size
+-    const int mb = src_d.dims()[0];
+-
+-    // src/input channels, height, width
+-    const int ic = src_d.dims()[1];
+-    const int ih = is_4d ? src_d.dims()[ndims - 2] : 0;
+-    const int iw = is_4d ? src_d.dims()[ndims - 1] : 0;
+-
+-    // dst/output channels
+-    const int oc = dst_d.dims()[1];
+-
+-    // weights height, width
+-    const int kh = is_4d ? wei_d.dims()[with_groups + ndims - 2] : 0;
+-    const int kw = is_4d ? wei_d.dims()[with_groups + ndims - 1] : 0;
+-
+-    aip.with_bias = ipd.bias_desc.format_kind != format_kind::undef;
+-
+-    // Data layout is already defined thus should only be checked
+-    auto src_tag = memory_desc_matches_one_of_tag(src_md, nhwc, nchw, nc, cn);
+-    auto wei_tag = memory_desc_matches_one_of_tag(wei_md, ohwi, oihw, oi, io);
+-    auto dst_tag = memory_desc_matches_one_of_tag(dst_md, nc, cn);
+-    if (one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
+-        return status::unimplemented;
+-    }
+-
+-    arm_compute::TensorShape src_shape {(src_tag == nc)
+-                    ? arm_compute::TensorShape(ic, mb)
+-                    : arm_compute::TensorShape(mb, ic)};
+-    if (is_4d) {
+-        src_shape = (src_tag == nhwc)
+-                ? arm_compute::TensorShape(ic, iw, ih, mb)
+-                : arm_compute::TensorShape(iw, ih, ic, mb);
+-    }
+-
+-    // Compute Library requires the weights to be 2-dimensional for FC layer
+-    arm_compute::TensorShape wei_shape {
+-            arm_compute::TensorShape(is_4d ? ic * kh * kw : ic, oc)};
+-    if (is_2d && wei_tag == io) {
+-        wei_shape = arm_compute::TensorShape(oc, ic);
+-    }
+-
+-    arm_compute::DataLayout wei_layout {(wei_tag == ohwi || wei_tag == oi)
+-                    ? arm_compute::DataLayout::NHWC
+-                    : arm_compute::DataLayout::NCHW};
+-
+-    // clang-format off
+-    aip.src_info = arm_compute::TensorInfo(
+-            src_shape,
+-            1,
+-            arm_compute::DataType::F32,
+-            (src_tag == nhwc || src_tag == nc) ?
+-            arm_compute::DataLayout::NHWC : arm_compute::DataLayout::NCHW);
+-
+-    aip.wei_info = arm_compute::TensorInfo(
+-            wei_shape,
+-            1,
+-            arm_compute::DataType::F32,
+-            wei_layout);
+-
+-    aip.dst_info = arm_compute::TensorInfo(
+-            (dst_tag == nhwc || dst_tag == nc) ?
+-            arm_compute::TensorShape(oc, mb) : arm_compute::TensorShape(mb, oc),
+-            1,
+-            arm_compute::DataType::F32,
+-            (dst_tag == nhwc || dst_tag == nc) ?
+-            arm_compute::DataLayout::NHWC : arm_compute::DataLayout::NCHW);
+-
+-    aip.bia_info = arm_compute::TensorInfo(
+-            aip.with_bias ?
+-            arm_compute::TensorShape(oc) : arm_compute::TensorShape(),
+-            1,
+-            arm_compute::DataType::F32);
+-    // clang-format on
+-
+-    aip.fc_info.weights_trained_layout = wei_layout;
+-    if (is_2d && wei_tag != src_tag) { aip.fc_info.transpose_weights = false; }
+-
+-    // Either activation or sum is supported as post-op at the moment
+-    aip.fc_info.activation_info = acl_common_utils::get_acl_act(attr);
+-    const auto &post_ops = attr.post_ops_;
+-    aip.with_sum = (post_ops.len() == 1) && post_ops.entry_[0].is_sum();
+-
+-    // Fast math mode
+-    auto math_mode = get_fpmath_mode();
+-    bool is_fastmath_enabled
+-            = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
+-    aip.fc_info.enable_fast_math = is_fastmath_enabled;
+-
+-    // clang-format off
+-    // Validate fully connected layer manually to check for return status
+-    auto acl_st = arm_compute::NEFullyConnectedLayer::validate(
+-        &aip.src_info,
+-        &aip.wei_info,
+-        aip.with_bias ? &aip.bia_info : nullptr,
+-        &aip.dst_info,
+-        aip.fc_info);
+-    // clang-format on
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-        return status::unimplemented;
+-    }
+-
+-    if (aip.with_sum) {
+-        // clang-format off
+-        // Validate arithmetic addition manually to check for return status
+-        auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
+-            &aip.dst_info,
+-            &aip.dst_info,
+-            &aip.dst_info,
+-            arm_compute::ConvertPolicy::SATURATE);
+-
+-        // clang-format on
+-        if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
+-            MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
+-            return status::unimplemented;
+-        }
+-    }
+-
+-    return status::success;
+-}
+-
+-} // namespace acl_inner_product_utils
+-
+-} // namespace aarch64
+-} // namespace cpu
+-} // namespace impl
+-} // namespace dnnl
+diff --git a/src/cpu/aarch64/acl_inner_product_utils.hpp b/src/cpu/aarch64/acl_inner_product_utils.hpp
+deleted file mode 100644
+index 022d0e3349..0000000000
+--- a/src/cpu/aarch64/acl_inner_product_utils.hpp
++++ /dev/null
+@@ -1,62 +0,0 @@
+-/*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
+-*
+-* Licensed under the Apache License, Version 2.0 (the "License");
+-* you may not use this file except in compliance with the License.
+-* You may obtain a copy of the License at
+-*
+-*     http://www.apache.org/licenses/LICENSE-2.0
+-*
+-* Unless required by applicable law or agreed to in writing, software
+-* distributed under the License is distributed on an "AS IS" BASIS,
+-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-* See the License for the specific language governing permissions and
+-* limitations under the License.
+-*******************************************************************************/
+-
+-#ifndef CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
+-#define CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
+-
+-#include "cpu/cpu_inner_product_pd.hpp"
+-
+-#include "cpu/aarch64/acl_utils.hpp"
+-
+-namespace dnnl {
+-namespace impl {
+-namespace cpu {
+-namespace aarch64 {
+-
+-struct acl_ip_obj_t {
+-    arm_compute::NEFullyConnectedLayer fc;
+-    arm_compute::NEArithmeticAddition add;
+-    arm_compute::Tensor src_tensor;
+-    arm_compute::Tensor wei_tensor;
+-    arm_compute::Tensor bia_tensor;
+-    arm_compute::Tensor dst_tensor;
+-    arm_compute::Tensor dst_acc_tensor;
+-};
+-
+-struct acl_ip_conf_t {
+-    bool with_bias;
+-    bool with_sum;
+-    arm_compute::TensorInfo src_info;
+-    arm_compute::TensorInfo wei_info;
+-    arm_compute::TensorInfo bia_info;
+-    arm_compute::TensorInfo dst_info;
+-    arm_compute::FullyConnectedLayerInfo fc_info;
+-};
+-
+-namespace acl_inner_product_utils {
+-
+-status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
+-        memory_desc_t &wei_md, memory_desc_t &dst_md, memory_desc_t &bias_md,
+-        const inner_product_desc_t &ipd, const primitive_attr_t &attr);
+-
+-} // namespace acl_inner_product_utils
+-
+-} // namespace aarch64
+-} // namespace cpu
+-} // namespace impl
+-} // namespace dnnl
+-
+-#endif // CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
+diff --git a/src/cpu/aarch64/acl_softmax.hpp b/src/cpu/aarch64/acl_softmax.hpp
+index a4bfd0c3bd..4ee7139a93 100644
+--- a/src/cpu/aarch64/acl_softmax.hpp
++++ b/src/cpu/aarch64/acl_softmax.hpp
+@@ -129,7 +129,7 @@ struct acl_softmax_fwd_t : public primitive_t {
+             arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
+ 
+             const arm_compute::DataType acl_data_t
+-                    = acl_common_utils::get_acl_data_t(data_type);
++                    = acl_utils::get_acl_data_t(data_type);
+ 
+             const int threads = dnnl_get_max_threads();
+             if (inner_size_ == 1) {
+@@ -189,20 +189,15 @@ struct acl_softmax_fwd_t : public primitive_t {
+             }
+ 
+             // Validate manually to check for return status
+-            arm_compute::Status acl_st;
+             if (asp_.is_logsoftmax) {
+-                acl_st = arm_compute::NELogSoftmaxLayer::validate(
+-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis);
++                ACL_CHECK_VALID(arm_compute::NELogSoftmaxLayer::validate(
++                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
+             } else {
+-                acl_st = arm_compute::NESoftmaxLayer::validate(
+-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis);
+-            }
+-            if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-                MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-                return status::unimplemented;
++                ACL_CHECK_VALID(arm_compute::NESoftmaxLayer::validate(
++                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
+             }
+ 
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
+index a69f14b6f9..098217f50e 100644
+--- a/src/cpu/aarch64/acl_utils.cpp
++++ b/src/cpu/aarch64/acl_utils.cpp
+@@ -21,7 +21,7 @@ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+ 
+-namespace acl_common_utils {
++namespace acl_utils {
+ 
+ using namespace dnnl::impl::alg_kind;
+ using namespace data_type;
+@@ -247,7 +247,7 @@ status_t permute_common_dense_dimension_to_last(memory_desc_t *d0_permed,
+     return status::success;
+ }
+ 
+-} // namespace acl_common_utils
++} // namespace acl_utils
+ 
+ } // namespace aarch64
+ } // namespace cpu
+diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
+index 565cde66a9..bb8efc998c 100644
+--- a/src/cpu/aarch64/acl_utils.hpp
++++ b/src/cpu/aarch64/acl_utils.hpp
+@@ -21,13 +21,10 @@
+ 
+ #include "oneapi/dnnl/dnnl_types.h"
+ 
+-#include "common/bfloat16.hpp"
+-#include "common/c_types_map.hpp"
+ #include "common/dnnl_thread.hpp"
+ #include "common/memory_tracking.hpp"
+ #include "common/primitive.hpp"
+ #include "common/utils.hpp"
+-
+ #include "cpu/cpu_engine.hpp"
+ 
+ #include "arm_compute/runtime/NEON/NEFunctions.h"
+@@ -38,7 +35,7 @@ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+ 
+-namespace acl_common_utils {
++namespace acl_utils {
+ 
+ arm_compute::DataType get_acl_data_t(const dnnl_data_type_t dt);
+ arm_compute::ActivationLayerInfo get_acl_act(const primitive_attr_t &attr);
+@@ -68,12 +65,33 @@ status_t permute_common_dense_dimension_to_last(memory_desc_t *d0_permed,
+         const memory_desc_t *d0, const memory_desc_t *d1,
+         const memory_desc_t *d2);
+ 
+-#define MAYBE_REPORT_ACL_ERROR(msg) \
++// Logs a custom 'info' line describing an unsupported case
++#define LOG_ACL_UNSUPPORTED(msg) \
++    do { \
++        if (get_verbose() >= 2) \
++            printf("onednn_verbose,cpu,acl,unsupported: %s\n", (msg)); \
++    } while (0)
++
++// Returns unimplemented if error code x is NOT OK
++#define ACL_CHECK_VALID(x) \
++    do { \
++        arm_compute::Status s = x; \
++        if (s.error_code() != arm_compute::ErrorCode::OK) { \
++            LOG_ACL_UNSUPPORTED(s.error_description().c_str()); \
++            return dnnl::impl::status::unimplemented; \
++        } \
++    } while (0)
++
++// Returns unimplemented on condition x == true
++#define ACL_CHECK_SUPPORT(x, msg) \
+     do { \
+-        if (get_verbose()) printf("onednn_verbose,cpu,error,acl,%s\n", (msg)); \
++        if (x) { \
++            LOG_ACL_UNSUPPORTED(msg); \
++            return dnnl::impl::status::unimplemented; \
++        } \
+     } while (0)
+ 
+-} // namespace acl_common_utils
++} // namespace acl_utils
+ 
+ } // namespace aarch64
+ } // namespace cpu
+diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
+index 29e44eb189..223b3bc9b8 100644
+--- a/src/cpu/aarch64/acl_winograd_convolution.hpp
++++ b/src/cpu/aarch64/acl_winograd_convolution.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2020-2021 Arm Ltd. and affiliates
++* Copyright 2020-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -107,7 +107,7 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
+ 
+             set_default_alg_kind(alg_kind::convolution_winograd);
+ 
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+@@ -130,7 +130,7 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
+             // sum+eltwise post-ops
+             if (eltwise_only || sum_with_eltwise) {
+                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
+-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
++                eltwise_ok = acl_utils::acl_act_ok(act_type);
+             }
+ 
+             return eltwise_ok || (po.len() == 0);
+diff --git a/src/cpu/aarch64/matmul/acl_matmul.cpp b/src/cpu/aarch64/matmul/acl_matmul.cpp
+index 3945fda6fc..6f9bb9b9ad 100644
+--- a/src/cpu/aarch64/matmul/acl_matmul.cpp
++++ b/src/cpu/aarch64/matmul/acl_matmul.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -84,4 +84,4 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
+ } // namespace aarch64
+ } // namespace cpu
+ } // namespace impl
+-} // namespace dnnl
+\ No newline at end of file
++} // namespace dnnl
+diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
+index 6ba17e86dd..e69f4d9592 100644
+--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
++++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -87,7 +87,7 @@ struct acl_matmul_t : public primitive_t {
+             if (conf_status != status::success) return status::unimplemented;
+             // Number of threads in Compute Library is set by OMP_NUM_THREADS
+             // dnnl_get_max_threads() == OMP_NUM_THREADS
+-            acl_common_utils::acl_thread_bind();
++            acl_utils::acl_thread_bind();
+ 
+             return status::success;
+         }
+diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+index 76599d8bb1..ba266b4303 100644
+--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
++++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -22,14 +22,10 @@ namespace dnnl {
+ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+-namespace matmul {
+ 
+-using namespace dnnl::impl::status;
+-using namespace dnnl::impl::utils;
+-using namespace dnnl::impl::cpu::matmul;
+-using namespace prop_kind;
++using namespace alg_kind;
++using namespace cpu::matmul;
+ using namespace format_tag;
+-using namespace dnnl::impl::alg_kind;
+ 
+ namespace acl_matmul_utils {
+ 
+@@ -58,7 +54,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
+             wei_md, abcd, abdc, abc, acb, ab, ba);
+     auto dst_tag = memory_desc_matches_one_of_tag(
+             dst_md, abcd, abdc, abc, acb, ab, ba);
+-    if (one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
++    if (utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
+         return status::unimplemented;
+     }
+     amp.is_transA = helper.transA() == 'T';
+@@ -85,7 +81,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
+     // Fast-math mode
+     auto math_mode = get_fpmath_mode();
+     bool is_fastmath_enabled
+-            = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
++            = utils::one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
+     amp.gemm_info.set_fast_math(is_fastmath_enabled);
+ 
+     // Fused ReLU activation
+@@ -95,29 +91,15 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
+     amp.alpha = attr.output_scales_.scales_[0];
+ 
+     // Validate ACL transpose
+-    if (amp.is_transA) {
+-        auto acl_transA_st = arm_compute::NETranspose::validate(
+-                &amp.src_acc_info, &amp.src_info);
+-        if (acl_transA_st.error_code() != arm_compute::ErrorCode::OK) {
+-            MAYBE_REPORT_ACL_ERROR(acl_transA_st.error_description().c_str());
+-            return status::unimplemented;
+-        }
+-    }
+-    if (amp.is_transB) {
+-        auto acl_transB_st = arm_compute::NETranspose::validate(
+-                &amp.wei_acc_info, &amp.wei_info);
+-        if (acl_transB_st.error_code() != arm_compute::ErrorCode::OK) {
+-            MAYBE_REPORT_ACL_ERROR(acl_transB_st.error_description().c_str());
+-            return status::unimplemented;
+-        }
+-    }
++    if (amp.is_transA)
++        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
++                &amp.src_acc_info, &amp.src_info));
++    if (amp.is_transB)
++        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
++                &amp.wei_acc_info, &amp.wei_info));
+     // Validate ACL GEMM
+-    auto acl_st = arm_compute::NEGEMM::validate(&amp.src_info, &amp.wei_info,
+-            nullptr, &amp.dst_info, amp.alpha, 0.0f, amp.gemm_info);
+-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
+-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
+-        return status::unimplemented;
+-    }
++    ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp.src_info, &amp.wei_info,
++            nullptr, &amp.dst_info, amp.alpha, 0.0f, amp.gemm_info));
+ 
+     return status::success;
+ }
+@@ -175,7 +157,6 @@ bool acl_act_ok(alg_kind_t eltwise_activation) {
+ 
+ } // namespace acl_matmul_utils
+ 
+-} // namespace matmul
+ } // namespace aarch64
+ } // namespace cpu
+ } // namespace impl
+diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+index 1411dc4f4b..248dbe5a09 100644
+--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
++++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2021 Arm Ltd. and affiliates
++* Copyright 2021-2022 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -25,7 +25,6 @@ namespace dnnl {
+ namespace impl {
+ namespace cpu {
+ namespace aarch64 {
+-namespace matmul {
+ 
+ struct acl_matmul_obj_t {
+     arm_compute::NEGEMM gemm;
+@@ -61,10 +60,9 @@ arm_compute::ActivationLayerInfo get_acl_act(const primitive_attr_t &attr);
+ bool acl_act_ok(alg_kind_t eltwise_activation);
+ } // namespace acl_matmul_utils
+ 
+-} // namespace matmul
+ } // namespace aarch64
+ } // namespace cpu
+ } // namespace impl
+ } // namespace dnnl
+ 
+-#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP
+\ No newline at end of file
++#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP
+diff --git a/tests/benchdnn/inputs/ip/test_ip_acl b/tests/benchdnn/inputs/ip/test_ip_acl
+new file mode 100644
+index 0000000000..a8873c30a8
+--- /dev/null
++++ b/tests/benchdnn/inputs/ip/test_ip_acl
+@@ -0,0 +1,26 @@
++--reset
++
++# do not test other implementations
++--skip-impl='ref,jit'
++
++# test format::any
++--batch=shapes_ci
++
++# only inference, with and without bias
++--dir=FWD_I,FWD_B
++# test all shapes
++--batch=set_all
++
++# with and without batches
++--mb=0,2
++
++# test non-spatial layout combinations
++--stag=ab,ba
++--wtag=ab,ba
++--batch=shapes_0d
++
++# test spatial layout combinations
++--stag=abx,axb
++--wtag=abx,axb
++# 2d-spatial dimensions
++--batch=shapes_googlenet_v1
diff --git a/onednn.changes b/onednn.changes
index 6b834a3..73885c3 100644
--- a/onednn.changes
+++ b/onednn.changes
@@ -1,3 +1,10 @@
+-------------------------------------------------------------------
+Tue Sep 20 08:26:43 UTC 2022 - Guillaume GARDET <guillaume.gardet@opensuse.org>
+
+- Add patch to fix build with latest Arm Compute Library:
+  * 1428.patch
+  * fa93750.patch (dep for 1428.patch)
+
 -------------------------------------------------------------------
 Tue Sep 13 05:22:52 UTC 2022 - Paolo Stivanin <info@paolostivanin.com>
 
diff --git a/onednn.spec b/onednn.spec
index ae17253..0d5849a 100644
--- a/onednn.spec
+++ b/onednn.spec
@@ -37,6 +37,10 @@ Summary:        Intel Math Kernel Library for Deep Neural Networks
 License:        Apache-2.0
 URL:            https://01.org/onednn
 Source0:        https://github.com/oneapi-src/oneDNN/archive/v%{version}/oneDNN-%{version}.tar.gz
+# PATCH-FIX-UPSTREAM - deps for Patch2
+Patch1:         fa93750.patch
+# PATCH-FIX-UPSTREAM - Fix build with latest ACL - https://github.com/oneapi-src/oneDNN/pull/1428
+Patch2:         1428.patch
 BuildRequires:  chrpath
 BuildRequires:  cmake
 BuildRequires:  doxygen