oneapi-onednn/fa93750.patch

From fa93750bfb821fe05e3190b36f52b5bd88a57110 Mon Sep 17 00:00:00 2001
From: Diana Bite <diana.bite@arm.com>
Date: Thu, 24 Feb 2022 14:25:49 +0000
Subject: [PATCH] cpu: aarch64: acl: fix inner_prod test failure and improve
 validation

---
 src/cpu/aarch64/acl_binary.hpp                |   8 +-
 src/cpu/aarch64/acl_convolution_utils.cpp     |  65 ++-----
 src/cpu/aarch64/acl_eltwise.hpp               |   4 +-
 src/cpu/aarch64/acl_eltwise_utils.cpp         |  17 +-
 src/cpu/aarch64/acl_gemm_convolution.hpp      |   6 +-
 .../aarch64/acl_indirect_gemm_convolution.hpp |   6 +-
 src/cpu/aarch64/acl_inner_product.cpp         |  13 +-
 src/cpu/aarch64/acl_inner_product.hpp         | 179 +++++++++++++++--
 src/cpu/aarch64/acl_inner_product_utils.cpp   | 181 ------------------
 src/cpu/aarch64/acl_inner_product_utils.hpp   |  62 ------
 src/cpu/aarch64/acl_softmax.hpp               |  17 +-
 src/cpu/aarch64/acl_utils.cpp                 |   4 +-
 src/cpu/aarch64/acl_utils.hpp                 |  32 +++-
 src/cpu/aarch64/acl_winograd_convolution.hpp  |   6 +-
 src/cpu/aarch64/matmul/acl_matmul.cpp         |   4 +-
 src/cpu/aarch64/matmul/acl_matmul.hpp         |   4 +-
 src/cpu/aarch64/matmul/acl_matmul_utils.cpp   |  45 ++---
 src/cpu/aarch64/matmul/acl_matmul_utils.hpp   |   6 +-
 tests/benchdnn/inputs/ip/test_ip_acl          |  26 +++
 19 files changed, 281 insertions(+), 404 deletions(-)
 delete mode 100644 src/cpu/aarch64/acl_inner_product_utils.cpp
 delete mode 100644 src/cpu/aarch64/acl_inner_product_utils.hpp
 create mode 100644 tests/benchdnn/inputs/ip/test_ip_acl

diff --git a/src/cpu/aarch64/acl_binary.hpp b/src/cpu/aarch64/acl_binary.hpp
index 77adb45bef..122b094587 100644
--- a/src/cpu/aarch64/acl_binary.hpp
+++ b/src/cpu/aarch64/acl_binary.hpp
@@ -125,7 +125,7 @@ struct acl_binary_t : public primitive_t {

         status_t init(engine_t *engine) {

-            using namespace acl_common_utils;
+            using namespace acl_utils;

             // Only support f32 and s32 for now
             data_type_t ddt = dst_md(0)->data_type;
@@ -179,11 +179,7 @@ struct acl_binary_t : public primitive_t {
             }

             // Call operator specific validate function to check support
-            arm_compute::Status acl_st = validate(asp_);
-            if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-                MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-                return status::unimplemented;
-            }
+            ACL_CHECK_VALID(validate(asp_));

             // Initialize the ACL threads
             acl_thread_bind();
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index ca91de49e3..e072dc5490 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Arm Ltd. and affiliates
+* Copyright 2020-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -156,10 +156,10 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
     const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
                                     : arm_compute::DataLayout::NCHW;

-    auto acl_src_data_t = acl_common_utils::get_acl_data_t(src_d.data_type());
-    auto acl_wei_data_t = acl_common_utils::get_acl_data_t(wei_d.data_type());
-    auto acl_dst_data_t = acl_common_utils::get_acl_data_t(dst_d.data_type());
-    auto acl_bia_data_t = acl_common_utils::get_acl_data_t(bia_d.data_type());
+    auto acl_src_data_t = acl_utils::get_acl_data_t(src_d.data_type());
+    auto acl_wei_data_t = acl_utils::get_acl_data_t(wei_d.data_type());
+    auto acl_dst_data_t = acl_utils::get_acl_data_t(dst_d.data_type());
+    auto acl_bia_data_t = acl_utils::get_acl_data_t(bia_d.data_type());

     if (acl_bia_data_t == arm_compute::DataType::UNKNOWN)
         acl_bia_data_t = arm_compute::DataType::F32;
@@ -212,33 +212,14 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
     // is_eltwise(true) here stands for eltwise.scale == 1.f check
     acp.sum_with_eltwise = (post_ops.len() == 2) && post_ops.entry_[0].is_sum()
             && post_ops.entry_[1].is_eltwise(true);
-    acp.act_info = acl_common_utils::get_acl_act(attr);
+    acp.act_info = acl_utils::get_acl_act(attr);

     if (acp.sum_with_eltwise) {
-        // clang-format off
-        // Validate activation layer manually to check for return status
-        auto acl_al_st = arm_compute::NEActivationLayer::validate(
-            &acp.dst_info,
-            &acp.dst_info,
-            acp.act_info);
-        // clang-format on
-        if (acl_al_st.error_code() != arm_compute::ErrorCode::OK) {
-            MAYBE_REPORT_ACL_ERROR(acl_al_st.error_description().c_str());
-            return status::unimplemented;
-        }
-
-        // clang-format off
-        // Validate arithmetic addition manually to check for return status
-        auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
-            &acp.dst_info,
-            &acp.dst_info,
-            &acp.dst_info,
-            arm_compute::ConvertPolicy::SATURATE);
-        // clang-format on
-        if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
-            MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
-            return status::unimplemented;
-        }
+        ACL_CHECK_VALID(arm_compute::NEActivationLayer::validate( // eltwise
+                &acp.dst_info, &acp.dst_info, acp.act_info));
+        ACL_CHECK_VALID(arm_compute::NEArithmeticAddition::validate( // sum
+                &acp.dst_info, &acp.dst_info, &acp.dst_info,
+                arm_compute::ConvertPolicy::SATURATE));
     }

     return status::success;
@@ -254,7 +235,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,

     // clang-format off
     // Validate convolution manually to check for return status
-    auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
+    ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::validate(
         &acp.src_info,
         &acp.wei_info,
         acp.with_bias ? &acp.bia_info : nullptr,
@@ -263,12 +244,8 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
         acp.weights_info,
         acp.dilation_info,
         acp.act_info,
-        acp.fast_math);
+        acp.fast_math));
     // clang-format on
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-        return status::unimplemented;
-    }

     return status::success;
 }
@@ -289,7 +266,7 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,

     // clang-format off
     // NOTE: indirect convolution method supports only nhwc layout.
-    auto acl_st = arm_compute::NEGEMMConv2d::validate(
+    ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
         &acp.src_info,
         &acp.wei_info,
         acp.with_bias ? &acp.bia_info : nullptr,
@@ -298,12 +275,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
                                 acp.dilation_info,
                                 acp.act_info,
                                 acp.fast_math,
-                                1));
+                                1)));
     // clang-format on
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-        return status::unimplemented;
-    }

     return status::success;
 }
@@ -336,19 +309,15 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,

     // clang-format off
     // Validate convolution manually to check for return status
-    auto acl_st = arm_compute::NEWinogradConvolutionLayer::validate(
+    ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
         &acp.src_info,
         &acp.wei_info,
         acp.with_bias ? &acp.bia_info : nullptr,
         &acp.dst_info,
         acp.padstride_info,
         acp.act_info,
-        true); // enable_fast_math flag in ACL Winograd
+        true)); // enable_fast_math flag in ACL Winograd
     // clang-format on
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-        return status::unimplemented;
-    }

     return status::success;
 }
diff --git a/src/cpu/aarch64/acl_eltwise.hpp b/src/cpu/aarch64/acl_eltwise.hpp
index a55b89272c..381368aabb 100644
--- a/src/cpu/aarch64/acl_eltwise.hpp
+++ b/src/cpu/aarch64/acl_eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ struct acl_eltwise_fwd_t : public primitive_t {
                     aep_, data_md_, *desc(), *attr());
             if (conf_status != status::success) return status::unimplemented;

-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
diff --git a/src/cpu/aarch64/acl_eltwise_utils.cpp b/src/cpu/aarch64/acl_eltwise_utils.cpp
index 35e809e042..880b6aeaae 100644
--- a/src/cpu/aarch64/acl_eltwise_utils.cpp
+++ b/src/cpu/aarch64/acl_eltwise_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,

     const alg_kind_t eltwise_alg = ed.alg_kind;

-    bool activation_supported = acl_common_utils::acl_act_ok(eltwise_alg);
+    bool activation_supported = acl_utils::acl_act_ok(eltwise_alg);
     if (!activation_supported) { return status::unimplemented; }

     // batch size
@@ -69,8 +69,8 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
     const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
                                     : arm_compute::DataLayout::NCHW;

-    auto acl_src_data_t = acl_common_utils::get_acl_data_t(data_d.data_type());
-    auto acl_dst_data_t = acl_common_utils::get_acl_data_t(data_d.data_type());
+    auto acl_src_data_t = acl_utils::get_acl_data_t(data_d.data_type());
+    auto acl_dst_data_t = acl_utils::get_acl_data_t(data_d.data_type());

     // clang-format off
     aep.src_info = arm_compute::TensorInfo(
@@ -93,7 +93,7 @@ status_t acl_eltwise_check(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
         aep.dst_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
     }

-    aep.act_info = acl_common_utils::get_acl_act(ed);
+    aep.act_info = acl_utils::get_acl_act(ed);

     return status::success;
 }
@@ -105,14 +105,11 @@ status_t init_conf_eltwise(acl_eltwise_conf_t &aep, memory_desc_t &data_md,
     CHECK(acl_eltwise_check(aep, data_md, ed, attr));

     // clang-format off
-    auto acl_st = arm_compute::NEActivationLayer::validate(
+    ACL_CHECK_VALID(arm_compute::NEActivationLayer::validate(
         &aep.src_info,
         &aep.dst_info,
-        aep.act_info);
+        aep.act_info));
     // clang-format on
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        return status::unimplemented;
-    }

     return status::success;
 }
diff --git a/src/cpu/aarch64/acl_gemm_convolution.hpp b/src/cpu/aarch64/acl_gemm_convolution.hpp
index 3e7542b6bf..496f501211 100644
--- a/src/cpu/aarch64/acl_gemm_convolution.hpp
+++ b/src/cpu/aarch64/acl_gemm_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Arm Ltd. and affiliates
+* Copyright 2020-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -108,7 +108,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
                     src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr());
             if (conf_status != status::success) return status::unimplemented;

-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
@@ -146,7 +146,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
             // sum+eltwise post-ops
             if (eltwise_only || sum_with_eltwise) {
                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
+                eltwise_ok = acl_utils::acl_act_ok(act_type);
             }

             return eltwise_ok || (po.len() == 0);
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
index 0a0021aeee..18e757a2c9 100644
--- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
+++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
                     *attr());
             if (conf_status != status::success) return status::unimplemented;

-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
@@ -134,7 +134,7 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
             // sum+eltwise post-ops
             if (eltwise_only || sum_with_eltwise) {
                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
+                eltwise_ok = acl_utils::acl_act_ok(act_type);
             }

             return eltwise_ok || (po.len() == 0);
diff --git a/src/cpu/aarch64/acl_inner_product.cpp b/src/cpu/aarch64/acl_inner_product.cpp
index 7a316135f8..f355a657c7 100644
--- a/src/cpu/aarch64/acl_inner_product.cpp
+++ b/src/cpu/aarch64/acl_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,23 +21,18 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {

-using namespace dnnl::impl::status;
-using namespace dnnl::impl::memory_tracking::names;
-using namespace dnnl::impl::utils;
-
 status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     // Lock here is needed because resource_mapper does not support
     // concurrent multithreaded access.
     std::lock_guard<std::mutex> _lock {this->mtx};

-    status_t status = status::success;
     auto src_base = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);

-    bool with_bias = pd()->aip_.with_bias;
-    bool with_sum = pd()->aip_.with_sum;
+    bool with_bias = pd()->aip.with_bias;
+    bool with_sum = pd()->aip.with_sum;

     // Retrieve primitive resource and configured Compute Library objects
     auto *acl_resource
@@ -64,7 +59,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     acl_obj.dst_tensor.allocator()->free();
     if (with_bias) { acl_obj.bia_tensor.allocator()->free(); }

-    return status;
+    return status::success;
 }

 } // namespace aarch64
diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
index dd742ea0bc..e5a9bdcc8a 100644
--- a/src/cpu/aarch64/acl_inner_product.hpp
+++ b/src/cpu/aarch64/acl_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,15 +17,34 @@
 #ifndef CPU_AARCH64_ACL_INNER_PRODUCT_HPP
 #define CPU_AARCH64_ACL_INNER_PRODUCT_HPP

+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_inner_product_pd.hpp"

-#include "cpu/aarch64/acl_inner_product_utils.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {

+struct acl_ip_obj_t {
+    arm_compute::NEFullyConnectedLayer fc;
+    arm_compute::NEArithmeticAddition add;
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor bia_tensor;
+    arm_compute::Tensor dst_tensor;
+    arm_compute::Tensor dst_acc_tensor;
+};
+
+struct acl_ip_conf_t {
+    bool with_bias;
+    bool with_sum;
+    arm_compute::TensorInfo src_info;
+    arm_compute::TensorInfo wei_info;
+    arm_compute::TensorInfo bia_info;
+    arm_compute::TensorInfo dst_info;
+    arm_compute::FullyConnectedLayerInfo fc_info;
+};
+
 struct acl_ip_resource_t : public resource_t {
     acl_ip_resource_t() : acl_ip_obj_(utils::make_unique<acl_ip_obj_t>()) {}

@@ -71,33 +90,26 @@ struct acl_inner_product_fwd_t : public primitive_t {
     struct pd_t : public cpu_inner_product_fwd_pd_t {
         using cpu_inner_product_fwd_pd_t::cpu_inner_product_fwd_pd_t;

-        DECLARE_COMMON_PD_T("inner_product:acl", acl_inner_product_fwd_t);
+        DECLARE_COMMON_PD_T("acl", acl_inner_product_fwd_t);

         status_t init(engine_t *engine) {
-            using namespace utils;
-
             const bool ok = is_fwd() && !has_zero_dim_memory()
                     && expect_data_types(data_type::f32, data_type::f32,
                             data_type::f32, data_type::f32, data_type::f32)
                     && attr()->has_default_values(
                             primitive_attr_t::skip_mask_t::post_ops,
                             data_type::f32)
-                    && (set_default_params() == status::success)
-                    && post_ops_ok();
+                    && set_default_params() == status::success && post_ops_ok();

             if (!ok) return status::unimplemented;

-            auto conf_status = acl_inner_product_utils::init_conf_ip(aip_,
-                    src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr());
-            // conf_status here can be either status::success or status::unimplemented
-            if (conf_status != status::success) return conf_status;
-
-            acl_common_utils::acl_thread_bind();
+            CHECK(init_conf_ip(aip, src_md_, weights_md_, dst_md_, bias_md_,
+                    *desc(), *attr()));

             return status::success;
         }

-        acl_ip_conf_t aip_;
+        acl_ip_conf_t aip;

     protected:
         bool post_ops_ok() const {
@@ -111,16 +123,149 @@ struct acl_inner_product_fwd_t : public primitive_t {
             // Compute Library supports here only one eltwise post-op or sum
             if (po.len() == 1 && is_eltwise(0)) {
                 const auto act_type = po.entry_[0].eltwise.alg;
-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
+                eltwise_ok = acl_utils::acl_act_ok(act_type);
             }

             return eltwise_ok || (po.len() == 1 && is_sum(0))
                     || (po.len() == 0);
         }
+
+        status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
+                memory_desc_t &wei_md, memory_desc_t &dst_md,
+                memory_desc_t &bias_md, const inner_product_desc_t &ipd,
+                const primitive_attr_t &attr) {
+
+            ACL_CHECK_SUPPORT(src_md.ndims != wei_md.ndims,
+                    "source and weights dimensions must match");
+
+            const int ndims = src_md.ndims;
+
+            const bool is_2d = (ndims == 2);
+            const bool is_4d = (ndims == 4);
+
+            ACL_CHECK_SUPPORT(
+                    !(is_2d || is_4d), "ACL supports only 2d or 4d cases");
+
+            // batch size
+            const int n = src_md.dims[0];
+
+            // input and output channels
+            const int ic = src_md.dims[1];
+            const int oc = dst_md.dims[1];
+
+            // source spatial dimensions
+            const int ih = is_4d ? src_md.dims[ndims - 2] : 0;
+            const int iw = is_4d ? src_md.dims[ndims - 1] : 0;
+
+            // weights spatial dimensions
+            const int kh = is_4d ? wei_md.dims[ndims - 2] : 0;
+            const int kw = is_4d ? wei_md.dims[ndims - 1] : 0;
+
+            // Only NCHW or NHWC derivatives supported by ACL kernels
+            using namespace format_tag;
+            auto src_tag = memory_desc_matches_one_of_tag(
+                    src_md, nhwc, nchw, nc, cn);
+            auto wei_tag = memory_desc_matches_one_of_tag(
+                    wei_md, ohwi, oihw, oi, io);
+            auto dst_tag = memory_desc_matches_one_of_tag(dst_md, nc, cn);
+
+            ACL_CHECK_SUPPORT(
+                    utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
+                    "unsupported memory layout");
+
+            ACL_CHECK_SUPPORT(is_2d && src_tag != dst_tag,
+                    "for src and dst layouts must match");
+
+            arm_compute::TensorShape src_shape, wei_shape;
+            if (is_2d) {
+                src_shape = (src_tag == nc) ? arm_compute::TensorShape(ic, n)
+                                            : arm_compute::TensorShape(n, ic);
+
+                wei_shape = (wei_tag == io) ? arm_compute::TensorShape(oc, ic)
+                                            : arm_compute::TensorShape(ic, oc);
+            }
+            if (is_4d) {
+                src_shape = (src_tag == nhwc)
+                        ? arm_compute::TensorShape(ic, iw, ih, n)
+                        : arm_compute::TensorShape(iw, ih, ic, n);
+
+                // ACL requires the weights to be in 2D flattened shape
+                const int flattened_ic = is_4d ? ic * kh * kw : ic;
+                wei_shape = arm_compute::TensorShape(flattened_ic, oc);
+            }
+
+            arm_compute::DataLayout src_layout = (src_tag == nhwc)
+                    ? arm_compute::DataLayout::NHWC
+                    : arm_compute::DataLayout::NCHW;
+
+            arm_compute::DataLayout wei_layout = (wei_tag == ohwi)
+                    ? arm_compute::DataLayout::NHWC
+                    : arm_compute::DataLayout::NCHW;
+
+            aip.src_info = arm_compute::TensorInfo(
+                    src_shape, 1, arm_compute::DataType::F32, src_layout);
+
+            aip.wei_info = arm_compute::TensorInfo(
+                    wei_shape, 1, arm_compute::DataType::F32, wei_layout);
+
+            aip.dst_info
+                    = arm_compute::TensorInfo(arm_compute::TensorShape(oc, n),
+                            1, arm_compute::DataType::F32);
+
+            aip.with_bias = ipd.bias_desc.format_kind != format_kind::undef;
+            aip.bia_info = arm_compute::TensorInfo(aip.with_bias
+                            ? arm_compute::TensorShape(oc)
+                            : arm_compute::TensorShape(),
+                    1, arm_compute::DataType::F32);
+
+            aip.fc_info.weights_trained_layout = wei_layout;
+            if (is_2d && wei_tag != src_tag) {
+                // weights are already transposed
+                aip.fc_info.transpose_weights = false;
+            }
+
+            // Either activation or sum is supported as post-op at the moment
+            aip.fc_info.activation_info = acl_utils::get_acl_act(attr);
+            const auto &post_ops = attr.post_ops_;
+            aip.with_sum = (post_ops.len() == 1) && post_ops.entry_[0].is_sum();
+
+            // Fast math mode
+            auto math_mode = get_fpmath_mode();
+            bool is_fastmath_enabled = utils::one_of(
+                    math_mode, fpmath_mode::bf16, fpmath_mode::any);
+            aip.fc_info.enable_fast_math = is_fastmath_enabled;
+
+            // clang-format off
+            // Validate fully connected layer manually to check for return status
+            ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
+                &aip.src_info,
+                &aip.wei_info,
+                aip.with_bias ? &aip.bia_info : nullptr,
+                &aip.dst_info,
+                aip.fc_info));
+
+            if (aip.with_sum) {
+                // Validate arithmetic addition manually to check for return status
+                ACL_CHECK_VALID(arm_compute::NEArithmeticAddition::validate(
+                    &aip.dst_info,
+                    &aip.dst_info,
+                    &aip.dst_info,
+                    arm_compute::ConvertPolicy::SATURATE));
+                // clang-format on
+            }
+
+            return status::success;
+        }
     }; // pd_t

     acl_inner_product_fwd_t(const pd_t *apd) : primitive_t(apd) {}

+    status_t init(engine_t *engine) override {
+        acl_utils::acl_thread_bind();
+
+        return status::success;
+    }
+
     status_t create_resource(
             engine_t *engine, resource_mapper_t &mapper) const override {
         if (mapper.has_resource(this)) return status::success;
@@ -129,7 +274,7 @@ struct acl_inner_product_fwd_t : public primitive_t {
         if (!r) return status::out_of_memory;

         // Configure the resource based on information from primitive descriptor
-        auto st = r->configure(pd()->aip_);
+        auto st = r->configure(pd()->aip);
         if (st == status::success) { mapper.add(this, std::move(r)); }

         return st;
diff --git a/src/cpu/aarch64/acl_inner_product_utils.cpp b/src/cpu/aarch64/acl_inner_product_utils.cpp
deleted file mode 100644
index c8fab86f22..0000000000
--- a/src/cpu/aarch64/acl_inner_product_utils.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "cpu/aarch64/acl_inner_product_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-namespace acl_inner_product_utils {
-
-using namespace format_tag;
-using namespace utils;
-using namespace status;
-
-status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
-        memory_desc_t &wei_md, memory_desc_t &dst_md, memory_desc_t &bias_md,
-        const inner_product_desc_t &ipd, const primitive_attr_t &attr) {
-    const memory_desc_wrapper src_d(&src_md);
-    const memory_desc_wrapper wei_d(&wei_md);
-    const memory_desc_wrapper dst_d(&dst_md);
-    const memory_desc_wrapper bia_d(&bias_md);
-
-    // Compute Library currently supports forward propagation only
-    const prop_kind_t prop_kind = ipd.prop_kind;
-    const bool is_fwd = (prop_kind == dnnl_forward_training)
-            || (prop_kind == dnnl_forward_inference);
-    if (!is_fwd) return status::unimplemented;
-
-    const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
-    const int ndims = src_d.ndims();
-
-    // There are two sub-cases: src & wei tensors are either 2- or 4-dimensional
-    const bool is_2d = (ndims == 2) && (wei_d.ndims() == 2);
-    const bool is_4d = (ndims == 4) && (wei_d.ndims() == 4);
-
-    // Compute Library unsupported shape scenarios
-    // FP32 only is supported at the moment
-    if (one_of(true, !(is_4d || is_2d), with_groups)) { return unimplemented; }
-
-    // batch size
-    const int mb = src_d.dims()[0];
-
-    // src/input channels, height, width
-    const int ic = src_d.dims()[1];
-    const int ih = is_4d ? src_d.dims()[ndims - 2] : 0;
-    const int iw = is_4d ? src_d.dims()[ndims - 1] : 0;
-
-    // dst/output channels
-    const int oc = dst_d.dims()[1];
-
-    // weights height, width
-    const int kh = is_4d ? wei_d.dims()[with_groups + ndims - 2] : 0;
-    const int kw = is_4d ? wei_d.dims()[with_groups + ndims - 1] : 0;
-
-    aip.with_bias = ipd.bias_desc.format_kind != format_kind::undef;
-
-    // Data layout is already defined thus should only be checked
-    auto src_tag = memory_desc_matches_one_of_tag(src_md, nhwc, nchw, nc, cn);
-    auto wei_tag = memory_desc_matches_one_of_tag(wei_md, ohwi, oihw, oi, io);
-    auto dst_tag = memory_desc_matches_one_of_tag(dst_md, nc, cn);
-    if (one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
-        return status::unimplemented;
-    }
-
-    arm_compute::TensorShape src_shape {(src_tag == nc)
-                    ? arm_compute::TensorShape(ic, mb)
-                    : arm_compute::TensorShape(mb, ic)};
-    if (is_4d) {
-        src_shape = (src_tag == nhwc)
-                ? arm_compute::TensorShape(ic, iw, ih, mb)
-                : arm_compute::TensorShape(iw, ih, ic, mb);
-    }
-
-    // Compute Library requires the weights to be 2-dimensional for FC layer
-    arm_compute::TensorShape wei_shape {
-            arm_compute::TensorShape(is_4d ? ic * kh * kw : ic, oc)};
-    if (is_2d && wei_tag == io) {
-        wei_shape = arm_compute::TensorShape(oc, ic);
-    }
-
-    arm_compute::DataLayout wei_layout {(wei_tag == ohwi || wei_tag == oi)
-                    ? arm_compute::DataLayout::NHWC
-                    : arm_compute::DataLayout::NCHW};
-
-    // clang-format off
-    aip.src_info = arm_compute::TensorInfo(
-            src_shape,
-            1,
-            arm_compute::DataType::F32,
-            (src_tag == nhwc || src_tag == nc) ?
-            arm_compute::DataLayout::NHWC : arm_compute::DataLayout::NCHW);
-
-    aip.wei_info = arm_compute::TensorInfo(
-            wei_shape,
-            1,
-            arm_compute::DataType::F32,
-            wei_layout);
-
-    aip.dst_info = arm_compute::TensorInfo(
-            (dst_tag == nhwc || dst_tag == nc) ?
-            arm_compute::TensorShape(oc, mb) : arm_compute::TensorShape(mb, oc),
-            1,
-            arm_compute::DataType::F32,
-            (dst_tag == nhwc || dst_tag == nc) ?
-            arm_compute::DataLayout::NHWC : arm_compute::DataLayout::NCHW);
-
-    aip.bia_info = arm_compute::TensorInfo(
-            aip.with_bias ?
-            arm_compute::TensorShape(oc) : arm_compute::TensorShape(),
-            1,
-            arm_compute::DataType::F32);
-    // clang-format on
-
-    aip.fc_info.weights_trained_layout = wei_layout;
-    if (is_2d && wei_tag != src_tag) { aip.fc_info.transpose_weights = false; }
-
-    // Either activation or sum is supported as post-op at the moment
-    aip.fc_info.activation_info = acl_common_utils::get_acl_act(attr);
-    const auto &post_ops = attr.post_ops_;
-    aip.with_sum = (post_ops.len() == 1) && post_ops.entry_[0].is_sum();
-
-    // Fast math mode
-    auto math_mode = get_fpmath_mode();
-    bool is_fastmath_enabled
-            = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
-    aip.fc_info.enable_fast_math = is_fastmath_enabled;
-
-    // clang-format off
-    // Validate fully connected layer manually to check for return status
-    auto acl_st = arm_compute::NEFullyConnectedLayer::validate(
-        &aip.src_info,
-        &aip.wei_info,
-        aip.with_bias ? &aip.bia_info : nullptr,
-        &aip.dst_info,
-        aip.fc_info);
-    // clang-format on
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-        return status::unimplemented;
-    }
-
-    if (aip.with_sum) {
-        // clang-format off
-        // Validate arithmetic addition manually to check for return status
-        auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
-            &aip.dst_info,
-            &aip.dst_info,
-            &aip.dst_info,
-            arm_compute::ConvertPolicy::SATURATE);
-
-        // clang-format on
-        if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
-            MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
-            return status::unimplemented;
-        }
-    }
-
-    return status::success;
-}
-
-} // namespace acl_inner_product_utils
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_inner_product_utils.hpp b/src/cpu/aarch64/acl_inner_product_utils.hpp
deleted file mode 100644
index 022d0e3349..0000000000
--- a/src/cpu/aarch64/acl_inner_product_utils.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
-#define CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
-
-#include "cpu/cpu_inner_product_pd.hpp"
-
-#include "cpu/aarch64/acl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-struct acl_ip_obj_t {
-    arm_compute::NEFullyConnectedLayer fc;
-    arm_compute::NEArithmeticAddition add;
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor wei_tensor;
-    arm_compute::Tensor bia_tensor;
-    arm_compute::Tensor dst_tensor;
-    arm_compute::Tensor dst_acc_tensor;
-};
-
-struct acl_ip_conf_t {
-    bool with_bias;
-    bool with_sum;
-    arm_compute::TensorInfo src_info;
-    arm_compute::TensorInfo wei_info;
-    arm_compute::TensorInfo bia_info;
-    arm_compute::TensorInfo dst_info;
-    arm_compute::FullyConnectedLayerInfo fc_info;
-};
-
-namespace acl_inner_product_utils {
-
-status_t init_conf_ip(acl_ip_conf_t &aip, memory_desc_t &src_md,
-        memory_desc_t &wei_md, memory_desc_t &dst_md, memory_desc_t &bias_md,
-        const inner_product_desc_t &ipd, const primitive_attr_t &attr);
-
-} // namespace acl_inner_product_utils
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // CPU_AARCH64_ACL_INNER_PRODUCT_UTILS_HPP
diff --git a/src/cpu/aarch64/acl_softmax.hpp b/src/cpu/aarch64/acl_softmax.hpp
index a4bfd0c3bd..4ee7139a93 100644
--- a/src/cpu/aarch64/acl_softmax.hpp
+++ b/src/cpu/aarch64/acl_softmax.hpp
@@ -129,7 +129,7 @@ struct acl_softmax_fwd_t : public primitive_t {
             arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;

             const arm_compute::DataType acl_data_t
-                    = acl_common_utils::get_acl_data_t(data_type);
+                    = acl_utils::get_acl_data_t(data_type);

             const int threads = dnnl_get_max_threads();
             if (inner_size_ == 1) {
@@ -189,20 +189,15 @@ struct acl_softmax_fwd_t : public primitive_t {
             }

             // Validate manually to check for return status
-            arm_compute::Status acl_st;
             if (asp_.is_logsoftmax) {
-                acl_st = arm_compute::NELogSoftmaxLayer::validate(
-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis);
+                ACL_CHECK_VALID(arm_compute::NELogSoftmaxLayer::validate(
+                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
             } else {
-                acl_st = arm_compute::NESoftmaxLayer::validate(
-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis);
-            }
-            if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-                MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-                return status::unimplemented;
+                ACL_CHECK_VALID(arm_compute::NESoftmaxLayer::validate(
+                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
             }

-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
index a69f14b6f9..098217f50e 100644
--- a/src/cpu/aarch64/acl_utils.cpp
+++ b/src/cpu/aarch64/acl_utils.cpp
@@ -21,7 +21,7 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {

-namespace acl_common_utils {
+namespace acl_utils {

 using namespace dnnl::impl::alg_kind;
 using namespace data_type;
@@ -247,7 +247,7 @@ status_t permute_common_dense_dimension_to_last(memory_desc_t *d0_permed,
     return status::success;
 }

-} // namespace acl_common_utils
+} // namespace acl_utils

 } // namespace aarch64
 } // namespace cpu
diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
index 565cde66a9..bb8efc998c 100644
--- a/src/cpu/aarch64/acl_utils.hpp
+++ b/src/cpu/aarch64/acl_utils.hpp
@@ -21,13 +21,10 @@

 #include "oneapi/dnnl/dnnl_types.h"

-#include "common/bfloat16.hpp"
-#include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/memory_tracking.hpp"
 #include "common/primitive.hpp"
 #include "common/utils.hpp"
-
 #include "cpu/cpu_engine.hpp"

 #include "arm_compute/runtime/NEON/NEFunctions.h"
@@ -38,7 +35,7 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {

-namespace acl_common_utils {
+namespace acl_utils {

 arm_compute::DataType get_acl_data_t(const dnnl_data_type_t dt);
 arm_compute::ActivationLayerInfo get_acl_act(const primitive_attr_t &attr);
@@ -68,12 +65,33 @@ status_t permute_common_dense_dimension_to_last(memory_desc_t *d0_permed,
         const memory_desc_t *d0, const memory_desc_t *d1,
         const memory_desc_t *d2);

-#define MAYBE_REPORT_ACL_ERROR(msg) \
+// Logs a custom 'info' line describing an unsupported case
+#define LOG_ACL_UNSUPPORTED(msg) \
+    do { \
+        if (get_verbose() >= 2) \
+            printf("onednn_verbose,cpu,acl,unsupported: %s\n", (msg)); \
+    } while (0)
+
+// Returns unimplemented if error code x is NOT OK
+#define ACL_CHECK_VALID(x) \
+    do { \
+        arm_compute::Status s = x; \
+        if (s.error_code() != arm_compute::ErrorCode::OK) { \
+            LOG_ACL_UNSUPPORTED(s.error_description().c_str()); \
+            return dnnl::impl::status::unimplemented; \
+        } \
+    } while (0)
+
+// Returns unimplemented on condition x == true
+#define ACL_CHECK_SUPPORT(x, msg) \
     do { \
-        if (get_verbose()) printf("onednn_verbose,cpu,error,acl,%s\n", (msg)); \
+        if (x) { \
+            LOG_ACL_UNSUPPORTED(msg); \
+            return dnnl::impl::status::unimplemented; \
+        } \
     } while (0)

-} // namespace acl_common_utils
+} // namespace acl_utils

 } // namespace aarch64
 } // namespace cpu
diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
index 29e44eb189..223b3bc9b8 100644
--- a/src/cpu/aarch64/acl_winograd_convolution.hpp
+++ b/src/cpu/aarch64/acl_winograd_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Arm Ltd. and affiliates
+* Copyright 2020-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ struct acl_wino_convolution_fwd_t : public primitive_t {

             set_default_alg_kind(alg_kind::convolution_winograd);

-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
@@ -130,7 +130,7 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
             // sum+eltwise post-ops
             if (eltwise_only || sum_with_eltwise) {
                 const auto act_type = po.entry_[sum_with_eltwise].eltwise.alg;
-                eltwise_ok = acl_common_utils::acl_act_ok(act_type);
+                eltwise_ok = acl_utils::acl_act_ok(act_type);
             }

             return eltwise_ok || (po.len() == 0);
diff --git a/src/cpu/aarch64/matmul/acl_matmul.cpp b/src/cpu/aarch64/matmul/acl_matmul.cpp
index 3945fda6fc..6f9bb9b9ad 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,4 +84,4 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
\ No newline at end of file
+} // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
index 6ba17e86dd..e69f4d9592 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ struct acl_matmul_t : public primitive_t {
             if (conf_status != status::success) return status::unimplemented;
             // Number of threads in Compute Library is set by OMP_NUM_THREADS
             // dnnl_get_max_threads() == OMP_NUM_THREADS
-            acl_common_utils::acl_thread_bind();
+            acl_utils::acl_thread_bind();

             return status::success;
         }
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
index 76599d8bb1..ba266b4303 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,14 +22,10 @@ namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
-namespace matmul {

-using namespace dnnl::impl::status;
-using namespace dnnl::impl::utils;
-using namespace dnnl::impl::cpu::matmul;
-using namespace prop_kind;
+using namespace alg_kind;
+using namespace cpu::matmul;
 using namespace format_tag;
-using namespace dnnl::impl::alg_kind;

 namespace acl_matmul_utils {

@@ -58,7 +54,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
             wei_md, abcd, abdc, abc, acb, ab, ba);
     auto dst_tag = memory_desc_matches_one_of_tag(
             dst_md, abcd, abdc, abc, acb, ab, ba);
-    if (one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
+    if (utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag)) {
         return status::unimplemented;
     }
     amp.is_transA = helper.transA() == 'T';
@@ -85,7 +81,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     // Fast-math mode
     auto math_mode = get_fpmath_mode();
     bool is_fastmath_enabled
-            = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
+            = utils::one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
     amp.gemm_info.set_fast_math(is_fastmath_enabled);

     // Fused ReLU activation
@@ -95,29 +91,15 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     amp.alpha = attr.output_scales_.scales_[0];

     // Validate ACL transpose
-    if (amp.is_transA) {
-        auto acl_transA_st = arm_compute::NETranspose::validate(
-                &amp.src_acc_info, &amp.src_info);
-        if (acl_transA_st.error_code() != arm_compute::ErrorCode::OK) {
-            MAYBE_REPORT_ACL_ERROR(acl_transA_st.error_description().c_str());
-            return status::unimplemented;
-        }
-    }
-    if (amp.is_transB) {
-        auto acl_transB_st = arm_compute::NETranspose::validate(
-                &amp.wei_acc_info, &amp.wei_info);
-        if (acl_transB_st.error_code() != arm_compute::ErrorCode::OK) {
-            MAYBE_REPORT_ACL_ERROR(acl_transB_st.error_description().c_str());
-            return status::unimplemented;
-        }
-    }
+    if (amp.is_transA)
+        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
+                &amp.src_acc_info, &amp.src_info));
+    if (amp.is_transB)
+        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
+                &amp.wei_acc_info, &amp.wei_info));
     // Validate ACL GEMM
-    auto acl_st = arm_compute::NEGEMM::validate(&amp.src_info, &amp.wei_info,
-            nullptr, &amp.dst_info, amp.alpha, 0.0f, amp.gemm_info);
-    if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-        MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
-        return status::unimplemented;
-    }
+    ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp.src_info, &amp.wei_info,
+            nullptr, &amp.dst_info, amp.alpha, 0.0f, amp.gemm_info));

     return status::success;
 }
@@ -175,7 +157,6 @@ bool acl_act_ok(alg_kind_t eltwise_activation) {

 } // namespace acl_matmul_utils

-} // namespace matmul
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
index 1411dc4f4b..248dbe5a09 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,6 @@ namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
-namespace matmul {

 struct acl_matmul_obj_t {
     arm_compute::NEGEMM gemm;
@@ -61,10 +60,9 @@ arm_compute::ActivationLayerInfo get_acl_act(const primitive_attr_t &attr);
 bool acl_act_ok(alg_kind_t eltwise_activation);
 } // namespace acl_matmul_utils

-} // namespace matmul
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl

-#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP
\ No newline at end of file
+#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP
diff --git a/tests/benchdnn/inputs/ip/test_ip_acl b/tests/benchdnn/inputs/ip/test_ip_acl
new file mode 100644
index 0000000000..a8873c30a8
--- /dev/null
+++ b/tests/benchdnn/inputs/ip/test_ip_acl
@@ -0,0 +1,26 @@
+--reset
+
+# do not test other implementations
+--skip-impl='ref,jit'
+
+# test format::any
+--batch=shapes_ci
+
+# only inference, with and without bias
+--dir=FWD_I,FWD_B
+# test all shapes
+--batch=set_all
+
+# with and without batches
+--mb=0,2
+
+# test non-spatial layout combinations
+--stag=ab,ba
+--wtag=ab,ba
+--batch=shapes_0d
+
+# test spatial layout combinations
+--stag=abx,axb
+--wtag=abx,axb
+# 2d-spatial dimensions
+--batch=shapes_googlenet_v1