diff --git a/9_4_4-bootstrap-sources.tar.gz b/9_4_4-bootstrap-sources.tar.gz
deleted file mode 100644
index 0b5cf96..0000000
--- a/9_4_4-bootstrap-sources.tar.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3de754bb3e45d4b3980ed238781cfd808f4e7c84ceae73213662f1bf5ea2d8fa
-size 1526663
diff --git a/9_8_2-bootstrap-sources.tar.gz b/9_8_2-bootstrap-sources.tar.gz
new file mode 100644
index 0000000..adf4dab
--- /dev/null
+++ b/9_8_2-bootstrap-sources.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b8e6fae1aa500de67fcc88aa6fe49690f0dd5c87f6710592db4cda54c8086f7
+size 1446589
diff --git a/cabal-riscv64.patch b/cabal-riscv64.patch
deleted file mode 100644
index c9278db..0000000
--- a/cabal-riscv64.patch
+++ /dev/null
@@ -1,41 +0,0 @@
-Index: ghc-9.4.5/libraries/Cabal/Cabal-syntax/src/Distribution/System.hs
-===================================================================
---- ghc-9.4.5.orig/libraries/Cabal/Cabal-syntax/src/Distribution/System.hs
-+++ ghc-9.4.5/libraries/Cabal/Cabal-syntax/src/Distribution/System.hs
-@@ -159,7 +159,7 @@ buildOS = classifyOS Permissive System.I
- 
- -- | These are the known Arches: I386, X86_64, PPC, PPC64, Sparc,
- -- Arm, AArch64, Mips, SH, IA64, S390, S390X, Alpha, Hppa, Rs6000,
---- M68k, Vax, JavaScript and Wasm32.
-+-- M68k, Vax, RISCV64, JavaScript and Wasm32.
- --
- -- The following aliases can also be used:
- --    * PPC alias: powerpc
-@@ -174,6 +174,7 @@ data Arch = I386  | X86_64  | PPC  | PPC
-           | IA64  | S390    | S390X
-           | Alpha | Hppa    | Rs6000
-           | M68k  | Vax
-+          | RISCV64
-           | JavaScript
-           | Wasm32
-           | OtherArch String
-@@ -189,6 +190,7 @@ knownArches = [I386, X86_64, PPC, PPC64,
-               ,IA64, S390, S390X
-               ,Alpha, Hppa, Rs6000
-               ,M68k, Vax
-+              ,RISCV64
-               ,JavaScript
-               ,Wasm32]
- 
-Index: ghc-9.4.5/libraries/Cabal/Cabal/src/Distribution/Simple/PreProcess.hs
-===================================================================
---- ghc-9.4.5.orig/libraries/Cabal/Cabal/src/Distribution/Simple/PreProcess.hs
-+++ ghc-9.4.5/libraries/Cabal/Cabal/src/Distribution/Simple/PreProcess.hs
-@@ -729,6 +729,7 @@ platformDefines lbi =
-       Rs6000      -> ["rs6000"]
-       M68k        -> ["m68k"]
-       Vax         -> ["vax"]
-+      RISCV64     -> ["riscv64"]
-       JavaScript  -> ["javascript"]
-       Wasm32      -> ["wasm32"]
-       OtherArch _ -> []
diff --git a/ghc-9.10.1-src.tar.xz b/ghc-9.10.1-src.tar.xz
new file mode 100644
index 0000000..d0165a9
--- /dev/null
+++ b/ghc-9.10.1-src.tar.xz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf386a302d4ee054791ffd51748900f15d71760fd199157922d120cc1f89e2f7
+size 32828552
diff --git a/ghc-9.8.3-src.tar.xz b/ghc-9.8.3-src.tar.xz
deleted file mode 100644
index 60d6354..0000000
--- a/ghc-9.8.3-src.tar.xz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:99e40d729ec8831a633b75fd85d65dd10e31a0133dec9d198d686a273679ab70
-size 32565948
diff --git a/ghc-Cabal-install-PATH-warning.patch b/ghc-Cabal-install-PATH-warning.patch
index e59d86d..87f0448 100644
--- a/ghc-Cabal-install-PATH-warning.patch
+++ b/ghc-Cabal-install-PATH-warning.patch
@@ -1,14 +1,18 @@
-Index: ghc-9.2.0.20210331/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
+Index: ghc-9.10.1/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
 ===================================================================
---- ghc-9.2.0.20210331.orig/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
-+++ ghc-9.2.0.20210331/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
-@@ -206,8 +206,7 @@ copyComponent verbosity pkg_descr lbi (C
-                       ++ " in " ++ binPref)
-     inPath <- isInSearchPath binPref
-     when (not inPath) $
--      warn verbosity ("The directory " ++ binPref
--                      ++ " is not in the system search path.")
-+      warn verbosity ("Executable installed in " ++ binPref)
-     case compilerFlavor (compiler lbi) of
-       GHC   -> GHC.installExe   verbosity lbi binPref buildPref progFix pkg_descr exe
-       GHCJS -> GHCJS.installExe verbosity lbi binPref buildPref progFix pkg_descr exe
+--- ghc-9.10.1.orig/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
++++ ghc-9.10.1/libraries/Cabal/Cabal/src/Distribution/Simple/Install.hs
+@@ -261,12 +261,7 @@ copyComponent verbosity pkg_descr lbi (C
+     )
+   inPath <- isInSearchPath binPref
+   when (not inPath) $
+-    warn
+-      verbosity
+-      ( "The directory "
+-          ++ binPref
+-          ++ " is not in the system search path."
+-      )
++    warn verbosity ("Executable installed in " ++ binPref)
+   case compilerFlavor (compiler lbi) of
+     GHC -> GHC.installExe verbosity lbi binPref buildPref progFix pkg_descr exe
+     GHCJS -> GHCJS.installExe verbosity lbi binPref buildPref progFix pkg_descr exe
diff --git a/ghc-with-interpreter.patch b/ghc-with-interpreter.patch
deleted file mode 100644
index ad2dd2e..0000000
--- a/ghc-with-interpreter.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Index: ghc-9.4.5/hadrian/src/Oracles/Setting.hs
-===================================================================
---- ghc-9.4.5.orig/hadrian/src/Oracles/Setting.hs
-+++ ghc-9.4.5/hadrian/src/Oracles/Setting.hs
-@@ -292,7 +292,8 @@ ghcWithInterpreter = do
-                           , "darwin", "kfreebsdgnu" ]
-     goodArch <- anyTargetArch [ "i386", "x86_64", "powerpc"
-                               , "arm", "aarch64", "s390x"
--                              , "powerpc64", "powerpc64le" ]
-+                              , "powerpc64", "powerpc64le"
-+                              , "riscv64" ]
-     return $ goodOs && goodArch
- 
- -- | Variants of the ARM architecture.
diff --git a/ghc.changes b/ghc.changes
index 5870b0a..2b7135b 100644
--- a/ghc.changes
+++ b/ghc.changes
@@ -1,3 +1,26 @@
+-------------------------------------------------------------------
+Mon Nov 25 14:46:09 UTC 2024 - Andreas Schwab <schwab@suse.de>
+
+- riscv64-ncg.patch: Backport of MR 13105, NCG for RISCV64
+- Don't require on llvm on riscv64
+
+-------------------------------------------------------------------
+Fri Nov 22 14:54:22 UTC 2024 - Ondřej Súkup <mimi.vx@gmail.com>
+
+- Update to version 9.10.1. The change log for this release is at:
+  https://downloads.haskell.org/~ghc/9.10.1/docs/users_guide/9.10.1-notes.html
+- use ghc-bootstrap 9.8.2
+- enable numa on all architectures
+- cleanup dead parts of specfile
+- patch changes:
+ * added:
+  - hadrian-9.10-deps.patch
+  - os-string-be.patch
+  - ppc64le-miscompilation.patch
+ * dropped:
+  - cabal-riscv64.patch
+  - ghc-with-interpreter.patch
+
 -------------------------------------------------------------------
 Mon Oct 21 10:36:19 UTC 2024 - Ondřej Súkup <mimi.vx@gmail.com>
 
diff --git a/ghc.spec b/ghc.spec
index 61a887f..5a446e7 100644
--- a/ghc.spec
+++ b/ghc.spec
@@ -1,7 +1,7 @@
 #
 # spec file for package ghc
 #
-# Copyright (c) 2024 SUSE LLC
+# Copyright (c) 2023 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -11,31 +11,19 @@
 # case the license is the MIT License). An "Open Source License" is a
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
-
 # Please submit bugfixes or comments via https://bugs.opensuse.org/
 #
 
+%define full_version 9.10.1
+%define short_version 9.10.1
 
-%define full_version 9.8.3
-%define short_version 9.8.3
-
-%ifnarch s390x
-%define with_libnuma 1
-%else
-%define with_libnuma 0
-%endif
-
-%global llvm_major 14
+%global llvm_major 15
 
 # conditionals
-
 # disable prof, docs, perf build
 # bcond_with for production builds: disable quick build
 %bcond_with quickbuild
 
-# make sure ghc libraries' ABI hashes unchanged (ghcX.Y not supported yet)
-%bcond_with abicheck
-
 # bcond_without for production builds: use Hadrian buildsystem
 %bcond_without hadrian
 
@@ -70,14 +58,15 @@
 %{?with_haddock:%bcond_without manual}
 %endif
 
-%global ghc_llvm_archs s390x riscv64
+%global ghc_llvm_archs s390x
 %global ghc_unregisterized_arches noarch
 
-%global base_ver 4.19.2.0
+%global base_ver 4.20.0.0
 %global ghc_compact_ver 0.1.0.0
-%global hpc_ver 0.7.0.0
+%global hpc_ver 0.7.0.1
 %global hsc2hs_ver 0.68.8
 %global ghc_bignum_ver 1.3
+%global xhtml_ver 3000.2.2.1
 
 Name:           ghc
 Version:        %{short_version}
@@ -87,24 +76,31 @@ License:        BSD-3-Clause
 URL:            https://www.haskell.org/ghc/
 Source:         https://downloads.haskell.org/~ghc/%{full_version}/ghc-%{version}-src.tar.xz
 Source2:        ghc-rpmlintrc
-Source4:        9_4_4-bootstrap-sources.tar.gz
+Source4:        9_8_2-bootstrap-sources.tar.gz
 Source5:        ghc-pkg.man
 Source6:        haddock.man
 Source7:        runghc.man
+
 Patch1:         ghc-gen_contents_index-haddock-path.patch
 # https://ghc.haskell.org/trac/ghc/ticket/15689
 Patch2:         ghc-Cabal-install-PATH-warning.patch
+
 # PATCH-FIX-UPSTREAM Disable-unboxed-arrays.patch ptrommler@icloud.com -- Do not use unboxed arrays on big-endian platforms. See Haskell Trac #15411.
 Patch3:         Disable-unboxed-arrays.patch
-# PATCH-FIX-UPSTREAM Hadrian: enable GHCi support on riscv64 (dd38aca95a)
-Patch100:       ghc-with-interpreter.patch
-# PATCH-FIX-UPSTREAM libraries/Cabal: Add support for the 64-bit RISC-V architecture (d89526f98)
-Patch101:       cabal-riscv64.patch
+
+Patch5:         ppc64le-miscompilation.patch
+
+Patch100:       os-string-be.patch 
 Patch200:       ghc-hadrian-s390x-rts--qg.patch
+Patch300:       hadrian-9.10-deps.patch
+
+# Backport of MR 13105 (NCG for RISCV64)
+Patch243:       riscv64-ncg.patch
+
 BuildRequires:  binutils-devel
 BuildRequires:  gcc-PIE
 BuildRequires:  gcc-c++
-BuildRequires:  ghc-bootstrap >= 9.4
+BuildRequires:  ghc-bootstrap >= 9.8
 BuildRequires:  ghc-bootstrap-helpers >= 1.3
 BuildRequires:  ghc-rpm-macros-extra => 2.6.1
 BuildRequires:  glibc-devel
@@ -112,6 +108,7 @@ BuildRequires:  gmp-devel
 BuildRequires:  libdw-devel
 BuildRequires:  libelf-devel
 BuildRequires:  libffi-devel
+BuildRequires:  libdwarf-devel
 BuildRequires:  libtool
 %ifarch %{ghc_llvm_archs}
 BuildRequires:  clang%{llvm_major}
@@ -138,9 +135,7 @@ Recommends:     %{name}-compiler-default = %{version}-%{release}
 %if %{with manual}
 BuildRequires:  python3-Sphinx
 %endif
-%if %{with_libnuma}
 BuildRequires:  libnuma-devel
-%endif
 %if %{with haddock}
 Suggests:       %{name}-doc = %{version}-%{release}
 Suggests:       %{name}-doc-index = %{version}-%{release}
@@ -172,11 +167,11 @@ Haskell home page at <http://www.haskell.org/>.
 %package compiler
 Summary:        GHC compiler and utilities
 License:        BSD-3-Clause
-Requires:       %{name}-filesystem = %{version}-%{release}
 Requires:       gcc
 Requires:       ghc-base-devel = %{base_ver}-%{release}
+Requires:       %{name}-filesystem = %{version}-%{release}
 Provides:       hsc2hs = %{hsc2hs_ver}-%{release}
-%ifarch riscv64 s390x
+%ifarch %{ghc_llvm_archs}
 Requires:       clang%{llvm_major}
 Requires:       llvm%{llvm_major}
 %endif
@@ -216,6 +211,7 @@ This package provides some common directories used for
 Haskell libraries documentation.
 %endif
 
+
 %if %{with manual}
 %package manual
 Summary:        GHC manual
@@ -227,48 +223,51 @@ BuildArch:      noarch
 This package provides the User Guide and Haddock manual.
 %endif
 
+
 %global ghc_version_override %{version}
 %global ghc_pkg_c_deps ghc-compiler = %{ghc_version_override}-%{release}
 %global version %{ghc_version_override}
-%if %{with_libnuma}
-%define libnuma_dep ,libnuma-devel
-%else
-%define libnuma_dep %{nil}
-%endif
 #!ForceMultiversion
-%ghc_lib_subpackage -d Cabal-3.10.3.0
-%ghc_lib_subpackage -d Cabal-syntax-3.10.3.0
-%ghc_lib_subpackage -d array-0.5.8.0
-%ghc_lib_subpackage -d -c gmp-devel,libffi-devel,libdw-devel,libelf-devel%{libnuma_dep} base-%{base_ver}
-%ghc_lib_subpackage -d binary-0.8.9.1
-%ghc_lib_subpackage -d -x ghc-bignum-%{ghc_bignum_ver}
+%ghc_lib_subpackage -d Cabal-3.12.0.0
+%ghc_lib_subpackage -d Cabal-syntax-3.12.0.0
+%ghc_lib_subpackage -d array-0.5.7.0
+%ghc_lib_subpackage -d -c gmp-devel,libffi-devel,libdw-devel,libelf-devel,libnuma-devel base-%{base_ver}
+%ghc_lib_subpackage -d binary-0.8.9.2
 %ghc_lib_subpackage -d bytestring-0.12.1.0
-%ghc_lib_subpackage -d containers-0.6.8
-%ghc_lib_subpackage -d deepseq-1.5.1.0
-%ghc_lib_subpackage -d directory-1.3.8.5
+%ghc_lib_subpackage -d containers-0.7
+%ghc_lib_subpackage -d deepseq-1.5.0.0
+%ghc_lib_subpackage -d directory-1.3.8.3
 %ghc_lib_subpackage -d exceptions-0.10.7
-%ghc_lib_subpackage -d filepath-1.4.200.1
+%ghc_lib_subpackage -d filepath-1.5.2.0
 %ghc_lib_subpackage -d -x ghc-%{ghc_version_override}
+%ghc_lib_subpackage -d -x ghc-bignum-%{ghc_bignum_ver}
 %ghc_lib_subpackage -d -x ghc-boot-%{ghc_version_override}
 %ghc_lib_subpackage -d ghc-boot-th-%{ghc_version_override}
 %ghc_lib_subpackage -d -x ghc-compact-%{ghc_compact_ver}
+%ghc_lib_subpackage -d ghc-experimental-0.1.0.0
 %ghc_lib_subpackage -d -x ghc-heap-%{ghc_version_override}
+%ghc_lib_subpackage -d ghc-internal-9.1001.0
 %ghc_lib_subpackage -d -x ghci-%{ghc_version_override}
 %ghc_lib_subpackage -d haskeline-0.8.2.1
 %ghc_lib_subpackage -d -x hpc-%{hpc_ver}
 %ghc_lib_subpackage -d mtl-2.3.1
 %ghc_lib_subpackage -d parsec-3.1.17.0
 %ghc_lib_subpackage -d pretty-1.1.3.6
-%ghc_lib_subpackage -d process-1.6.25.0
-%ghc_lib_subpackage -d semaphore-compat-1.0.0
+%ghc_lib_subpackage -d process-1.6.19.0
 %ghc_lib_subpackage -d stm-2.5.3.1
-%ghc_lib_subpackage -d template-haskell-2.21.0.0
+%ghc_lib_subpackage -d semaphore-compat-1.0.0
+%ghc_lib_subpackage -d template-haskell-2.22.0.0
 %ghc_lib_subpackage -d -c ncurses-devel terminfo-0.4.1.6
 %ghc_lib_subpackage -d text-2.1.1
 %ghc_lib_subpackage -d time-1.12.2
-%ghc_lib_subpackage -d transformers-0.6.1.0
-%ghc_lib_subpackage -d unix-2.8.4.0
-%ghc_lib_subpackage -d xhtml-3000.2.2.1
+%ghc_lib_subpackage -d transformers-0.6.1.1
+%ghc_lib_subpackage -d unix-2.8.5.1
+%ghc_lib_subpackage -d xhtml-%{xhtml_ver}
+
+# new in 9.10 
+%ghc_lib_subpackage -d os-string-2.0.2
+%ghc_lib_subpackage -d ghc-toolchain-0.1.0.0
+%ghc_lib_subpackage -d ghc-platform-0.1.0.0
 
 %global version %{ghc_version_override}
 
@@ -301,17 +300,24 @@ Installing this package causes %{name}-*-prof packages corresponding to
 %ifarch s390x
 %patch -P 3 -p1
 %endif
+
+%patch -P 5 -p1
+
 %patch -P 100 -p1
-%patch -P 101 -p1
+
 %ifarch ppc64le s390x riscv64
 %patch -P 200 -p1
 %endif
 
+%patch -P 300 -p1
+
+%patch -P 243 -p1
+
 rm libffi-tarballs/libffi-*.tar.gz
 
 %build
 cp %{SOURCE4} ./
-hadrian/bootstrap/bootstrap.py --bootstrap-sources 9_4_4-bootstrap-sources.tar.gz
+hadrian/bootstrap/bootstrap.py --bootstrap-sources 9_8_2-bootstrap-sources.tar.gz
 
 %global hadrian _build/bin/hadrian
 
@@ -352,7 +358,7 @@ python3 boot.source --hadrian
 %endif
 %global jobs_nr %{?_smp_mflags}
 %else
-%global jobs_nr -j1
+%global jobs_nr -j1 
 %endif
 %{hadrian} %{jobs_nr} --flavour=%{?with_quickbuild:quick+no_profiled_libs}%{!?with_quickbuild:perf%{!?with_ghc_prof:+no_profiled_libs}}%{?hadrian_llvm} %{hadrian_docs} binary-dist-dir --hash-unit-ids
 
@@ -371,21 +377,25 @@ for i in $(find %{buildroot} -type f -executable -exec sh -c "file {} | grep -q
   chrpath -d $i
 done
 
+
 %if %{with haddock}
 # remove short hashes
 for d in %{buildroot}%{ghc_html_libraries_dir}/*/; do
 mv $d $(echo $d | sed -e "s/\(.*\)-.*/\\1/")
 done
 %endif
-
+ 
 # containers src moved to a subdir
 cp -p libraries/containers/containers/LICENSE libraries/containers/LICENSE
 # hack for Cabal-syntax/LICENSE
 mkdir -p libraries/Cabal-syntax
 cp -p libraries/Cabal/Cabal-syntax/LICENSE libraries/Cabal-syntax
-
+# hack for ghc-toolchain
+mkdir -p libraries/ghc-toolchain
+cp -p LICENSE libraries/ghc-toolchain
 rm -f %{name}-*.files
 
+
 # FIXME replace with ghc_subpackages_list
 for i in %{ghc_packages_list}; do
 name=$(echo $i | sed -e "s/\(.*\)-.*/\1/")
@@ -397,8 +407,8 @@ done
 echo "%%dir %{ghclibdir}" >> %{name}-base%{?_ghcdynlibdir:-devel}.files
 
 %ghc_gen_filelists ghc %{ghc_version_override}
-%ghc_gen_filelists ghc-boot %{ghc_version_override}
 %ghc_gen_filelists ghc-bignum %{ghc_bignum_ver}
+%ghc_gen_filelists ghc-boot %{ghc_version_override}
 %ghc_gen_filelists ghc-compact %{ghc_compact_ver}
 %ghc_gen_filelists ghc-heap %{ghc_version_override}
 %ghc_gen_filelists ghci %{ghc_version_override}
@@ -423,13 +433,13 @@ fi
 
 %if %{with ghc_prof}
 ls %{buildroot}%{ghclibdir}/bin/ghc-iserv-prof* >> %{name}-base-prof.files
-ls %{buildroot}%{ghclibdir}/lib/bin/ghc-iserv-prof >> %{name}-base-prof.files
 echo "%%dir %{ghcliblib}/bin"
 %endif
 
 sed -i -e "s|^%{buildroot}||g" %{name}-base*.files
 sed -i -e "s|%{buildroot}||g" %{buildroot}%{_bindir}/*
 
+
 %if %{with haddock}
 rm %{buildroot}%{_docdir}/ghc-%{version}/archives/libraries.html.tar.xz
 %endif
@@ -438,25 +448,27 @@ rm %{buildroot}%{_docdir}/ghc-%{version}/archives/Haddock.html.tar.xz
 rm %{buildroot}%{_docdir}/ghc-%{version}/archives/users_guide.html.tar.xz
 %endif
 
+
 mkdir -p %{buildroot}%{_mandir}/man1
 install -p -m 0644 %{SOURCE5} %{buildroot}%{_mandir}/man1/ghc-pkg.1
 install -p -m 0644 %{SOURCE6} %{buildroot}%{_mandir}/man1/haddock.1
 install -p -m 0644 %{SOURCE7} %{buildroot}%{_mandir}/man1/runghc.1
 
+
 rm %{buildroot}%{ghclibdir}/lib/package.conf.d/.stamp
 rm %{buildroot}%{ghclibdir}/lib/package.conf.d/*.conf.copy
 
 # https://gitlab.haskell.org/ghc/ghc/-/issues/24121
 rm %{buildroot}%{ghclibdir}/share/doc/%{ghcplatform}/*/LICENSE
 
-(
-cd %{buildroot}%{ghclibdir}/lib/bin
-for i in *; do
-  if [ -f %{buildroot}%{ghclibdir}/bin/$i ]; then
-     ln -sf ../../bin/$i
-  fi
-done
-)
+#(
+#cd %{buildroot}%{ghclibdir}/lib/bin
+#for i in *; do
+#  if [ -f %{buildroot}%{ghclibdir}/bin/$i ]; then
+#     ln -sf ../../bin/$i
+#  fi
+#done
+#)
 
 %check
 # Actually, I took this from Jens Petersen's Fedora package
@@ -518,17 +530,18 @@ $GHC --info
 %dir %{ghcliblib}
 %dir %{ghcliblib}/%{ghcplatform}
 %dir %{ghclibdir}/bin
-%dir %{ghcliblib}/bin
 %{ghclibdir}/bin/ghc
 %{ghclibdir}/bin/ghc-iserv
 %{ghclibdir}/bin/ghc-iserv-dyn
 %{ghclibdir}/bin/ghc-pkg
+%{ghclibdir}/bin/ghc-toolchain-bin
 %{ghclibdir}/bin/hpc
 %{ghclibdir}/bin/hsc2hs
 %{ghclibdir}/bin/runghc
 %{ghclibdir}/bin/hp2ps
 %{ghclibdir}/bin/unlit
 %{ghclibdir}/bin/ghc-%{version}
+%{ghclibdir}/bin/ghc-toolchain-bin-ghc-%{version}
 %{ghclibdir}/bin/ghc-iserv-ghc-%{version}
 %{ghclibdir}/bin/ghc-iserv-dyn-ghc-%{version}
 %{ghclibdir}/bin/ghc-pkg-%{version}
@@ -541,14 +554,13 @@ $GHC --info
 %{ghclibdir}/bin/runhaskell
 %{ghclibdir}/bin/runhaskell-%{version}
 %{ghclibdir}/bin/unlit-ghc-%{version}
-%{ghclibdir}/lib/bin/ghc-iserv
-%{ghclibdir}/lib/bin/ghc-iserv-dyn
-%{ghclibdir}/lib/bin/unlit
 %{ghcliblib}/ghc-interp.js
 %{ghcliblib}/ghc-usage.txt
 %{ghcliblib}/ghci-usage.txt
 %{ghcliblib}/llvm-passes
 %{ghcliblib}/llvm-targets
+%{ghcliblib}/post-link.mjs
+%{ghcliblib}/prelude.js
 %dir %{ghcliblib}/package.conf.d
 %ghost %{ghcliblib}/package.conf.d/package.cache
 %{ghcliblib}/package.conf.d/package.cache.lock
diff --git a/hadrian-9.10-deps.patch b/hadrian-9.10-deps.patch
new file mode 100644
index 0000000..e9dc0d6
--- /dev/null
+++ b/hadrian-9.10-deps.patch
@@ -0,0 +1,74 @@
+--- ghc-9.10.0.20240313/hadrian/hadrian.cabal.orig	2024-03-13 21:23:13.000000000 +0800
++++ ghc-9.10.0.20240313/hadrian/hadrian.cabal	2024-03-14 21:53:00.180444498 +0800
+@@ -36,6 +36,8 @@
+     main-is:             Main.hs
+     hs-source-dirs:      .
+                        , src
++                       , ../libraries/ghc-platform/src
++                       , ../utils/ghc-toolchain/src
+     other-modules:       Base
+                        , Builder
+                        , CommandLine
+@@ -152,6 +154,7 @@
+                        , directory            >= 1.3.1.0 && < 1.4
+                        , extra                >= 1.4.7
+                        , filepath
++                       , process
+                        , time
+                        , mtl                  >= 2.2     && < 2.4
+                        , parsec               >= 3.1     && < 3.2
+@@ -168,8 +171,6 @@
+                        , text                 >= 1.2     && < 3
+                        , cryptohash-sha256    >= 0.11    && < 0.12
+                        , base16-bytestring    >= 0.1.1 && < 1.1.0.0
+-                       , ghc-platform
+-                       , ghc-toolchain
+     ghc-options:       -Wall
+                        -Wincomplete-record-updates
+                        -Wredundant-constraints
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Program.hs.orig	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Program.hs	2024-03-14 22:06:00.298622371 +0800
+@@ -1,3 +1,5 @@
++{-# LANGUAGE NoImplicitPrelude #-}
++
+ module GHC.Toolchain.Program
+     ( Program(..)
+     , shProgram
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cc.hs~	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cc.hs	2024-03-14 22:06:23.356740644 +0800
+@@ -1,3 +1,4 @@
++{-# LANGUAGE NoImplicitPrelude #-}
+ {-# LANGUAGE NamedFieldPuns #-}
+ {-# LANGUAGE ViewPatterns #-}
+ 
+diff -up ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Ar.hs~ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Ar.hs
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Ar.hs~	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Ar.hs	2024-03-14 22:09:48.872794751 +0800
+@@ -1,3 +1,4 @@
++{-# LANGUAGE NoImplicitPrelude #-}
+ {-# LANGUAGE NamedFieldPuns #-}
+ {-# LANGUAGE RecordWildCards #-}
+ 
+diff -up ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cpp.hs~ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cpp.hs
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cpp.hs~	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Cpp.hs	2024-03-14 22:10:18.872948622 +0800
+@@ -1,3 +1,4 @@
++{-# LANGUAGE NoImplicitPrelude #-}
+ {-# LANGUAGE NamedFieldPuns #-}
+ 
+ module GHC.Toolchain.Tools.Cpp (HsCpp(..), findHsCpp, Cpp(..), findCpp) where
+diff -up ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Link.hs~ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Link.hs
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Link.hs~	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/Link.hs	2024-03-14 22:10:12.954918270 +0800
+@@ -1,3 +1,4 @@
++{-# LANGUAGE NoImplicitPrelude #-}
+ {-# OPTIONS_GHC -Wno-name-shadowing #-}
+ {-# LANGUAGE NamedFieldPuns #-}
+ {-# LANGUAGE RecordWildCards #-}
+--- ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/MergeObjs.hs~	2024-03-13 21:23:14.000000000 +0800
++++ ghc-9.10.0.20240313/utils/ghc-toolchain/src/GHC/Toolchain/Tools/MergeObjs.hs	2024-03-14 22:13:08.399807784 +0800
+@@ -1,3 +1,4 @@
++{-# LANGUAGE NoImplicitPrelude #-}
+ {-# LANGUAGE NamedFieldPuns #-}
+ {-# LANGUAGE CPP #-}
+ 
diff --git a/os-string-be.patch b/os-string-be.patch
new file mode 100644
index 0000000..105a155
--- /dev/null
+++ b/os-string-be.patch
@@ -0,0 +1,25 @@
+From bafe87d871399b58ce4a50592b980c990a3eac39 Mon Sep 17 00:00:00 2001
+From: Bodigrim <andrew.lelechenko@gmail.com>
+Date: Thu, 9 May 2024 01:02:44 +0100
+Subject: [PATCH] Fix compilation on big-endian arches
+
+---
+ libraries/os-string/System/OsString/Data/ByteString/Short/Internal.hs | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/libraries/os-string/System/OsString/Data/ByteString/Short/Internal.hs b/libraries/os-string/System/OsString/Data/ByteString/Short/Internal.hs
+index fedc199..f7ddcd8 100644
+--- a/libraries/os-string/System/OsString/Data/ByteString/Short/Internal.hs
++++ b/libraries/os-string/System/OsString/Data/ByteString/Short/Internal.hs
+@@ -311,7 +311,11 @@ word16ToLE#, word16FromLE# :: Word16# -> Word16#
+ word16ToLE#, word16FromLE# :: Word# -> Word#
+ #endif
+ #ifdef WORDS_BIGENDIAN
++#if MIN_VERSION_base(4,16,0)
++word16ToLE# w = wordToWord16# (byteSwap16# (word16ToWord# w))
++#else
+ word16ToLE# = byteSwap16#
++#endif
+ #else
+ word16ToLE# w# = w#
+ #endif
diff --git a/ppc64le-miscompilation.patch b/ppc64le-miscompilation.patch
new file mode 100644
index 0000000..539acce
--- /dev/null
+++ b/ppc64le-miscompilation.patch
@@ -0,0 +1,25 @@
+From 3f128c7d6c145985e3e12fda173e7e9a5a9c03f7 Mon Sep 17 00:00:00 2001
+From: Peter Trommler <ptrommler@acm.org>
+Date: Sat, 15 Jun 2024 08:55:30 +0200
+Subject: [PATCH 4/4] PPC NCG: Fix sign hints in C calls
+
+---
+ compiler/GHC/CmmToAsm/PPC/CodeGen.hs | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
+index cca47f7bac2..060bd7159eb 100644
+--- a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
++++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
+@@ -1770,7 +1770,7 @@ genCCall' config gcp target dest_regs args
+                                 _ -> panic "genCall': unknown calling conv."
+ 
+         argReps = map (cmmExprType platform) args
+-        (argHints, _) = foreignTargetHints target
++        (_, argHints) = foreignTargetHints target
+ 
+         roundTo a x | x `mod` a == 0 = x
+                     | otherwise = x + a - (x `mod` a)
+-- 
+GitLab
+
diff --git a/riscv64-ncg.patch b/riscv64-ncg.patch
new file mode 100644
index 0000000..b71a015
--- /dev/null
+++ b/riscv64-ncg.patch
@@ -0,0 +1,6217 @@
+Index: ghc-9.10.1/CODEOWNERS
+===================================================================
+--- ghc-9.10.1.orig/CODEOWNERS
++++ ghc-9.10.1/CODEOWNERS
+@@ -40,6 +40,7 @@
+ /compiler/GHC/HsToCore/Foreign/Wasm.hs @TerrorJack
+ /compiler/GHC/Tc/Deriv/            @RyanGlScott
+ /compiler/GHC/CmmToAsm/            @simonmar @bgamari @AndreasK
++/compiler/GHC/CmmToAsm/RV64/       @supersven @angerman
+ /compiler/GHC/CmmToAsm/Wasm/       @TerrorJack
+ /compiler/GHC/CmmToLlvm/           @angerman
+ /compiler/GHC/StgToCmm/            @simonmar @osa1
+Index: ghc-9.10.1/compiler/CodeGen.Platform.h
+===================================================================
+--- ghc-9.10.1.orig/compiler/CodeGen.Platform.h
++++ ghc-9.10.1/compiler/CodeGen.Platform.h
+@@ -1,7 +1,8 @@
+ 
+ import GHC.Cmm.Expr
+ #if !(defined(MACHREGS_i386) || defined(MACHREGS_x86_64) \
+-    || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64))
++    || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64) \
++    || defined(MACHREGS_riscv64))
+ import GHC.Utils.Panic.Plain
+ #endif
+ import GHC.Platform.Reg
+@@ -1041,6 +1042,105 @@ freeReg 18 = False
+ 
+ # if defined(REG_Base)
+ freeReg REG_Base  = False
++# endif
++# if defined(REG_Sp)
++freeReg REG_Sp    = False
++# endif
++# if defined(REG_SpLim)
++freeReg REG_SpLim = False
++# endif
++# if defined(REG_Hp)
++freeReg REG_Hp    = False
++# endif
++# if defined(REG_HpLim)
++freeReg REG_HpLim = False
++# endif
++
++# if defined(REG_R1)
++freeReg REG_R1    = False
++# endif
++# if defined(REG_R2)
++freeReg REG_R2    = False
++# endif
++# if defined(REG_R3)
++freeReg REG_R3    = False
++# endif
++# if defined(REG_R4)
++freeReg REG_R4    = False
++# endif
++# if defined(REG_R5)
++freeReg REG_R5    = False
++# endif
++# if defined(REG_R6)
++freeReg REG_R6    = False
++# endif
++# if defined(REG_R7)
++freeReg REG_R7    = False
++# endif
++# if defined(REG_R8)
++freeReg REG_R8    = False
++# endif
++
++# if defined(REG_F1)
++freeReg REG_F1    = False
++# endif
++# if defined(REG_F2)
++freeReg REG_F2    = False
++# endif
++# if defined(REG_F3)
++freeReg REG_F3    = False
++# endif
++# if defined(REG_F4)
++freeReg REG_F4    = False
++# endif
++# if defined(REG_F5)
++freeReg REG_F5    = False
++# endif
++# if defined(REG_F6)
++freeReg REG_F6    = False
++# endif
++
++# if defined(REG_D1)
++freeReg REG_D1    = False
++# endif
++# if defined(REG_D2)
++freeReg REG_D2    = False
++# endif
++# if defined(REG_D3)
++freeReg REG_D3    = False
++# endif
++# if defined(REG_D4)
++freeReg REG_D4    = False
++# endif
++# if defined(REG_D5)
++freeReg REG_D5    = False
++# endif
++# if defined(REG_D6)
++freeReg REG_D6    = False
++# endif
++
++freeReg _ = True
++
++#elif defined(MACHREGS_riscv64)
++
++-- zero reg
++freeReg 0 = False
++-- link register
++freeReg 1 = False
++-- stack pointer
++freeReg 2 = False
++-- global pointer
++freeReg 3 = False
++-- thread pointer
++freeReg 4 = False
++-- frame pointer
++freeReg 8 = False
++-- made-up inter-procedural (ip) register
++-- See Note [The made-up RISCV64 TMP (IP) register]
++freeReg 31 = False
++
++# if defined(REG_Base)
++freeReg REG_Base  = False
+ # endif
+ # if defined(REG_Sp)
+ freeReg REG_Sp    = False
+Index: ghc-9.10.1/compiler/GHC/Cmm/CLabel.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/Cmm/CLabel.hs
++++ ghc-9.10.1/compiler/GHC/Cmm/CLabel.hs
+@@ -1720,6 +1720,8 @@ pprDynamicLinkerAsmLabel !platform dllIn
+       | platformArch platform == ArchAArch64
+       = ppLbl
+ 
++      | platformArch platform == ArchRISCV64
++      = ppLbl
+ 
+       | platformArch platform == ArchX86_64
+       = case dllInfo of
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm.hs
+@@ -67,6 +67,7 @@ import qualified GHC.CmmToAsm.X86   as X
+ import qualified GHC.CmmToAsm.PPC   as PPC
+ import qualified GHC.CmmToAsm.AArch64 as AArch64
+ import qualified GHC.CmmToAsm.Wasm as Wasm32
++import qualified GHC.CmmToAsm.RV64  as RV64
+ 
+ import GHC.CmmToAsm.Reg.Liveness
+ import qualified GHC.CmmToAsm.Reg.Linear                as Linear
+@@ -148,7 +149,7 @@ nativeCodeGen logger ts config modLoc h
+       ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
+       ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
+       ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
+-      ArchRISCV64   -> panic "nativeCodeGen: No NCG for RISCV64"
++      ArchRISCV64   -> nCG' (RV64.ncgRV64 config)
+       ArchLoongArch64->panic "nativeCodeGen: No NCG for LoongArch64"
+       ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
+       ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+@@ -240,6 +240,7 @@ dwarfRegNo p r = case platformArch p of
+     | r == xmm15 -> 32
+   ArchPPC_64 _ -> fromIntegral $ toRegNo r
+   ArchAArch64  -> fromIntegral $ toRegNo r
++  ArchRISCV64  -> fromIntegral $ toRegNo r
+   _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
+ 
+ -- | Virtual register number to use for return address.
+@@ -252,5 +253,6 @@ dwarfReturnRegNo p
+     ArchX86    -> 8  -- eip
+     ArchX86_64 -> 16 -- rip
+     ArchPPC_64 ELF_V2 -> 65 -- lr (link register)
+-    ArchAArch64-> 30
++    ArchAArch64 -> 30
++    ArchRISCV64 -> 1 -- ra (return address)
+     _other     -> error "dwarfReturnRegNo: Unsupported platform!"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/PIC.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/PIC.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/PIC.hs
+@@ -132,6 +132,11 @@ cmmMakeDynamicReference config reference
+               addImport symbolPtr
+               return $ cmmMakePicReference config symbolPtr
+ 
++        AccessViaSymbolPtr | ArchRISCV64 <- platformArch platform -> do
++              let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
++              addImport symbolPtr
++              return $ cmmMakePicReference config symbolPtr
++
+         AccessViaSymbolPtr -> do
+               let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+               addImport symbolPtr
+@@ -164,6 +169,10 @@ cmmMakePicReference config lbl
+   | ArchAArch64 <- platformArch platform
+   = CmmLit $ CmmLabel lbl
+ 
++  -- as on AArch64, there's no pic base register.
++  | ArchRISCV64 <- platformArch platform
++  = CmmLit $ CmmLabel lbl
++
+   | OSAIX <- platformOS platform
+   = CmmMachOp (MO_Add W32)
+           [ CmmReg (CmmGlobal $ GlobalRegUse PicBaseReg (bWord platform))
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64.hs
+@@ -0,0 +1,57 @@
++{-# OPTIONS_GHC -fno-warn-orphans #-}
++
++-- | Native code generator for RiscV64 architectures
++module GHC.CmmToAsm.RV64 (ncgRV64) where
++
++import GHC.CmmToAsm.Config
++import GHC.CmmToAsm.Instr
++import GHC.CmmToAsm.Monad
++import GHC.CmmToAsm.RV64.CodeGen qualified as RV64
++import GHC.CmmToAsm.RV64.Instr qualified as RV64
++import GHC.CmmToAsm.RV64.Ppr qualified as RV64
++import GHC.CmmToAsm.RV64.RegInfo qualified as RV64
++import GHC.CmmToAsm.RV64.Regs qualified as RV64
++import GHC.CmmToAsm.Types
++import GHC.Prelude
++import GHC.Utils.Outputable (ftext)
++
++ncgRV64 :: NCGConfig -> NcgImpl RawCmmStatics RV64.Instr RV64.JumpDest
++ncgRV64 config =
++  NcgImpl
++    { ncgConfig = config,
++      cmmTopCodeGen = RV64.cmmTopCodeGen,
++      generateJumpTableForInstr = RV64.generateJumpTableForInstr config,
++      getJumpDestBlockId = RV64.getJumpDestBlockId,
++      canShortcut = RV64.canShortcut,
++      shortcutStatics = RV64.shortcutStatics,
++      shortcutJump = RV64.shortcutJump,
++      pprNatCmmDeclS = RV64.pprNatCmmDecl config,
++      pprNatCmmDeclH = RV64.pprNatCmmDecl config,
++      maxSpillSlots = RV64.maxSpillSlots config,
++      allocatableRegs = RV64.allocatableRegs platform,
++      ncgAllocMoreStack = RV64.allocMoreStack platform,
++      ncgMakeFarBranches = RV64.makeFarBranches,
++      extractUnwindPoints = const [],
++      invertCondBranches = \_ _ -> id
++    }
++  where
++    platform = ncgPlatform config
++
++-- | `Instruction` instance for RV64
++instance Instruction RV64.Instr where
++  regUsageOfInstr = RV64.regUsageOfInstr
++  patchRegsOfInstr = RV64.patchRegsOfInstr
++  isJumpishInstr = RV64.isJumpishInstr
++  jumpDestsOfInstr = RV64.jumpDestsOfInstr
++  patchJumpInstr = RV64.patchJumpInstr
++  mkSpillInstr = RV64.mkSpillInstr
++  mkLoadInstr = RV64.mkLoadInstr
++  takeDeltaInstr = RV64.takeDeltaInstr
++  isMetaInstr = RV64.isMetaInstr
++  mkRegRegMoveInstr _ = RV64.mkRegRegMoveInstr
++  takeRegRegMoveInstr = RV64.takeRegRegMoveInstr
++  mkJumpInstr = RV64.mkJumpInstr
++  mkStackAllocInstr = RV64.mkStackAllocInstr
++  mkStackDeallocInstr = RV64.mkStackDeallocInstr
++  mkComment = pure . RV64.COMMENT . ftext
++  pprInstr = RV64.pprInstr
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/CodeGen.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/CodeGen.hs
+@@ -0,0 +1,2207 @@
++{-# LANGUAGE BangPatterns #-}
++{-# LANGUAGE BinaryLiterals #-}
++{-# LANGUAGE GADTs #-}
++{-# LANGUAGE OverloadedStrings #-}
++
++module GHC.CmmToAsm.RV64.CodeGen
++  ( cmmTopCodeGen,
++    generateJumpTableForInstr,
++    makeFarBranches,
++  )
++where
++
++import Control.Monad
++import Data.Maybe
++import Data.Word
++import GHC.Cmm
++import GHC.Cmm.BlockId
++import GHC.Cmm.CLabel
++import GHC.Cmm.Dataflow.Block
++import GHC.Cmm.Dataflow.Graph
++import GHC.Cmm.Dataflow.Label
++import GHC.Cmm.DebugBlock
++import GHC.Cmm.Switch
++import GHC.Cmm.Utils
++import GHC.CmmToAsm.CPrim
++import GHC.CmmToAsm.Config
++import GHC.CmmToAsm.Format
++import GHC.CmmToAsm.Monad
++  ( NatM,
++    getBlockIdNat,
++    getConfig,
++    getDebugBlock,
++    getFileId,
++    getNewLabelNat,
++    getNewRegNat,
++    getPicBaseMaybeNat,
++    getPlatform,
++  )
++import GHC.CmmToAsm.PIC
++import GHC.CmmToAsm.RV64.Cond
++import GHC.CmmToAsm.RV64.Instr
++import GHC.CmmToAsm.RV64.Regs
++import GHC.CmmToAsm.Types
++import GHC.Data.FastString
++import GHC.Data.OrdList
++import GHC.Float
++import GHC.Platform
++import GHC.Platform.Reg
++import GHC.Platform.Regs
++import GHC.Prelude hiding (EQ)
++import GHC.Types.Basic
++import GHC.Types.ForeignCall
++import GHC.Types.SrcLoc (srcSpanFile, srcSpanStartCol, srcSpanStartLine)
++import GHC.Types.Tickish (GenTickish (..))
++import GHC.Types.Unique.Supply
++import GHC.Utils.Constants (debugIsOn)
++import GHC.Utils.Misc
++import GHC.Utils.Monad
++import GHC.Utils.Outputable
++import GHC.Utils.Panic
++
++-- For an overview of an NCG's structure, see Note [General layout of an NCG]
++
++cmmTopCodeGen ::
++  RawCmmDecl ->
++  NatM [NatCmmDecl RawCmmStatics Instr]
++-- Thus we'll have to deal with either CmmProc ...
++cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
++  picBaseMb <- getPicBaseMaybeNat
++  when (isJust picBaseMb) $ panic "RV64.cmmTopCodeGen: Unexpected PIC base register (RISCV ISA does not define one)"
++
++  let blocks = toBlockListEntryFirst graph
++  (nat_blocks, statics) <- mapAndUnzipM basicBlockCodeGen blocks
++
++  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
++      tops = proc : concat statics
++
++  pure tops
++
++-- ... or CmmData.
++cmmTopCodeGen (CmmData sec dat) = pure [CmmData sec dat] -- no translation, we just use CmmStatic
++
++basicBlockCodeGen ::
++  Block CmmNode C C ->
++  NatM
++    ( [NatBasicBlock Instr],
++      [NatCmmDecl RawCmmStatics Instr]
++    )
++basicBlockCodeGen block = do
++  config <- getConfig
++  let (_, nodes, tail) = blockSplit block
++      id = entryLabel block
++      stmts = blockToList nodes
++
++      header_comment_instr
++        | debugIsOn =
++            unitOL
++              $ MULTILINE_COMMENT
++                ( text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
++                    $+$ withPprStyle defaultDumpStyle (pdoc (ncgPlatform config) block)
++                )
++        | otherwise = nilOL
++
++  -- Generate location directive `.loc` (DWARF debug location info)
++  loc_instrs <- genLocInstrs
++
++  -- Generate other instructions
++  mid_instrs <- stmtsToInstrs stmts
++  (!tail_instrs) <- stmtToInstrs tail
++
++  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
++
++      -- TODO: Then x86 backend runs @verifyBasicBlock@ here. How important it is to
++      -- have a valid CFG is an open question: This and the AArch64 and PPC NCGs
++      -- work fine without it.
++
++      -- Code generation may introduce new basic block boundaries, which are
++      -- indicated by the NEWBLOCK instruction. We must split up the instruction
++      -- stream into basic blocks again. Also, we extract LDATAs here too.
++      (top, other_blocks, statics) = foldrOL mkBlocks ([], [], []) instrs
++
++  return (BasicBlock id top : other_blocks, statics)
++  where
++    genLocInstrs :: NatM (OrdList Instr)
++    genLocInstrs = do
++      dbg <- getDebugBlock (entryLabel block)
++      case dblSourceTick =<< dbg of
++        Just (SourceNote span name) ->
++          do
++            fileId <- getFileId (srcSpanFile span)
++            let line = srcSpanStartLine span; col = srcSpanStartCol span
++            pure $ unitOL $ LOCATION fileId line col name
++        _ -> pure nilOL
++
++mkBlocks ::
++  Instr ->
++  ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g]) ->
++  ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g])
++mkBlocks (NEWBLOCK id) (instrs, blocks, statics) =
++  ([], BasicBlock id instrs : blocks, statics)
++mkBlocks (LDATA sec dat) (instrs, blocks, statics) =
++  (instrs, blocks, CmmData sec dat : statics)
++mkBlocks instr (instrs, blocks, statics) =
++  (instr : instrs, blocks, statics)
++
++-- -----------------------------------------------------------------------------
++
++-- | Utilities
++
++-- | Annotate an `Instr` with a `SDoc` comment
++ann :: SDoc -> Instr -> Instr
++ann doc instr {- debugIsOn -} = ANN doc instr
++{-# INLINE ann #-}
++
++-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
++-- -dppr-debug.  The idea is that we can trivially see how a cmm expression
++-- ended up producing the assembly we see.  By having the verbatim AST printed
++-- we can simply check the patterns that were matched to arrive at the assembly
++-- we generated.
++--
++-- pprExpr will hide a lot of noise of the underlying data structure and print
++-- the expression into something that can be easily read by a human. However
++-- going back to the exact CmmExpr representation can be laborious and adds
++-- indirections to find the matches that lead to the assembly.
++--
++-- An improvement could be to have
++--
++--    (pprExpr genericPlatform e) <> parens (text. show e)
++--
++-- to have the best of both worlds.
++--
++-- Note: debugIsOn is too restrictive, it only works for debug compilers.
++-- However, we do not only want to inspect this for debug compilers. Ideally
++-- we'd have a check for -dppr-debug here already, such that we don't even
++-- generate the ANN expressions. However, as they are lazy, they shouldn't be
++-- forced until we actually force them, and without -dppr-debug they should
++-- never end up being forced.
++annExpr :: CmmExpr -> Instr -> Instr
++annExpr e {- debugIsOn -} = ANN (text . show $ e)
++-- annExpr e instr {- debugIsOn -} = ANN (pprExpr genericPlatform e) instr
++-- annExpr _ instr = instr
++{-# INLINE annExpr #-}
++
++-- -----------------------------------------------------------------------------
++-- Generating a table-branch
++
++-- Note [RISCV64 Jump Tables]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~
++--
++-- Jump tables are implemented by generating a table of relative addresses,
++-- where each entry is the relative offset to the target block from the first
++-- entry / table label (`generateJumpTableForInstr`). Using the jump table means
++-- loading the entry's value and jumping to the calculated absolute address
++-- (`genSwitch`).
++--
++-- For example, this Cmm switch
++--
++--   switch [1 .. 10] _s2wn::I64 {
++--       case 1 : goto c347;
++--       case 2 : goto c348;
++--       case 3 : goto c349;
++--       case 4 : goto c34a;
++--       case 5 : goto c34b;
++--       case 6 : goto c34c;
++--       case 7 : goto c34d;
++--       case 8 : goto c34e;
++--       case 9 : goto c34f;
++--       case 10 : goto c34g;
++--   }   // CmmSwitch
++--
++-- leads to this jump table in Assembly
++--
++--   .section .rodata
++--           .balign 8
++--   .Ln34G:
++--           .quad   0
++--           .quad   .Lc347-(.Ln34G)+0
++--           .quad   .Lc348-(.Ln34G)+0
++--           .quad   .Lc349-(.Ln34G)+0
++--           .quad   .Lc34a-(.Ln34G)+0
++--           .quad   .Lc34b-(.Ln34G)+0
++--           .quad   .Lc34c-(.Ln34G)+0
++--           .quad   .Lc34d-(.Ln34G)+0
++--           .quad   .Lc34e-(.Ln34G)+0
++--           .quad   .Lc34f-(.Ln34G)+0
++--           .quad   .Lc34g-(.Ln34G)+0
++--
++-- and this indexing code where the jump should be done (register t0 contains
++-- the index)
++--
++--           addi t0, t0, 0 // silly move (ignore it)
++--           la t1, .Ln34G // load the table's address
++--           sll t0, t0, 3 // index * 8 -> offset in bytes
++--           add t0, t0, t1 // address of the table's entry
++--           ld t0, 0(t0) // load entry
++--           add t0, t0, t1 // relative to absolute address
++--           jalr zero, t0, 0 // jump to the block
++--
++-- In object code (disassembled) the table looks like
++--
++--   0000000000000000 <.Ln34G>:
++--        ...
++--        8: R_RISCV_ADD64        .Lc347
++--        8: R_RISCV_SUB64        .Ln34G
++--        10: R_RISCV_ADD64       .Lc348
++--        10: R_RISCV_SUB64       .Ln34G
++--        18: R_RISCV_ADD64       .Lc349
++--        18: R_RISCV_SUB64       .Ln34G
++--        20: R_RISCV_ADD64       .Lc34a
++--        20: R_RISCV_SUB64       .Ln34G
++--        28: R_RISCV_ADD64       .Lc34b
++--        28: R_RISCV_SUB64       .Ln34G
++--        30: R_RISCV_ADD64       .Lc34c
++--        30: R_RISCV_SUB64       .Ln34G
++--        38: R_RISCV_ADD64       .Lc34d
++--        38: R_RISCV_SUB64       .Ln34G
++--        40: R_RISCV_ADD64       .Lc34e
++--        40: R_RISCV_SUB64       .Ln34G
++--        48: R_RISCV_ADD64       .Lc34f
++--        48: R_RISCV_SUB64       .Ln34G
++--        50: R_RISCV_ADD64       .Lc34g
++--        50: R_RISCV_SUB64       .Ln34G
++--
++-- I.e. the relative offset calculations are done by the linker via relocations.
++-- This seems to be PIC compatible; at least `scanelf` (pax-utils) does not
++-- complain.
++
++
++-- | Generate jump to jump table target
++--
++-- The index into the jump table is calulated by evaluating @expr@. The
++-- corresponding table entry contains the relative address to jump to (relative
++-- to the jump table's first entry / the table's own label).
++genSwitch :: NCGConfig -> CmmExpr -> SwitchTargets -> NatM InstrBlock
++genSwitch config expr targets = do
++  (reg, fmt1, e_code) <- getSomeReg indexExpr
++  let fmt = II64
++  targetReg <- getNewRegNat fmt
++  lbl <- getNewLabelNat
++  dynRef <- cmmMakeDynamicReference config DataReference lbl
++  (tableReg, fmt2, t_code) <- getSomeReg dynRef
++  let code =
++        toOL
++          [ COMMENT (text "indexExpr" <+> (text . show) indexExpr),
++            COMMENT (text "dynRef" <+> (text . show) dynRef)
++          ]
++          `appOL` e_code
++          `appOL` t_code
++          `appOL` toOL
++            [ COMMENT (ftext "Jump table for switch"),
++              -- index to offset into the table (relative to tableReg)
++              annExpr expr (SLL (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt1) reg) (OpImm (ImmInt 3))),
++              -- calculate table entry address
++              ADD (OpReg W64 targetReg) (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt2) tableReg),
++              -- load table entry (relative offset from tableReg (first entry) to target label)
++              LDRU II64 (OpReg W64 targetReg) (OpAddr (AddrRegImm targetReg (ImmInt 0))),
++              -- calculate absolute address of the target label
++              ADD (OpReg W64 targetReg) (OpReg W64 targetReg) (OpReg W64 tableReg),
++              -- prepare jump to target label
++              J_TBL ids (Just lbl) targetReg
++            ]
++  return code
++  where
++    -- See Note [Sub-word subtlety during jump-table indexing] in
++    -- GHC.CmmToAsm.X86.CodeGen for why we must first offset, then widen.
++    indexExpr0 = cmmOffset platform expr offset
++    -- We widen to a native-width register to sanitize the high bits
++    indexExpr =
++      CmmMachOp
++        (MO_UU_Conv expr_w (platformWordWidth platform))
++        [indexExpr0]
++    expr_w = cmmExprWidth platform expr
++    (offset, ids) = switchTargetsToTable targets
++    platform = ncgPlatform config
++
++-- | Generate jump table data (if required)
++--
++-- The idea is to emit one table entry per case. The entry is the relative
++-- address of the block to jump to (relative to the table's first entry /
++-- table's own label.) The calculation itself is done by the linker.
++generateJumpTableForInstr ::
++  NCGConfig ->
++  Instr ->
++  Maybe (NatCmmDecl RawCmmStatics Instr)
++generateJumpTableForInstr config (J_TBL ids (Just lbl) _) =
++  let jumpTable =
++        map jumpTableEntryRel ids
++        where
++          jumpTableEntryRel Nothing =
++            CmmStaticLit (CmmInt 0 (ncgWordWidth config))
++          jumpTableEntryRel (Just blockid) =
++            CmmStaticLit
++              ( CmmLabelDiffOff
++                  blockLabel
++                  lbl
++                  0
++                  (ncgWordWidth config)
++              )
++            where
++              blockLabel = blockLbl blockid
++   in Just (CmmData (Section ReadOnlyData lbl) (CmmStaticsRaw lbl jumpTable))
++generateJumpTableForInstr _ _ = Nothing
++
++-- -----------------------------------------------------------------------------
++-- Top-level of the instruction selector
++
++stmtsToInstrs ::
++  -- | Cmm Statements
++  [CmmNode O O] ->
++  -- | Resulting instruction
++  NatM InstrBlock
++stmtsToInstrs stmts = concatOL <$> mapM stmtToInstrs stmts
++
++stmtToInstrs ::
++  CmmNode e x ->
++  -- | Resulting instructions
++  NatM InstrBlock
++stmtToInstrs stmt = do
++  config <- getConfig
++  platform <- getPlatform
++  case stmt of
++    CmmUnsafeForeignCall target result_regs args ->
++      genCCall target result_regs args
++    CmmComment s -> pure (unitOL (COMMENT (ftext s)))
++    CmmTick {} -> pure nilOL
++    CmmAssign reg src
++      | isFloatType ty -> assignReg_FltCode format reg src
++      | otherwise -> assignReg_IntCode format reg src
++      where
++        ty = cmmRegType reg
++        format = cmmTypeFormat ty
++    CmmStore addr src _alignment
++      | isFloatType ty -> assignMem_FltCode format addr src
++      | otherwise -> assignMem_IntCode format addr src
++      where
++        ty = cmmExprType platform src
++        format = cmmTypeFormat ty
++    CmmBranch id -> genBranch id
++    -- We try to arrange blocks such that the likely branch is the fallthrough
++    -- in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
++    CmmCondBranch arg true false _prediction ->
++      genCondBranch true false arg
++    CmmSwitch arg ids -> genSwitch config arg ids
++    CmmCall {cml_target = arg} -> genJump arg
++    CmmUnwind _regs -> pure nilOL
++    -- Intentionally not have a default case here: If anybody adds a
++    -- constructor, the compiler should force them to think about this here.
++    CmmForeignCall {} -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
++    CmmEntry {} -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
++
++--------------------------------------------------------------------------------
++
++-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
++--
++-- They are really trees of insns to facilitate fast appending, where a
++-- left-to-right traversal yields the insns in the correct order.
++type InstrBlock =
++  OrdList Instr
++
++-- | Register's passed up the tree.
++--
++-- If the stix code forces the register to live in a pre-decided machine
++-- register, it comes out as @Fixed@; otherwise, it comes out as @Any@, and the
++-- parent can decide which register to put it in.
++data Register
++  = Fixed Format Reg InstrBlock
++  | Any Format (Reg -> InstrBlock)
++
++-- | Sometimes we need to change the Format of a register. Primarily during
++-- conversion.
++swizzleRegisterRep :: Format -> Register -> Register
++swizzleRegisterRep format' (Fixed _format reg code) = Fixed format' reg code
++swizzleRegisterRep format' (Any _format codefn) = Any format' codefn
++
++-- | Grab a `Reg` for a `CmmReg`
++--
++-- `LocalReg`s are assigned virtual registers (`RegVirtual`), `GlobalReg`s are
++-- assigned real registers (`RegReal`). It is an error if a `GlobalReg` is not a
++-- STG register.
++getRegisterReg :: Platform -> CmmReg -> Reg
++getRegisterReg _ (CmmLocal (LocalReg u pk)) =
++  RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
++getRegisterReg platform (CmmGlobal mid) =
++  case globalRegMaybe platform (globalRegUseGlobalReg mid) of
++    Just reg -> RegReal reg
++    Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
++
++-- -----------------------------------------------------------------------------
++-- General things for putting together code sequences
++
++-- | Compute an expression into any register
++getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
++getSomeReg expr = do
++  r <- getRegister expr
++  case r of
++    Any rep code -> do
++      newReg <- getNewRegNat rep
++      return (newReg, rep, code newReg)
++    Fixed rep reg code ->
++      return (reg, rep, code)
++
++-- | Compute an expression into any floating-point register
++--
++-- If the initial expression is not a floating-point expression, finally move
++-- the result into a floating-point register.
++getFloatReg :: (HasCallStack) => CmmExpr -> NatM (Reg, Format, InstrBlock)
++getFloatReg expr = do
++  r <- getRegister expr
++  case r of
++    Any rep code | isFloatFormat rep -> do
++      newReg <- getNewRegNat rep
++      return (newReg, rep, code newReg)
++    Any II32 code -> do
++      newReg <- getNewRegNat FF32
++      return (newReg, FF32, code newReg)
++    Any II64 code -> do
++      newReg <- getNewRegNat FF64
++      return (newReg, FF64, code newReg)
++    Any _w _code -> do
++      config <- getConfig
++      pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
++    -- can't do much for fixed.
++    Fixed rep reg code ->
++      return (reg, rep, code)
++
++-- | Map `CmmLit` to `OpImm`
++--
++-- N.B. this is a partial function, because not all `CmmLit`s have an immediate
++-- representation.
++litToImm' :: CmmLit -> Operand
++litToImm' = OpImm . litToImm
++
++-- | Compute a `CmmExpr` into a `Register`
++getRegister :: CmmExpr -> NatM Register
++getRegister e = do
++  config <- getConfig
++  getRegister' config (ncgPlatform config) e
++
++-- | The register width to be used for an operation on the given width
++-- operand.
++opRegWidth :: Width -> Width
++opRegWidth W64 = W64
++opRegWidth W32 = W32
++opRegWidth W16 = W32
++opRegWidth W8 = W32
++opRegWidth w = pprPanic "opRegWidth" (text "Unsupported width" <+> ppr w)
++
++-- Note [Signed arithmetic on RISCV64]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-- Handling signed arithmetic on sub-word-size values on RISCV64 is a bit
++-- tricky as Cmm's type system does not capture signedness. While 32-bit values
++-- are fairly easy to handle due to RISCV64's 32-bit instruction variants
++-- (denoted by use of %wN registers), 16- and 8-bit values require quite some
++-- care.
++--
++-- We handle 16-and 8-bit values by using the 32-bit operations and
++-- sign-/zero-extending operands and truncate results as necessary. For
++-- simplicity we maintain the invariant that a register containing a
++-- sub-word-size value always contains the zero-extended form of that value
++-- in between operations.
++--
++-- For instance, consider the program,
++--
++--    test(bits64 buffer)
++--      bits8 a = bits8[buffer];
++--      bits8 b = %mul(a, 42);
++--      bits8 c = %not(b);
++--      bits8 d = %shrl(c, 4::bits8);
++--      return (d);
++--    }
++--
++-- This program begins by loading `a` from memory, for which we use a
++-- zero-extended byte-size load.  We next sign-extend `a` to 32-bits, and use a
++-- 32-bit multiplication to compute `b`, and truncate the result back down to
++-- 8-bits.
++--
++-- Next we compute `c`: The `%not` requires no extension of its operands, but
++-- we must still truncate the result back down to 8-bits. Finally the `%shrl`
++-- requires no extension and no truncate since we can assume that
++-- `c` is zero-extended.
++--
++-- The "RISC-V Sign Extension Optimizations" LLVM tech talk presentation by
++-- Craig Topper covers possible future improvements
++-- (https://llvm.org/devmtg/2022-11/slides/TechTalk21-RISC-VSignExtensionOptimizations.pdf)
++--
++--
++-- Note [Handling PIC on RV64]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-- RV64 does not have a special PIC register, the general approach is to simply
++-- do PC-relative addressing or go through the GOT. There is assembly support
++-- for both.
++--
++-- rv64 assembly has a `la` (load address) pseudo-instruction, that allows
++-- loading a label's address into a register. The instruction is desugared into
++-- different addressing modes, e.g. PC-relative addressing:
++--
++-- 1: lui  rd1, %pcrel_hi(label)
++--    addi rd1, %pcrel_lo(1b)
++--
++-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dModifiers.html,
++-- PIC can be enabled/disabled through
++--
++--  .option pic
++--
++-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dDirectives.html#RISC_002dV_002dDirectives
++--
++-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
++-- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
++-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
++-- callsites for this. One is in this module to produce the @target@ in @genCCall@
++-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
++--
++-- Conceptually we do not want any special PicBaseReg to be used on RV64. If
++-- we want to distinguish between symbol loading, we need to address this through
++-- the way we load it, not through a register.
++--
++
++getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register
++-- OPTIMIZATION WARNING: CmmExpr rewrites
++-- 1. Rewrite: Reg + (-n) => Reg - n
++--    TODO: this expression shouldn't even be generated to begin with.
++getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)])
++  | i < 0 =
++      getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)])
++getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)])
++  | i < 0 =
++      getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)])
++-- Generic case.
++getRegister' config plat expr =
++  case expr of
++    CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _)) ->
++      -- See Note [Handling PIC on RV64]
++      pprPanic "getRegister': There's no PIC base register on RISCV" (ppr PicBaseReg)
++    CmmLit lit ->
++      case lit of
++        CmmInt 0 w -> pure $ Fixed (intFormat w) zeroReg nilOL
++        CmmInt i w ->
++          -- narrowU is important: Negative immediates may be
++          -- sign-extended on load!
++          let imm = OpImm . ImmInteger $ narrowU w i
++           in pure (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) imm)))
++        CmmFloat 0 w -> do
++          let op = litToImm' lit
++          pure (Any (floatFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) op)))
++        CmmFloat _f W8 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
++        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)
++        CmmFloat f W32 -> do
++          let word = castFloatToWord32 (fromRational f) :: Word32
++          intReg <- getNewRegNat (intFormat W32)
++          return
++            ( Any
++                (floatFormat W32)
++                ( \dst ->
++                    toOL
++                      [ annExpr expr
++                          $ MOV (OpReg W32 intReg) (OpImm (ImmInteger (fromIntegral word))),
++                        MOV (OpReg W32 dst) (OpReg W32 intReg)
++                      ]
++                )
++            )
++        CmmFloat f W64 -> do
++          let word = castDoubleToWord64 (fromRational f) :: Word64
++          intReg <- getNewRegNat (intFormat W64)
++          return
++            ( Any
++                (floatFormat W64)
++                ( \dst ->
++                    toOL
++                      [ annExpr expr
++                          $ MOV (OpReg W64 intReg) (OpImm (ImmInteger (fromIntegral word))),
++                        MOV (OpReg W64 dst) (OpReg W64 intReg)
++                      ]
++                )
++            )
++        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
++        CmmVec _lits -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)
++        CmmLabel lbl -> do
++          let op = OpImm (ImmCLbl lbl)
++              rep = cmmLitType plat lit
++              format = cmmTypeFormat rep
++          return (Any format (\dst -> unitOL $ annExpr expr (LDR format (OpReg (formatToWidth format) dst) op)))
++        CmmLabelOff lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
++          let op = OpImm (ImmIndex lbl off)
++              rep = cmmLitType plat lit
++              format = cmmTypeFormat rep
++          return (Any format (\dst -> unitOL $ LDR format (OpReg (formatToWidth format) dst) op))
++        CmmLabelOff lbl off -> do
++          let op = litToImm' (CmmLabel lbl)
++              rep = cmmLitType plat lit
++              format = cmmTypeFormat rep
++              width = typeWidth rep
++          (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
++          return
++            ( Any
++                format
++                ( \dst ->
++                    off_code
++                      `snocOL` LDR format (OpReg (formatToWidth format) dst) op
++                      `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r)
++                )
++            )
++        CmmLabelDiffOff {} -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
++        CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
++        CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
++    CmmLoad mem rep _ -> do
++      let format = cmmTypeFormat rep
++          width = typeWidth rep
++      Amode addr addr_code <- getAmode plat width mem
++      case width of
++        w
++          | w <= W64 ->
++              -- Load without sign-extension. See Note [Signed arithmetic on RISCV64]
++              pure
++                ( Any
++                    format
++                    ( \dst ->
++                        addr_code
++                          `snocOL` LDRU format (OpReg width dst) (OpAddr addr)
++                    )
++                )
++        _ ->
++          pprPanic ("Width too big! Cannot load: " ++ show width) (pdoc plat expr)
++    CmmStackSlot _ _ ->
++      pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)
++    CmmReg reg ->
++      return
++        ( Fixed
++            (cmmTypeFormat (cmmRegType reg))
++            (getRegisterReg plat reg)
++            nilOL
++        )
++    CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
++      getRegister' config plat
++        $ CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
++      where
++        width = typeWidth (cmmRegType reg)
++    CmmRegOff reg off -> do
++      (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
++      (reg, _format, code) <- getSomeReg $ CmmReg reg
++      return
++        $ Any
++          (intFormat width)
++          ( \dst ->
++              off_code
++                `appOL` code
++                `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r)
++          )
++      where
++        width = typeWidth (cmmRegType reg)
++
++    -- Handle MO_RelaxedRead as a normal CmmLoad, to allow
++    -- non-trivial addressing modes to be used.
++    CmmMachOp (MO_RelaxedRead w) [e] ->
++      getRegister (CmmLoad e (cmmBits w) NaturallyAligned)
++    -- for MachOps, see GHC.Cmm.MachOp
++    -- For CmmMachOp, see GHC.Cmm.Expr
++    CmmMachOp op [e] -> do
++      (reg, _format, code) <- getSomeReg e
++      case op of
++        MO_Not w -> return $ Any (intFormat w) $ \dst ->
++          let w' = opRegWidth w
++           in code
++                `snocOL`
++                -- pseudo instruction `not` is `xori rd, rs, -1`
++                ann (text "not") (XORI (OpReg w' dst) (OpReg w' reg) (OpImm (ImmInt (-1))))
++                `appOL` truncateReg w' w dst -- See Note [Signed arithmetic on RISCV64]
++        MO_S_Neg w -> negate code w reg
++        MO_F_Neg w ->
++          return
++            $ Any
++              (floatFormat w)
++              ( \dst ->
++                  code
++                    `snocOL` NEG (OpReg w dst) (OpReg w reg)
++              )
++        -- TODO: Can this case happen?
++        MO_SF_Conv from to | from < W32 -> do
++          -- extend to the smallest available representation
++          (reg_x, code_x) <- signExtendReg from W32 reg
++          pure
++            $ Any
++              (floatFormat to)
++              ( \dst ->
++                  code
++                    `appOL` code_x
++                    `snocOL` annExpr expr (FCVT IntToFloat (OpReg to dst) (OpReg from reg_x)) -- (Signed ConVerT Float)
++              )
++        MO_SF_Conv from to ->
++          pure
++            $ Any
++              (floatFormat to)
++              ( \dst ->
++                  code
++                    `snocOL` annExpr expr (FCVT IntToFloat (OpReg to dst) (OpReg from reg)) -- (Signed ConVerT Float)
++              )
++        MO_FS_Conv from to
++          | to < W32 ->
++              pure
++                $ Any
++                  (intFormat to)
++                  ( \dst ->
++                      code
++                        `snocOL`
++                        -- W32 is the smallest width to convert to. Decrease width afterwards.
++                        annExpr expr (FCVT FloatToInt (OpReg W32 dst) (OpReg from reg))
++                        `appOL` signExtendAdjustPrecission W32 to dst dst -- (float convert (-> zero) signed)
++                  )
++        MO_FS_Conv from to ->
++          pure
++            $ Any
++              (intFormat to)
++              ( \dst ->
++                  code
++                    `snocOL` annExpr expr (FCVT FloatToInt (OpReg to dst) (OpReg from reg))
++                    `appOL` truncateReg from to dst -- (float convert (-> zero) signed)
++              )
++        MO_UU_Conv from to
++          | from <= to ->
++              pure
++                $ Any
++                  (intFormat to)
++                  ( \dst ->
++                      code
++                        `snocOL` annExpr e (MOV (OpReg to dst) (OpReg from reg))
++                  )
++        MO_UU_Conv from to ->
++          pure
++            $ Any
++              (intFormat to)
++              ( \dst ->
++                  code
++                    `snocOL` annExpr e (MOV (OpReg from dst) (OpReg from reg))
++                    `appOL` truncateReg from to dst
++              )
++        MO_SS_Conv from to -> ss_conv from to reg code
++        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` annExpr e (FCVT FloatToFloat (OpReg to dst) (OpReg from reg)))
++        -- Conversions
++        -- TODO: Duplication with MO_UU_Conv
++        MO_XX_Conv from to
++          | to < from ->
++              pure
++                $ Any
++                  (intFormat to)
++                  ( \dst ->
++                      code
++                        `snocOL` annExpr e (MOV (OpReg from dst) (OpReg from reg))
++                        `appOL` truncateReg from to dst
++                  )
++        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
++        MO_AlignmentCheck align wordWidth -> do
++          reg <- getRegister' config plat e
++          addAlignmentCheck align wordWidth reg
++        x -> pprPanic ("getRegister' (monadic CmmMachOp): " ++ show x) (pdoc plat expr)
++      where
++        -- In the case of 16- or 8-bit values we need to sign-extend to 32-bits
++        -- See Note [Signed arithmetic on RISCV64].
++        negate code w reg = do
++          let w' = opRegWidth w
++          (reg', code_sx) <- signExtendReg w w' reg
++          return $ Any (intFormat w) $ \dst ->
++            code
++              `appOL` code_sx
++              `snocOL` NEG (OpReg w' dst) (OpReg w' reg')
++              `appOL` truncateReg w' w dst
++
++        ss_conv from to reg code
++          | from < to = do
++              pure $ Any (intFormat to) $ \dst ->
++                code
++                  `appOL` signExtend from to reg dst
++                  `appOL` truncateReg from to dst
++          | from > to =
++              pure $ Any (intFormat to) $ \dst ->
++                code
++                  `appOL` toOL
++                    [ ann
++                        (text "MO_SS_Conv: narrow register signed" <+> ppr reg <+> ppr from <> text "->" <> ppr to)
++                        (SLL (OpReg to dst) (OpReg from reg) (OpImm (ImmInt shift))),
++                      -- signed right shift
++                      SRA (OpReg to dst) (OpReg to dst) (OpImm (ImmInt shift))
++                    ]
++                  `appOL` truncateReg from to dst
++          | otherwise =
++              -- No conversion necessary: Just copy.
++              pure $ Any (intFormat from) $ \dst ->
++                code `snocOL` MOV (OpReg from dst) (OpReg from reg)
++          where
++            shift = 64 - (widthInBits from - widthInBits to)
++
++    -- Dyadic machops:
++    --
++    -- The general idea is:
++    -- compute x<i> <- x
++    -- compute x<j> <- y
++    -- OP x<r>, x<i>, x<j>
++    --
++    -- TODO: for now we'll only implement the 64bit versions. And rely on the
++    --      fallthrough to alert us if things go wrong!
++    -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
++    -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
++    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
++    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
++    -- 1. Compute Reg +/- n directly.
++    --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
++    CmmMachOp (MO_Add w) [CmmReg reg, CmmLit (CmmInt n _)]
++      | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
++      where
++        -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
++        w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
++        r' = getRegisterReg plat reg
++    CmmMachOp (MO_Sub w) [CmmReg reg, CmmLit (CmmInt n _)]
++      | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
++      where
++        -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
++        w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
++        r' = getRegisterReg plat reg
++    CmmMachOp (MO_U_Quot w) [x, y] | w == W8 || w == W16 -> do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_y, format_y, code_y) <- getSomeReg y
++      return
++        $ Any
++          (intFormat w)
++          ( \dst ->
++              code_x
++                `appOL` truncateReg (formatToWidth format_x) w reg_x
++                `appOL` code_y
++                `appOL` truncateReg (formatToWidth format_y) w reg_y
++                `snocOL` annExpr expr (DIVU (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
++          )
++
++    -- 2. Shifts. x << n, x >> n.
++    CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)]
++      | w == W32,
++        0 <= n,
++        n < 32 -> do
++          (reg_x, _format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `snocOL` annExpr expr (SLL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++                    `appOL` truncateReg w w dst
++              )
++    CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)]
++      | w == W64,
++        0 <= n,
++        n < 64 -> do
++          (reg_x, _format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `snocOL` annExpr expr (SLL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++                    `appOL` truncateReg w w dst
++              )
++    CmmMachOp (MO_S_Shr w) [x, CmmLit (CmmInt n _)] | fitsIn12bitImm n -> do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x
++      return
++        $ Any
++          (intFormat w)
++          ( \dst ->
++              code_x
++                `appOL` code_x'
++                `snocOL` annExpr expr (SRA (OpReg w dst) (OpReg w reg_x') (OpImm (ImmInteger n)))
++          )
++    CmmMachOp (MO_S_Shr w) [x, y] -> do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_y, _format_y, code_y) <- getSomeReg y
++      (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x
++      return
++        $ Any
++          (intFormat w)
++          ( \dst ->
++              code_x
++                `appOL` code_x'
++                `appOL` code_y
++                `snocOL` annExpr expr (SRA (OpReg w dst) (OpReg w reg_x') (OpReg w reg_y))
++          )
++    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)]
++      | w == W8,
++        0 <= n,
++        n < 8 -> do
++          (reg_x, format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `appOL` truncateReg (formatToWidth format_x) w reg_x
++                    `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++              )
++    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)]
++      | w == W16,
++        0 <= n,
++        n < 16 -> do
++          (reg_x, format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `appOL` truncateReg (formatToWidth format_x) w reg_x
++                    `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++              )
++    CmmMachOp (MO_U_Shr w) [x, y] | w == W8 || w == W16 -> do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_y, _format_y, code_y) <- getSomeReg y
++      return
++        $ Any
++          (intFormat w)
++          ( \dst ->
++              code_x
++                `appOL` code_y
++                `appOL` truncateReg (formatToWidth format_x) w reg_x
++                `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
++          )
++    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)]
++      | w == W32,
++        0 <= n,
++        n < 32 -> do
++          (reg_x, _format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++              )
++    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)]
++      | w == W64,
++        0 <= n,
++        n < 64 -> do
++          (reg_x, _format_x, code_x) <- getSomeReg x
++          return
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  code_x
++                    `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)))
++              )
++
++    -- 3. Logic &&, ||
++    CmmMachOp (MO_And w) [CmmReg reg, CmmLit (CmmInt n _)]
++      | fitsIn12bitImm n ->
++          return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
++      where
++        w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
++        r' = getRegisterReg plat reg
++    CmmMachOp (MO_Or w) [CmmReg reg, CmmLit (CmmInt n _)]
++      | fitsIn12bitImm n ->
++          return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORI (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
++      where
++        w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
++        r' = getRegisterReg plat reg
++
++    -- Generic binary case.
++    CmmMachOp op [x, y] -> do
++      let -- A "plain" operation.
++          bitOp w op = do
++            -- compute x<m> <- x
++            -- compute x<o> <- y
++            -- <OP> x<n>, x<m>, x<o>
++            (reg_x, format_x, code_x) <- getSomeReg x
++            (reg_y, format_y, code_y) <- getSomeReg y
++            massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOp: incompatible"
++            return
++              $ Any
++                (intFormat w)
++                ( \dst ->
++                    code_x
++                      `appOL` code_y
++                      `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)
++                )
++
++          -- A (potentially signed) integer operation.
++          -- In the case of 8- and 16-bit signed arithmetic we must first
++          -- sign-extend both arguments to 32-bits.
++          -- See Note [Signed arithmetic on RISCV64].
++          intOp is_signed w op = do
++            -- compute x<m> <- x
++            -- compute x<o> <- y
++            -- <OP> x<n>, x<m>, x<o>
++            (reg_x, format_x, code_x) <- getSomeReg x
++            (reg_y, format_y, code_y) <- getSomeReg y
++            massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int"
++            -- This is the width of the registers on which the operation
++            -- should be performed.
++            let w' = opRegWidth w
++                signExt r
++                  | not is_signed = return (r, nilOL)
++                  | otherwise = signExtendReg w w' r
++            (reg_x_sx, code_x_sx) <- signExt reg_x
++            (reg_y_sx, code_y_sx) <- signExt reg_y
++            return $ Any (intFormat w) $ \dst ->
++              code_x
++                `appOL` code_y
++                `appOL`
++                -- sign-extend both operands
++                code_x_sx
++                `appOL` code_y_sx
++                `appOL` op (OpReg w' dst) (OpReg w' reg_x_sx) (OpReg w' reg_y_sx)
++                `appOL` truncateReg w' w dst -- truncate back to the operand's original width
++          floatOp w op = do
++            (reg_fx, format_x, code_fx) <- getFloatReg x
++            (reg_fy, format_y, code_fy) <- getFloatReg y
++            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatOp: non-float"
++            return
++              $ Any
++                (floatFormat w)
++                ( \dst ->
++                    code_fx
++                      `appOL` code_fy
++                      `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy)
++                )
++
++          -- need a special one for conditionals, as they return ints
++          floatCond w op = do
++            (reg_fx, format_x, code_fx) <- getFloatReg x
++            (reg_fy, format_y, code_fy) <- getFloatReg y
++            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatCond: non-float"
++            return
++              $ Any
++                (intFormat w)
++                ( \dst ->
++                    code_fx
++                      `appOL` code_fy
++                      `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy)
++                )
++
++      case op of
++        -- Integer operations
++        -- Add/Sub should only be Integer Options.
++        MO_Add w -> intOp False w (\d x y -> unitOL $ annExpr expr (ADD d x y))
++        -- TODO: Handle sub-word case
++        MO_Sub w -> intOp False w (\d x y -> unitOL $ annExpr expr (SUB d x y))
++        -- N.B. We needn't sign-extend sub-word size (in)equality comparisons
++        -- since we don't care about ordering.
++        MO_Eq w -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ))
++        MO_Ne w -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y NE))
++        -- Signed multiply/divide
++        MO_Mul w -> intOp True w (\d x y -> unitOL $ annExpr expr (MUL d x y))
++        MO_S_MulMayOflo w -> do_mul_may_oflo w x y
++        MO_S_Quot w -> intOp True w (\d x y -> unitOL $ annExpr expr (DIV d x y))
++        MO_S_Rem w -> intOp True w (\d x y -> unitOL $ annExpr expr (REM d x y))
++        -- Unsigned multiply/divide
++        MO_U_Quot w -> intOp False w (\d x y -> unitOL $ annExpr expr (DIVU d x y))
++        MO_U_Rem w -> intOp False w (\d x y -> unitOL $ annExpr expr (REMU d x y))
++        -- Signed comparisons
++        MO_S_Ge w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SGE))
++        MO_S_Le w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SLE))
++        MO_S_Gt w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SGT))
++        MO_S_Lt w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SLT))
++        -- Unsigned comparisons
++        MO_U_Ge w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGE))
++        MO_U_Le w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULE))
++        MO_U_Gt w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGT))
++        MO_U_Lt w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULT))
++        -- Floating point arithmetic
++        MO_F_Add w -> floatOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
++        MO_F_Sub w -> floatOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
++        MO_F_Mul w -> floatOp w (\d x y -> unitOL $ annExpr expr (MUL d x y))
++        MO_F_Quot w -> floatOp w (\d x y -> unitOL $ annExpr expr (DIV d x y))
++        -- Floating point comparison
++        MO_F_Eq w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ))
++        MO_F_Ne w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y NE))
++        MO_F_Ge w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGE))
++        MO_F_Le w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLE)) -- x <= y <=> y > x
++        MO_F_Gt w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGT))
++        MO_F_Lt w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLT)) -- x < y <=> y >= x
++
++        -- Bitwise operations
++        MO_And w -> bitOp w (\d x y -> unitOL $ annExpr expr (AND d x y))
++        MO_Or w -> bitOp w (\d x y -> unitOL $ annExpr expr (OR d x y))
++        MO_Xor w -> bitOp w (\d x y -> unitOL $ annExpr expr (XOR d x y))
++        MO_Shl w -> intOp False w (\d x y -> unitOL $ annExpr expr (SLL d x y))
++        MO_U_Shr w -> intOp False w (\d x y -> unitOL $ annExpr expr (SRL d x y))
++        MO_S_Shr w -> intOp True w (\d x y -> unitOL $ annExpr expr (SRA d x y))
++        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ pprMachOp op <+> text "in" <+> pdoc plat expr
++
++    -- Generic ternary case.
++    CmmMachOp op [x, y, z] ->
++      case op of
++        -- Floating-point fused multiply-add operations
++        --
++        -- x86 fmadd    x * y + z <=> RISCV64 fmadd : d =   r1 * r2 + r3
++        -- x86 fmsub    x * y - z <=> RISCV64 fnmsub: d =   r1 * r2 - r3
++        -- x86 fnmadd - x * y + z <=> RISCV64 fmsub : d = - r1 * r2 + r3
++        -- x86 fnmsub - x * y - z <=> RISCV64 fnmadd: d = - r1 * r2 - r3
++        MO_FMA var w -> case var of
++          FMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMAdd d n m a)
++          FMSub -> float3Op w (\d n m a -> unitOL $ FMA FMSub d n m a)
++          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
++          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
++        _ ->
++          pprPanic "getRegister' (unhandled ternary CmmMachOp): "
++            $ pprMachOp op
++            <+> text "in"
++            <+> pdoc plat expr
++      where
++        float3Op w op = do
++          (reg_fx, format_x, code_fx) <- getFloatReg x
++          (reg_fy, format_y, code_fy) <- getFloatReg y
++          (reg_fz, format_z, code_fz) <- getFloatReg z
++          massertPpr (isFloatFormat format_x && isFloatFormat format_y && isFloatFormat format_z)
++            $ text "float3Op: non-float"
++          pure
++            $ Any (floatFormat w)
++            $ \dst ->
++              code_fx
++                `appOL` code_fy
++                `appOL` code_fz
++                `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) (OpReg w reg_fz)
++    CmmMachOp _op _xs ->
++      pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
++  where
++    isNbitEncodeable :: Int -> Integer -> Bool
++    isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
++    -- N.B. MUL does not set the overflow flag.
++    -- Return 0 when the operation cannot overflow, /= 0 otherwise
++    do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
++    do_mul_may_oflo w _x _y | w > W64 = pprPanic "Cannot multiply larger than 64bit" (ppr w)
++    do_mul_may_oflo w@W64 x y = do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_y, format_y, code_y) <- getSomeReg y
++      -- TODO: Can't we clobber reg_x and reg_y to save registers?
++      lo <- getNewRegNat II64
++      hi <- getNewRegNat II64
++      -- TODO: Overhaul CSET: 3rd operand isn't needed for SNEZ
++      let nonSense = OpImm (ImmInt 0)
++      pure
++        $ Any
++          (intFormat w)
++          ( \dst ->
++              code_x
++                `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x
++                `appOL` code_y
++                `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y
++                `appOL` toOL
++                  [ annExpr expr (MULH (OpReg w hi) (OpReg w reg_x) (OpReg w reg_y)),
++                    MUL (OpReg w lo) (OpReg w reg_x) (OpReg w reg_y),
++                    SRA (OpReg w lo) (OpReg w lo) (OpImm (ImmInt (widthInBits W64 - 1))),
++                    ann
++                      (text "Set flag if result of MULH contains more than sign bits.")
++                      (XOR (OpReg w hi) (OpReg w hi) (OpReg w lo)),
++                    CSET (OpReg w dst) (OpReg w hi) nonSense NE
++                  ]
++          )
++    do_mul_may_oflo w x y = do
++      (reg_x, format_x, code_x) <- getSomeReg x
++      (reg_y, format_y, code_y) <- getSomeReg y
++      let width_x = formatToWidth format_x
++          width_y = formatToWidth format_y
++      if w > width_x && w > width_y
++        then
++          pure
++            $ Any
++              (intFormat w)
++              ( \dst ->
++                  -- 8bit * 8bit cannot overflow 16bit
++                  -- 16bit * 16bit cannot overflow 32bit
++                  -- 32bit * 32bit cannot overflow 64bit
++                  unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 0)))
++              )
++        else do
++          let use32BitMul = w <= W32 && width_x <= W32 && width_y <= W32
++              nonSense = OpImm (ImmInt 0)
++          if use32BitMul
++            then do
++              narrowedReg <- getNewRegNat II64
++              pure
++                $ Any
++                  (intFormat w)
++                  ( \dst ->
++                      code_x
++                        `appOL` signExtend (formatToWidth format_x) W32 reg_x reg_x
++                        `appOL` code_y
++                        `appOL` signExtend (formatToWidth format_y) W32 reg_y reg_y
++                        `snocOL` annExpr expr (MUL (OpReg W32 dst) (OpReg W32 reg_x) (OpReg W32 reg_y))
++                        `appOL` signExtendAdjustPrecission W32 w dst narrowedReg
++                        `appOL` toOL
++                          [ ann
++                              (text "Check if the multiplied value fits in the narrowed register")
++                              (SUB (OpReg w dst) (OpReg w dst) (OpReg w narrowedReg)),
++                            CSET (OpReg w dst) (OpReg w dst) nonSense NE
++                          ]
++                  )
++            else
++              pure
++                $ Any
++                  (intFormat w)
++                  ( \dst ->
++                      -- Do not handle this unlikely case. Just tell that it may overflow.
++                      unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 1)))
++                  )
++
++-- | Instructions to sign-extend the value in the given register from width @w@
++-- up to width @w'@.
++signExtendReg :: Width -> Width -> Reg -> NatM (Reg, OrdList Instr)
++signExtendReg w _w' r | w == W64 = pure (r, nilOL)
++signExtendReg w w' r = do
++  r' <- getNewRegNat (intFormat w')
++  let instrs = signExtend w w' r r'
++  pure (r', instrs)
++
++-- | Sign extends to 64bit, if needed
++--
++-- Source `Reg` @r@ stays untouched, while the conversion happens on destination
++-- `Reg` @r'@.
++signExtend :: Width -> Width -> Reg -> Reg -> OrdList Instr
++signExtend w w' _r _r' | w > w' = pprPanic "This is not a sign extension, but a truncation." $ ppr w <> text "->" <+> ppr w'
++signExtend w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w'
++signExtend w w' r r' | w == W64 && w' == W64 && r == r' = nilOL
++signExtend w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r)
++signExtend w w' r r'
++  | w == W32 && w' == W64 =
++      unitOL
++        $ ann
++          (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w')
++          -- `ADDIW r r 0` is the pseudo-op SEXT.W
++          (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0)))
++signExtend w w' r r' =
++  toOL
++    [ ann
++        (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
++        (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
++      -- signed (arithmetic) right shift
++      SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
++    ]
++  where
++    shift = 64 - widthInBits w
++
++-- | Sign extends to 64bit, if needed and reduces the precission to the target `Width` (@w'@)
++--
++-- Source `Reg` @r@ stays untouched, while the conversion happens on destination
++-- `Reg` @r'@.
++signExtendAdjustPrecission :: Width -> Width -> Reg -> Reg -> OrdList Instr
++signExtendAdjustPrecission w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w'
++signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 && r == r' = nilOL
++signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r)
++signExtendAdjustPrecission w w' r r'
++  | w == W32 && w' == W64 =
++      unitOL
++        $ ann
++          (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w')
++          -- `ADDIW r r 0` is the pseudo-op SEXT.W
++          (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0)))
++signExtendAdjustPrecission w w' r r'
++  | w > w' =
++      toOL
++        [ ann
++            (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
++            (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
++          -- signed (arithmetic) right shift
++          SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
++        ]
++  where
++    shift = 64 - widthInBits w'
++signExtendAdjustPrecission w w' r r' =
++  toOL
++    [ ann
++        (text "sign extend register" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
++        (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
++      -- signed (arithmetic) right shift
++      SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
++    ]
++  where
++    shift = 64 - widthInBits w
++
++-- | Instructions to truncate the value in the given register from width @w@
++-- to width @w'@.
++--
++-- In other words, it just cuts the width out of the register. N.B.: This
++-- ignores signedness (no sign extension takes place)!
++truncateReg :: Width -> Width -> Reg -> OrdList Instr
++truncateReg _w w' _r | w' == W64 = nilOL
++truncateReg _w w' r | w' > W64 = pprPanic "Cannot truncate to width bigger than register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w'
++truncateReg w _w' r | w > W64 = pprPanic "Unexpected register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w
++truncateReg w w' r =
++  toOL
++    [ ann
++        (text "truncate register" <+> ppr r <+> ppr w <> text "->" <> ppr w')
++        (SLL (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift))),
++      -- SHL ignores signedness!
++      SRL (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift))
++    ]
++  where
++    shift = 64 - widthInBits w'
++
++-- | Given a 'Register', produce a new 'Register' with an instruction block
++-- which will check the value for alignment. Used for @-falignment-sanitisation@.
++addAlignmentCheck :: Int -> Width -> Register -> NatM Register
++addAlignmentCheck align wordWidth reg = do
++  jumpReg <- getNewRegNat II64
++  cmpReg <- getNewRegNat II64
++  okayLblId <- getBlockIdNat
++
++  pure $ case reg of
++    Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt jumpReg cmpReg okayLblId reg)
++    Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt jumpReg cmpReg okayLblId reg)
++  where
++    check :: Format -> Reg -> Reg -> BlockId -> Reg -> InstrBlock
++    check fmt jumpReg cmpReg okayLblId reg =
++      let width = formatToWidth fmt
++       in assert (not $ isFloatFormat fmt)
++            $ toOL
++              [ ann
++                  (text "Alignment check - alignment: " <> int align <> text ", word width: " <> text (show wordWidth))
++                  (AND (OpReg width cmpReg) (OpReg width reg) (OpImm $ ImmInt $ align - 1)),
++                BCOND EQ (OpReg width cmpReg) zero (TBlock okayLblId),
++                COMMENT (text "Alignment check failed"),
++                LDR II64 (OpReg W64 jumpReg) (OpImm $ ImmCLbl mkBadAlignmentLabel),
++                B (TReg jumpReg),
++                NEWBLOCK okayLblId
++              ]
++
++-- -----------------------------------------------------------------------------
++--  The 'Amode' type: Memory addressing modes passed up the tree.
++data Amode = Amode AddrMode InstrBlock
++
++-- | Provide the value of a `CmmExpr` with an `Amode`
++--
++-- N.B. this function should be used to provide operands to load and store
++-- instructions with signed 12bit wide immediates (S & I types). For other
++-- immediate sizes and formats (e.g. B type uses multiples of 2) this function
++-- would need to be adjusted.
++getAmode ::
++  Platform ->
++  -- | width of loaded value
++  Width ->
++  CmmExpr ->
++  NatM Amode
++-- TODO: Specialize stuff we can destructure here.
++
++-- LDR/STR: Immediate can be represented with 12bits
++getAmode platform w (CmmRegOff reg off)
++  | w <= W64,
++    fitsIn12bitImm off =
++      return $ Amode (AddrRegImm reg' off') nilOL
++  where
++    reg' = getRegisterReg platform reg
++    off' = ImmInt off
++
++-- For Stores we often see something like this:
++-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
++-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
++-- for `n` in range.
++getAmode _platform _ (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
++  | fitsIn12bitImm off =
++      do
++        (reg, _format, code) <- getSomeReg expr
++        return $ Amode (AddrRegImm reg (ImmInteger off)) code
++getAmode _platform _ (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
++  | fitsIn12bitImm (-off) =
++      do
++        (reg, _format, code) <- getSomeReg expr
++        return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
++
++-- Generic case
++getAmode _platform _ expr =
++  do
++    (reg, _format, code) <- getSomeReg expr
++    return $ Amode (AddrReg reg) code
++
++-- -----------------------------------------------------------------------------
++-- Generating assignments
++
++-- Assignments are really at the heart of the whole code generation
++-- business.  Almost all top-level nodes of any real importance are
++-- assignments, which correspond to loads, stores, or register
++-- transfers.  If we're really lucky, some of the register transfers
++-- will go away, because we can use the destination register to
++-- complete the code generation for the right hand side.  This only
++-- fails when the right hand side is forced into a fixed register
++-- (e.g. the result of a call).
++
++assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
++assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
++assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
++assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
++assignMem_IntCode rep addrE srcE =
++  do
++    (src_reg, _format, code) <- getSomeReg srcE
++    platform <- getPlatform
++    let w = formatToWidth rep
++    Amode addr addr_code <- getAmode platform w addrE
++    return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
++      `consOL` ( code
++                   `appOL` addr_code
++                   `snocOL` STR rep (OpReg w src_reg) (OpAddr addr)
++               )
++
++assignReg_IntCode _ reg src =
++  do
++    platform <- getPlatform
++    let dst = getRegisterReg platform reg
++    r <- getRegister src
++    return $ case r of
++      Any _ code ->
++        COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src)))
++          `consOL` code dst
++      Fixed format freg fcode ->
++        COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src)))
++          `consOL` ( fcode
++                       `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg)
++                   )
++
++-- Let's treat Floating point stuff
++-- as integer code for now. Opaque.
++assignMem_FltCode = assignMem_IntCode
++
++assignReg_FltCode = assignReg_IntCode
++
++-- -----------------------------------------------------------------------------
++-- Jumps
++-- AArch64 has 26bits for targets, whereas RiscV only has 20.
++-- Thus we need to distinguish between far (outside of the)
++-- current compilation unit. And regular branches.
++-- RiscV has ±2MB of displacement, whereas AArch64 has ±128MB.
++-- Thus for most branches we can get away with encoding it
++-- directly in the instruction rather than always loading the
++-- address into a register and then using that to jump.
++-- Under the assumption that our linked build product is less than
++-- ~2*128MB of TEXT, and there are no jump that span the whole
++-- TEXT segment.
++-- Something where riscv's compressed instruction might come in
++-- handy.
++genJump :: CmmExpr {-the branch target-} -> NatM InstrBlock
++genJump expr = do
++  (target, _format, code) <- getSomeReg expr
++  return (code `appOL` unitOL (annExpr expr (B (TReg target))))
++
++-- -----------------------------------------------------------------------------
++--  Unconditional branches
++genBranch :: BlockId -> NatM InstrBlock
++genBranch = return . toOL . mkJumpInstr
++
++-- -----------------------------------------------------------------------------
++-- Conditional branches
++genCondJump ::
++  BlockId ->
++  CmmExpr ->
++  NatM InstrBlock
++genCondJump bid expr = do
++  case expr of
++    -- Optimized == 0 case.
++    CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do
++      (reg_x, _format_x, code_x) <- getSomeReg x
++      return $ code_x `snocOL` annExpr expr (BCOND EQ zero (OpReg w reg_x) (TBlock bid))
++
++    -- Optimized /= 0 case.
++    CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do
++      (reg_x, _format_x, code_x) <- getSomeReg x
++      return $ code_x `snocOL` annExpr expr (BCOND NE zero (OpReg w reg_x) (TBlock bid))
++
++    -- Generic case.
++    CmmMachOp mop [x, y] -> do
++      let ubcond w cmp = do
++            -- compute both sides.
++            (reg_x, format_x, code_x) <- getSomeReg x
++            (reg_y, format_y, code_y) <- getSomeReg y
++            let x' = OpReg w reg_x
++                y' = OpReg w reg_y
++            return $ case w of
++              w
++                | w == W8 || w == W16 ->
++                    code_x
++                      `appOL` truncateReg (formatToWidth format_x) w reg_x
++                      `appOL` code_y
++                      `appOL` truncateReg (formatToWidth format_y) w reg_y
++                      `appOL` code_y
++                      `snocOL` annExpr expr (BCOND cmp x' y' (TBlock bid))
++              _ ->
++                code_x
++                  `appOL` code_y
++                  `snocOL` annExpr expr (BCOND cmp x' y' (TBlock bid))
++
++          sbcond w cmp = do
++            -- compute both sides.
++            (reg_x, format_x, code_x) <- getSomeReg x
++            (reg_y, format_y, code_y) <- getSomeReg y
++            let x' = OpReg w reg_x
++                y' = OpReg w reg_y
++            return $ case w of
++              w
++                | w `elem` [W8, W16, W32] ->
++                    code_x
++                      `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x
++                      `appOL` code_y
++                      `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y
++                      `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid)))
++              _ -> code_x `appOL` code_y `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid)))
++
++          fbcond w cmp = do
++            -- ensure we get float regs
++            (reg_fx, _format_fx, code_fx) <- getFloatReg x
++            (reg_fy, _format_fy, code_fy) <- getFloatReg y
++            condOpReg <- OpReg W64 <$> getNewRegNat II64
++            oneReg <- getNewRegNat II64
++            return $ code_fx
++              `appOL` code_fy
++              `snocOL` annExpr expr (CSET condOpReg (OpReg w reg_fx) (OpReg w reg_fy) cmp)
++              `snocOL` MOV (OpReg W64 oneReg) (OpImm (ImmInt 1))
++              `snocOL` BCOND EQ condOpReg (OpReg w oneReg) (TBlock bid)
++
++      case mop of
++        MO_F_Eq w -> fbcond w EQ
++        MO_F_Ne w -> fbcond w NE
++        MO_F_Gt w -> fbcond w FGT
++        MO_F_Ge w -> fbcond w FGE
++        MO_F_Lt w -> fbcond w FLT
++        MO_F_Le w -> fbcond w FLE
++        MO_Eq w -> sbcond w EQ
++        MO_Ne w -> sbcond w NE
++        MO_S_Gt w -> sbcond w SGT
++        MO_S_Ge w -> sbcond w SGE
++        MO_S_Lt w -> sbcond w SLT
++        MO_S_Le w -> sbcond w SLE
++        MO_U_Gt w -> ubcond w UGT
++        MO_U_Ge w -> ubcond w UGE
++        MO_U_Lt w -> ubcond w ULT
++        MO_U_Le w -> ubcond w ULE
++        _ -> pprPanic "RV64.genCondJump:case mop: " (text $ show expr)
++    _ -> pprPanic "RV64.genCondJump: " (text $ show expr)
++
++-- | Generate conditional branching instructions
++--
++-- This is basically an "if with else" statement.
++genCondBranch ::
++  -- | the true branch target
++  BlockId ->
++  -- | the false branch target
++  BlockId ->
++  -- | the condition on which to branch
++  CmmExpr ->
++  -- | Instructions
++  NatM InstrBlock
++genCondBranch true false expr =
++  appOL
++    <$> genCondJump true expr
++    <*> genBranch false
++
++-- -----------------------------------------------------------------------------
++--  Generating C calls
++
++-- | Generate a call to a C function.
++--
++-- - Integer values are passed in GP registers a0-a7.
++-- - Floating point values are passed in FP registers fa0-fa7.
++-- - If there are no free floating point registers, the FP values are passed in GP registers.
++-- - If all GP registers are taken, the values are spilled as whole words (!) onto the stack.
++-- - For integers/words, the return value is in a0.
++-- - The return value is in fa0 if the return type is a floating point value.
++genCCall ::
++  ForeignTarget -> -- function to call
++  [CmmFormal] -> -- where to put the result
++  [CmmActual] -> -- arguments (of mixed type)
++  NatM InstrBlock
++-- TODO: Specialize where we can.
++-- Generic impl
++genCCall target@(ForeignTarget expr _cconv) dest_regs arg_regs = do
++  -- we want to pass arg_regs into allArgRegs
++  -- The target :: ForeignTarget call can either
++  -- be a foreign procedure with an address expr
++  -- and a calling convention.
++  (call_target_reg, call_target_code) <-
++    -- Compute the address of the call target into a register. This
++    -- addressing enables us to jump through the whole address space
++    -- without further ado. PC-relative addressing would involve
++    -- instructions to do similar, though.
++    do
++      (reg, _format, reg_code) <- getSomeReg expr
++      pure (reg, reg_code)
++  -- compute the code and register logic for all arg_regs.
++  -- this will give us the format information to match on.
++  arg_regs' <- mapM getSomeReg arg_regs
++
++  -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
++  -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
++  -- STG; this then breaks packing of stack arguments, if we need to pack
++  -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
++  -- in Cmm proper. Option two, which we choose here is to use extended Hint
++  -- information to contain the size information and use that when packing
++  -- arguments, spilled onto the stack.
++  let (_res_hints, arg_hints) = foreignTargetHints target
++      arg_regs'' = zipWith (\(r, f, c) h -> (r, f, h, c)) arg_regs' arg_hints
++
++  (stackSpaceWords, passRegs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
++
++  readResultsCode <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
++
++  let moveStackDown 0 =
++        toOL
++          [ PUSH_STACK_FRAME,
++            DELTA (-16)
++          ]
++      moveStackDown i | odd i = moveStackDown (i + 1)
++      moveStackDown i =
++        toOL
++          [ PUSH_STACK_FRAME,
++            SUB (OpReg W64 spMachReg) (OpReg W64 spMachReg) (OpImm (ImmInt (8 * i))),
++            DELTA (-8 * i - 16)
++          ]
++      moveStackUp 0 =
++        toOL
++          [ POP_STACK_FRAME,
++            DELTA 0
++          ]
++      moveStackUp i | odd i = moveStackUp (i + 1)
++      moveStackUp i =
++        toOL
++          [ ADD (OpReg W64 spMachReg) (OpReg W64 spMachReg) (OpImm (ImmInt (8 * i))),
++            POP_STACK_FRAME,
++            DELTA 0
++          ]
++
++  let code =
++        call_target_code -- compute the label (possibly into a register)
++          `appOL` moveStackDown stackSpaceWords
++          `appOL` passArgumentsCode -- put the arguments into x0, ...
++          `snocOL` BL call_target_reg passRegs -- branch and link (C calls aren't tail calls, but return)
++          `appOL` readResultsCode -- parse the results into registers
++          `appOL` moveStackUp stackSpaceWords
++  return code
++  where
++    -- Implementiation of the RISCV ABI calling convention.
++    -- https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/948463cd5dbebea7c1869e20146b17a2cc8fda2f/riscv-cc.adoc#integer-calling-convention
++    passArguments :: [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
++    -- Base case: no more arguments to pass (left)
++    passArguments _ _ [] stackSpaceWords accumRegs accumCode = return (stackSpaceWords, accumRegs, accumCode)
++    -- Still have GP regs, and we want to pass an GP argument.
++    passArguments (gpReg : gpRegs) fpRegs ((r, format, hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do
++      -- RISCV64 Integer Calling Convention: "When passed in registers or on the
++      -- stack, integer scalars narrower than XLEN bits are widened according to
++      -- the sign of their type up to 32 bits, then sign-extended to XLEN bits."
++      let w = formatToWidth format
++          assignArg =
++            if hint == SignedHint
++              then
++                COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r)
++                  `consOL` signExtend w W64 r gpReg
++              else
++                toOL
++                  [ COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r),
++                    MOV (OpReg w gpReg) (OpReg w r)
++                  ]
++          accumCode' =
++            accumCode
++              `appOL` code_r
++              `appOL` assignArg
++      passArguments gpRegs fpRegs args stackSpaceWords (gpReg : accumRegs) accumCode'
++
++    -- Still have FP regs, and we want to pass an FP argument.
++    passArguments gpRegs (fpReg : fpRegs) ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do
++      let w = formatToWidth format
++          mov = MOV (OpReg w fpReg) (OpReg w r)
++          accumCode' =
++            accumCode
++              `appOL` code_r
++              `snocOL` ann (text "Pass fp argument: " <> ppr r) mov
++      passArguments gpRegs fpRegs args stackSpaceWords (fpReg : accumRegs) accumCode'
++
++    -- No mor regs left to pass. Must pass on stack.
++    passArguments [] [] ((r, format, hint, code_r) : args) stackSpaceWords accumRegs accumCode = do
++      let w = formatToWidth format
++          spOffet = 8 * stackSpaceWords
++          str = STR format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet)))
++          stackCode =
++            if hint == SignedHint
++              then
++                code_r
++                  `appOL` signExtend w W64 r tmpReg
++                  `snocOL` ann (text "Pass signed argument (size " <> ppr w <> text ") on the stack: " <> ppr tmpReg) str
++              else
++                code_r
++                  `snocOL` ann (text "Pass unsigned argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str
++      passArguments [] [] args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode)
++
++    -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
++    passArguments [] fpRegs ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do
++      let w = formatToWidth format
++          spOffet = 8 * stackSpaceWords
++          str = STR format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet)))
++          stackCode =
++            code_r
++              `snocOL` ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str
++      passArguments [] fpRegs args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode)
++
++    -- Still have gpRegs left, but want to pass a FP argument. Must be passed in gpReg then.
++    passArguments (gpReg : gpRegs) [] ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do
++      let w = formatToWidth format
++          mov = MOV (OpReg w gpReg) (OpReg w r)
++          accumCode' =
++            accumCode
++              `appOL` code_r
++              `snocOL` ann (text "Pass fp argument in gpReg: " <> ppr r) mov
++      passArguments gpRegs [] args stackSpaceWords (gpReg : accumRegs) accumCode'
++    passArguments _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
++
++    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg] -> InstrBlock -> NatM InstrBlock
++    readResults _ _ [] _ accumCode = return accumCode
++    readResults [] _ _ _ _ = do
++      platform <- getPlatform
++      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
++    readResults _ [] _ _ _ = do
++      platform <- getPlatform
++      pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
++    readResults (gpReg : gpRegs) (fpReg : fpRegs) (dst : dsts) accumRegs accumCode = do
++      -- gp/fp reg -> dst
++      platform <- getPlatform
++      let rep = cmmRegType (CmmLocal dst)
++          format = cmmTypeFormat rep
++          w = cmmRegWidth (CmmLocal dst)
++          r_dst = getRegisterReg platform (CmmLocal dst)
++      if isFloatFormat format
++        then readResults (gpReg : gpRegs) fpRegs dsts (fpReg : accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
++        else
++          readResults gpRegs (fpReg : fpRegs) dsts (gpReg : accumRegs)
++            $ accumCode
++            `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg)
++            `appOL`
++            -- truncate, otherwise an unexpectedly big value might be used in upfollowing calculations
++            truncateReg W64 w r_dst
++genCCall (PrimTarget mop) dest_regs arg_regs = do
++  case mop of
++    MO_F32_Fabs
++      | [arg_reg] <- arg_regs,
++        [dest_reg] <- dest_regs ->
++          unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
++    MO_F64_Fabs
++      | [arg_reg] <- arg_regs,
++        [dest_reg] <- dest_regs ->
++          unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
++    -- 64 bit float ops
++    MO_F64_Pwr -> mkCCall "pow"
++    MO_F64_Sin -> mkCCall "sin"
++    MO_F64_Cos -> mkCCall "cos"
++    MO_F64_Tan -> mkCCall "tan"
++    MO_F64_Sinh -> mkCCall "sinh"
++    MO_F64_Cosh -> mkCCall "cosh"
++    MO_F64_Tanh -> mkCCall "tanh"
++    MO_F64_Asin -> mkCCall "asin"
++    MO_F64_Acos -> mkCCall "acos"
++    MO_F64_Atan -> mkCCall "atan"
++    MO_F64_Asinh -> mkCCall "asinh"
++    MO_F64_Acosh -> mkCCall "acosh"
++    MO_F64_Atanh -> mkCCall "atanh"
++    MO_F64_Log -> mkCCall "log"
++    MO_F64_Log1P -> mkCCall "log1p"
++    MO_F64_Exp -> mkCCall "exp"
++    MO_F64_ExpM1 -> mkCCall "expm1"
++    MO_F64_Fabs -> mkCCall "fabs"
++    MO_F64_Sqrt -> mkCCall "sqrt"
++    -- 32 bit float ops
++    MO_F32_Pwr -> mkCCall "powf"
++    MO_F32_Sin -> mkCCall "sinf"
++    MO_F32_Cos -> mkCCall "cosf"
++    MO_F32_Tan -> mkCCall "tanf"
++    MO_F32_Sinh -> mkCCall "sinhf"
++    MO_F32_Cosh -> mkCCall "coshf"
++    MO_F32_Tanh -> mkCCall "tanhf"
++    MO_F32_Asin -> mkCCall "asinf"
++    MO_F32_Acos -> mkCCall "acosf"
++    MO_F32_Atan -> mkCCall "atanf"
++    MO_F32_Asinh -> mkCCall "asinhf"
++    MO_F32_Acosh -> mkCCall "acoshf"
++    MO_F32_Atanh -> mkCCall "atanhf"
++    MO_F32_Log -> mkCCall "logf"
++    MO_F32_Log1P -> mkCCall "log1pf"
++    MO_F32_Exp -> mkCCall "expf"
++    MO_F32_ExpM1 -> mkCCall "expm1f"
++    MO_F32_Fabs -> mkCCall "fabsf"
++    MO_F32_Sqrt -> mkCCall "sqrtf"
++    -- 64-bit primops
++    MO_I64_ToI -> mkCCall "hs_int64ToInt"
++    MO_I64_FromI -> mkCCall "hs_intToInt64"
++    MO_W64_ToW -> mkCCall "hs_word64ToWord"
++    MO_W64_FromW -> mkCCall "hs_wordToWord64"
++    MO_x64_Neg -> mkCCall "hs_neg64"
++    MO_x64_Add -> mkCCall "hs_add64"
++    MO_x64_Sub -> mkCCall "hs_sub64"
++    MO_x64_Mul -> mkCCall "hs_mul64"
++    MO_I64_Quot -> mkCCall "hs_quotInt64"
++    MO_I64_Rem -> mkCCall "hs_remInt64"
++    MO_W64_Quot -> mkCCall "hs_quotWord64"
++    MO_W64_Rem -> mkCCall "hs_remWord64"
++    MO_x64_And -> mkCCall "hs_and64"
++    MO_x64_Or -> mkCCall "hs_or64"
++    MO_x64_Xor -> mkCCall "hs_xor64"
++    MO_x64_Not -> mkCCall "hs_not64"
++    MO_x64_Shl -> mkCCall "hs_uncheckedShiftL64"
++    MO_I64_Shr -> mkCCall "hs_uncheckedIShiftRA64"
++    MO_W64_Shr -> mkCCall "hs_uncheckedShiftRL64"
++    MO_x64_Eq -> mkCCall "hs_eq64"
++    MO_x64_Ne -> mkCCall "hs_ne64"
++    MO_I64_Ge -> mkCCall "hs_geInt64"
++    MO_I64_Gt -> mkCCall "hs_gtInt64"
++    MO_I64_Le -> mkCCall "hs_leInt64"
++    MO_I64_Lt -> mkCCall "hs_ltInt64"
++    MO_W64_Ge -> mkCCall "hs_geWord64"
++    MO_W64_Gt -> mkCCall "hs_gtWord64"
++    MO_W64_Le -> mkCCall "hs_leWord64"
++    MO_W64_Lt -> mkCCall "hs_ltWord64"
++    -- Conversion
++    MO_UF_Conv w -> mkCCall (word2FloatLabel w)
++    -- Optional MachOps
++    -- These are enabled/disabled by backend flags: GHC.StgToCmm.Config
++    MO_S_Mul2 _w -> unsupported mop
++    MO_S_QuotRem _w -> unsupported mop
++    MO_U_QuotRem _w -> unsupported mop
++    MO_U_QuotRem2 _w -> unsupported mop
++    MO_Add2 _w -> unsupported mop
++    MO_AddWordC _w -> unsupported mop
++    MO_SubWordC _w -> unsupported mop
++    MO_AddIntC _w -> unsupported mop
++    MO_SubIntC _w -> unsupported mop
++    MO_U_Mul2 _w -> unsupported mop
++    -- Memory Ordering
++    -- The related C functions are:
++    -- #include <stdatomic.h>
++    -- atomic_thread_fence(memory_order_acquire);
++    -- atomic_thread_fence(memory_order_release);
++    -- atomic_thread_fence(memory_order_seq_cst);
++    MO_AcquireFence -> pure (unitOL (FENCE FenceRead FenceReadWrite))
++    MO_ReleaseFence -> pure (unitOL (FENCE FenceReadWrite FenceWrite))
++    MO_SeqCstFence -> pure (unitOL (FENCE FenceReadWrite FenceReadWrite))
++    MO_Touch -> pure nilOL -- Keep variables live (when using interior pointers)
++    -- Prefetch
++    MO_Prefetch_Data _n -> pure nilOL -- Prefetch hint.
++
++    -- Memory copy/set/move/cmp, with alignment for optimization
++    MO_Memcpy _align -> mkCCall "memcpy"
++    MO_Memset _align -> mkCCall "memset"
++    MO_Memmove _align -> mkCCall "memmove"
++    MO_Memcmp _align -> mkCCall "memcmp"
++    MO_SuspendThread -> mkCCall "suspendThread"
++    MO_ResumeThread -> mkCCall "resumeThread"
++    MO_PopCnt w -> mkCCall (popCntLabel w)
++    MO_Pdep w -> mkCCall (pdepLabel w)
++    MO_Pext w -> mkCCall (pextLabel w)
++    MO_Clz w -> mkCCall (clzLabel w)
++    MO_Ctz w -> mkCCall (ctzLabel w)
++    MO_BSwap w -> mkCCall (bSwapLabel w)
++    MO_BRev w -> mkCCall (bRevLabel w)
++    -- Atomic read-modify-write.
++    mo@(MO_AtomicRead w ord)
++      | [p_reg] <- arg_regs,
++        [dst_reg] <- dest_regs -> do
++          (p, _fmt_p, code_p) <- getSomeReg p_reg
++          platform <- getPlatform
++          -- Analog to the related MachOps (above)
++          -- The related C functions are:
++          -- #include <stdatomic.h>
++          -- __atomic_load_n(&a, __ATOMIC_ACQUIRE);
++          -- __atomic_load_n(&a, __ATOMIC_SEQ_CST);
++          let instrs = case ord of
++                MemOrderRelaxed -> unitOL $ ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p))
++                MemOrderAcquire ->
++                  toOL
++                    [ ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)),
++                      FENCE FenceRead FenceReadWrite
++                    ]
++                MemOrderSeqCst ->
++                  toOL
++                    [ ann moDescr (FENCE FenceReadWrite FenceReadWrite),
++                      LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p),
++                      FENCE FenceRead FenceReadWrite
++                    ]
++                MemOrderRelease -> panic $ "Unexpected MemOrderRelease on an AtomicRead: " ++ show mo
++              dst = getRegisterReg platform (CmmLocal dst_reg)
++              moDescr = (text . show) mo
++              code = code_p `appOL` instrs
++          return code
++      | otherwise -> panic "mal-formed AtomicRead"
++    mo@(MO_AtomicWrite w ord)
++      | [p_reg, val_reg] <- arg_regs -> do
++          (p, _fmt_p, code_p) <- getSomeReg p_reg
++          (val, fmt_val, code_val) <- getSomeReg val_reg
++          -- Analog to the related MachOps (above)
++          -- The related C functions are:
++          -- #include <stdatomic.h>
++          -- __atomic_store_n(&a, 23, __ATOMIC_SEQ_CST);
++          -- __atomic_store_n(&a, 23, __ATOMIC_RELEASE);
++          let instrs = case ord of
++                MemOrderRelaxed -> unitOL $ ann moDescr (STR fmt_val (OpReg w val) (OpAddr $ AddrReg p))
++                MemOrderSeqCst ->
++                  toOL
++                    [ ann moDescr (FENCE FenceReadWrite FenceWrite),
++                      STR fmt_val (OpReg w val) (OpAddr $ AddrReg p),
++                      FENCE FenceReadWrite FenceReadWrite
++                    ]
++                MemOrderRelease ->
++                  toOL
++                    [ ann moDescr (FENCE FenceReadWrite FenceWrite),
++                      STR fmt_val (OpReg w val) (OpAddr $ AddrReg p)
++                    ]
++                MemOrderAcquire -> panic $ "Unexpected MemOrderAcquire on an AtomicWrite" ++ show mo
++              moDescr = (text . show) mo
++              code =
++                code_p
++                  `appOL` code_val
++                  `appOL` instrs
++          pure code
++      | otherwise -> panic "mal-formed AtomicWrite"
++    MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
++    MO_Cmpxchg w -> mkCCall (cmpxchgLabel w)
++    -- -- Should be an AtomicRMW variant eventually.
++    -- -- Sequential consistent.
++    -- TODO: this should be implemented properly!
++    MO_Xchg w -> mkCCall (xchgLabel w)
++  where
++    unsupported :: (Show a) => a -> b
++    unsupported mop =
++      panic
++        ( "outOfLineCmmOp: "
++            ++ show mop
++            ++ " not supported here"
++        )
++    mkCCall :: FastString -> NatM InstrBlock
++    mkCCall name = do
++      config <- getConfig
++      target <-
++        cmmMakeDynamicReference config CallReference
++          $ mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
++      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
++      genCCall (ForeignTarget target cconv) dest_regs arg_regs
++
++    unaryFloatOp w op arg_reg dest_reg = do
++      platform <- getPlatform
++      (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg
++      let dst = getRegisterReg platform (CmmLocal dest_reg)
++      let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx)
++      pure code
++
++{- Note [RISCV64 far jumps]
++~~~~~~~~~~~~~~~~~~~~~~~~~~~
++
++RISCV64 conditional jump instructions can only encode an offset of +/-4KiB
++(12bits) which is usually enough but can be exceeded in edge cases. In these
++cases we will replace:
++
++  b.cond <cond> foo
++
++with the sequence:
++
++  b.cond <cond> <lbl_true>
++  b <lbl_false>
++  <lbl_true>:
++  la reg foo
++  b reg
++  <lbl_false>:
++
++and
++
++  b foo
++
++with the sequence:
++
++  la reg foo
++  b reg
++
++Compared to AArch64 the target label is loaded to a register, because
++unconditional jump instructions can only address +/-1MiB. The LA
++pseudo-instruction will be replaced by up to two real instructions, ensuring
++correct addressing.
++
++One could surely find more efficient replacements, taking PC-relative addressing
++into account. This could be a future improvement. (As far branches are pretty
++rare, one might question and measure the value of such improvement.)
++
++RISCV has many pseudo-instructions which emit more than one real instructions.
++Thus, we count the real instructions after the Assembler has seen them.
++
++We make some simplifications in the name of performance which can result in
++overestimating jump <-> label offsets:
++
++\* To avoid having to recalculate the label offsets once we replaced a jump we simply
++  assume all label jumps will be expanded to a three instruction far jump sequence.
++\* For labels associated with a info table we assume the info table is 64byte large.
++  Most info tables are smaller than that but it means we don't have to distinguish
++  between multiple types of info tables.
++
++In terms of implementation we walk the instruction stream at least once calculating
++label offsets, and if we determine during this that the functions body is big enough
++to potentially contain out of range jumps we walk the instructions a second time, replacing
++out of range jumps with the sequence of instructions described above.
++
++-}
++
++-- | A conditional jump to a far target
++--
++-- By loading the far target into a register for the jump, we can address the
++-- whole memory range.
++genCondFarJump :: (MonadUnique m) => Cond -> Operand -> Operand -> BlockId -> m InstrBlock
++genCondFarJump cond op1 op2 far_target = do
++  skip_lbl_id <- newBlockId
++  jmp_lbl_id <- newBlockId
++
++  -- TODO: We can improve this by inverting the condition
++  -- but it's not quite trivial since we don't know if we
++  -- need to consider float orderings.
++  -- So we take the hit of the additional jump in the false
++  -- case for now.
++  return
++    $ toOL
++      [ ann (text "Conditional far jump to: " <> ppr far_target)
++          $ BCOND cond op1 op2 (TBlock jmp_lbl_id),
++        B (TBlock skip_lbl_id),
++        NEWBLOCK jmp_lbl_id,
++        LDR II64 (OpReg W64 tmpReg) (OpImm (ImmCLbl (blockLbl far_target))),
++        B (TReg tmpReg),
++        NEWBLOCK skip_lbl_id
++      ]
++
++-- | An unconditional jump to a far target
++--
++-- By loading the far target into a register for the jump, we can address the
++-- whole memory range.
++genFarJump :: (MonadUnique m) => BlockId -> m InstrBlock
++genFarJump far_target =
++  return
++    $ toOL
++      [ ann (text "Unconditional far jump to: " <> ppr far_target)
++          $ LDR II64 (OpReg W64 tmpReg) (OpImm (ImmCLbl (blockLbl far_target))),
++        B (TReg tmpReg)
++      ]
++
++-- See Note [RISCV64 far jumps]
++data BlockInRange = InRange | NotInRange BlockId
++
++-- See Note [RISCV64 far jumps]
++makeFarBranches ::
++  Platform ->
++  LabelMap RawCmmStatics ->
++  [NatBasicBlock Instr] ->
++  UniqSM [NatBasicBlock Instr]
++makeFarBranches {- only used when debugging -} _platform statics basic_blocks = do
++  -- All offsets/positions are counted in multiples of 4 bytes (the size of RISCV64 instructions)
++  -- That is an offset of 1 represents a 4-byte/one instruction offset.
++  let (func_size, lblMap) = foldl' calc_lbl_positions (0, mapEmpty) basic_blocks
++  if func_size < max_jump_dist
++    then pure basic_blocks
++    else do
++      (_, blocks) <- mapAccumLM (replace_blk lblMap) 0 basic_blocks
++      pure $ concat blocks
++  where
++    -- pprTrace "lblMap" (ppr lblMap) $ basic_blocks
++
++    -- 2^11, 12 bit immediate with one bit is reserved for the sign
++    max_jump_dist = 2 ^ (11 :: Int) - 1 :: Int
++    -- Currently all inline info tables fit into 64 bytes.
++    max_info_size = 16 :: Int
++    long_bc_jump_size = 5 :: Int
++    long_b_jump_size = 2 :: Int
++
++    -- Replace out of range conditional jumps with unconditional jumps.
++    replace_blk :: LabelMap Int -> Int -> GenBasicBlock Instr -> UniqSM (Int, [GenBasicBlock Instr])
++    replace_blk !m !pos (BasicBlock lbl instrs) = do
++      -- Account for a potential info table before the label.
++      let !block_pos = pos + infoTblSize_maybe lbl
++      (!pos', instrs') <- mapAccumLM (replace_jump m) block_pos instrs
++      let instrs'' = concat instrs'
++      -- We might have introduced new labels, so split the instructions into basic blocks again if neccesary.
++      let (top, split_blocks, no_data) = foldr mkBlocks ([], [], []) instrs''
++      -- There should be no data in the instruction stream at this point
++      massert (null no_data)
++
++      let final_blocks = BasicBlock lbl top : split_blocks
++      pure (pos', final_blocks)
++
++    replace_jump :: LabelMap Int -> Int -> Instr -> UniqSM (Int, [Instr])
++    replace_jump !m !pos instr = do
++      case instr of
++        ANN ann instr -> do
++          (idx, instr' : instrs') <- replace_jump m pos instr
++          pure (idx, ANN ann instr' : instrs')
++        BCOND cond op1 op2 t ->
++          case target_in_range m t pos of
++            InRange -> pure (pos + instr_size instr, [instr])
++            NotInRange far_target -> do
++              jmp_code <- genCondFarJump cond op1 op2 far_target
++              pure (pos + instr_size instr, fromOL jmp_code)
++        B t ->
++          case target_in_range m t pos of
++            InRange -> pure (pos + instr_size instr, [instr])
++            NotInRange far_target -> do
++              jmp_code <- genFarJump far_target
++              pure (pos + instr_size instr, fromOL jmp_code)
++        _ -> pure (pos + instr_size instr, [instr])
++
++    target_in_range :: LabelMap Int -> Target -> Int -> BlockInRange
++    target_in_range m target src =
++      case target of
++        (TReg {}) -> InRange
++        (TBlock bid) -> block_in_range m src bid
++
++    block_in_range :: LabelMap Int -> Int -> BlockId -> BlockInRange
++    block_in_range m src_pos dest_lbl =
++      case mapLookup dest_lbl m of
++        Nothing ->
++          pprTrace "not in range" (ppr dest_lbl)
++            $ NotInRange dest_lbl
++        Just dest_pos ->
++          if abs (dest_pos - src_pos) < max_jump_dist
++            then InRange
++            else NotInRange dest_lbl
++
++    calc_lbl_positions :: (Int, LabelMap Int) -> GenBasicBlock Instr -> (Int, LabelMap Int)
++    calc_lbl_positions (pos, m) (BasicBlock lbl instrs) =
++      let !pos' = pos + infoTblSize_maybe lbl
++       in foldl' instr_pos (pos', mapInsert lbl pos' m) instrs
++
++    instr_pos :: (Int, LabelMap Int) -> Instr -> (Int, LabelMap Int)
++    instr_pos (pos, m) instr = (pos + instr_size instr, m)
++
++    infoTblSize_maybe bid =
++      case mapLookup bid statics of
++        Nothing -> 0 :: Int
++        Just _info_static -> max_info_size
++
++    instr_size :: Instr -> Int
++    instr_size i = case i of
++      COMMENT {} -> 0
++      MULTILINE_COMMENT {} -> 0
++      ANN _ instr -> instr_size instr
++      LOCATION {} -> 0
++      DELTA {} -> 0
++      -- At this point there should be no NEWBLOCK in the instruction stream (pos, mapInsert bid pos m)
++      NEWBLOCK {} -> panic "mkFarBranched - Unexpected"
++      LDATA {} -> panic "mkFarBranched - Unexpected"
++      PUSH_STACK_FRAME -> 4
++      POP_STACK_FRAME -> 4
++      ADD {} -> 1
++      MUL {} -> 1
++      MULH {} -> 1
++      NEG {} -> 1
++      DIV {} -> 1
++      REM {} -> 1
++      REMU {} -> 1
++      SUB {} -> 1
++      DIVU {} -> 1
++      AND {} -> 1
++      OR {} -> 1
++      SRA {} -> 1
++      XOR {} -> 1
++      SLL {} -> 1
++      SRL {} -> 1
++      MOV {} -> 2
++      ORI {} -> 1
++      XORI {} -> 1
++      CSET {} -> 2
++      STR {} -> 1
++      LDR {} -> 3
++      LDRU {} -> 1
++      FENCE {} -> 1
++      FCVT {} -> 1
++      FABS {} -> 1
++      FMA {} -> 1
++      -- estimate the subsituted size for jumps to lables
++      -- jumps to registers have size 1
++      BCOND {} -> long_bc_jump_size
++      B (TBlock _) -> long_b_jump_size
++      B (TReg _) -> 1
++      BL _ _ -> 1
++      J_TBL {} -> 1
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Cond.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Cond.hs
+@@ -0,0 +1,42 @@
++module GHC.CmmToAsm.RV64.Cond
++  ( Cond (..),
++  )
++where
++
++import GHC.Prelude hiding (EQ)
++
++-- | Condition codes.
++--
++-- Used in conditional branches and bit setters. According to the available
++-- instruction set, some conditions are encoded as their negated opposites. I.e.
++-- these are logical things that don't necessarily map 1:1 to hardware/ISA.
++data Cond
++  = -- | int and float
++    EQ
++  | -- | int and float
++    NE
++  | -- | signed less than
++    SLT
++  | -- | signed less than or equal
++    SLE
++  | -- | signed greater than or equal
++    SGE
++  | -- | signed greater than
++    SGT
++  | -- | unsigned less than
++    ULT
++  | -- | unsigned less than or equal
++    ULE
++  | -- | unsigned greater than or equal
++    UGE
++  | -- | unsigned greater than
++    UGT
++  | -- | floating point instruction @flt@
++    FLT
++  | -- | floating point instruction @fle@
++    FLE
++  | -- | floating point instruction @fge@
++    FGE
++  | -- | floating point instruction @fgt@
++    FGT
++  deriving (Eq, Show)
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Instr.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Instr.hs
+@@ -0,0 +1,823 @@
++-- All instructions will be rendered eventually. Thus, there's no benefit in
++-- being lazy in data types.
++{-# LANGUAGE StrictData #-}
++{-# OPTIONS_GHC -fno-warn-orphans #-}
++
++module GHC.CmmToAsm.RV64.Instr where
++
++import Data.Maybe
++import GHC.Cmm
++import GHC.Cmm.BlockId
++import GHC.Cmm.CLabel
++import GHC.Cmm.Dataflow.Label
++import GHC.CmmToAsm.Config
++import GHC.CmmToAsm.Format
++import GHC.CmmToAsm.Instr (RegUsage (..))
++import GHC.CmmToAsm.RV64.Cond
++import GHC.CmmToAsm.RV64.Regs
++import GHC.CmmToAsm.Types
++import GHC.CmmToAsm.Utils
++import GHC.Data.FastString (LexicalFastString)
++import GHC.Platform
++import GHC.Platform.Reg
++import GHC.Platform.Regs
++import GHC.Prelude
++import GHC.Stack
++import GHC.Types.Unique.Supply
++import GHC.Utils.Outputable
++import GHC.Utils.Panic
++
++-- | Stack frame header size in bytes.
++--
++-- The stack frame header is made of the values that are always saved
++-- (regardless of the context.) It consists of the saved return address and a
++-- pointer to the previous frame. Thus, its size is two stack frame slots which
++-- equals two addresses/words (2 * 8 byte).
++stackFrameHeaderSize :: Int
++stackFrameHeaderSize = 2 * spillSlotSize
++
++-- | All registers are 8 byte wide.
++spillSlotSize :: Int
++spillSlotSize = 8
++
++-- | The number of bytes that the stack pointer should be aligned to.
++stackAlign :: Int
++stackAlign = 16
++
++-- | The number of spill slots available without allocating more.
++maxSpillSlots :: NCGConfig -> Int
++maxSpillSlots config =
++  ( (ncgSpillPreallocSize config - stackFrameHeaderSize)
++      `div` spillSlotSize
++  )
++    - 1
++
++-- | Convert a spill slot number to a *byte* offset.
++spillSlotToOffset :: Int -> Int
++spillSlotToOffset slot =
++  stackFrameHeaderSize + spillSlotSize * slot
++
++instance Outputable RegUsage where
++  ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')'
++
++-- | Get the registers that are being used by this instruction.
++-- regUsage doesn't need to do any trickery for jumps and such.
++-- Just state precisely the regs read and written by that insn.
++-- The consequences of control flow transfers, as far as register
++-- allocation goes, are taken care of by the register allocator.
++--
++-- RegUsage = RU [<read regs>] [<write regs>]
++regUsageOfInstr :: Platform -> Instr -> RegUsage
++regUsageOfInstr platform instr = case instr of
++  ANN _ i -> regUsageOfInstr platform i
++  COMMENT {} -> usage ([], [])
++  MULTILINE_COMMENT {} -> usage ([], [])
++  PUSH_STACK_FRAME -> usage ([], [])
++  POP_STACK_FRAME -> usage ([], [])
++  LOCATION {} -> usage ([], [])
++  DELTA {} -> usage ([], [])
++  ADD dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  MUL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  NEG dst src -> usage (regOp src, regOp dst)
++  MULH dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  DIV dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  REM dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  REMU dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  SUB dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  DIVU dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  AND dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  OR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  SRA dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  XOR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  SLL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  SRL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
++  MOV dst src -> usage (regOp src, regOp dst)
++  -- ORI's third operand is always an immediate
++  ORI dst src1 _ -> usage (regOp src1, regOp dst)
++  XORI dst src1 _ -> usage (regOp src1, regOp dst)
++  J_TBL _ _ t -> usage ([t], [])
++  B t -> usage (regTarget t, [])
++  BCOND _ l r t -> usage (regTarget t ++ regOp l ++ regOp r, [])
++  BL t ps -> usage (t : ps, callerSavedRegisters)
++  CSET dst l r _ -> usage (regOp l ++ regOp r, regOp dst)
++  STR _ src dst -> usage (regOp src ++ regOp dst, [])
++  LDR _ dst src -> usage (regOp src, regOp dst)
++  LDRU _ dst src -> usage (regOp src, regOp dst)
++  FENCE _ _ -> usage ([], [])
++  FCVT _variant dst src -> usage (regOp src, regOp dst)
++  FABS dst src -> usage (regOp src, regOp dst)
++  FMA _ dst src1 src2 src3 ->
++    usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
++  _ -> panic $ "regUsageOfInstr: " ++ instrCon instr
++  where
++    -- filtering the usage is necessary, otherwise the register
++    -- allocator will try to allocate pre-defined fixed stg
++    -- registers as well, as they show up.
++    usage :: ([Reg], [Reg]) -> RegUsage
++    usage (srcRegs, dstRegs) =
++      RU
++        (filter (interesting platform) srcRegs)
++        (filter (interesting platform) dstRegs)
++
++    regAddr :: AddrMode -> [Reg]
++    regAddr (AddrRegImm r1 _imm) = [r1]
++    regAddr (AddrReg r1) = [r1]
++
++    regOp :: Operand -> [Reg]
++    regOp (OpReg _w r1) = [r1]
++    regOp (OpAddr a) = regAddr a
++    regOp (OpImm _imm) = []
++
++    regTarget :: Target -> [Reg]
++    regTarget (TBlock _bid) = []
++    regTarget (TReg r1) = [r1]
++
++    -- Is this register interesting for the register allocator?
++    interesting :: Platform -> Reg -> Bool
++    interesting _ (RegVirtual _) = True
++    interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
++
++-- | Caller-saved registers (according to calling convention)
++--
++-- These registers may be clobbered after a jump.
++callerSavedRegisters :: [Reg]
++callerSavedRegisters =
++  [regSingle raRegNo]
++    ++ map regSingle [t0RegNo .. t2RegNo]
++    ++ map regSingle [a0RegNo .. a7RegNo]
++    ++ map regSingle [t3RegNo .. t6RegNo]
++    ++ map regSingle [ft0RegNo .. ft7RegNo]
++    ++ map regSingle [fa0RegNo .. fa7RegNo]
++
++-- | Apply a given mapping to all the register references in this instruction.
++patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
++patchRegsOfInstr instr env = case instr of
++  ANN d i -> ANN d (patchRegsOfInstr i env)
++  COMMENT {} -> instr
++  MULTILINE_COMMENT {} -> instr
++  PUSH_STACK_FRAME -> instr
++  POP_STACK_FRAME -> instr
++  LOCATION {} -> instr
++  DELTA {} -> instr
++  ADD o1 o2 o3 -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
++  MUL o1 o2 o3 -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
++  NEG o1 o2 -> NEG (patchOp o1) (patchOp o2)
++  MULH o1 o2 o3 -> MULH (patchOp o1) (patchOp o2) (patchOp o3)
++  DIV o1 o2 o3 -> DIV (patchOp o1) (patchOp o2) (patchOp o3)
++  REM o1 o2 o3 -> REM (patchOp o1) (patchOp o2) (patchOp o3)
++  REMU o1 o2 o3 -> REMU (patchOp o1) (patchOp o2) (patchOp o3)
++  SUB o1 o2 o3 -> SUB (patchOp o1) (patchOp o2) (patchOp o3)
++  DIVU o1 o2 o3 -> DIVU (patchOp o1) (patchOp o2) (patchOp o3)
++  AND o1 o2 o3 -> AND (patchOp o1) (patchOp o2) (patchOp o3)
++  OR o1 o2 o3 -> OR (patchOp o1) (patchOp o2) (patchOp o3)
++  SRA o1 o2 o3 -> SRA (patchOp o1) (patchOp o2) (patchOp o3)
++  XOR o1 o2 o3 -> XOR (patchOp o1) (patchOp o2) (patchOp o3)
++  SLL o1 o2 o3 -> SLL (patchOp o1) (patchOp o2) (patchOp o3)
++  SRL o1 o2 o3 -> SRL (patchOp o1) (patchOp o2) (patchOp o3)
++  MOV o1 o2 -> MOV (patchOp o1) (patchOp o2)
++  -- o3 cannot be a register for ORI (always an immediate)
++  ORI o1 o2 o3 -> ORI (patchOp o1) (patchOp o2) (patchOp o3)
++  XORI o1 o2 o3 -> XORI (patchOp o1) (patchOp o2) (patchOp o3)
++  J_TBL ids mbLbl t -> J_TBL ids mbLbl (env t)
++  B t -> B (patchTarget t)
++  BL t ps -> BL (patchReg t) ps
++  BCOND c o1 o2 t -> BCOND c (patchOp o1) (patchOp o2) (patchTarget t)
++  CSET o l r c -> CSET (patchOp o) (patchOp l) (patchOp r) c
++  STR f o1 o2 -> STR f (patchOp o1) (patchOp o2)
++  LDR f o1 o2 -> LDR f (patchOp o1) (patchOp o2)
++  LDRU f o1 o2 -> LDRU f (patchOp o1) (patchOp o2)
++  FENCE o1 o2 -> FENCE o1 o2
++  FCVT variant o1 o2 -> FCVT variant (patchOp o1) (patchOp o2)
++  FABS o1 o2 -> FABS (patchOp o1) (patchOp o2)
++  FMA s o1 o2 o3 o4 ->
++    FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
++  _ -> panic $ "patchRegsOfInstr: " ++ instrCon instr
++  where
++    patchOp :: Operand -> Operand
++    patchOp (OpReg w r) = OpReg w (env r)
++    patchOp (OpAddr a) = OpAddr (patchAddr a)
++    patchOp opImm = opImm
++
++    patchTarget :: Target -> Target
++    patchTarget (TReg r) = TReg (env r)
++    patchTarget tBlock = tBlock
++
++    patchAddr :: AddrMode -> AddrMode
++    patchAddr (AddrRegImm r1 imm) = AddrRegImm (env r1) imm
++    patchAddr (AddrReg r) = AddrReg (env r)
++
++    patchReg :: Reg -> Reg
++    patchReg = env
++
++-- | Checks whether this instruction is a jump/branch instruction.
++--
++-- One that can change the flow of control in a way that the
++-- register allocator needs to worry about.
++isJumpishInstr :: Instr -> Bool
++isJumpishInstr instr = case instr of
++  ANN _ i -> isJumpishInstr i
++  J_TBL {} -> True
++  B {} -> True
++  BL {} -> True
++  BCOND {} -> True
++  _ -> False
++
++-- | Get the `BlockId`s of the jump destinations (if any)
++jumpDestsOfInstr :: Instr -> [BlockId]
++jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
++jumpDestsOfInstr (J_TBL ids _mbLbl _r) = catMaybes ids
++jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
++jumpDestsOfInstr (BCOND _ _ _ t) = [id | TBlock id <- [t]]
++jumpDestsOfInstr _ = []
++
++-- | Change the destination of this (potential) jump instruction.
++--
++-- Used in the linear allocator when adding fixup blocks for join
++-- points.
++patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
++patchJumpInstr instr patchF =
++  case instr of
++    ANN d i -> ANN d (patchJumpInstr i patchF)
++    J_TBL ids mbLbl r -> J_TBL (map (fmap patchF) ids) mbLbl r
++    B (TBlock bid) -> B (TBlock (patchF bid))
++    BCOND c o1 o2 (TBlock bid) -> BCOND c o1 o2 (TBlock (patchF bid))
++    _ -> panic $ "patchJumpInstr: " ++ instrCon instr
++
++-- -----------------------------------------------------------------------------
++-- Note [RISCV64 Spills and Reloads]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++--
++-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading
++-- registers. The load and store instructions of RISCV64 address with a signed
++-- 12-bit immediate + a register; machine stackpointer (sp/x2) in this case.
++--
++-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't always address into it in a
++-- single load/store instruction. There are offsets to sp (not to be confused
++-- with STG's SP!) which need a register to be calculated.
++--
++-- Using sp to compute the offset would violate assumptions about the stack pointer
++-- pointing to the top of the stack during signal handling.  As we can't force
++-- every signal to use its own stack, we have to ensure that the stack pointer
++-- always points to the top of the stack, and we can't use it for computation.
++--
++-- So, we reserve one register (TMP) for this purpose (and other, unrelated
++-- intermediate operations.) See Note [The made-up RISCV64 TMP (IP) register]
++
++-- | Generate instructions to spill a register into a spill slot.
++mkSpillInstr ::
++  (HasCallStack) =>
++  NCGConfig ->
++  -- | register to spill
++  Reg ->
++  -- | current stack delta
++  Int ->
++  -- | spill slot to use
++  Int ->
++  [Instr]
++mkSpillInstr _config reg delta slot =
++  case off - delta of
++    imm | fitsIn12bitImm imm -> [mkStrSpImm imm]
++    imm ->
++      [ movImmToTmp imm,
++        addSpToTmp,
++        mkStrTmp
++      ]
++  where
++    fmt = case reg of
++      RegReal (RealRegSingle n) | n < d0RegNo -> II64
++      _ -> FF64
++    mkStrSpImm imm =
++      ANN (text "Spill@" <> int (off - delta))
++        $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm)))
++    movImmToTmp imm =
++      ANN (text "Spill: TMP <- " <> int imm)
++        $ MOV tmp (OpImm (ImmInt imm))
++    addSpToTmp =
++      ANN (text "Spill: TMP <- SP + TMP ")
++        $ ADD tmp tmp sp
++    mkStrTmp =
++      ANN (text "Spill@" <> int (off - delta))
++        $ STR fmt (OpReg W64 reg) (OpAddr (AddrReg tmpReg))
++
++    off = spillSlotToOffset slot
++
++-- | Generate instructions to load a register from a spill slot.
++mkLoadInstr ::
++  NCGConfig ->
++  -- | register to load
++  Reg ->
++  -- | current stack delta
++  Int ->
++  -- | spill slot to use
++  Int ->
++  [Instr]
++mkLoadInstr _config reg delta slot =
++  case off - delta of
++    imm | fitsIn12bitImm imm -> [mkLdrSpImm imm]
++    imm ->
++      [ movImmToTmp imm,
++        addSpToTmp,
++        mkLdrTmp
++      ]
++  where
++    fmt = case reg of
++      RegReal (RealRegSingle n) | n < d0RegNo -> II64
++      _ -> FF64
++    mkLdrSpImm imm =
++      ANN (text "Reload@" <> int (off - delta))
++        $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm)))
++    movImmToTmp imm =
++      ANN (text "Reload: TMP <- " <> int imm)
++        $ MOV tmp (OpImm (ImmInt imm))
++    addSpToTmp =
++      ANN (text "Reload: TMP <- SP + TMP ")
++        $ ADD tmp tmp sp
++    mkLdrTmp =
++      ANN (text "Reload@" <> int (off - delta))
++        $ LDR fmt (OpReg W64 reg) (OpAddr (AddrReg tmpReg))
++
++    off = spillSlotToOffset slot
++
++-- | See if this instruction is telling us the current C stack delta
++takeDeltaInstr :: Instr -> Maybe Int
++takeDeltaInstr (ANN _ i) = takeDeltaInstr i
++takeDeltaInstr (DELTA i) = Just i
++takeDeltaInstr _ = Nothing
++
++-- | Not real instructions.  Just meta data
++isMetaInstr :: Instr -> Bool
++isMetaInstr instr =
++  case instr of
++    ANN _ i -> isMetaInstr i
++    COMMENT {} -> True
++    MULTILINE_COMMENT {} -> True
++    LOCATION {} -> True
++    LDATA {} -> True
++    NEWBLOCK {} -> True
++    DELTA {} -> True
++    PUSH_STACK_FRAME -> True
++    POP_STACK_FRAME -> True
++    _ -> False
++
++-- | Copy the value in a register to another one.
++--
++-- Must work for all register classes.
++mkRegRegMoveInstr :: Reg -> Reg -> Instr
++mkRegRegMoveInstr src dst = ANN desc instr
++  where
++    desc = text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst
++    instr = MOV (operandFromReg dst) (operandFromReg src)
++
++-- | Take the source and destination from this (potential) reg -> reg move instruction
++--
++-- We have to be a bit careful here: A `MOV` can also mean an implicit
++-- conversion. This case is filtered out.
++takeRegRegMoveInstr :: Instr -> Maybe (Reg, Reg)
++takeRegRegMoveInstr (MOV (OpReg width dst) (OpReg width' src))
++  | width == width' && (isFloatReg dst == isFloatReg src) = pure (src, dst)
++takeRegRegMoveInstr _ = Nothing
++
++-- | Make an unconditional jump instruction.
++mkJumpInstr :: BlockId -> [Instr]
++mkJumpInstr = pure . B . TBlock
++
++-- | Decrement @sp@ to allocate stack space.
++--
++-- The stack grows downwards, so we decrement the stack pointer by @n@ (bytes).
++-- This is dual to `mkStackDeallocInstr`. @sp@ is the RISCV stack pointer, not
++-- to be confused with the STG stack pointer.
++mkStackAllocInstr :: Platform -> Int -> [Instr]
++mkStackAllocInstr _platform = moveSp . negate
++
++-- | Increment SP to deallocate stack space.
++--
++-- The stack grows downwards, so we increment the stack pointer by @n@ (bytes).
++-- This is dual to `mkStackAllocInstr`. @sp@ is the RISCV stack pointer, not to
++-- be confused with the STG stack pointer.
++mkStackDeallocInstr :: Platform -> Int -> [Instr]
++mkStackDeallocInstr _platform = moveSp
++
++moveSp :: Int -> [Instr]
++moveSp n
++  | n == 0 = []
++  | n /= 0 && fitsIn12bitImm n = pure . ANN desc $ ADD sp sp (OpImm (ImmInt n))
++  | otherwise =
++      -- This ends up in three effective instructions. We could get away with
++      -- two for intMax12bit < n < 3 * intMax12bit by recursing once. However,
++      -- this way is likely less surprising.
++      [ ANN desc (MOV tmp (OpImm (ImmInt n))),
++        ADD sp sp tmp
++      ]
++  where
++    desc = text "Move SP:" <+> int n
++
++--
++-- See Note [extra spill slots] in X86/Instr.hs
++--
++allocMoreStack ::
++  Platform ->
++  Int ->
++  NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr ->
++  UniqSM (NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr, [(BlockId, BlockId)])
++allocMoreStack _ _ top@(CmmData _ _) = return (top, [])
++allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
++  let entries = entryBlocks proc
++
++  uniqs <- getUniquesM
++
++  let delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
++        where
++          x = slots * spillSlotSize -- sp delta
++      alloc = mkStackAllocInstr platform delta
++      dealloc = mkStackDeallocInstr platform delta
++
++      retargetList = zip entries (map mkBlockId uniqs)
++
++      new_blockmap :: LabelMap BlockId
++      new_blockmap = mapFromList retargetList
++
++      insert_stack_insn (BasicBlock id insns)
++        | Just new_blockid <- mapLookup id new_blockmap =
++            [ BasicBlock id $ alloc ++ [B (TBlock new_blockid)],
++              BasicBlock new_blockid block'
++            ]
++        | otherwise =
++            [BasicBlock id block']
++        where
++          block' = foldr insert_dealloc [] insns
++
++      insert_dealloc insn r = case insn of
++        J_TBL {} -> dealloc ++ (insn : r)
++        ANN _ e -> insert_dealloc e r
++        _other
++          | jumpDestsOfInstr insn /= [] ->
++              patchJumpInstr insn retarget : r
++        _other -> insn : r
++        where
++          retarget b = fromMaybe b (mapLookup b new_blockmap)
++
++      new_code = concatMap insert_stack_insn code
++  return (CmmProc info lbl live (ListGraph new_code), retargetList)
++
++data Instr
++  = -- | Comment pseudo-op
++    COMMENT SDoc
++  | -- | Multi-line comment pseudo-op
++    MULTILINE_COMMENT SDoc
++  | -- | Annotated instruction. Should print <instr> # <doc>
++    ANN SDoc Instr
++  | -- | Location pseudo-op @.loc@ (file, line, col, name)
++    LOCATION Int Int Int LexicalFastString
++  | -- | Static data spat out during code generation.
++    LDATA Section RawCmmStatics
++  | -- | Start a new basic block.
++    --
++    -- Useful during codegen, removed later. Preceding instruction should be a
++    -- jump, as per the invariants for a BasicBlock (see Cmm).
++    NEWBLOCK BlockId
++  | -- | Specify current stack offset for benefit of subsequent passes
++    DELTA Int
++  | -- | Push a minimal stack frame consisting of the return address (RA) and the frame pointer (FP).
++    PUSH_STACK_FRAME
++  | -- | Pop the minimal stack frame of prior `PUSH_STACK_FRAME`.
++    POP_STACK_FRAME
++  | -- | Arithmetic addition (both integer and floating point)
++    --
++    -- @rd = rs1 + rs2@
++    ADD Operand Operand Operand
++  | -- | Arithmetic subtraction (both integer and floating point)
++    --
++    -- @rd = rs1 - rs2@
++    SUB Operand Operand Operand
++  | -- | Logical AND (integer only)
++    --
++    -- @rd = rs1 & rs2@
++    AND Operand Operand Operand
++  | -- | Logical OR (integer only)
++    --
++    -- @rd = rs1 | rs2@
++    OR Operand Operand Operand
++  | -- | Logical left shift (zero extened, integer only)
++    --
++    -- @rd = rs1 << rs2@
++    SLL Operand Operand Operand
++  | -- | Logical right shift (zero extened, integer only)
++    --
++    -- @rd = rs1 >> rs2@
++    SRL Operand Operand Operand
++  | -- | Arithmetic right shift (sign-extened, integer only)
++    --
++    -- @rd = rs1 >> rs2@
++    SRA Operand Operand Operand
++  | -- | Store to memory (both, integer and floating point)
++    STR Format Operand Operand
++  | -- | Load from memory (sign-extended, integer and floating point)
++    LDR Format Operand Operand
++  | -- | Load from memory (unsigned, integer and floating point)
++    LDRU Format Operand Operand
++  | -- | Arithmetic multiplication (both, integer and floating point)
++    --
++    -- @rd = rn × rm@
++    MUL Operand Operand Operand
++  | -- | Negation (both, integer and floating point)
++    --
++    -- @rd = -op2@
++    NEG Operand Operand
++  | -- | Division (both, integer and floating point)
++    --
++    -- @rd = rn ÷ rm@
++    DIV Operand Operand Operand
++  | -- | Remainder (integer only, signed)
++    --
++    -- @rd = rn % rm@
++    REM Operand Operand Operand --
++  | -- | Remainder (integer only, unsigned)
++    --
++    -- @rd = |rn % rm|@
++    REMU Operand Operand Operand
++  | -- | High part of a multiplication that doesn't fit into 64bits (integer only)
++    --
++    -- E.g. for a multiplication with 64bits width: @rd = (rs1 * rs2) >> 64@.
++    MULH Operand Operand Operand
++  | -- | Unsigned division (integer only)
++    --
++    -- @rd = |rn ÷ rm|@
++    DIVU Operand Operand Operand
++  | -- | XOR (integer only)
++    --
++    -- @rd = rn ⊕ op2@
++    XOR Operand Operand Operand
++  | -- | ORI with immediate (integer only)
++    --
++    -- @rd = rn | op2@
++    ORI Operand Operand Operand
++  | -- | OR with immediate (integer only)
++    --
++    -- @rd = rn ⊕ op2@
++    XORI Operand Operand Operand
++  | -- | Move to register (integer and floating point)
++    --
++    -- @rd = rn@  or  @rd = #imm@
++    MOV Operand Operand
++  | -- | Pseudo-op for conditional setting of a register.
++    --
++    -- @if(o2 cond o3) op <- 1 else op <- 0@
++    CSET Operand Operand Operand Cond
++  | -- | A jump instruction with data for switch/jump tables
++    J_TBL [Maybe BlockId] (Maybe CLabel) Reg
++  | -- | Unconditional jump (no linking)
++    B Target
++  | -- | Unconditional jump, links return address (sets @ra@/@x1@)
++    BL Reg [Reg]
++  | -- | branch with condition (integer only)
++    BCOND Cond Operand Operand Target
++  | -- | Fence instruction
++    --
++    -- Memory barrier.
++    FENCE FenceType FenceType
++  | -- | Floating point conversion
++    FCVT FcvtVariant Operand Operand
++  | -- | Floating point ABSolute value
++    FABS Operand Operand
++  | -- | Floating-point fused multiply-add instructions
++    --
++    -- - fmadd : d =   r1 * r2 + r3
++    -- - fnmsub: d =   r1 * r2 - r3
++    -- - fmsub : d = - r1 * r2 + r3
++    -- - fnmadd: d = - r1 * r2 - r3
++    FMA FMASign Operand Operand Operand Operand
++
++-- | Operand of a FENCE instruction (@r@, @w@ or @rw@)
++data FenceType = FenceRead | FenceWrite | FenceReadWrite
++
++-- | Variant of a floating point conversion instruction
++data FcvtVariant = FloatToFloat | IntToFloat | FloatToInt
++
++instrCon :: Instr -> String
++instrCon i =
++  case i of
++    COMMENT {} -> "COMMENT"
++    MULTILINE_COMMENT {} -> "COMMENT"
++    ANN {} -> "ANN"
++    LOCATION {} -> "LOCATION"
++    LDATA {} -> "LDATA"
++    NEWBLOCK {} -> "NEWBLOCK"
++    DELTA {} -> "DELTA"
++    PUSH_STACK_FRAME {} -> "PUSH_STACK_FRAME"
++    POP_STACK_FRAME {} -> "POP_STACK_FRAME"
++    ADD {} -> "ADD"
++    OR {} -> "OR"
++    MUL {} -> "MUL"
++    NEG {} -> "NEG"
++    DIV {} -> "DIV"
++    REM {} -> "REM"
++    REMU {} -> "REMU"
++    MULH {} -> "MULH"
++    SUB {} -> "SUB"
++    DIVU {} -> "DIVU"
++    AND {} -> "AND"
++    SRA {} -> "SRA"
++    XOR {} -> "XOR"
++    SLL {} -> "SLL"
++    SRL {} -> "SRL"
++    MOV {} -> "MOV"
++    ORI {} -> "ORI"
++    XORI {} -> "ORI"
++    STR {} -> "STR"
++    LDR {} -> "LDR"
++    LDRU {} -> "LDRU"
++    CSET {} -> "CSET"
++    J_TBL {} -> "J_TBL"
++    B {} -> "B"
++    BL {} -> "BL"
++    BCOND {} -> "BCOND"
++    FENCE {} -> "FENCE"
++    FCVT {} -> "FCVT"
++    FABS {} -> "FABS"
++    FMA variant _ _ _ _ ->
++      case variant of
++        FMAdd -> "FMADD"
++        FMSub -> "FMSUB"
++        FNMAdd -> "FNMADD"
++        FNMSub -> "FNMSUB"
++
++data Target
++  = TBlock BlockId
++  | TReg Reg
++
++data Operand
++  = -- | register
++    OpReg Width Reg
++  | -- | immediate value
++    OpImm Imm
++  | -- | memory reference
++    OpAddr AddrMode
++  deriving (Eq, Show)
++
++operandFromReg :: Reg -> Operand
++operandFromReg = OpReg W64
++
++operandFromRegNo :: RegNo -> Operand
++operandFromRegNo = operandFromReg . regSingle
++
++zero, ra, sp, gp, tp, fp, tmp :: Operand
++zero = operandFromReg zeroReg
++ra = operandFromReg raReg
++sp = operandFromReg spMachReg
++gp = operandFromRegNo 3
++tp = operandFromRegNo 4
++fp = operandFromRegNo 8
++tmp = operandFromReg tmpReg
++
++x0, x1, x2, x3, x4, x5, x6, x7 :: Operand
++x8, x9, x10, x11, x12, x13, x14, x15 :: Operand
++x16, x17, x18, x19, x20, x21, x22, x23 :: Operand
++x24, x25, x26, x27, x28, x29, x30, x31 :: Operand
++x0 = operandFromRegNo x0RegNo
++x1 = operandFromRegNo 1
++x2 = operandFromRegNo 2
++x3 = operandFromRegNo 3
++x4 = operandFromRegNo 4
++x5 = operandFromRegNo x5RegNo
++x6 = operandFromRegNo 6
++x7 = operandFromRegNo x7RegNo
++
++x8 = operandFromRegNo 8
++
++x9 = operandFromRegNo 9
++
++x10 = operandFromRegNo x10RegNo
++
++x11 = operandFromRegNo 11
++
++x12 = operandFromRegNo 12
++
++x13 = operandFromRegNo 13
++
++x14 = operandFromRegNo 14
++
++x15 = operandFromRegNo 15
++
++x16 = operandFromRegNo 16
++
++x17 = operandFromRegNo x17RegNo
++
++x18 = operandFromRegNo 18
++
++x19 = operandFromRegNo 19
++
++x20 = operandFromRegNo 20
++
++x21 = operandFromRegNo 21
++
++x22 = operandFromRegNo 22
++
++x23 = operandFromRegNo 23
++
++x24 = operandFromRegNo 24
++
++x25 = operandFromRegNo 25
++
++x26 = operandFromRegNo 26
++
++x27 = operandFromRegNo 27
++
++x28 = operandFromRegNo x28RegNo
++
++x29 = operandFromRegNo 29
++
++x30 = operandFromRegNo 30
++
++x31 = operandFromRegNo x31RegNo
++
++d0, d1, d2, d3, d4, d5, d6, d7 :: Operand
++d8, d9, d10, d11, d12, d13, d14, d15 :: Operand
++d16, d17, d18, d19, d20, d21, d22, d23 :: Operand
++d24, d25, d26, d27, d28, d29, d30, d31 :: Operand
++d0 = operandFromRegNo d0RegNo
++d1 = operandFromRegNo 33
++d2 = operandFromRegNo 34
++d3 = operandFromRegNo 35
++d4 = operandFromRegNo 36
++d5 = operandFromRegNo 37
++d6 = operandFromRegNo 38
++d7 = operandFromRegNo d7RegNo
++
++d8 = operandFromRegNo 40
++
++d9 = operandFromRegNo 41
++
++d10 = operandFromRegNo d10RegNo
++
++d11 = operandFromRegNo 43
++
++d12 = operandFromRegNo 44
++
++d13 = operandFromRegNo 45
++
++d14 = operandFromRegNo 46
++
++d15 = operandFromRegNo 47
++
++d16 = operandFromRegNo 48
++
++d17 = operandFromRegNo d17RegNo
++
++d18 = operandFromRegNo 50
++
++d19 = operandFromRegNo 51
++
++d20 = operandFromRegNo 52
++
++d21 = operandFromRegNo 53
++
++d22 = operandFromRegNo 54
++
++d23 = operandFromRegNo 55
++
++d24 = operandFromRegNo 56
++
++d25 = operandFromRegNo 57
++
++d26 = operandFromRegNo 58
++
++d27 = operandFromRegNo 59
++
++d28 = operandFromRegNo 60
++
++d29 = operandFromRegNo 61
++
++d30 = operandFromRegNo 62
++
++d31 = operandFromRegNo d31RegNo
++
++fitsIn12bitImm :: (Num a, Ord a) => a -> Bool
++fitsIn12bitImm off = off >= intMin12bit && off <= intMax12bit
++
++intMin12bit :: (Num a) => a
++intMin12bit = -2048
++
++intMax12bit :: (Num a) => a
++intMax12bit = 2047
++
++fitsIn32bits :: (Num a, Ord a, Bits a) => a -> Bool
++fitsIn32bits i = (-1 `shiftL` 31) <= i && i <= (1 `shiftL` 31 - 1)
++
++isNbitEncodeable :: Int -> Integer -> Bool
++isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
++
++isEncodeableInWidth :: Width -> Integer -> Bool
++isEncodeableInWidth = isNbitEncodeable . widthInBits
++
++isIntOp :: Operand -> Bool
++isIntOp = not . isFloatOp
++
++isFloatOp :: Operand -> Bool
++isFloatOp (OpReg _ reg) | isFloatReg reg = True
++isFloatOp _ = False
++
++isFloatReg :: Reg -> Bool
++isFloatReg (RegReal (RealRegSingle i)) | i > 31 = True
++isFloatReg (RegVirtual (VirtualRegF _)) = True
++isFloatReg (RegVirtual (VirtualRegD _)) = True
++isFloatReg _ = False
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Ppr.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Ppr.hs
+@@ -0,0 +1,715 @@
++{-# LANGUAGE ScopedTypeVariables #-}
++
++module GHC.CmmToAsm.RV64.Ppr (pprNatCmmDecl, pprInstr) where
++
++import GHC.Cmm hiding (topInfoTable)
++import GHC.Cmm.BlockId
++import GHC.Cmm.CLabel
++import GHC.Cmm.Dataflow.Label
++import GHC.CmmToAsm.Config
++import GHC.CmmToAsm.Format
++import GHC.CmmToAsm.Ppr
++import GHC.CmmToAsm.RV64.Cond
++import GHC.CmmToAsm.RV64.Instr
++import GHC.CmmToAsm.RV64.Regs
++import GHC.CmmToAsm.Types
++import GHC.CmmToAsm.Utils
++import GHC.Platform
++import GHC.Platform.Reg
++import GHC.Prelude hiding (EQ)
++import GHC.Types.Basic (Alignment, alignmentBytes, mkAlignment)
++import GHC.Types.Unique (getUnique, pprUniqueAlways)
++import GHC.Utils.Outputable
++import GHC.Utils.Panic
++
++pprNatCmmDecl :: forall doc. (IsDoc doc) => NCGConfig -> NatCmmDecl RawCmmStatics Instr -> doc
++pprNatCmmDecl config (CmmData section dats) =
++  pprSectionAlign config section $$ pprDatas config dats
++pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
++  let platform = ncgPlatform config
++
++      pprProcAlignment :: doc
++      pprProcAlignment = maybe empty (pprAlign . mkAlignment) (ncgProcAlignment config)
++   in pprProcAlignment
++        $$ case topInfoTable proc of
++          Nothing ->
++            -- special case for code without info table:
++            pprSectionAlign config (Section Text lbl)
++              $$
++              -- do not
++              -- pprProcAlignment config $$
++              pprLabel platform lbl
++              $$ vcat (map (pprBasicBlock config top_info) blocks) -- blocks guaranteed not null, so label needed
++              $$ ppWhen
++                (ncgDwarfEnabled config)
++                (line (pprBlockEndLabel platform lbl) $$ line (pprProcEndLabel platform lbl))
++              $$ pprSizeDecl platform lbl
++          Just (CmmStaticsRaw info_lbl _) ->
++            pprSectionAlign config (Section Text info_lbl)
++              $$
++              -- pprProcAlignment config $$
++              ( if platformHasSubsectionsViaSymbols platform
++                  then line (pprAsmLabel platform (mkDeadStripPreventer info_lbl) <> char ':')
++                  else empty
++              )
++              $$ vcat (map (pprBasicBlock config top_info) blocks)
++              $$ ppWhen (ncgDwarfEnabled config) (line (pprProcEndLabel platform info_lbl))
++              $$
++              -- above: Even the first block gets a label, because with branch-chain
++              -- elimination, it might be the target of a goto.
++              ( if platformHasSubsectionsViaSymbols platform
++                  then -- See Note [Subsections Via Symbols]
++
++                    line
++                      $ text "\t.long "
++                      <+> pprAsmLabel platform info_lbl
++                      <+> char '-'
++                      <+> pprAsmLabel platform (mkDeadStripPreventer info_lbl)
++                  else empty
++              )
++              $$ pprSizeDecl platform info_lbl
++{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc #-}
++{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> HDoc #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
++
++pprLabel :: (IsDoc doc) => Platform -> CLabel -> doc
++pprLabel platform lbl =
++  pprGloblDecl platform lbl
++    $$ pprTypeDecl platform lbl
++    $$ line (pprAsmLabel platform lbl <> char ':')
++
++pprAlign :: (IsDoc doc) => Alignment -> doc
++pprAlign alignment =
++  -- "The .align directive for RISC-V is an alias to .p2align, which aligns to a
++  -- power of two, so .align 2 means align to 4 bytes. Because the definition of
++  -- the .align directive varies by architecture, it is recommended to use the
++  -- unambiguous .p2align or .balign directives instead."
++  -- (https://github.com/riscv-non-isa/riscv-asm-manual/blob/main/riscv-asm.md#-align)
++  line $ text "\t.balign " <> int (alignmentBytes alignment)
++
++-- | Print appropriate alignment for the given section type.
++--
++-- Currently, this always aligns to a full machine word (8 byte.) A future
++-- improvement could be to really do this per section type (though, it's
++-- probably not a big gain.)
++pprAlignForSection :: (IsDoc doc) => SectionType -> doc
++pprAlignForSection _seg = pprAlign . mkAlignment $ 8
++
++-- | Print section header and appropriate alignment for that section.
++--
++-- This will e.g. emit a header like:
++--
++--     .section .text
++--     .balign 8
++pprSectionAlign :: (IsDoc doc) => NCGConfig -> Section -> doc
++pprSectionAlign _config (Section (OtherSection _) _) =
++  panic "RV64.Ppr.pprSectionAlign: unknown section"
++pprSectionAlign config sec@(Section seg _) =
++  line (pprSectionHeader config sec)
++    $$ pprAlignForSection seg
++
++pprProcEndLabel ::
++  (IsLine doc) =>
++  Platform ->
++  -- | Procedure name
++  CLabel ->
++  doc
++pprProcEndLabel platform lbl =
++  pprAsmLabel platform (mkAsmTempProcEndLabel lbl) <> colon
++
++pprBlockEndLabel ::
++  (IsLine doc) =>
++  Platform ->
++  -- | Block name
++  CLabel ->
++  doc
++pprBlockEndLabel platform lbl =
++  pprAsmLabel platform (mkAsmTempEndLabel lbl) <> colon
++
++-- | Output the ELF .size directive (if needed.)
++pprSizeDecl :: (IsDoc doc) => Platform -> CLabel -> doc
++pprSizeDecl platform lbl
++  | osElfTarget (platformOS platform) =
++      line $ text "\t.size" <+> asmLbl <> text ", .-" <> asmLbl
++  where
++    asmLbl = pprAsmLabel platform lbl
++pprSizeDecl _ _ = empty
++
++pprBasicBlock ::
++  (IsDoc doc) =>
++  NCGConfig ->
++  LabelMap RawCmmStatics ->
++  NatBasicBlock Instr ->
++  doc
++pprBasicBlock config info_env (BasicBlock blockid instrs) =
++  maybe_infotable
++    $ pprLabel platform asmLbl
++    $$ vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs))
++    $$ ppWhen
++      (ncgDwarfEnabled config)
++      ( -- Emit both end labels since this may end up being a standalone
++        -- top-level block
++        line
++          ( pprBlockEndLabel platform asmLbl
++              <> pprProcEndLabel platform asmLbl
++          )
++      )
++  where
++    -- TODO: Check if we can  filter more instructions here.
++    -- TODO: Shouldn't this be a more general check on a higher level?
++    -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
++    optInstrs = filter f instrs
++      where
++        f (MOV o1 o2) | o1 == o2 = False
++        f _ = True
++
++    asmLbl = blockLbl blockid
++    platform = ncgPlatform config
++    maybe_infotable c = case mapLookup blockid info_env of
++      Nothing -> c
++      Just (CmmStaticsRaw info_lbl info) ->
++        --  pprAlignForSection platform Text $$
++        infoTableLoc
++          $$ vcat (map (pprData config) info)
++          $$ pprLabel platform info_lbl
++          $$ c
++          $$ ppWhen
++            (ncgDwarfEnabled config)
++            (line (pprBlockEndLabel platform info_lbl))
++    -- Make sure the info table has the right .loc for the block
++    -- coming right after it. See Note [Info Offset]
++    infoTableLoc = case instrs of
++      (l@LOCATION {} : _) -> pprInstr platform l
++      _other -> empty
++
++pprDatas :: (IsDoc doc) => NCGConfig -> RawCmmStatics -> doc
++-- See Note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel".
++pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
++  | lbl == mkIndStaticInfoLabel,
++    let labelInd (CmmLabelOff l _) = Just l
++        labelInd (CmmLabel l) = Just l
++        labelInd _ = Nothing,
++    Just ind' <- labelInd ind,
++    alias `mayRedirectTo` ind' =
++      pprGloblDecl (ncgPlatform config) alias
++        $$ line (text ".equiv" <+> pprAsmLabel (ncgPlatform config) alias <> comma <> pprAsmLabel (ncgPlatform config) ind')
++pprDatas config (CmmStaticsRaw lbl dats) =
++  vcat (pprLabel platform lbl : map (pprData config) dats)
++  where
++    platform = ncgPlatform config
++
++pprData :: (IsDoc doc) => NCGConfig -> CmmStatic -> doc
++pprData _config (CmmString str) = line (pprString str)
++pprData _config (CmmFileEmbed path _) = line (pprFileEmbed path)
++-- TODO: AFAIK there no Darwin for RISCV, so we may consider to simplify this.
++pprData config (CmmUninitialised bytes) =
++  line
++    $ let platform = ncgPlatform config
++       in if platformOS platform == OSDarwin
++            then text ".space " <> int bytes
++            else text ".skip " <> int bytes
++pprData config (CmmStaticLit lit) = pprDataItem config lit
++
++pprGloblDecl :: (IsDoc doc) => Platform -> CLabel -> doc
++pprGloblDecl platform lbl
++  | not (externallyVisibleCLabel lbl) = empty
++  | otherwise = line (text "\t.globl " <> pprAsmLabel platform lbl)
++
++-- Note [Always use objects for info tables]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-- See discussion in X86.Ppr for why this is necessary.  Essentially we need to
++-- ensure that we never pass function symbols when we might want to lookup the
++-- info table.  If we did, we could end up with procedure linking tables
++-- (PLT)s, and thus the lookup wouldn't point to the function, but into the
++-- jump table.
++--
++-- Fun fact: The LLVMMangler exists to patch this issue on the LLVM side as
++-- well.
++pprLabelType' :: (IsLine doc) => Platform -> CLabel -> doc
++pprLabelType' platform lbl =
++  if isCFunctionLabel lbl || functionOkInfoTable
++    then text "@function"
++    else text "@object"
++  where
++    functionOkInfoTable =
++      platformTablesNextToCode platform
++        && isInfoTableLabel lbl
++        && not (isCmmInfoTableLabel lbl)
++        && not (isConInfoTableLabel lbl)
++
++-- this is called pprTypeAndSizeDecl in PPC.Ppr
++pprTypeDecl :: (IsDoc doc) => Platform -> CLabel -> doc
++pprTypeDecl platform lbl =
++  if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
++    then line (text ".type " <> pprAsmLabel platform lbl <> text ", " <> pprLabelType' platform lbl)
++    else empty
++
++pprDataItem :: (IsDoc doc) => NCGConfig -> CmmLit -> doc
++pprDataItem config lit =
++  lines_ (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit)
++  where
++    platform = ncgPlatform config
++
++    imm = litToImm lit
++
++    ppr_item II8 _ = [text "\t.byte\t" <> pprDataImm platform imm]
++    ppr_item II16 _ = [text "\t.short\t" <> pprDataImm platform imm]
++    ppr_item II32 _ = [text "\t.long\t" <> pprDataImm platform imm]
++    ppr_item II64 _ = [text "\t.quad\t" <> pprDataImm platform imm]
++    ppr_item FF32 (CmmFloat r _) =
++      let bs = floatToBytes (fromRational r)
++       in map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs
++    ppr_item FF64 (CmmFloat r _) =
++      let bs = doubleToBytes (fromRational r)
++       in map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs
++    ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit)
++
++-- | Pretty print an immediate value in the @data@ section
++--
++-- This does not include any checks. We rely on the Assembler to check for
++-- errors. Use `pprOpImm` for immediates in instructions (operands.)
++pprDataImm :: (IsLine doc) => Platform -> Imm -> doc
++pprDataImm _ (ImmInt i) = int i
++pprDataImm _ (ImmInteger i) = integer i
++pprDataImm p (ImmCLbl l) = pprAsmLabel p l
++pprDataImm p (ImmIndex l i) = pprAsmLabel p l <> char '+' <> int i
++pprDataImm _ (ImmLit s) = ftext s
++pprDataImm _ (ImmFloat f) = float (fromRational f)
++pprDataImm _ (ImmDouble d) = double (fromRational d)
++pprDataImm p (ImmConstantSum a b) = pprDataImm p a <> char '+' <> pprDataImm p b
++pprDataImm p (ImmConstantDiff a b) =
++  pprDataImm p a
++    <> char '-'
++    <> lparen
++    <> pprDataImm p b
++    <> rparen
++
++-- | Comment @c@ with @# c@
++asmComment :: SDoc -> SDoc
++asmComment c = text "#" <+> c
++
++-- | Commen @c@ with @// c@
++asmDoubleslashComment :: SDoc -> SDoc
++asmDoubleslashComment c = text "//" <+> c
++
++-- | Comment @c@ with @/* c */@ (multiline comment)
++asmMultilineComment :: SDoc -> SDoc
++asmMultilineComment c = text "/*" $+$ c $+$ text "*/"
++
++-- | Pretty print an immediate operand of an instruction
++--
++-- The kinds of immediates we can use here is pretty limited: RISCV doesn't
++-- support index expressions (as e.g. Aarch64 does.) Floating points need to
++-- fit in range. As we don't need them, forbit them to save us from future
++-- troubles.
++pprOpImm :: (IsLine doc) => Platform -> Imm -> doc
++pprOpImm platform im = case im of
++  ImmInt i -> int i
++  ImmInteger i -> integer i
++  ImmCLbl l -> char '=' <> pprAsmLabel platform l
++  _ -> pprPanic "RV64.Ppr.pprOpImm" (text "Unsupported immediate for instruction operands" <> colon <+> (text . show) im)
++
++-- | Negate integer immediate operand
++--
++-- This function is partial and will panic if the operand is not an integer.
++negOp :: Operand -> Operand
++negOp (OpImm (ImmInt i)) = OpImm (ImmInt (negate i))
++negOp (OpImm (ImmInteger i)) = OpImm (ImmInteger (negate i))
++negOp op = pprPanic "RV64.negOp" (text $ show op)
++
++-- | Pretty print an operand
++pprOp :: (IsLine doc) => Platform -> Operand -> doc
++pprOp plat op = case op of
++  OpReg w r -> pprReg w r
++  OpImm im -> pprOpImm plat im
++  OpAddr (AddrRegImm r1 im) -> pprOpImm plat im <> char '(' <> pprReg W64 r1 <> char ')'
++  OpAddr (AddrReg r1) -> text "0(" <+> pprReg W64 r1 <+> char ')'
++
++-- | Pretty print register with calling convention name
++--
++-- This representation makes it easier to reason about the emitted assembly
++-- code.
++pprReg :: forall doc. (IsLine doc) => Width -> Reg -> doc
++pprReg w r = case r of
++  RegReal (RealRegSingle i) -> ppr_reg_no i
++  -- virtual regs should not show up, but this is helpful for debugging.
++  RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u
++  RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u
++  RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u
++  _ -> pprPanic "RiscV64.pprReg" (text (show r) <+> ppr w)
++  where
++    ppr_reg_no :: Int -> doc
++    -- General Purpose Registers
++    ppr_reg_no 0 = text "zero"
++    ppr_reg_no 1 = text "ra"
++    ppr_reg_no 2 = text "sp"
++    ppr_reg_no 3 = text "gp"
++    ppr_reg_no 4 = text "tp"
++    ppr_reg_no 5 = text "t0"
++    ppr_reg_no 6 = text "t1"
++    ppr_reg_no 7 = text "t2"
++    ppr_reg_no 8 = text "s0"
++    ppr_reg_no 9 = text "s1"
++    ppr_reg_no 10 = text "a0"
++    ppr_reg_no 11 = text "a1"
++    ppr_reg_no 12 = text "a2"
++    ppr_reg_no 13 = text "a3"
++    ppr_reg_no 14 = text "a4"
++    ppr_reg_no 15 = text "a5"
++    ppr_reg_no 16 = text "a6"
++    ppr_reg_no 17 = text "a7"
++    ppr_reg_no 18 = text "s2"
++    ppr_reg_no 19 = text "s3"
++    ppr_reg_no 20 = text "s4"
++    ppr_reg_no 21 = text "s5"
++    ppr_reg_no 22 = text "s6"
++    ppr_reg_no 23 = text "s7"
++    ppr_reg_no 24 = text "s8"
++    ppr_reg_no 25 = text "s9"
++    ppr_reg_no 26 = text "s10"
++    ppr_reg_no 27 = text "s11"
++    ppr_reg_no 28 = text "t3"
++    ppr_reg_no 29 = text "t4"
++    ppr_reg_no 30 = text "t5"
++    ppr_reg_no 31 = text "t6"
++    -- Floating Point Registers
++    ppr_reg_no 32 = text "ft0"
++    ppr_reg_no 33 = text "ft1"
++    ppr_reg_no 34 = text "ft2"
++    ppr_reg_no 35 = text "ft3"
++    ppr_reg_no 36 = text "ft4"
++    ppr_reg_no 37 = text "ft5"
++    ppr_reg_no 38 = text "ft6"
++    ppr_reg_no 39 = text "ft7"
++    ppr_reg_no 40 = text "fs0"
++    ppr_reg_no 41 = text "fs1"
++    ppr_reg_no 42 = text "fa0"
++    ppr_reg_no 43 = text "fa1"
++    ppr_reg_no 44 = text "fa2"
++    ppr_reg_no 45 = text "fa3"
++    ppr_reg_no 46 = text "fa4"
++    ppr_reg_no 47 = text "fa5"
++    ppr_reg_no 48 = text "fa6"
++    ppr_reg_no 49 = text "fa7"
++    ppr_reg_no 50 = text "fs2"
++    ppr_reg_no 51 = text "fs3"
++    ppr_reg_no 52 = text "fs4"
++    ppr_reg_no 53 = text "fs5"
++    ppr_reg_no 54 = text "fs6"
++    ppr_reg_no 55 = text "fs7"
++    ppr_reg_no 56 = text "fs8"
++    ppr_reg_no 57 = text "fs9"
++    ppr_reg_no 58 = text "fs10"
++    ppr_reg_no 59 = text "fs11"
++    ppr_reg_no 60 = text "ft8"
++    ppr_reg_no 61 = text "ft9"
++    ppr_reg_no 62 = text "ft10"
++    ppr_reg_no 63 = text "ft11"
++    ppr_reg_no i
++      | i < 0 = pprPanic "Unexpected register number (min is 0)" (ppr w <+> int i)
++      | i > 63 = pprPanic "Unexpected register number (max is 63)" (ppr w <+> int i)
++      -- no support for widths > W64.
++      | otherwise = pprPanic "Unsupported width in register (max is 64)" (ppr w <+> int i)
++
++-- | Single precission `Operand` (floating-point)
++isSingleOp :: Operand -> Bool
++isSingleOp (OpReg W32 _) = True
++isSingleOp _ = False
++
++-- | Double precission `Operand` (floating-point)
++isDoubleOp :: Operand -> Bool
++isDoubleOp (OpReg W64 _) = True
++isDoubleOp _ = False
++
++-- | `Operand` is an immediate value
++isImmOp :: Operand -> Bool
++isImmOp (OpImm _) = True
++isImmOp _ = False
++
++-- | `Operand` is an immediate @0@ value
++isImmZero :: Operand -> Bool
++isImmZero (OpImm (ImmFloat 0)) = True
++isImmZero (OpImm (ImmDouble 0)) = True
++isImmZero (OpImm (ImmInt 0)) = True
++isImmZero _ = False
++
++-- | `Target` represents a label
++isLabel :: Target -> Bool
++isLabel (TBlock _) = True
++isLabel _ = False
++
++-- | Get the pretty-printed label from a `Target`
++--
++-- This function is partial and will panic if the `Target` is not a label.
++getLabel :: (IsLine doc) => Platform -> Target -> doc
++getLabel platform (TBlock bid) = pprBlockId platform bid
++  where
++    pprBlockId :: (IsLine doc) => Platform -> BlockId -> doc
++    pprBlockId platform bid = pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
++getLabel _platform _other = panic "Cannot turn this into a label"
++
++-- | Pretty-print an `Instr`
++--
++-- This function is partial and will panic if the `Instr` is not supported. This
++-- can happen due to invalid operands or unexpected meta instructions.
++pprInstr :: (IsDoc doc) => Platform -> Instr -> doc
++pprInstr platform instr = case instr of
++  -- see Note [dualLine and dualDoc] in GHC.Utils.Outputable
++  COMMENT s -> dualDoc (asmComment s) empty
++  MULTILINE_COMMENT s -> dualDoc (asmMultilineComment s) empty
++  ANN d i -> dualDoc (pprInstr platform i <+> asmDoubleslashComment d) (pprInstr platform i)
++  LOCATION file line' col _name ->
++    line (text "\t.loc" <+> int file <+> int line' <+> int col)
++  DELTA d -> dualDoc (asmComment $ text "\tdelta = " <> int d) empty
++  NEWBLOCK _ -> panic "PprInstr: NEWBLOCK"
++  LDATA _ _ -> panic "pprInstr: LDATA"
++  PUSH_STACK_FRAME ->
++    lines_
++      [ text "\taddi sp, sp, -16",
++        text "\tsd x1, 8(sp)", -- store RA
++        text "\tsd x8, 0(sp)", -- store FP/s0
++        text "\taddi x8, sp, 16"
++      ]
++  POP_STACK_FRAME ->
++    lines_
++      [ text "\tld x8, 0(sp)", -- restore FP/s0
++        text "\tld x1, 8(sp)", -- restore RA
++        text "\taddi sp, sp, 16"
++      ]
++  ADD o1 o2 o3
++    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfadd." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
++    -- This case is used for sign extension: SEXT.W op
++    | OpReg W64 _ <- o1, OpReg W32 _ <- o2, isImmOp o3 -> op3 (text "\taddiw") o1 o2 o3
++    | otherwise -> op3 (text "\tadd") o1 o2 o3
++  MUL o1 o2 o3
++    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfmul." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
++    | otherwise -> op3 (text "\tmul") o1 o2 o3
++  MULH o1 o2 o3 -> op3 (text "\tmulh") o1 o2 o3
++  NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfneg.s") o1 o2
++  NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfneg.d") o1 o2
++  NEG o1 o2 -> op2 (text "\tneg") o1 o2
++  DIV o1 o2 o3
++    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 ->
++        -- TODO: This must (likely) be refined regarding width
++        op3 (text "\tfdiv." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
++  DIV o1 o2 o3 -> op3 (text "\tdiv") o1 o2 o3
++  REM o1 o2 o3
++    | isFloatOp o1 || isFloatOp o2 || isFloatOp o3 ->
++        panic "pprInstr - REM not implemented for floats (yet)"
++  REM o1 o2 o3 -> op3 (text "\trem") o1 o2 o3
++  REMU o1 o2 o3 -> op3 (text "\tremu") o1 o2 o3
++  SUB o1 o2 o3
++    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfsub." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
++    | isImmOp o3 -> op3 (text "\taddi") o1 o2 (negOp o3)
++    | otherwise -> op3 (text "\tsub") o1 o2 o3
++  DIVU o1 o2 o3 -> op3 (text "\tdivu") o1 o2 o3
++  AND o1 o2 o3
++    | isImmOp o3 -> op3 (text "\tandi") o1 o2 o3
++    | otherwise -> op3 (text "\tand") o1 o2 o3
++  OR o1 o2 o3 -> op3 (text "\tor") o1 o2 o3
++  SRA o1 o2 o3 | isImmOp o3 -> op3 (text "\tsrai") o1 o2 o3
++  SRA o1 o2 o3 -> op3 (text "\tsra") o1 o2 o3
++  XOR o1 o2 o3 -> op3 (text "\txor") o1 o2 o3
++  SLL o1 o2 o3 -> op3 (text "\tsll") o1 o2 o3
++  SRL o1 o2 o3 -> op3 (text "\tsrl") o1 o2 o3
++  MOV o1 o2
++    | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.d") o1 o2 -- fmv.d rd, rs is pseudo op fsgnj.d rd, rs, rs
++    | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.s") o1 o2 -- fmv.s rd, rs is pseudo op fsgnj.s rd, rs, rs
++    | isFloatOp o1 && isImmZero o2 && isDoubleOp o1 -> op2 (text "\tfcvt.d.w") o1 zero
++    | isFloatOp o1 && isImmZero o2 && isSingleOp o1 -> op2 (text "\tfcvt.s.w") o1 zero
++    | isFloatOp o1 && not (isFloatOp o2) && isSingleOp o1 -> op2 (text "\tfmv.w.x") o1 o2
++    | isFloatOp o1 && not (isFloatOp o2) && isDoubleOp o1 -> op2 (text "\tfmv.d.x") o1 o2
++    | not (isFloatOp o1) && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.x.w") o1 o2
++    | not (isFloatOp o1) && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.x.d") o1 o2
++    | (OpImm (ImmInteger i)) <- o2,
++      fitsIn12bitImm i ->
++        lines_ [text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2]
++    | (OpImm (ImmInt i)) <- o2,
++      fitsIn12bitImm i ->
++        lines_ [text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2]
++    | (OpImm (ImmInteger i)) <- o2,
++      fitsIn32bits i ->
++        lines_
++          [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")",
++            text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")"
++          ]
++    | (OpImm (ImmInt i)) <- o2,
++      fitsIn32bits i ->
++        lines_
++          [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")",
++            text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")"
++          ]
++    | isImmOp o2 ->
++        -- Surrender! Let the assembler figure out the right expressions with pseudo-op LI.
++        lines_ [text "\tli" <+> pprOp platform o1 <> comma <+> pprOp platform o2]
++    | otherwise -> op3 (text "\taddi") o1 o2 (OpImm (ImmInt 0))
++  ORI o1 o2 o3 -> op3 (text "\tori") o1 o2 o3
++  XORI o1 o2 o3 -> op3 (text "\txori") o1 o2 o3
++  J_TBL _ _ r -> pprInstr platform (B (TReg r))
++  B l | isLabel l -> line $ text "\tjal" <+> pprOp platform x0 <> comma <+> getLabel platform l
++  B (TReg r) -> line $ text "\tjalr" <+> pprOp platform x0 <> comma <+> pprReg W64 r <> comma <+> text "0"
++  BL r _ -> line $ text "\tjalr" <+> text "x1" <> comma <+> pprReg W64 r <> comma <+> text "0"
++  BCOND c l r t
++    | isLabel t ->
++        line $ text "\t" <> pprBcond c <+> pprOp platform l <> comma <+> pprOp platform r <> comma <+> getLabel platform t
++  BCOND _ _ _ (TReg _) -> panic "RV64.ppr: No conditional branching to registers!"
++  CSET o l r c -> case c of
++    EQ
++      | isIntOp l && isIntOp r ->
++          lines_
++            [ subFor l r,
++              text "\tseqz" <+> pprOp platform o <> comma <+> pprOp platform o
++            ]
++    EQ | isFloatOp l && isFloatOp r -> line $ binOp ("\tfeq." ++ floatOpPrecision platform l r)
++    NE
++      | isIntOp l && isIntOp r ->
++          lines_
++            [ subFor l r,
++              text "\tsnez" <+> pprOp platform o <> comma <+> pprOp platform o
++            ]
++    NE
++      | isFloatOp l && isFloatOp r ->
++          lines_
++            [ binOp ("\tfeq." ++ floatOpPrecision platform l r),
++              text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"
++            ]
++    SLT -> lines_ [sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r]
++    SLE ->
++      lines_
++        [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l,
++          text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"
++        ]
++    SGE ->
++      lines_
++        [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r,
++          text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"
++        ]
++    SGT -> lines_ [sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l]
++    ULT -> lines_ [sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r]
++    ULE ->
++      lines_
++        [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l,
++          text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"
++        ]
++    UGE ->
++      lines_
++        [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r,
++          text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"
++        ]
++    UGT -> lines_ [sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l]
++    FLT | isFloatOp l && isFloatOp r -> line $ binOp ("\tflt." ++ floatOpPrecision platform l r)
++    FLE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfle." ++ floatOpPrecision platform l r)
++    FGT | isFloatOp l && isFloatOp r -> line $ binOp ("\tfgt." ++ floatOpPrecision platform l r)
++    FGE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfge." ++ floatOpPrecision platform l r)
++    x -> pprPanic "RV64.ppr: unhandled CSET conditional" (text (show x) <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l)
++    where
++      subFor l r
++        | (OpImm _) <- r = text "\taddi" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform (negOp r)
++        | (OpImm _) <- l = panic "RV64.ppr: Cannot SUB IMM _"
++        | otherwise = text "\tsub" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
++      sltFor l r
++        | (OpImm _) <- r = text "\tslti"
++        | (OpImm _) <- l = panic "PV64.ppr: Cannot SLT IMM _"
++        | otherwise = text "\tslt"
++      sltuFor l r
++        | (OpImm _) <- r = text "\tsltui"
++        | (OpImm _) <- l = panic "PV64.ppr: Cannot SLTU IMM _"
++        | otherwise = text "\tsltu"
++      binOp :: (IsLine doc) => String -> doc
++      binOp op = text op <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
++  STR II8 o1 o2 -> op2 (text "\tsb") o1 o2
++  STR II16 o1 o2 -> op2 (text "\tsh") o1 o2
++  STR II32 o1 o2 -> op2 (text "\tsw") o1 o2
++  STR II64 o1 o2 -> op2 (text "\tsd") o1 o2
++  STR FF32 o1 o2 -> op2 (text "\tfsw") o1 o2
++  STR FF64 o1 o2 -> op2 (text "\tfsd") o1 o2
++  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
++    lines_
++      [ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl,
++        text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> int off
++      ]
++  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
++    line $ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl
++  LDR II8 o1 o2 -> op2 (text "\tlb") o1 o2
++  LDR II16 o1 o2 -> op2 (text "\tlh") o1 o2
++  LDR II32 o1 o2 -> op2 (text "\tlw") o1 o2
++  LDR II64 o1 o2 -> op2 (text "\tld") o1 o2
++  LDR FF32 o1 o2 -> op2 (text "\tflw") o1 o2
++  LDR FF64 o1 o2 -> op2 (text "\tfld") o1 o2
++  LDRU II8 o1 o2 -> op2 (text "\tlbu") o1 o2
++  LDRU II16 o1 o2 -> op2 (text "\tlhu") o1 o2
++  LDRU II32 o1 o2 -> op2 (text "\tlwu") o1 o2
++  -- double words (64bit) cannot be sign extended by definition
++  LDRU II64 o1 o2 -> op2 (text "\tld") o1 o2
++  LDRU FF32 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tflw") o1 o2
++  LDRU FF32 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tflw") o1 o2
++  LDRU FF64 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tfld") o1 o2
++  LDRU FF64 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tfld") o1 o2
++  LDRU f o1 o2 -> pprPanic "Unsupported unsigned load" ((text . show) f <+> pprOp platform o1 <+> pprOp platform o2)
++  FENCE r w -> line $ text "\tfence" <+> pprFenceType r <> char ',' <+> pprFenceType w
++  FCVT FloatToFloat o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.d") o1 o2
++  FCVT FloatToFloat o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.s") o1 o2
++  FCVT FloatToFloat o1 o2 ->
++    pprPanic "RV64.pprInstr - impossible float to float conversion"
++      $ line (pprOp platform o1 <> text "->" <> pprOp platform o2)
++  FCVT IntToFloat o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.s.w") o1 o2
++  FCVT IntToFloat o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.l") o1 o2
++  FCVT IntToFloat o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.w") o1 o2
++  FCVT IntToFloat o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.d.l") o1 o2
++  FCVT IntToFloat o1 o2 ->
++    pprPanic "RV64.pprInstr - impossible integer to float conversion"
++      $ line (pprOp platform o1 <> text "->" <> pprOp platform o2)
++  FCVT FloatToInt o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.w.s") o1 o2
++  FCVT FloatToInt o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.w.d") o1 o2
++  FCVT FloatToInt o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.l.s") o1 o2
++  FCVT FloatToInt o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.l.d") o1 o2
++  FCVT FloatToInt o1 o2 ->
++    pprPanic "RV64.pprInstr - impossible float to integer conversion"
++      $ line (pprOp platform o1 <> text "->" <> pprOp platform o2)
++  FABS o1 o2 | isSingleOp o2 -> op2 (text "\tfabs.s") o1 o2
++  FABS o1 o2 | isDoubleOp o2 -> op2 (text "\tfabs.d") o1 o2
++  FMA variant d r1 r2 r3 ->
++    let fma = case variant of
++          FMAdd -> text "\tfmadd" <> dot <> floatPrecission d
++          FMSub -> text "\tfmsub" <> dot <> floatPrecission d
++          FNMAdd -> text "\tfnmadd" <> dot <> floatPrecission d
++          FNMSub -> text "\tfnmsub" <> dot <> floatPrecission d
++     in op4 fma d r1 r2 r3
++  instr -> panic $ "RV64.pprInstr - Unknown instruction: " ++ instrCon instr
++  where
++    op2 op o1 o2 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2
++    op3 op o1 o2 o3 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
++    op4 op o1 o2 o3 o4 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
++    pprFenceType FenceRead = text "r"
++    pprFenceType FenceWrite = text "w"
++    pprFenceType FenceReadWrite = text "rw"
++    floatPrecission o
++      | isSingleOp o = text "s"
++      | isDoubleOp o = text "d"
++      | otherwise = pprPanic "Impossible floating point precission: " (pprOp platform o)
++
++floatOpPrecision :: Platform -> Operand -> Operand -> String
++floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isSingleOp l && isSingleOp r = "s" -- single precision
++floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isDoubleOp l && isDoubleOp r = "d" -- double precision
++floatOpPrecision p l r = pprPanic "Cannot determine floating point precission" (text "op1" <+> pprOp p l <+> text "op2" <+> pprOp p r)
++
++-- | Pretty print a conditional branch
++--
++-- This function is partial and will panic if the conditional is not supported;
++-- i.e. if its floating point related.
++pprBcond :: (IsLine doc) => Cond -> doc
++pprBcond c = text "b" <> pprCond c
++  where
++    pprCond :: (IsLine doc) => Cond -> doc
++    pprCond c = case c of
++      EQ -> text "eq"
++      NE -> text "ne"
++      SLT -> text "lt"
++      SLE -> text "le"
++      SGE -> text "ge"
++      SGT -> text "gt"
++      ULT -> text "ltu"
++      ULE -> text "leu"
++      UGE -> text "geu"
++      UGT -> text "gtu"
++      -- BCOND cannot handle floating point comparisons / registers
++      _ -> panic $ "RV64.ppr: unhandled BCOND conditional: " ++ show c
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/RegInfo.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/RegInfo.hs
+@@ -0,0 +1,41 @@
++-- | Minimum viable implementation of jump short-cutting: No short-cutting.
++--
++-- The functions here simply implement the no-short-cutting case. Implementing
++-- the real behaviour would be a great optimization in future.
++module GHC.CmmToAsm.RV64.RegInfo
++  ( getJumpDestBlockId,
++    canShortcut,
++    shortcutStatics,
++    shortcutJump,
++    JumpDest (..),
++  )
++where
++
++import GHC.Cmm
++import GHC.Cmm.BlockId
++import GHC.CmmToAsm.RV64.Instr
++import GHC.Prelude
++import GHC.Utils.Outputable
++
++newtype JumpDest = DestBlockId BlockId
++
++instance Outputable JumpDest where
++  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
++
++-- | Extract BlockId
++--
++-- Never `Nothing` for Riscv64 NCG.
++getJumpDestBlockId :: JumpDest -> Maybe BlockId
++getJumpDestBlockId (DestBlockId bid) = Just bid
++
++-- No `Instr`s can bet shortcut (for now)
++canShortcut :: Instr -> Maybe JumpDest
++canShortcut _ = Nothing
++
++-- Identity of the provided `RawCmmStatics`
++shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
++shortcutStatics _ other_static = other_static
++
++-- Identity of the provided `Instr`
++shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
++shortcutJump _ other = other
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Regs.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Regs.hs
+@@ -0,0 +1,230 @@
++module GHC.CmmToAsm.RV64.Regs where
++
++import GHC.Cmm
++import GHC.Cmm.CLabel (CLabel)
++import GHC.CmmToAsm.Format
++import GHC.Data.FastString
++import GHC.Platform
++import GHC.Platform.Reg
++import GHC.Platform.Reg.Class
++import GHC.Platform.Regs
++import GHC.Prelude
++import GHC.Types.Unique
++import GHC.Utils.Outputable
++import GHC.Utils.Panic
++
++-- * Registers
++
++-- | First integer register number. @zero@ register.
++x0RegNo :: RegNo
++x0RegNo = 0
++
++-- | return address register
++x1RegNo, raRegNo :: RegNo
++x1RegNo = 1
++raRegNo = x1RegNo
++
++x5RegNo, t0RegNo :: RegNo
++x5RegNo = 5
++t0RegNo = x5RegNo
++
++x7RegNo, t2RegNo :: RegNo
++x7RegNo = 7
++t2RegNo = x7RegNo
++
++x28RegNo, t3RegNo :: RegNo
++x28RegNo = 28
++t3RegNo = x28RegNo
++
++-- | Last integer register number. Used as TMP (IP) register.
++x31RegNo, t6RegNo, tmpRegNo :: RegNo
++x31RegNo = 31
++t6RegNo = x31RegNo
++tmpRegNo = x31RegNo
++
++-- | First floating point register.
++d0RegNo, ft0RegNo :: RegNo
++d0RegNo = 32
++ft0RegNo = d0RegNo
++
++d7RegNo, ft7RegNo :: RegNo
++d7RegNo = 39
++ft7RegNo = d7RegNo
++
++-- | Last floating point register.
++d31RegNo :: RegNo
++d31RegNo = 63
++
++a0RegNo, x10RegNo :: RegNo
++x10RegNo = 10
++a0RegNo = x10RegNo
++
++a7RegNo, x17RegNo :: RegNo
++x17RegNo = 17
++a7RegNo = x17RegNo
++
++fa0RegNo, d10RegNo :: RegNo
++d10RegNo = 42
++fa0RegNo = d10RegNo
++
++fa7RegNo, d17RegNo :: RegNo
++d17RegNo = 49
++fa7RegNo = d17RegNo
++
++-- Note [The made-up RISCV64 TMP (IP) register]
++-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++--
++-- RISCV64 has no inter-procedural register in its ABI. However, we need one to
++-- make register spills/loads to/from high number slots. I.e. slot numbers that
++-- do not fit in a 12bit integer which is used as immediate in the arithmetic
++-- operations. Thus, we're marking one additional register (x31) as permanently
++-- non-free and call it TMP.
++--
++-- TMP can be used as temporary register in all operations. Just be aware that
++-- it may be clobbered as soon as you loose direct control over it (i.e. using
++-- TMP by-passes the register allocation/spilling mechanisms.) It should be fine
++-- to use it as temporary register in a MachOp translation as long as you don't
++-- rely on its value beyond this limited scope.
++--
++-- X31 is a caller-saved register. I.e. there are no guarantees about what the
++-- callee does with it. That's exactly what we want here.
++
++zeroReg, raReg, spMachReg, tmpReg :: Reg
++zeroReg = regSingle x0RegNo
++raReg = regSingle 1
++
++-- | Not to be confused with the `CmmReg` `spReg`
++spMachReg = regSingle 2
++
++tmpReg = regSingle tmpRegNo
++
++-- | All machine register numbers.
++allMachRegNos :: [RegNo]
++allMachRegNos = intRegs ++ fpRegs
++  where
++    intRegs = [x0RegNo .. x31RegNo]
++    fpRegs = [d0RegNo .. d31RegNo]
++
++-- | Registers available to the register allocator.
++--
++-- These are all registers minus those with a fixed role in RISCV ABI (zero, lr,
++-- sp, gp, tp, fp, tmp) and GHC RTS (Base, Sp, Hp, HpLim, R1..R8, F1..F6,
++-- D1..D6.)
++allocatableRegs :: Platform -> [RealReg]
++allocatableRegs platform =
++  let isFree = freeReg platform
++   in map RealRegSingle $ filter isFree allMachRegNos
++
++-- | Integer argument registers according to the calling convention
++allGpArgRegs :: [Reg]
++allGpArgRegs = map regSingle [a0RegNo .. a7RegNo]
++
++-- | Floating point argument registers according to the calling convention
++allFpArgRegs :: [Reg]
++allFpArgRegs = map regSingle [fa0RegNo .. fa7RegNo]
++
++-- * Addressing modes
++
++-- | Addressing modes
++data AddrMode
++  = -- | A register plus some immediate integer, e.g. @8(sp)@ or @-16(sp)@. The
++    -- offset needs to fit into 12bits.
++    AddrRegImm Reg Imm
++  | -- | A register
++    AddrReg Reg
++  deriving (Eq, Show)
++
++-- * Immediates
++
++data Imm
++  = ImmInt Int
++  | ImmInteger Integer -- Sigh.
++  | ImmCLbl CLabel -- AbstractC Label (with baggage)
++  | ImmLit FastString
++  | ImmIndex CLabel Int
++  | ImmFloat Rational
++  | ImmDouble Rational
++  | ImmConstantSum Imm Imm
++  | ImmConstantDiff Imm Imm
++  deriving (Eq, Show)
++
++-- | Map `CmmLit` to `Imm`
++--
++-- N.B. this is a partial function, because not all `CmmLit`s have an immediate
++-- representation.
++litToImm :: CmmLit -> Imm
++litToImm (CmmInt i w) = ImmInteger (narrowS w i)
++-- narrow to the width: a CmmInt might be out of
++-- range, but we assume that ImmInteger only contains
++-- in-range values.  A signed value should be fine here.
++litToImm (CmmFloat f W32) = ImmFloat f
++litToImm (CmmFloat f W64) = ImmDouble f
++litToImm (CmmLabel l) = ImmCLbl l
++litToImm (CmmLabelOff l off) = ImmIndex l off
++litToImm (CmmLabelDiffOff l1 l2 off _) =
++  ImmConstantSum
++    (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
++    (ImmInt off)
++litToImm l = panic $ "RV64.Regs.litToImm: no match for " ++ show l
++
++-- == To satisfy GHC.CmmToAsm.Reg.Target =======================================
++
++-- squeese functions for the graph allocator -----------------------------------
++
++-- | regSqueeze_class reg
++--      Calculate the maximum number of register colors that could be
++--      denied to a node of this class due to having this reg
++--      as a neighbour.
++{-# INLINE virtualRegSqueeze #-}
++virtualRegSqueeze :: RegClass -> VirtualReg -> Int
++virtualRegSqueeze cls vr =
++  case cls of
++    RcInteger ->
++      case vr of
++        VirtualRegI {} -> 1
++        VirtualRegHi {} -> 1
++        _other -> 0
++    RcDouble ->
++      case vr of
++        VirtualRegD {} -> 1
++        VirtualRegF {} -> 0
++        _other -> 0
++    _other -> 0
++
++{-# INLINE realRegSqueeze #-}
++realRegSqueeze :: RegClass -> RealReg -> Int
++realRegSqueeze cls rr =
++  case cls of
++    RcInteger ->
++      case rr of
++        RealRegSingle regNo
++          | regNo < d0RegNo -> 1
++          | otherwise -> 0
++    RcDouble ->
++      case rr of
++        RealRegSingle regNo
++          | regNo < d0RegNo -> 0
++          | otherwise -> 1
++    _other -> 0
++
++mkVirtualReg :: Unique -> Format -> VirtualReg
++mkVirtualReg u format
++  | not (isFloatFormat format) = VirtualRegI u
++  | otherwise =
++      case format of
++        FF32 -> VirtualRegD u
++        FF64 -> VirtualRegD u
++        _ -> panic "RV64.mkVirtualReg"
++
++{-# INLINE classOfRealReg #-}
++classOfRealReg :: RealReg -> RegClass
++classOfRealReg (RealRegSingle i)
++  | i < d0RegNo = RcInteger
++  | otherwise = RcDouble
++
++regDotColor :: RealReg -> SDoc
++regDotColor reg =
++  case classOfRealReg reg of
++    RcInteger -> text "blue"
++    RcFloat -> text "red"
++    RcDouble -> text "green"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+@@ -119,7 +119,7 @@ trivColorable platform virtualRegSqueeze
+                             ArchMipseb    -> panic "trivColorable ArchMipseb"
+                             ArchMipsel    -> panic "trivColorable ArchMipsel"
+                             ArchS390X     -> panic "trivColorable ArchS390X"
+-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
++                            ArchRISCV64   -> 14
+                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
+                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                             ArchWasm32    -> panic "trivColorable ArchWasm32"
+@@ -154,7 +154,7 @@ trivColorable platform virtualRegSqueeze
+                             ArchMipseb    -> panic "trivColorable ArchMipseb"
+                             ArchMipsel    -> panic "trivColorable ArchMipsel"
+                             ArchS390X     -> panic "trivColorable ArchS390X"
+-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
++                            ArchRISCV64   -> 0
+                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
+                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                             ArchWasm32    -> panic "trivColorable ArchWasm32"
+@@ -188,7 +188,7 @@ trivColorable platform virtualRegSqueeze
+                             ArchMipseb    -> panic "trivColorable ArchMipseb"
+                             ArchMipsel    -> panic "trivColorable ArchMipsel"
+                             ArchS390X     -> panic "trivColorable ArchS390X"
+-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
++                            ArchRISCV64   -> 20
+                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
+                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                             ArchWasm32    -> panic "trivColorable ArchWasm32"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Linear.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear.hs
+@@ -112,6 +112,7 @@ import qualified GHC.CmmToAsm.Reg.Linear
+ import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+ import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+ import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
++import qualified GHC.CmmToAsm.Reg.Linear.RV64    as RV64
+ import GHC.CmmToAsm.Reg.Target
+ import GHC.CmmToAsm.Reg.Liveness
+ import GHC.CmmToAsm.Reg.Utils
+@@ -221,7 +222,7 @@ linearRegAlloc config entry_ids block_li
+       ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
+       ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
+       ArchMipsel     -> panic "linearRegAlloc ArchMipsel"
+-      ArchRISCV64    -> panic "linearRegAlloc ArchRISCV64"
++      ArchRISCV64    -> go (frInitFreeRegs platform :: RV64.FreeRegs)
+       ArchLoongArch64-> panic "linearRegAlloc ArchLoongArch64"
+       ArchJavaScript -> panic "linearRegAlloc ArchJavaScript"
+       ArchWasm32     -> panic "linearRegAlloc ArchWasm32"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+@@ -29,10 +29,12 @@ import qualified GHC.CmmToAsm.Reg.Linear
+ import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+ import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+ import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
++import qualified GHC.CmmToAsm.Reg.Linear.RV64    as RV64
+ 
+ import qualified GHC.CmmToAsm.PPC.Instr     as PPC.Instr
+ import qualified GHC.CmmToAsm.X86.Instr     as X86.Instr
+ import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr
++import qualified GHC.CmmToAsm.RV64.Instr    as RV64.Instr
+ 
+ class Show freeRegs => FR freeRegs where
+     frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
+@@ -64,6 +66,12 @@ instance FR AArch64.FreeRegs where
+     frInitFreeRegs = AArch64.initFreeRegs
+     frReleaseReg = \_ -> AArch64.releaseReg
+ 
++instance FR RV64.FreeRegs where
++    frAllocateReg = const RV64.allocateReg
++    frGetFreeRegs = const RV64.getFreeRegs
++    frInitFreeRegs = RV64.initFreeRegs
++    frReleaseReg = const RV64.releaseReg
++
+ maxSpillSlots :: NCGConfig -> Int
+ maxSpillSlots config = case platformArch (ncgPlatform config) of
+    ArchX86       -> X86.Instr.maxSpillSlots config
+@@ -76,7 +84,7 @@ maxSpillSlots config = case platformArch
+    ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
+    ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
+    ArchMipsel    -> panic "maxSpillSlots ArchMipsel"
+-   ArchRISCV64   -> panic "maxSpillSlots ArchRISCV64"
++   ArchRISCV64   -> RV64.Instr.maxSpillSlots config
+    ArchLoongArch64->panic "maxSpillSlots ArchLoongArch64"
+    ArchJavaScript-> panic "maxSpillSlots ArchJavaScript"
+    ArchWasm32    -> panic "maxSpillSlots ArchWasm32"
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs
+@@ -0,0 +1,96 @@
++-- | Functions to implement the @FR@ (as in "free regs") type class.
++--
++-- For LLVM GHC calling convention (used registers), see
++-- https://github.com/llvm/llvm-project/blob/6ab900f8746e7d8e24afafb5886a40801f6799f4/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L13638-L13685
++module GHC.CmmToAsm.Reg.Linear.RV64
++  ( allocateReg,
++    getFreeRegs,
++    initFreeRegs,
++    releaseReg,
++    FreeRegs (..),
++  )
++where
++
++import Data.Word
++import GHC.CmmToAsm.RV64.Regs
++import GHC.Platform
++import GHC.Platform.Reg
++import GHC.Platform.Reg.Class
++import GHC.Prelude
++import GHC.Stack
++import GHC.Utils.Outputable
++import GHC.Utils.Panic
++
++-- | Bitmaps to indicate which registers are free (currently unused)
++--
++-- The bit index represents the `RegNo`, in case of floating point registers
++-- with an offset of 32. The register is free when the bit is set.
++data FreeRegs
++  = FreeRegs
++      -- | integer/general purpose registers (`RcInteger`)
++      !Word32
++      -- | floating point registers (`RcDouble`)
++      !Word32
++
++instance Show FreeRegs where
++  show (FreeRegs g f) = "FreeRegs 0b" ++ showBits g ++ " 0b" ++ showBits f
++
++-- | Show bits as a `String` of @1@s and @0@s
++showBits :: Word32 -> String
++showBits w = map (\i -> if testBit w i then '1' else '0') [0 .. 31]
++
++instance Outputable FreeRegs where
++  ppr (FreeRegs g f) =
++    text "   "
++      <+> foldr (\i x -> pad_int i <+> x) (text "") [0 .. 31]
++      $$ text "GPR"
++      <+> foldr (\i x -> show_bit g i <+> x) (text "") [0 .. 31]
++      $$ text "FPR"
++      <+> foldr (\i x -> show_bit f i <+> x) (text "") [0 .. 31]
++    where
++      pad_int i | i < 10 = char ' ' <> int i
++      pad_int i = int i
++      -- remember bit = 1 means it's available.
++      show_bit bits bit | testBit bits bit = text "  "
++      show_bit _ _ = text " x"
++
++-- | Set bits of all allocatable registers to 1
++initFreeRegs :: Platform -> FreeRegs
++initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
++  where
++    noFreeRegs :: FreeRegs
++    noFreeRegs = FreeRegs 0 0
++
++-- | Get all free `RealReg`s (i.e. those where the corresponding bit is 1)
++getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
++getFreeRegs cls (FreeRegs g f)
++  | RcFloat <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
++  | RcDouble <- cls = go 32 f allocatableDoubleRegs
++  | RcInteger <- cls = go 0 g allocatableIntRegs
++  where
++    go _ _ [] = []
++    go off x (i : is)
++      | testBit x i = RealRegSingle (off + i) : (go off x $! is)
++      | otherwise = go off x $! is
++    -- The lists of allocatable registers are manually crafted: Register
++    -- allocation is pretty hot code. We don't want to iterate and map like
++    -- `initFreeRegs` all the time! (The register mappings aren't supposed to
++    -- change often.)
++    allocatableIntRegs = [5 .. 7] ++ [10 .. 17] ++ [28 .. 30]
++    allocatableDoubleRegs = [0 .. 7] ++ [10 .. 17] ++ [28 .. 31]
++
++-- | Set corresponding register bit to 0
++allocateReg :: (HasCallStack) => RealReg -> FreeRegs -> FreeRegs
++allocateReg (RealRegSingle r) (FreeRegs g f)
++  | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
++  | r < 32 && testBit g r = FreeRegs (clearBit g r) f
++  | r > 31 = panic $ "Linear.RV64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
++  | otherwise = pprPanic "Linear.RV64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
++
++-- | Set corresponding register bit to 1
++releaseReg :: (HasCallStack) => RealReg -> FreeRegs -> FreeRegs
++releaseReg (RealRegSingle r) (FreeRegs g f)
++  | r > 31 && testBit f (r - 32) = pprPanic "Linear.RV64.releaseReg" (text "can't release non-allocated reg v" <> int (r - 32))
++  | r < 32 && testBit g r = pprPanic "Linear.RV64.releaseReg" (text "can't release non-allocated reg x" <> int r)
++  | r > 31 = FreeRegs g (setBit f (r - 32))
++  | otherwise = FreeRegs (setBit g r) f
+Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Target.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Target.hs
++++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Target.hs
+@@ -34,7 +34,7 @@ import qualified GHC.CmmToAsm.X86.Regs
+ import qualified GHC.CmmToAsm.X86.RegInfo    as X86
+ import qualified GHC.CmmToAsm.PPC.Regs       as PPC
+ import qualified GHC.CmmToAsm.AArch64.Regs   as AArch64
+-
++import qualified GHC.CmmToAsm.RV64.Regs   as RV64
+ 
+ targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
+ targetVirtualRegSqueeze platform
+@@ -49,7 +49,7 @@ targetVirtualRegSqueeze platform
+       ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
+       ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
+       ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
+-      ArchRISCV64   -> panic "targetVirtualRegSqueeze ArchRISCV64"
++      ArchRISCV64   -> RV64.virtualRegSqueeze
+       ArchLoongArch64->panic "targetVirtualRegSqueeze ArchLoongArch64"
+       ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript"
+       ArchWasm32    -> panic "targetVirtualRegSqueeze ArchWasm32"
+@@ -69,7 +69,7 @@ targetRealRegSqueeze platform
+       ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
+       ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
+       ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
+-      ArchRISCV64   -> panic "targetRealRegSqueeze ArchRISCV64"
++      ArchRISCV64   -> RV64.realRegSqueeze
+       ArchLoongArch64->panic "targetRealRegSqueeze ArchLoongArch64"
+       ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript"
+       ArchWasm32    -> panic "targetRealRegSqueeze ArchWasm32"
+@@ -88,7 +88,7 @@ targetClassOfRealReg platform
+       ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
+       ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
+       ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
+-      ArchRISCV64   -> panic "targetClassOfRealReg ArchRISCV64"
++      ArchRISCV64   -> RV64.classOfRealReg
+       ArchLoongArch64->panic "targetClassOfRealReg ArchLoongArch64"
+       ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript"
+       ArchWasm32    -> panic "targetClassOfRealReg ArchWasm32"
+@@ -107,7 +107,7 @@ targetMkVirtualReg platform
+       ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
+       ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
+       ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
+-      ArchRISCV64   -> panic "targetMkVirtualReg ArchRISCV64"
++      ArchRISCV64   -> RV64.mkVirtualReg
+       ArchLoongArch64->panic "targetMkVirtualReg ArchLoongArch64"
+       ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript"
+       ArchWasm32    -> panic "targetMkVirtualReg ArchWasm32"
+@@ -126,7 +126,7 @@ targetRegDotColor platform
+       ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
+       ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
+       ArchMipsel    -> panic "targetRegDotColor ArchMipsel"
+-      ArchRISCV64   -> panic "targetRegDotColor ArchRISCV64"
++      ArchRISCV64   -> RV64.regDotColor
+       ArchLoongArch64->panic "targetRegDotColor ArchLoongArch64"
+       ArchJavaScript-> panic "targetRegDotColor ArchJavaScript"
+       ArchWasm32    -> panic "targetRegDotColor ArchWasm32"
+Index: ghc-9.10.1/compiler/GHC/Driver/Backend.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/Driver/Backend.hs
++++ ghc-9.10.1/compiler/GHC/Driver/Backend.hs
+@@ -213,6 +213,7 @@ platformNcgSupported platform = if
+          ArchPPC_64 {} -> True
+          ArchAArch64   -> True
+          ArchWasm32    -> True
++         ArchRISCV64   -> True
+          _             -> False
+ 
+ -- | Is the platform supported by the JS backend?
+Index: ghc-9.10.1/compiler/GHC/Driver/DynFlags.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/Driver/DynFlags.hs
++++ ghc-9.10.1/compiler/GHC/Driver/DynFlags.hs
+@@ -1325,6 +1325,7 @@ default_PIC platform =
+     (OSDarwin,  ArchAArch64) -> [Opt_PIC]
+     (OSLinux,   ArchAArch64) -> [Opt_PIC, Opt_ExternalDynamicRefs]
+     (OSLinux,   ArchARM {})  -> [Opt_PIC, Opt_ExternalDynamicRefs]
++    (OSLinux,   ArchRISCV64 {}) -> [Opt_PIC, Opt_ExternalDynamicRefs]
+     (OSOpenBSD, ArchX86_64)  -> [Opt_PIC] -- Due to PIE support in
+                                          -- OpenBSD since 5.3 release
+                                          -- (1 May 2013) we need to
+Index: ghc-9.10.1/compiler/GHC/Platform.hs
+===================================================================
+--- ghc-9.10.1.orig/compiler/GHC/Platform.hs
++++ ghc-9.10.1/compiler/GHC/Platform.hs
+@@ -250,7 +250,6 @@ platformHasRTSLinker p = case archOS_arc
+   ArchPPC_64 ELF_V1 -> False -- powerpc64
+   ArchPPC_64 ELF_V2 -> False -- powerpc64le
+   ArchS390X         -> False
+-  ArchRISCV64       -> False
+   ArchLoongArch64   -> False
+   ArchJavaScript    -> False
+   ArchWasm32        -> False
+Index: ghc-9.10.1/compiler/ghc.cabal.in
+===================================================================
+--- ghc-9.10.1.orig/compiler/ghc.cabal.in
++++ ghc-9.10.1/compiler/ghc.cabal.in
+@@ -290,6 +290,7 @@ Library
+         GHC.CmmToAsm.Reg.Linear.FreeRegs
+         GHC.CmmToAsm.Reg.Linear.JoinToTargets
+         GHC.CmmToAsm.Reg.Linear.PPC
++        GHC.CmmToAsm.Reg.Linear.RV64
+         GHC.CmmToAsm.Reg.Linear.StackMap
+         GHC.CmmToAsm.Reg.Linear.State
+         GHC.CmmToAsm.Reg.Linear.Stats
+@@ -298,6 +299,13 @@ Library
+         GHC.CmmToAsm.Reg.Liveness
+         GHC.CmmToAsm.Reg.Target
+         GHC.CmmToAsm.Reg.Utils
++        GHC.CmmToAsm.RV64
++        GHC.CmmToAsm.RV64.CodeGen
++        GHC.CmmToAsm.RV64.Cond
++        GHC.CmmToAsm.RV64.Instr
++        GHC.CmmToAsm.RV64.Ppr
++        GHC.CmmToAsm.RV64.RegInfo
++        GHC.CmmToAsm.RV64.Regs
+         GHC.CmmToAsm.Types
+         GHC.CmmToAsm.Utils
+         GHC.CmmToAsm.X86
+Index: ghc-9.10.1/hadrian/bindist/config.mk.in
+===================================================================
+--- ghc-9.10.1.orig/hadrian/bindist/config.mk.in
++++ ghc-9.10.1/hadrian/bindist/config.mk.in
+@@ -152,7 +152,7 @@ GhcWithSMP := $(strip $(if $(filter YESN
+ # Whether to include GHCi in the compiler.  Depends on whether the RTS linker
+ # has support for this OS/ARCH combination.
+ OsSupportsGHCi=$(strip $(patsubst $(TargetOS_CPP), YES, $(findstring $(TargetOS_CPP), mingw32 linux solaris2 freebsd dragonfly netbsd openbsd darwin kfreebsdgnu)))
+-ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64)))
++ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64 riscv64)))
+ 
+ ifeq "$(OsSupportsGHCi)$(ArchSupportsGHCi)" "YESYES"
+ GhcWithInterpreter=YES
+Index: ghc-9.10.1/hadrian/src/Settings/Builders/RunTest.hs
+===================================================================
+--- ghc-9.10.1.orig/hadrian/src/Settings/Builders/RunTest.hs
++++ ghc-9.10.1/hadrian/src/Settings/Builders/RunTest.hs
+@@ -118,7 +118,7 @@ inTreeCompilerArgs stg = do
+ 
+     os          <- queryHostTarget queryOS
+     arch        <- queryTargetTarget queryArch
+-    let codegen_arches = ["x86_64", "i386", "powerpc", "powerpc64", "powerpc64le", "aarch64", "wasm32"]
++    let codegen_arches = ["x86_64", "i386", "powerpc", "powerpc64", "powerpc64le", "aarch64", "wasm32", "riscv64"]
+     let withNativeCodeGen
+           | unregisterised = False
+           | arch `elem` codegen_arches = True
+@@ -139,7 +139,7 @@ inTreeCompilerArgs stg = do
+     -- For this information, we need to query ghc --info, however, that would
+     -- require building ghc, which we don't want to do here. Therefore, the
+     -- logic from `platformHasRTSLinker` is duplicated here.
+-    let rtsLinker = not $ arch `elem` ["powerpc", "powerpc64", "powerpc64le", "s390x", "riscv64", "loongarch64", "javascript", "wasm32"]
++    let rtsLinker = not $ arch `elem` ["powerpc", "powerpc64", "powerpc64le", "s390x", "loongarch64", "javascript", "wasm32"]
+ 
+     return TestCompilerArgs{..}
+ 
+Index: ghc-9.10.1/rts/LinkerInternals.h
+===================================================================
+--- ghc-9.10.1.orig/rts/LinkerInternals.h
++++ ghc-9.10.1/rts/LinkerInternals.h
+@@ -208,7 +208,7 @@ typedef struct _Segment {
+     int n_sections;
+ } Segment;
+ 
+-#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
++#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
+ #define NEED_SYMBOL_EXTRAS 1
+ #endif
+ 
+@@ -220,8 +220,9 @@ typedef struct _Segment {
+ #define NEED_M32 1
+ #endif
+ 
+-/* Jump Islands are sniplets of machine code required for relative
+- * address relocations on the PowerPC, x86_64 and ARM.
++/* Jump Islands are sniplets of machine code required for relative address
++ * relocations on the PowerPC, x86_64 and ARM. On RISCV64 we use symbolextras
++ * like a GOT for locals where SymbolExtra represents one entry.
+  */
+ typedef struct {
+ #if defined(powerpc_HOST_ARCH)
+@@ -237,6 +238,8 @@ typedef struct {
+     uint8_t     jumpIsland[8];
+ #elif defined(arm_HOST_ARCH)
+     uint8_t     jumpIsland[16];
++#elif defined(riscv64_HOST_ARCH)
++    uint64_t    addr;
+ #endif
+ } SymbolExtra;
+ 
+Index: ghc-9.10.1/rts/RtsSymbols.c
+===================================================================
+--- ghc-9.10.1.orig/rts/RtsSymbols.c
++++ ghc-9.10.1/rts/RtsSymbols.c
+@@ -980,6 +980,17 @@ extern char **environ;
+ #define RTS_LIBGCC_SYMBOLS
+ #endif
+ 
++#if defined(riscv64_HOST_ARCH)
++// See https://gcc.gnu.org/onlinedocs/gccint/Integer-library-routines.html as
++// reference for the following built-ins. __clzdi2 and __ctzdi2 probably relate
++// to __builtin-s in libraries/ghc-prim/cbits/ctz.c.
++#define RTS_ARCH_LIBGCC_SYMBOLS \
++  SymI_NeedsProto(__clzdi2) \
++  SymI_NeedsProto(__ctzdi2)
++#else
++#define RTS_ARCH_LIBGCC_SYMBOLS
++#endif
++
+ // Symbols defined by libgcc/compiler-rt for AArch64's outline atomics.
+ #if defined(HAVE_ARM_OUTLINE_ATOMICS)
+ #include "ARMOutlineAtomicsSymbols.h"
+@@ -1032,6 +1043,7 @@ RTS_DARWIN_ONLY_SYMBOLS
+ RTS_OPENBSD_ONLY_SYMBOLS
+ RTS_LIBC_SYMBOLS
+ RTS_LIBGCC_SYMBOLS
++RTS_ARCH_LIBGCC_SYMBOLS
+ RTS_FINI_ARRAY_SYMBOLS
+ RTS_LIBFFI_SYMBOLS
+ RTS_ARM_OUTLINE_ATOMIC_SYMBOLS
+@@ -1074,6 +1086,7 @@ RtsSymbolVal rtsSyms[] = {
+       RTS_DARWIN_ONLY_SYMBOLS
+       RTS_OPENBSD_ONLY_SYMBOLS
+       RTS_LIBGCC_SYMBOLS
++      RTS_ARCH_LIBGCC_SYMBOLS
+       RTS_FINI_ARRAY_SYMBOLS
+       RTS_LIBFFI_SYMBOLS
+       RTS_ARM_OUTLINE_ATOMIC_SYMBOLS
+Index: ghc-9.10.1/rts/adjustor/LibffiAdjustor.c
+===================================================================
+--- ghc-9.10.1.orig/rts/adjustor/LibffiAdjustor.c
++++ ghc-9.10.1/rts/adjustor/LibffiAdjustor.c
+@@ -12,6 +12,7 @@
+ #include "Adjustor.h"
+ 
+ #include "rts/ghc_ffi.h"
++#include <stdint.h>
+ #include <string.h>
+ 
+ // Note that ffi_alloc_prep_closure is a non-standard libffi closure
+@@ -187,5 +188,21 @@ createAdjustor (int cconv,
+         barf("createAdjustor: failed to allocate memory");
+     }
+ 
+-    return (void*)code;
++#if defined(riscv64_HOST_ARCH)
++    // Synchronize the memory and instruction cache to prevent illegal
++    // instruction exceptions.
++
++    // We expect two instructions for address loading, one for the jump.
++    int instrCount = 3;
++    // On Linux the parameters of __builtin___clear_cache are currently unused.
++    // Add them anyways for future compatibility. (I.e. the parameters couldn't
++    // be checked during development.)
++    // TODO: Check the upper boundary e.g. with a debugger.
++    __builtin___clear_cache((void *)code,
++                            (void *)((uint64_t *) code + instrCount));
++    // Memory barrier to ensure nothing circumvents the fence.i / cache flush.
++    SEQ_CST_FENCE();
++#endif
++
++    return (void *)code;
+ }
+Index: ghc-9.10.1/rts/linker/Elf.c
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/Elf.c
++++ ghc-9.10.1/rts/linker/Elf.c
+@@ -103,7 +103,8 @@
+ 
+ #include "elf_got.h"
+ 
+-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
++#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH)
++#  define NEED_GOT
+ #  define NEED_PLT
+ #  include "elf_plt.h"
+ #  include "elf_reloc.h"
+@@ -430,10 +431,7 @@ ocVerifyImage_ELF ( ObjectCode* oc )
+       case EM_AARCH64: IF_DEBUG(linker,debugBelch( "aarch64" )); break;
+ #endif
+ #if defined(EM_RISCV)
+-      case EM_RISCV:  IF_DEBUG(linker,debugBelch( "riscv" ));
+-          errorBelch("%s: RTS linker not implemented on riscv",
+-                     oc->fileName);
+-          return 0;
++      case EM_RISCV:  IF_DEBUG(linker,debugBelch( "riscv" )); break;
+ #endif
+ #if defined(EM_LOONGARCH)
+       case EM_LOONGARCH:  IF_DEBUG(linker,debugBelch( "loongarch64" ));
+@@ -1130,9 +1128,10 @@ end:
+    return result;
+ }
+ 
+-// the aarch64 linker uses relocacteObjectCodeAarch64,
+-// see elf_reloc_aarch64.{h,c}
+-#if !defined(aarch64_HOST_ARCH)
++// the aarch64 and riscv64 linkers use relocateObjectCodeAarch64() and
++// relocateObjectCodeRISCV64() (respectively), see elf_reloc_aarch64.{h,c} and
++// elf_reloc_riscv64.{h,c}
++#if !defined(aarch64_HOST_ARCH) && !defined(riscv64_HOST_ARCH)
+ 
+ /* Do ELF relocations which lack an explicit addend.  All x86-linux
+    and arm-linux relocations appear to be of this form. */
+@@ -1359,7 +1358,7 @@ do_Elf_Rel_relocations ( ObjectCode* oc,
+                /* try to locate an existing stub for this target */
+                if(findStub(&oc->sections[target_shndx], (void**)&S, 0)) {
+                    /* didn't find any. Need to create one */
+-                   if(makeStub(&oc->sections[target_shndx], (void**)&S, 0)) {
++                   if(makeStub(&oc->sections[target_shndx], (void**)&S, NULL, 0)) {
+                        errorBelch("Unable to create veneer for ARM_CALL\n");
+                        return 0;
+                    }
+@@ -1451,7 +1450,7 @@ do_Elf_Rel_relocations ( ObjectCode* oc,
+                /* try to locate an existing stub for this target */
+                if(findStub(&oc->sections[target_shndx], (void**)&S, 1)) {
+                    /* didn't find any. Need to create one */
+-                   if(makeStub(&oc->sections[target_shndx], (void**)&S, 1)) {
++                   if(makeStub(&oc->sections[target_shndx], (void**)&S, NULL, 1)) {
+                        errorBelch("Unable to create veneer for ARM_THM_CALL\n");
+                        return 0;
+                    }
+@@ -1991,7 +1990,7 @@ ocResolve_ELF ( ObjectCode* oc )
+     (void) shnum;
+     (void) shdr;
+ 
+-#if defined(aarch64_HOST_ARCH)
++#if defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
+     /* use new relocation design */
+     if(relocateObjectCode( oc ))
+         return 0;
+@@ -2014,6 +2013,9 @@ ocResolve_ELF ( ObjectCode* oc )
+ 
+ #if defined(powerpc_HOST_ARCH)
+     ocFlushInstructionCache( oc );
++#elif defined(riscv64_HOST_ARCH)
++    /* New-style pseudo-polymorph (by architecture) call */
++    flushInstructionCache( oc );
+ #endif
+ 
+     return ocMprotect_Elf(oc);
+Index: ghc-9.10.1/rts/linker/ElfTypes.h
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/ElfTypes.h
++++ ghc-9.10.1/rts/linker/ElfTypes.h
+@@ -150,6 +150,7 @@ typedef
+ struct _Stub {
+     void * addr;
+     void * target;
++    void* got_addr;
+     /* flags can hold architecture specific information they are used during
+      * lookup of stubs as well. Thus two stubs for the same target with
+      * different flags are considered unequal.
+Index: ghc-9.10.1/rts/linker/SymbolExtras.c
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/SymbolExtras.c
++++ ghc-9.10.1/rts/linker/SymbolExtras.c
+@@ -153,7 +153,7 @@ void ocProtectExtras(ObjectCode* oc)
+ }
+ 
+ 
+-#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
++#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
+ SymbolExtra* makeSymbolExtra( ObjectCode const* oc,
+                               unsigned long symbolNumber,
+                               unsigned long target )
+@@ -189,9 +189,12 @@ SymbolExtra* makeSymbolExtra( ObjectCode
+     extra->addr = target;
+     memcpy(extra->jumpIsland, jmp, 8);
+ #endif /* x86_64_HOST_ARCH */
+-
++#if defined(riscv64_HOST_ARCH)
++    // Fake GOT entry (used like GOT, but located in symbol extras)
++    extra->addr = target;
++#endif
+     return extra;
+ }
+-#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH */
++#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH || riscv64_HOST_ARCH */
+ #endif /* !x86_64_HOST_ARCH) || !mingw32_HOST_OS */
+ #endif // NEED_SYMBOL_EXTRAS
+Index: ghc-9.10.1/rts/linker/SymbolExtras.h
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/SymbolExtras.h
++++ ghc-9.10.1/rts/linker/SymbolExtras.h
+@@ -16,7 +16,7 @@ SymbolExtra* makeArmSymbolExtra( ObjectC
+                                  unsigned long target,
+                                  bool fromThumb,
+                                  bool toThumb );
+-#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
++#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
+ SymbolExtra* makeSymbolExtra( ObjectCode const* oc,
+                               unsigned long symbolNumber,
+                               unsigned long target );
+Index: ghc-9.10.1/rts/linker/elf_plt.c
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_plt.c
++++ ghc-9.10.1/rts/linker/elf_plt.c
+@@ -5,7 +5,7 @@
+ #include <stdint.h>
+ #include <stdlib.h>
+ 
+-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
++#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
+ #if defined(OBJFORMAT_ELF)
+ 
+ #define STRINGIFY(x) #x
+@@ -49,11 +49,13 @@ findStub(Section * section,
+ bool
+ makeStub(Section * section,
+           void* * addr,
++          void* got_addr,
+           uint8_t flags) {
+ 
+     Stub * s = calloc(1, sizeof(Stub));
+     ASSERT(s != NULL);
+     s->target = *addr;
++    s->got_addr = got_addr;
+     s->flags  = flags;
+     s->next = NULL;
+     s->addr = (uint8_t *)section->info->stub_offset + 8
+Index: ghc-9.10.1/rts/linker/elf_plt.h
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_plt.h
++++ ghc-9.10.1/rts/linker/elf_plt.h
+@@ -4,8 +4,9 @@
+ 
+ #include "elf_plt_arm.h"
+ #include "elf_plt_aarch64.h"
++#include "elf_plt_riscv64.h"
+ 
+-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
++#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH)
+ 
+ #if defined(OBJFORMAT_ELF)
+ 
+@@ -21,6 +22,8 @@
+ #define __suffix__ Arm
+ #elif defined(__mips__)
+ #define __suffix__ Mips
++#elif defined(__riscv)
++#define __suffix__ RISCV64
+ #else
+ #error "unknown architecture"
+ #endif
+@@ -34,10 +37,10 @@ unsigned  numberOfStubsForSection( Objec
+ #define STUB_SIZE          ADD_SUFFIX(stubSize)
+ 
+ bool findStub(Section * section, void* * addr, uint8_t flags);
+-bool makeStub(Section * section, void* * addr, uint8_t flags);
++bool makeStub(Section * section, void* * addr, void* got_addr, uint8_t flags);
+ 
+ void freeStubs(Section * section);
+ 
+ #endif // OBJECTFORMAT_ELF
+ 
+-#endif // arm/aarch64_HOST_ARCH
++#endif // arm/aarch64_HOST_ARCH/riscv64_HOST_ARCH
+Index: ghc-9.10.1/rts/linker/elf_plt_riscv64.c
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/rts/linker/elf_plt_riscv64.c
+@@ -0,0 +1,90 @@
++#include "Rts.h"
++#include "elf_compat.h"
++#include "elf_plt_riscv64.h"
++#include "rts/Messages.h"
++#include "linker/ElfTypes.h"
++
++#include <stdint.h>
++#include <stdlib.h>
++
++#if defined(riscv64_HOST_ARCH)
++
++#if defined(OBJFORMAT_ELF)
++
++const size_t instSizeRISCV64 = 4;
++const size_t stubSizeRISCV64 = 3 * instSizeRISCV64;
++
++bool needStubForRelRISCV64(Elf_Rel *rel) {
++  switch (ELF64_R_TYPE(rel->r_info)) {
++  case R_RISCV_CALL:
++  case R_RISCV_CALL_PLT:
++    return true;
++  default:
++    return false;
++  }
++}
++
++bool needStubForRelaRISCV64(Elf_Rela *rela) {
++  switch (ELF64_R_TYPE(rela->r_info)) {
++  case R_RISCV_CALL:
++  case R_RISCV_CALL_PLT:
++    return true;
++  default:
++    return false;
++  }
++}
++
++// After the global offset table (GOT) has been set up, we can use these three
++// instructions to jump to the target address / function:
++//
++//  1. AUIPC ip, %pcrel_hi(addr)
++//  2. LD ip, %pcrel_lo(addr)(ip)
++//  3. JARL x0, ip, 0
++//
++// We could use the absolute address of the target (because we know it), but
++// that would require loading a 64-bit constant which is a nightmare to do in
++// riscv64 assembly. (See
++// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/5ffe5b5aeedb37b1c1c0c3d94641267d9ad4795a/riscv-elf.adoc#procedure-linkage-table)
++//
++// So far, PC-relative addressing seems to be good enough. If it ever turns out
++// to be not, one could (additionally for out-of-range cases?) encode absolute
++// addressing here.
++bool makeStubRISCV64(Stub *s) {
++  uint32_t *P = (uint32_t *)s->addr;
++  int32_t addr = (uint64_t)s->got_addr - (uint64_t)P;
++
++  uint64_t hi = (addr + 0x800) >> 12;
++  uint64_t lo = addr - (hi << 12);
++
++  IF_DEBUG(
++      linker,
++      debugBelch(
++          "makeStubRISCV64: P = %p, got_addr = %p, target = %p, addr = 0x%x "
++          ", hi = 0x%lx, lo = 0x%lx\n",
++          P, s->got_addr, s->target, addr, hi, lo));
++
++  // AUIPC ip, %pcrel_hi(addr)
++  uint32_t auipcInst = 0b0010111; // opcode
++  auipcInst |= 0x1f << 7;         // rd = ip (x31)
++  auipcInst |= hi << 12;          // imm[31:12]
++
++  // LD ip, %pcrel_lo(addr)(ip)
++  uint32_t ldInst = 0b0000011; // opcode
++  ldInst |= 0x1f << 7;         // rd = ip (x31)
++  ldInst |= 0x1f << 15;        // rs = ip (x31)
++  ldInst |= 0b11 << 12;        // funct3 = 0x3 (LD)
++  ldInst |= lo << 20;          // imm[11:0]
++
++  // JARL x0, ip, 0
++  uint32_t jalrInst = 0b1100111; // opcode
++  jalrInst |= 0x1f << 15;        // rs = ip (x31)
++
++  P[0] = auipcInst;
++  P[1] = ldInst;
++  P[2] = jalrInst;
++
++  return EXIT_SUCCESS;
++}
++
++#endif
++#endif
+Index: ghc-9.10.1/rts/linker/elf_plt_riscv64.h
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/rts/linker/elf_plt_riscv64.h
+@@ -0,0 +1,12 @@
++#pragma once
++
++#include "LinkerInternals.h"
++
++#if defined(OBJFORMAT_ELF)
++
++extern const size_t stubSizeRISCV64;
++bool needStubForRelRISCV64(Elf_Rel * rel);
++bool needStubForRelaRISCV64(Elf_Rela * rel);
++bool makeStubRISCV64(Stub * s);
++
++#endif
+Index: ghc-9.10.1/rts/linker/elf_reloc.c
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_reloc.c
++++ ghc-9.10.1/rts/linker/elf_reloc.c
+@@ -4,13 +4,18 @@
+ 
+ #if defined(OBJFORMAT_ELF)
+ 
+-/* we currently only use this abstraction for elf/aarch64 */
+-#if defined(aarch64_HOST_ARCH)
++/* we currently only use this abstraction for elf/aarch64 and elf/riscv64 */
++#if defined(aarch64_HOST_ARCH) | defined(riscv64_HOST_ARCH)
+ 
+ bool
+ relocateObjectCode(ObjectCode * oc) {
+     return ADD_SUFFIX(relocateObjectCode)(oc);
+ }
++
++
++void flushInstructionCache(ObjectCode * oc){
++   return ADD_SUFFIX(flushInstructionCache)(oc);
++}
+ #endif
+ 
+ #endif
+Index: ghc-9.10.1/rts/linker/elf_reloc.h
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_reloc.h
++++ ghc-9.10.1/rts/linker/elf_reloc.h
+@@ -5,9 +5,10 @@
+ #if defined(OBJFORMAT_ELF)
+ 
+ #include "elf_reloc_aarch64.h"
++#include "elf_reloc_riscv64.h"
+ 
+ bool
+ relocateObjectCode(ObjectCode * oc);
+ 
+-
++void flushInstructionCache(ObjectCode *oc);
+ #endif /* OBJETFORMAT_ELF */
+Index: ghc-9.10.1/rts/linker/elf_reloc_aarch64.c
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_reloc_aarch64.c
++++ ghc-9.10.1/rts/linker/elf_reloc_aarch64.c
+@@ -240,7 +240,7 @@ computeAddend(Section * section, Elf_Rel
+                 /* check if we already have that stub */
+                 if(findStub(section, (void**)&S, 0)) {
+                     /* did not find it. Crete a new stub. */
+-                    if(makeStub(section, (void**)&S, 0)) {
++                    if(makeStub(section, (void**)&S, NULL, 0)) {
+                         abort(/* could not find or make stub */);
+                     }
+                 }
+@@ -339,5 +339,10 @@ relocateObjectCodeAarch64(ObjectCode * o
+     return EXIT_SUCCESS;
+ }
+ 
++void flushInstructionCacheAarch64(ObjectCode * oc STG_UNUSED) {
++  // Looks like we don't need this on Aarch64.
++  /* no-op */
++}
++
+ #endif /* OBJECTFORMAT_ELF */
+ #endif /* aarch64_HOST_ARCH */
+Index: ghc-9.10.1/rts/linker/elf_reloc_aarch64.h
+===================================================================
+--- ghc-9.10.1.orig/rts/linker/elf_reloc_aarch64.h
++++ ghc-9.10.1/rts/linker/elf_reloc_aarch64.h
+@@ -7,4 +7,5 @@
+ bool
+ relocateObjectCodeAarch64(ObjectCode * oc);
+ 
++void flushInstructionCacheAarch64(ObjectCode *oc);
+ #endif /* OBJETFORMAT_ELF */
+Index: ghc-9.10.1/rts/linker/elf_reloc_riscv64.c
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/rts/linker/elf_reloc_riscv64.c
+@@ -0,0 +1,693 @@
++#include "elf_reloc_riscv64.h"
++#include "LinkerInternals.h"
++#include "Rts.h"
++#include "Stg.h"
++#include "SymbolExtras.h"
++#include "linker/ElfTypes.h"
++#include "elf_plt.h"
++#include "elf_util.h"
++#include "rts/Messages.h"
++#include "util.h"
++
++#include <stdint.h>
++#include <stdlib.h>
++
++#if defined(riscv64_HOST_ARCH)
++
++#if defined(OBJFORMAT_ELF)
++
++typedef uint64_t addr_t;
++
++/* regular instructions are 32bit */
++typedef uint32_t inst_t;
++
++/* compressed instructions are 16bit */
++typedef uint16_t cinst_t;
++
++// TODO: These instances could be static. They are not yet, because we might
++// need their debugging symbols.
++char *relocationTypeToString(Elf64_Xword type);
++int32_t decodeAddendRISCV64(Section *section, Elf_Rel *rel);
++bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend);
++void write8le(uint8_t *p, uint8_t v);
++uint8_t read8le(const uint8_t *P);
++void write16le(cinst_t *p, uint16_t v);
++uint16_t read16le(const cinst_t *P);
++uint32_t read32le(const inst_t *P);
++void write32le(inst_t *p, uint32_t v);
++uint64_t read64le(const uint64_t *P);
++void write64le(uint64_t *p, uint64_t v);
++uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end);
++void setCJType(cinst_t *loc, uint32_t val);
++void setCBType(cinst_t *loc, uint32_t val);
++void setBType(inst_t *loc, uint32_t val);
++void setSType(inst_t *loc, uint32_t val);
++int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol,
++                      int64_t addend, ObjectCode *oc);
++void setJType(inst_t *loc, uint32_t val);
++void setIType(inst_t *loc, int32_t val);
++void checkInt(inst_t *loc, int32_t v, int n);
++void setUType(inst_t *loc, int32_t val);
++
++
++char *relocationTypeToString(Elf64_Xword type) {
++  switch (ELF64_R_TYPE(type)) {
++  case R_RISCV_NONE:
++    return "R_RISCV_NONE";
++  case R_RISCV_32:
++    return "R_RISCV_32";
++  case R_RISCV_64:
++    return "R_RISCV_64";
++  case R_RISCV_RELATIVE:
++    return "R_RISCV_RELATIVE";
++  case R_RISCV_COPY:
++    return "R_RISCV_COPY";
++  case R_RISCV_JUMP_SLOT:
++    return "R_RISCV_JUMP_SLOT";
++  case R_RISCV_TLS_DTPMOD32:
++    return "R_RISCV_TLS_DTPMOD32";
++  case R_RISCV_TLS_DTPMOD64:
++    return "R_RISCV_TLS_DTPMOD64";
++  case R_RISCV_TLS_DTPREL32:
++    return "R_RISCV_TLS_DTPREL32";
++  case R_RISCV_TLS_DTPREL64:
++    return "R_RISCV_TLS_DTPREL64";
++  case R_RISCV_TLS_TPREL32:
++    return "R_RISCV_TLS_TPREL32";
++  case R_RISCV_TLS_TPREL64:
++    return "R_RISCV_TLS_TPREL64";
++  case R_RISCV_BRANCH:
++    return "R_RISCV_BRANCH";
++  case R_RISCV_JAL:
++    return "R_RISCV_JAL";
++  case R_RISCV_CALL:
++    return "R_RISCV_CALL";
++  case R_RISCV_CALL_PLT:
++    return "R_RISCV_CALL_PLT";
++  case R_RISCV_GOT_HI20:
++    return "R_RISCV_GOT_HI20";
++  case R_RISCV_PCREL_HI20:
++    return "R_RISCV_PCREL_HI20";
++  case R_RISCV_LO12_I:
++    return "R_RISCV_LO12_I";
++  case R_RISCV_PCREL_LO12_I:
++    return "R_RISCV_PCREL_LO12_I";
++  case R_RISCV_HI20:
++    return "R_RISCV_HI20";
++  case R_RISCV_LO12_S:
++    return "R_RISCV_LO12_S";
++  case R_RISCV_PCREL_LO12_S:
++    return "R_RISCV_PCREL_LO12_S";
++  case R_RISCV_RELAX:
++    return "R_RISCV_RELAX";
++  case R_RISCV_RVC_BRANCH:
++    return "R_RISCV_RVC_BRANCH";
++  case R_RISCV_RVC_JUMP:
++    return "R_RISCV_RVC_JUMP";
++  default:
++    return "Unknown relocation type";
++  }
++}
++
++STG_NORETURN
++int32_t decodeAddendRISCV64(Section *section STG_UNUSED,
++                            Elf_Rel *rel STG_UNUSED) {
++  barf("decodeAddendRISCV64: Relocations with explicit addend are not supported."
++       " Please open a ticket; providing the causing code/binary.");
++}
++
++// Make sure that V can be represented as an N bit signed integer.
++void checkInt(inst_t *loc, int32_t v, int n) {
++  if (!isInt(n, v)) {
++    barf("Relocation at 0x%x is out of range. value: 0x%x (%d), "
++               "sign-extended value: 0x%x (%d), max bits 0x%x (%d)\n",
++               *loc, v, v, signExtend32(v, n), signExtend32(v, n), n, n);
++  }
++}
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++void write8le(uint8_t *p, uint8_t v) { *p = v; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++uint8_t read8le(const uint8_t *p) { return *p; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++void write16le(cinst_t *p, uint16_t v) { *p = v; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++uint16_t read16le(const cinst_t *p) { return *p; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++uint32_t read32le(const inst_t *p) { return *p; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++void write32le(inst_t *p, uint32_t v) { *p = v; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++uint64_t read64le(const uint64_t *p) { return *p; }
++
++// RISCV is little-endian by definition: We can rely on (implicit) casts.
++void write64le(uint64_t *p, uint64_t v) { *p = v; }
++
++uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end) {
++  return (v & ((1ULL << (begin + 1)) - 1)) >> end;
++}
++
++// Set immediate val in the instruction at *loc. In U-type instructions the
++// upper 20bits carry the upper 20bits of the immediate.
++void setUType(inst_t *loc, int32_t val) {
++  const unsigned bits = 32;
++  uint32_t hi = val + 0x800;
++  checkInt(loc, signExtend32(hi, bits) >> 12, 20);
++  IF_DEBUG(linker, debugBelch("setUType: hi 0x%x val 0x%x\n", hi, val));
++
++  uint32_t imm = hi & 0xFFFFF000;
++  write32le(loc, (read32le(loc) & 0xFFF) | imm);
++}
++
++// Set immediate val in the instruction at *loc. In I-type instructions the
++// upper 12bits carry the lower 12bit of the immediate.
++void setIType(inst_t *loc, int32_t val) {
++  uint64_t hi = (val + 0x800) >> 12;
++  uint64_t lo = val - (hi << 12);
++
++  IF_DEBUG(linker, debugBelch("setIType: hi 0x%lx lo 0x%lx\n", hi, lo));
++  IF_DEBUG(linker, debugBelch("setIType: loc %p  *loc 0x%x  val 0x%x\n", loc,
++                              *loc, val));
++
++  uint32_t imm = lo & 0xfff;
++  uint32_t instr = (read32le(loc) & 0xfffff) | (imm << 20);
++
++  IF_DEBUG(linker, debugBelch("setIType: insn 0x%x\n", instr));
++  write32le(loc, instr);
++  IF_DEBUG(linker, debugBelch("setIType: loc %p  *loc' 0x%x  val 0x%x\n", loc,
++                              *loc, val));
++}
++
++// Set immediate val in the instruction at *loc. In S-type instructions the
++// lower 12 bits of the immediate are at bits 7 to 11 ([0:4]) and 25 to 31
++// ([5:11]).
++void setSType(inst_t *loc, uint32_t val) {
++  uint64_t hi = (val + 0x800) >> 12;
++  uint64_t lo = val - (hi << 12);
++
++  uint32_t imm = lo;
++  uint32_t instr = (read32le(loc) & 0x1fff07f) | (extractBits(imm, 11, 5) << 25) |
++         (extractBits(imm, 4, 0) << 7);
++
++  write32le(loc, instr);
++}
++
++// Set immediate val in the instruction at *loc. In J-type instructions the
++// immediate has 20bits which are pretty scattered:
++// instr bit -> imm bit
++// 31 -> 20
++// [30:21] -> [10:1]
++// 20 -> 11
++// [19:12] -> [19:12]
++//
++// N.B. bit 0 of the immediate is missing!
++void setJType(inst_t *loc, uint32_t val) {
++  checkInt(loc, val, 21);
++
++  uint32_t insn = read32le(loc) & 0xFFF;
++  uint32_t imm20 = extractBits(val, 20, 20) << 31;
++  uint32_t imm10_1 = extractBits(val, 10, 1) << 21;
++  uint32_t imm11 = extractBits(val, 11, 11) << 20;
++  uint32_t imm19_12 = extractBits(val, 19, 12) << 12;
++  insn |= imm20 | imm10_1 | imm11 | imm19_12;
++
++  write32le(loc, insn);
++}
++
++// Set immediate val in the instruction at *loc. In B-type instructions the
++// immediate has 12bits which are pretty scattered:
++// instr bit -> imm bit
++// 31 -> 12
++// [30:25] -> [10:5]
++// [11:8] -> [4:1]
++// 7 -> 11
++//
++// N.B. bit 0 of the immediate is missing!
++void setBType(inst_t *loc, uint32_t val) {
++  checkInt(loc, val, 13);
++
++  uint32_t insn = read32le(loc) & 0x1FFF07F;
++  uint32_t imm12 = extractBits(val, 12, 12) << 31;
++  uint32_t imm10_5 = extractBits(val, 10, 5) << 25;
++  uint32_t imm4_1 = extractBits(val, 4, 1) << 8;
++  uint32_t imm11 = extractBits(val, 11, 11) << 7;
++  insn |= imm12 | imm10_5 | imm4_1 | imm11;
++
++  write32le(loc, insn);
++}
++
++
++// Set immediate val in the instruction at *loc. CB-type instructions have a
++// lenght of 16 bits (half-word, compared to the usual 32bit/word instructions.)
++// The immediate has 8bits which are pretty scattered:
++// instr bit -> imm bit
++// 12 -> 8
++// [11:10] -> [4:3]
++// [6:5] -> [7:6]
++// [4:3] -> [2:1]
++// 2 -> 5
++//
++// N.B. bit 0 of the immediate is missing!
++void setCBType(cinst_t *loc, uint32_t val) {
++  checkInt((inst_t *)loc, val, 9);
++  uint16_t insn = read16le(loc) & 0xE383;
++  uint16_t imm8 = extractBits(val, 8, 8) << 12;
++  uint16_t imm4_3 = extractBits(val, 4, 3) << 10;
++  uint16_t imm7_6 = extractBits(val, 7, 6) << 5;
++  uint16_t imm2_1 = extractBits(val, 2, 1) << 3;
++  uint16_t imm5 = extractBits(val, 5, 5) << 2;
++  insn |= imm8 | imm4_3 | imm7_6 | imm2_1 | imm5;
++
++  write16le(loc, insn);
++}
++
++// Set immediate val in the instruction at *loc. CJ-type instructions have a
++// lenght of 16 bits (half-word, compared to the usual 32bit/word instructions.)
++// The immediate has 11bits which are pretty scattered:
++// instr bit -> imm bit
++// 12 -> 11
++// 11 -> 4
++// [10:9] ->[9:8]
++// 8 -> 10
++// 7 -> 6
++// 6 -> 7
++// [5:3] -> [3:1]
++// 2 -> 5
++//
++// N.B. bit 0 of the immediate is missing!
++void setCJType(cinst_t *loc, uint32_t val) {
++  checkInt((inst_t *)loc, val, 12);
++  uint16_t insn = read16le(loc) & 0xE003;
++  uint16_t imm11 = extractBits(val, 11, 11) << 12;
++  uint16_t imm4 = extractBits(val, 4, 4) << 11;
++  uint16_t imm9_8 = extractBits(val, 9, 8) << 9;
++  uint16_t imm10 = extractBits(val, 10, 10) << 8;
++  uint16_t imm6 = extractBits(val, 6, 6) << 7;
++  uint16_t imm7 = extractBits(val, 7, 7) << 6;
++  uint16_t imm3_1 = extractBits(val, 3, 1) << 3;
++  uint16_t imm5 = extractBits(val, 5, 5) << 2;
++  insn |= imm11 | imm4 | imm9_8 | imm10 | imm6 | imm7 | imm3_1 | imm5;
++
++  write16le(loc, insn);
++}
++
++// Encode the addend according to the relocaction into the instruction.
++bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend) {
++  // instruction to rewrite (P: Position of the relocation)
++  addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset);
++  IF_DEBUG(linker,
++           debugBelch(
++               "Relocation type %s 0x%lx (%lu) symbol 0x%lx addend 0x%x (%u / "
++               "%d) P 0x%lx\n",
++               relocationTypeToString(rel->r_info), ELF64_R_TYPE(rel->r_info),
++               ELF64_R_TYPE(rel->r_info), ELF64_R_SYM(rel->r_info), addend,
++               addend, addend, P));
++  switch (ELF64_R_TYPE(rel->r_info)) {
++  case R_RISCV_32_PCREL:
++  case R_RISCV_32:
++    write32le((inst_t *)P, addend);
++    break;
++  case R_RISCV_64:
++    write64le((uint64_t *)P, addend);
++    break;
++  case R_RISCV_GOT_HI20:
++  case R_RISCV_PCREL_HI20:
++  case R_RISCV_HI20: {
++    setUType((inst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_PCREL_LO12_I:
++  case R_RISCV_LO12_I: {
++    setIType((inst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_RVC_JUMP: {
++    setCJType((cinst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_RVC_BRANCH: {
++    setCBType((cinst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_BRANCH: {
++    setBType((inst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_CALL:
++  case R_RISCV_CALL_PLT: {
++    // We could relax more (in some cases) but right now most important is to
++    // make it work.
++    setUType((inst_t *)P, addend);
++    setIType(((inst_t *)P) + 1, addend);
++    break;
++  }
++  case R_RISCV_JAL: {
++    setJType((inst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_ADD8:
++    write8le((uint8_t *)P, read8le((uint8_t *)P) + addend);
++    break;
++  case R_RISCV_ADD16:
++    write16le((cinst_t *)P, read16le((cinst_t *)P) + addend);
++    break;
++  case R_RISCV_ADD32:
++    write32le((inst_t *)P, read32le((inst_t *)P) + addend);
++    break;
++  case R_RISCV_ADD64:
++    write64le((uint64_t *)P, read64le((uint64_t *)P) + addend);
++    break;
++  case R_RISCV_SUB6: {
++    uint8_t keep = *((uint8_t *)P) & 0xc0;
++    uint8_t imm = (((*(uint8_t *)P) & 0x3f) - addend) & 0x3f;
++
++    write8le((uint8_t *)P, keep | imm);
++    break;
++  }
++  case R_RISCV_SUB8:
++    write8le((uint8_t *)P, read8le((uint8_t *)P) - addend);
++    break;
++  case R_RISCV_SUB16:
++    write16le((cinst_t *)P, read16le((cinst_t *)P) - addend);
++    break;
++  case R_RISCV_SUB32:
++    write32le((inst_t *)P, read32le((inst_t *)P) - addend);
++    break;
++  case R_RISCV_SUB64:
++    write64le((uint64_t *)P, read64le((uint64_t *)P) - addend);
++    break;
++  case R_RISCV_SET6: {
++    uint8_t keep = *((uint8_t *)P) & 0xc0;
++    uint8_t imm = (addend & 0x3f) & 0x3f;
++
++    write8le((uint8_t *)P, keep | imm);
++    break;
++  }
++  case R_RISCV_SET8:
++    write8le((uint8_t *)P, addend);
++    break;
++  case R_RISCV_SET16:
++    write16le((cinst_t *)P, addend);
++    break;
++  case R_RISCV_SET32:
++    write32le((inst_t *)P, addend);
++    break;
++  case R_RISCV_PCREL_LO12_S:
++  case R_RISCV_TPREL_LO12_S:
++  case R_RISCV_LO12_S: {
++    setSType((inst_t *)P, addend);
++    break;
++  }
++  case R_RISCV_RELAX:
++  case R_RISCV_ALIGN:
++    // Implementing relaxations (rewriting instructions to more efficient ones)
++    // could be implemented in future. As the code already is aligned and we do
++    // not change the instruction sizes, we should get away with not aligning
++    // (though, that is cheating.) To align or change the instruction count, we
++    // would need machinery to squeeze or extend memory at the current location.
++    break;
++  default:
++    barf("Missing relocation 0x%lx\n", ELF64_R_TYPE(rel->r_info));
++  }
++  return EXIT_SUCCESS;
++}
++
++/**
++ * Compute the *new* addend for a relocation, given a pre-existing addend.
++ * @param section The section the relocation is in.
++ * @param rel     The Relocation struct.
++ * @param symbol  The target symbol.
++ * @param addend  The existing addend. Either explicit or implicit.
++ * @return The new computed addend.
++ */
++int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol,
++                      int64_t addend, ObjectCode *oc) {
++  Section * section = &oc->sections[relaTab->targetSectionIndex];
++
++  // instruction to rewrite (P: Position of the relocation)
++  addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset);
++
++  CHECK(0x0 != P);
++  CHECK((uint64_t)section->start <= P);
++  CHECK(P <= (uint64_t)section->start + section->size);
++  // S: Value of the symbol in the symbol table
++  addr_t S = (addr_t)symbol->addr;
++  /* GOT slot for the symbol (G + GOT) */
++  addr_t GOT_S = (addr_t)symbol->got_addr;
++
++  // A: Addend field in the relocation entry associated with the symbol
++  int64_t A = addend;
++
++  IF_DEBUG(linker, debugBelch("%s: P 0x%lx S 0x%lx %s GOT_S 0x%lx A 0x%lx relNo %u\n",
++                              relocationTypeToString(rel->r_info), P, S,
++                              symbol->name, GOT_S, A, relNo));
++  switch (ELF64_R_TYPE(rel->r_info)) {
++  case R_RISCV_32:
++    return S + A;
++  case R_RISCV_64:
++    return S + A;
++  case R_RISCV_HI20:
++    return S + A;
++  case R_RISCV_JUMP_SLOT:
++    return S;
++  case R_RISCV_JAL:
++    return S + A - P;
++  case R_RISCV_PCREL_HI20:
++    return S + A - P;
++  case R_RISCV_LO12_I:
++    return S + A;
++    // Quoting LLVM docs: For R_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}),
++    // the symbol actually points the corresponding R_RISCV_PCREL_HI20
++    // relocation, and the target VA is calculated using PCREL_HI20's symbol.
++  case R_RISCV_PCREL_LO12_S:
++    FALLTHROUGH;
++  case R_RISCV_PCREL_LO12_I: {
++    // Lookup related HI20 relocation and use that value. I'm still confused why
++    // relocations aren't self-contained, but this is how LLVM does it. And,
++    // calculating the lower 12 bit without any relationship to the GOT entry's
++    // address makes no sense either.
++      for (int64_t i = relNo; i >= 0 ; i--) {
++        Elf_Rela *rel_prime = &relaTab->relocations[i];
++
++        addr_t P_prime =
++            (addr_t)((uint8_t *)section->start + rel_prime->r_offset);
++
++        if (P_prime != S) {
++          // S points to the P of the corresponding *_HI20 relocation.
++          continue;
++        }
++
++        ElfSymbol *symbol_prime =
++            findSymbol(oc, relaTab->sectionHeader->sh_link,
++                       ELF64_R_SYM((Elf64_Xword)rel_prime->r_info));
++
++        CHECK(0x0 != symbol_prime);
++
++        /* take explicit addend */
++        int64_t addend_prime = rel_prime->r_addend;
++
++        uint64_t type_prime = ELF64_R_TYPE(rel_prime->r_info);
++
++        if (type_prime == R_RISCV_PCREL_HI20 ||
++            type_prime == R_RISCV_GOT_HI20 ||
++            type_prime == R_RISCV_TLS_GD_HI20 ||
++            type_prime == R_RISCV_TLS_GOT_HI20) {
++          IF_DEBUG(linker,
++                   debugBelch(
++                       "Found matching relocation: %s (P: 0x%lx, S: 0x%lx, "
++                       "sym-name: %s) -> %s (P: 0x%lx, S: %p, sym-name: %s, relNo: %ld)",
++                       relocationTypeToString(rel->r_info), P, S, symbol->name,
++                       relocationTypeToString(rel_prime->r_info), P_prime,
++                       symbol_prime->addr, symbol_prime->name, i));
++          int32_t result = computeAddend(relaTab, i, (Elf_Rel *)rel_prime,
++                                         symbol_prime, addend_prime, oc);
++          IF_DEBUG(linker, debugBelch("Result of computeAddend: 0x%x (%d)\n",
++                                      result, result));
++          return result;
++        }
++    }
++    debugBelch("Missing HI relocation for %s: P 0x%lx S 0x%lx %s\n",
++               relocationTypeToString(rel->r_info), P, S, symbol->name);
++    abort();
++  }
++
++  case R_RISCV_RVC_JUMP:
++    return S + A - P;
++  case R_RISCV_RVC_BRANCH:
++    return S + A - P;
++  case R_RISCV_BRANCH:
++    return S + A - P;
++  case R_RISCV_CALL:
++  case R_RISCV_CALL_PLT: {
++    addr_t GOT_Target;
++    if (GOT_S != 0) {
++      // 1. Public symbol with GOT entry.
++      GOT_Target = GOT_S;
++    } else {
++      // 2. Fake GOT entry with symbol extra entry.
++      SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S);
++      addr_t* FAKE_GOT_S = &symbolExtra->addr;
++      IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT w/ SymbolExtra = %p , "
++                                  "entry = %p\n",
++                                  symbolExtra, FAKE_GOT_S));
++      GOT_Target = (addr_t) FAKE_GOT_S;
++    }
++
++    if (findStub(section, (void **)&S, 0)) {
++      /* did not find it. Crete a new stub. */
++      if (makeStub(section, (void **)&S, (void *)GOT_Target, 0)) {
++        abort(/* could not find or make stub */);
++      }
++    }
++    IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT: S = 0x%lx A = 0x%lx P = "
++                                "0x%lx (S + A) - P = 0x%lx \n",
++                                S, A, P, (S + A) - P));
++    return (S + A) - P;
++  }
++  case R_RISCV_ADD8:
++    FALLTHROUGH;
++  case R_RISCV_ADD16:
++    FALLTHROUGH;
++  case R_RISCV_ADD32:
++    FALLTHROUGH;
++  case R_RISCV_ADD64:
++    return S + A; // Add V when the value is set
++  case R_RISCV_SUB6:
++    FALLTHROUGH;
++  case R_RISCV_SUB8:
++    FALLTHROUGH;
++  case R_RISCV_SUB16:
++    FALLTHROUGH;
++  case R_RISCV_SUB32:
++    FALLTHROUGH;
++  case R_RISCV_SUB64:
++    return S + A; // Subtract from V when value is set
++  case R_RISCV_SET6:
++    FALLTHROUGH;
++  case R_RISCV_SET8:
++    FALLTHROUGH;
++  case R_RISCV_SET16:
++    FALLTHROUGH;
++  case R_RISCV_SET32:
++    return S + A;
++  case R_RISCV_RELAX:
++    // This "relocation" has no addend.
++    FALLTHROUGH;
++  case R_RISCV_ALIGN:
++    // I guess we don't need to implement this relaxation. Otherwise, this
++    // should return the number of blank bytes to insert via NOPs.
++    return 0;
++  case R_RISCV_32_PCREL:
++    return S + A - P;
++  case R_RISCV_GOT_HI20: {
++    // TODO: Allocating extra memory for every symbol just to play this trick
++    // seems to be a bit obscene. (GOT relocations hitting local symbols
++    // happens, but not very often.) It would be better to allocate only what we
++    // really need.
++
++    // There are two cases here: 1. The symbol is public and has an entry in the
++    // GOT. 2. It's local and has no corresponding GOT entry. The first case is
++    // easy: We simply calculate the addend with the GOT address. In the second
++    // case we create a symbol extra entry and pretend it's the GOT.
++    if (GOT_S != 0) {
++      // 1. Public symbol with GOT entry.
++      return GOT_S + A - P;
++    } else {
++      // 2. Fake GOT entry with symbol extra entry.
++      SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S);
++      addr_t* FAKE_GOT_S = &symbolExtra->addr;
++      addr_t res = (addr_t) FAKE_GOT_S + A - P;
++      IF_DEBUG(linker, debugBelch("R_RISCV_GOT_HI20 w/ SymbolExtra = %p , "
++                                  "entry = %p , reloc-addend = 0x%lu ",
++                                  symbolExtra, FAKE_GOT_S, res));
++      return res;
++    }
++  }
++  default:
++    barf("Unimplemented relocation: 0x%lx\n (%lu)",
++               ELF64_R_TYPE(rel->r_info), ELF64_R_TYPE(rel->r_info));
++  }
++  barf("This should never happen!");
++}
++
++// Iterate over all relocations and perform them.
++bool relocateObjectCodeRISCV64(ObjectCode *oc) {
++  for (ElfRelocationTable *relTab = oc->info->relTable; relTab != NULL;
++       relTab = relTab->next) {
++    /* only relocate interesting sections */
++    if (SECTIONKIND_OTHER == oc->sections[relTab->targetSectionIndex].kind)
++      continue;
++
++    Section *targetSection = &oc->sections[relTab->targetSectionIndex];
++
++    for (unsigned i = 0; i < relTab->n_relocations; i++) {
++      Elf_Rel *rel = &relTab->relocations[i];
++
++      ElfSymbol *symbol = findSymbol(oc, relTab->sectionHeader->sh_link,
++                                     ELF64_R_SYM((Elf64_Xword)rel->r_info));
++
++      CHECK(0x0 != symbol);
++
++      // This always fails, because we don't support Rel locations, yet: Do we
++      // need this case? Leaving it in to spot the potential bug when it
++      // appears.
++      /* decode implicit addend */
++      int64_t addend = decodeAddendRISCV64(targetSection, rel);
++
++      addend = computeAddend((ElfRelocationATable*) relTab, i, rel, symbol, addend, oc);
++      encodeAddendRISCV64(targetSection, rel, addend);
++    }
++  }
++  for (ElfRelocationATable *relaTab = oc->info->relaTable; relaTab != NULL;
++       relaTab = relaTab->next) {
++    /* only relocate interesting sections */
++    if (SECTIONKIND_OTHER == oc->sections[relaTab->targetSectionIndex].kind)
++      continue;
++
++    Section *targetSection = &oc->sections[relaTab->targetSectionIndex];
++
++    for (unsigned i = 0; i < relaTab->n_relocations; i++) {
++
++      Elf_Rela *rel = &relaTab->relocations[i];
++
++      ElfSymbol *symbol = findSymbol(oc, relaTab->sectionHeader->sh_link,
++                                     ELF64_R_SYM((Elf64_Xword)rel->r_info));
++
++      CHECK(0x0 != symbol);
++
++      /* take explicit addend */
++      int64_t addend = rel->r_addend;
++
++      addend = computeAddend(relaTab, i, (Elf_Rel *)rel, symbol, addend, oc);
++      encodeAddendRISCV64(targetSection, (Elf_Rel *)rel, addend);
++    }
++  }
++  return EXIT_SUCCESS;
++}
++
++void flushInstructionCacheRISCV64(ObjectCode *oc) {
++  // Synchronize the memory and instruction cache to prevent illegal instruction
++  // exceptions. On Linux the parameters of __builtin___clear_cache are
++  // currently unused. Add them anyways for future compatibility. (I.e. the
++  // parameters couldn't be checked during development.)
++
++  /* The main object code */
++  void *codeBegin = oc->image + oc->misalignment;
++  __builtin___clear_cache(codeBegin, (void*) ((uint64_t*) codeBegin + oc->fileSize));
++
++  /* Jump Islands */
++  __builtin___clear_cache((void *)oc->symbol_extras,
++                          (void *)(oc->symbol_extras + oc->n_symbol_extras));
++
++  // Memory barrier to ensure nothing circumvents the fence.i / cache flushes.
++  SEQ_CST_FENCE();
++}
++
++#endif /* OBJECTFORMAT_ELF */
++#endif /* riscv64_HOST_ARCH */
+Index: ghc-9.10.1/rts/linker/elf_reloc_riscv64.h
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/rts/linker/elf_reloc_riscv64.h
+@@ -0,0 +1,11 @@
++#pragma once
++
++#include "LinkerInternals.h"
++
++#if defined(OBJFORMAT_ELF)
++
++bool
++relocateObjectCodeRISCV64(ObjectCode * oc);
++
++void flushInstructionCacheRISCV64(ObjectCode *oc);
++#endif /* OBJETFORMAT_ELF */
+Index: ghc-9.10.1/rts/rts.cabal
+===================================================================
+--- ghc-9.10.1.orig/rts/rts.cabal
++++ ghc-9.10.1/rts/rts.cabal
+@@ -468,9 +468,11 @@ library
+                  linker/elf_got.c
+                  linker/elf_plt.c
+                  linker/elf_plt_aarch64.c
++                 linker/elf_plt_riscv64.c
+                  linker/elf_plt_arm.c
+                  linker/elf_reloc.c
+                  linker/elf_reloc_aarch64.c
++                 linker/elf_reloc_riscv64.c
+                  linker/elf_tlsgd.c
+                  linker/elf_util.c
+                  sm/BlockAlloc.c
+Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.hs
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.hs
+@@ -0,0 +1,123 @@
++{-# LANGUAGE ForeignFunctionInterface #-}
++{-# LANGUAGE GHCForeignImportPrim #-}
++{-# LANGUAGE MagicHash #-}
++{-# LANGUAGE UnboxedTuples #-}
++{-# LANGUAGE UnliftedFFITypes #-}
++
++-- | This test ensures that sub-word signed and unsigned parameters are correctly
++-- handed over to C functions. I.e. it asserts the calling-convention.
++--
++-- The number of parameters is currently shaped for the RISCV64 calling-convention.
++-- You may need to add more parameters to the C functions in case there are more
++-- registers reserved for parameters in your architecture.
++module Main where
++
++import Data.Word
++import GHC.Exts
++import GHC.Int
++
++foreign import ccall "fun8"
++  fun8 ::
++    Int8# -> -- a0
++    Word8# -> -- a1
++    Int8# -> -- a2
++    Int8# -> -- a3
++    Int8# -> -- a4
++    Int8# -> -- a5
++    Int8# -> -- a6
++    Int8# -> -- a7
++    Word8# -> -- s0
++    Int8# -> -- s1
++    Int64# -- result
++
++foreign import ccall "fun16"
++  fun16 ::
++    Int16# -> -- a0
++    Word16# -> -- a1
++    Int16# -> -- a2
++    Int16# -> -- a3
++    Int16# -> -- a4
++    Int16# -> -- a5
++    Int16# -> -- a6
++    Int16# -> -- a7
++    Word16# -> -- s0
++    Int16# -> -- s1
++    Int64# -- result
++
++foreign import ccall "fun32"
++  fun32 ::
++    Int32# -> -- a0
++    Word32# -> -- a1
++    Int32# -> -- a2
++    Int32# -> -- a3
++    Int32# -> -- a4
++    Int32# -> -- a5
++    Int32# -> -- a6
++    Int32# -> -- a7
++    Word32# -> -- s0
++    Int32# -> -- s1
++    Int64# -- result
++
++foreign import ccall "funFloat"
++  funFloat ::
++    Float# -> -- a0
++    Float# -> -- a1
++    Float# -> -- a2
++    Float# -> -- a3
++    Float# -> -- a4
++    Float# -> -- a5
++    Float# -> -- a6
++    Float# -> -- a7
++    Float# -> -- s0
++    Float# -> -- s1
++    Float# -- result
++
++foreign import ccall "funDouble"
++  funDouble ::
++    Double# -> -- a0
++    Double# -> -- a1
++    Double# -> -- a2
++    Double# -> -- a3
++    Double# -> -- a4
++    Double# -> -- a5
++    Double# -> -- a6
++    Double# -> -- a7
++    Double# -> -- s0
++    Double# -> -- s1
++    Double# -- result
++
++main :: IO ()
++main =
++  -- N.B. the values here aren't choosen by accident: -1 means all bits one in
++  -- twos-complement, which is the same as the max word value.
++  let i8 :: Int8# = intToInt8# (-1#)
++      w8 :: Word8# = wordToWord8# (255##)
++      res8 :: Int64# = fun8 i8 w8 i8 i8 i8 i8 i8 i8 w8 i8
++      expected_res8 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word8) + 8 * (-1)
++      i16 :: Int16# = intToInt16# (-1#)
++      w16 :: Word16# = wordToWord16# (65535##)
++      res16 :: Int64# = fun16 i16 w16 i16 i16 i16 i16 i16 i16 w16 i16
++      expected_res16 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word16) + 8 * (-1)
++      i32 :: Int32# = intToInt32# (-1#)
++      w32 :: Word32# = wordToWord32# (4294967295##)
++      res32 :: Int64# = fun32 i32 w32 i32 i32 i32 i32 i32 i32 w32 i32
++      expected_res32 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word32) + 8 * (-1)
++      resFloat :: Float = F# (funFloat 1.0# 1.1# 1.2# 1.3# 1.4# 1.5# 1.6# 1.7# 1.8# 1.9#)
++      resDouble :: Double = D# (funDouble 1.0## 1.1## 1.2## 1.3## 1.4## 1.5## 1.6## 1.7## 1.8## 1.9##)
++   in do
++        print $ "fun8 result:" ++ show (I64# res8)
++        assertEqual expected_res8 (I64# res8)
++        print $ "fun16 result:" ++ show (I64# res16)
++        assertEqual expected_res16 (I64# res16)
++        print $ "fun32 result:" ++ show (I64# res32)
++        assertEqual expected_res32 (I64# res32)
++        print $ "funFloat result:" ++ show resFloat
++        assertEqual (14.5 :: Float) resFloat
++        print $ "funDouble result:" ++ show resDouble
++        assertEqual (14.5 :: Double) resDouble
++
++assertEqual :: (Eq a, Show a) => a -> a -> IO ()
++assertEqual a b =
++  if a == b
++    then pure ()
++    else error $ show a ++ " =/= " ++ show b
+Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.stdout
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.stdout
+@@ -0,0 +1,60 @@
++"fun8 result:502"
++"fun16 result:131062"
++"fun32 result:8589934582"
++"funFloat result:14.5"
++"funDouble result:14.5"
++fun32:
++a0: 0xffffffff -1
++a1: 0xffffffff 4294967295
++a2: 0xffffffff -1
++a3: 0xffffffff -1
++a4: 0xffffffff -1
++a5: 0xffffffff -1
++a6: 0xffffffff -1
++a7: 0xffffffff -1
++s0: 0xffffffff -1
++s1: 0xffffffff 4294967295
++fun16:
++a0: 0xffffffff -1
++a1: 0xffff 65535
++a2: 0xffffffff -1
++a3: 0xffffffff -1
++a4: 0xffffffff -1
++a5: 0xffffffff -1
++a6: 0xffffffff -1
++a7: 0xffffffff -1
++s0: 0xffffffff -1
++s1: 0xffff 65535
++fun8:
++a0: 0xffffffff -1
++a1: 0xff 255
++a2: 0xffffffff -1
++a3: 0xffffffff -1
++a4: 0xffffffff -1
++a5: 0xffffffff -1
++a6: 0xffffffff -1
++a7: 0xffffffff -1
++s0: 0xffffffff -1
++s1: 0xff 255
++funFloat:
++a0: 1.000000
++a1: 1.100000
++a2: 1.200000
++a3: 1.300000
++a4: 1.400000
++a5: 1.500000
++a6: 1.600000
++a7: 1.700000
++s0: 1.800000
++s1: 1.900000
++funDouble:
++a0: 1.000000
++a1: 1.100000
++a2: 1.200000
++a3: 1.300000
++a4: 1.400000
++a5: 1.500000
++a6: 1.600000
++a7: 1.700000
++s0: 1.800000
++s1: 1.900000
+Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv_c.c
+===================================================================
+--- /dev/null
++++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv_c.c
+@@ -0,0 +1,91 @@
++#include "stdint.h"
++#include "stdio.h"
++
++int64_t fun8(int8_t a0, uint8_t a1, int8_t a2, int8_t a3, int8_t a4, int8_t a5,
++             int8_t a6, int8_t a7, int8_t s0, uint8_t s1) {
++  printf("fun8:\n");
++  printf("a0: %#x %hhd\n", a0, a0);
++  printf("a1: %#x %hhu\n", a1, a1);
++  printf("a2: %#x %hhd\n", a2, a2);
++  printf("a3: %#x %hhd\n", a3, a3);
++  printf("a4: %#x %hhd\n", a4, a4);
++  printf("a5: %#x %hhd\n", a5, a5);
++  printf("a6: %#x %hhd\n", a6, a6);
++  printf("a7: %#x %hhd\n", a7, a7);
++  printf("s0: %#x %hhd\n", s0, s0);
++  printf("s1: %#x %hhu\n", s1, s1);
++
++  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
++}
++
++int64_t fun16(int16_t a0, uint16_t a1, int16_t a2, int16_t a3, int16_t a4,
++              int16_t a5, int16_t a6, int16_t a7, int16_t s0, uint16_t s1) {
++  printf("fun16:\n");
++  printf("a0: %#x %hd\n", a0, a0);
++  printf("a1: %#x %hu\n", a1, a1);
++  printf("a2: %#x %hd\n", a2, a2);
++  printf("a3: %#x %hd\n", a3, a3);
++  printf("a4: %#x %hd\n", a4, a4);
++  printf("a5: %#x %hd\n", a5, a5);
++  printf("a6: %#x %hd\n", a6, a6);
++  printf("a7: %#x %hd\n", a7, a7);
++  printf("s0: %#x %hd\n", s0, s0);
++  printf("s1: %#x %hu\n", s1, s1);
++
++  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
++}
++
++int64_t fun32(int32_t a0, uint32_t a1, int32_t a2, int32_t a3, int32_t a4,
++              int32_t a5, int32_t a6, int32_t a7, int32_t s0, uint32_t s1) {
++  printf("fun32:\n");
++  printf("a0: %#x %d\n", a0, a0);
++  printf("a1: %#x %u\n", a1, a1);
++  printf("a2: %#x %d\n", a2, a2);
++  printf("a3: %#x %d\n", a3, a3);
++  printf("a4: %#x %d\n", a4, a4);
++  printf("a5: %#x %d\n", a5, a5);
++  printf("a6: %#x %d\n", a6, a6);
++  printf("a7: %#x %d\n", a7, a7);
++  printf("s0: %#x %d\n", s0, s0);
++  printf("s1: %#x %u\n", s1, s1);
++
++  // Ensure the addition happens in long int (not just int) precission.
++  // Otherwise, the result is truncated during the operation.
++  int64_t force_int64_precission = 0;
++  return force_int64_precission + a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 +
++         s1;
++}
++
++float funFloat(float a0, float a1, float a2, float a3, float a4, float a5,
++             float a6, float a7, float s0, float s1) {
++  printf("funFloat:\n");
++  printf("a0: %f\n", a0);
++  printf("a1: %f\n", a1);
++  printf("a2: %f\n", a2);
++  printf("a3: %f\n", a3);
++  printf("a4: %f\n", a4);
++  printf("a5: %f\n", a5);
++  printf("a6: %f\n", a6);
++  printf("a7: %f\n", a7);
++  printf("s0: %f\n", s0);
++  printf("s1: %f\n", s1);
++
++  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
++}
++
++double funDouble(double a0, double a1, double a2, double a3, double a4, double a5,
++             double a6, double a7, double s0, double s1) {
++  printf("funDouble:\n");
++  printf("a0: %f\n", a0);
++  printf("a1: %f\n", a1);
++  printf("a2: %f\n", a2);
++  printf("a3: %f\n", a3);
++  printf("a4: %f\n", a4);
++  printf("a5: %f\n", a5);
++  printf("a6: %f\n", a6);
++  printf("a7: %f\n", a7);
++  printf("s0: %f\n", s0);
++  printf("s1: %f\n", s1);
++
++  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
++}