Index: ghc-9.10.1/CODEOWNERS =================================================================== --- ghc-9.10.1.orig/CODEOWNERS +++ ghc-9.10.1/CODEOWNERS @@ -40,6 +40,7 @@ /compiler/GHC/HsToCore/Foreign/Wasm.hs @TerrorJack /compiler/GHC/Tc/Deriv/ @RyanGlScott /compiler/GHC/CmmToAsm/ @simonmar @bgamari @AndreasK +/compiler/GHC/CmmToAsm/RV64/ @supersven @angerman /compiler/GHC/CmmToAsm/Wasm/ @TerrorJack /compiler/GHC/CmmToLlvm/ @angerman /compiler/GHC/StgToCmm/ @simonmar @osa1 Index: ghc-9.10.1/compiler/CodeGen.Platform.h =================================================================== --- ghc-9.10.1.orig/compiler/CodeGen.Platform.h +++ ghc-9.10.1/compiler/CodeGen.Platform.h @@ -1,7 +1,8 @@ import GHC.Cmm.Expr #if !(defined(MACHREGS_i386) || defined(MACHREGS_x86_64) \ - || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64)) + || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64) \ + || defined(MACHREGS_riscv64)) import GHC.Utils.Panic.Plain #endif import GHC.Platform.Reg @@ -1041,6 +1042,105 @@ freeReg 18 = False # if defined(REG_Base) freeReg REG_Base = False +# endif +# if defined(REG_Sp) +freeReg REG_Sp = False +# endif +# if defined(REG_SpLim) +freeReg REG_SpLim = False +# endif +# if defined(REG_Hp) +freeReg REG_Hp = False +# endif +# if defined(REG_HpLim) +freeReg REG_HpLim = False +# endif + +# if defined(REG_R1) +freeReg REG_R1 = False +# endif +# if defined(REG_R2) +freeReg REG_R2 = False +# endif +# if defined(REG_R3) +freeReg REG_R3 = False +# endif +# if defined(REG_R4) +freeReg REG_R4 = False +# endif +# if defined(REG_R5) +freeReg REG_R5 = False +# endif +# if defined(REG_R6) +freeReg REG_R6 = False +# endif +# if defined(REG_R7) +freeReg REG_R7 = False +# endif +# if defined(REG_R8) +freeReg REG_R8 = False +# endif + +# if defined(REG_F1) +freeReg REG_F1 = False +# endif +# if defined(REG_F2) +freeReg REG_F2 = False +# endif +# if defined(REG_F3) +freeReg REG_F3 = False +# endif +# if defined(REG_F4) +freeReg REG_F4 = False +# endif +# if defined(REG_F5) +freeReg REG_F5 = False +# endif +# if defined(REG_F6) +freeReg REG_F6 = False +# endif + +# if defined(REG_D1) +freeReg REG_D1 = False +# endif +# if defined(REG_D2) +freeReg REG_D2 = False +# endif +# if defined(REG_D3) +freeReg REG_D3 = False +# endif +# if defined(REG_D4) +freeReg REG_D4 = False +# endif +# if defined(REG_D5) +freeReg REG_D5 = False +# endif +# if defined(REG_D6) +freeReg REG_D6 = False +# endif + +freeReg _ = True + +#elif defined(MACHREGS_riscv64) + +-- zero reg +freeReg 0 = False +-- link register +freeReg 1 = False +-- stack pointer +freeReg 2 = False +-- global pointer +freeReg 3 = False +-- thread pointer +freeReg 4 = False +-- frame pointer +freeReg 8 = False +-- made-up inter-procedural (ip) register +-- See Note [The made-up RISCV64 TMP (IP) register] +freeReg 31 = False + +# if defined(REG_Base) +freeReg REG_Base = False # endif # if defined(REG_Sp) freeReg REG_Sp = False Index: ghc-9.10.1/compiler/GHC/Cmm/CLabel.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/Cmm/CLabel.hs +++ ghc-9.10.1/compiler/GHC/Cmm/CLabel.hs @@ -1720,6 +1720,8 @@ pprDynamicLinkerAsmLabel !platform dllIn | platformArch platform == ArchAArch64 = ppLbl + | platformArch platform == ArchRISCV64 + = ppLbl | platformArch platform == ArchX86_64 = case dllInfo of Index: ghc-9.10.1/compiler/GHC/CmmToAsm.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm.hs @@ -67,6 +67,7 @@ import qualified GHC.CmmToAsm.X86 as X import qualified GHC.CmmToAsm.PPC as PPC import qualified GHC.CmmToAsm.AArch64 as AArch64 import qualified GHC.CmmToAsm.Wasm as Wasm32 +import qualified GHC.CmmToAsm.RV64 as RV64 import GHC.CmmToAsm.Reg.Liveness import qualified GHC.CmmToAsm.Reg.Linear as Linear @@ -148,7 +149,7 @@ nativeCodeGen logger ts config modLoc h ArchAlpha -> panic "nativeCodeGen: No NCG for Alpha" ArchMipseb -> panic "nativeCodeGen: No NCG for mipseb" ArchMipsel -> panic "nativeCodeGen: No NCG for mipsel" - ArchRISCV64 -> panic "nativeCodeGen: No NCG for RISCV64" + ArchRISCV64 -> nCG' (RV64.ncgRV64 config) ArchLoongArch64->panic "nativeCodeGen: No NCG for LoongArch64" ArchUnknown -> panic "nativeCodeGen: No NCG for unknown arch" ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Dwarf/Constants.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Dwarf/Constants.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Dwarf/Constants.hs @@ -240,6 +240,7 @@ dwarfRegNo p r = case platformArch p of | r == xmm15 -> 32 ArchPPC_64 _ -> fromIntegral $ toRegNo r ArchAArch64 -> fromIntegral $ toRegNo r + ArchRISCV64 -> fromIntegral $ toRegNo r _other -> error "dwarfRegNo: Unsupported platform or unknown register!" -- | Virtual register number to use for return address. @@ -252,5 +253,6 @@ dwarfReturnRegNo p ArchX86 -> 8 -- eip ArchX86_64 -> 16 -- rip ArchPPC_64 ELF_V2 -> 65 -- lr (link register) - ArchAArch64-> 30 + ArchAArch64 -> 30 + ArchRISCV64 -> 1 -- ra (return address) _other -> error "dwarfReturnRegNo: Unsupported platform!" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/PIC.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/PIC.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/PIC.hs @@ -132,6 +132,11 @@ cmmMakeDynamicReference config reference addImport symbolPtr return $ cmmMakePicReference config symbolPtr + AccessViaSymbolPtr | ArchRISCV64 <- platformArch platform -> do + let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl + addImport symbolPtr + return $ cmmMakePicReference config symbolPtr + AccessViaSymbolPtr -> do let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl addImport symbolPtr @@ -164,6 +169,10 @@ cmmMakePicReference config lbl | ArchAArch64 <- platformArch platform = CmmLit $ CmmLabel lbl + -- as on AArch64, there's no pic base register. + | ArchRISCV64 <- platformArch platform + = CmmLit $ CmmLabel lbl + | OSAIX <- platformOS platform = CmmMachOp (MO_Add W32) [ CmmReg (CmmGlobal $ GlobalRegUse PicBaseReg (bWord platform)) Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64.hs @@ -0,0 +1,57 @@ +{-# OPTIONS_GHC -fno-warn-orphans #-} + +-- | Native code generator for RiscV64 architectures +module GHC.CmmToAsm.RV64 (ncgRV64) where + +import GHC.CmmToAsm.Config +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.RV64.CodeGen qualified as RV64 +import GHC.CmmToAsm.RV64.Instr qualified as RV64 +import GHC.CmmToAsm.RV64.Ppr qualified as RV64 +import GHC.CmmToAsm.RV64.RegInfo qualified as RV64 +import GHC.CmmToAsm.RV64.Regs qualified as RV64 +import GHC.CmmToAsm.Types +import GHC.Prelude +import GHC.Utils.Outputable (ftext) + +ncgRV64 :: NCGConfig -> NcgImpl RawCmmStatics RV64.Instr RV64.JumpDest +ncgRV64 config = + NcgImpl + { ncgConfig = config, + cmmTopCodeGen = RV64.cmmTopCodeGen, + generateJumpTableForInstr = RV64.generateJumpTableForInstr config, + getJumpDestBlockId = RV64.getJumpDestBlockId, + canShortcut = RV64.canShortcut, + shortcutStatics = RV64.shortcutStatics, + shortcutJump = RV64.shortcutJump, + pprNatCmmDeclS = RV64.pprNatCmmDecl config, + pprNatCmmDeclH = RV64.pprNatCmmDecl config, + maxSpillSlots = RV64.maxSpillSlots config, + allocatableRegs = RV64.allocatableRegs platform, + ncgAllocMoreStack = RV64.allocMoreStack platform, + ncgMakeFarBranches = RV64.makeFarBranches, + extractUnwindPoints = const [], + invertCondBranches = \_ _ -> id + } + where + platform = ncgPlatform config + +-- | `Instruction` instance for RV64 +instance Instruction RV64.Instr where + regUsageOfInstr = RV64.regUsageOfInstr + patchRegsOfInstr = RV64.patchRegsOfInstr + isJumpishInstr = RV64.isJumpishInstr + jumpDestsOfInstr = RV64.jumpDestsOfInstr + patchJumpInstr = RV64.patchJumpInstr + mkSpillInstr = RV64.mkSpillInstr + mkLoadInstr = RV64.mkLoadInstr + takeDeltaInstr = RV64.takeDeltaInstr + isMetaInstr = RV64.isMetaInstr + mkRegRegMoveInstr _ = RV64.mkRegRegMoveInstr + takeRegRegMoveInstr = RV64.takeRegRegMoveInstr + mkJumpInstr = RV64.mkJumpInstr + mkStackAllocInstr = RV64.mkStackAllocInstr + mkStackDeallocInstr = RV64.mkStackDeallocInstr + mkComment = pure . RV64.COMMENT . ftext + pprInstr = RV64.pprInstr Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/CodeGen.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/CodeGen.hs @@ -0,0 +1,2207 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BinaryLiterals #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE OverloadedStrings #-} + +module GHC.CmmToAsm.RV64.CodeGen + ( cmmTopCodeGen, + generateJumpTableForInstr, + makeFarBranches, + ) +where + +import Control.Monad +import Data.Maybe +import Data.Word +import GHC.Cmm +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import GHC.Cmm.Dataflow.Block +import GHC.Cmm.Dataflow.Graph +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.DebugBlock +import GHC.Cmm.Switch +import GHC.Cmm.Utils +import GHC.CmmToAsm.CPrim +import GHC.CmmToAsm.Config +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Monad + ( NatM, + getBlockIdNat, + getConfig, + getDebugBlock, + getFileId, + getNewLabelNat, + getNewRegNat, + getPicBaseMaybeNat, + getPlatform, + ) +import GHC.CmmToAsm.PIC +import GHC.CmmToAsm.RV64.Cond +import GHC.CmmToAsm.RV64.Instr +import GHC.CmmToAsm.RV64.Regs +import GHC.CmmToAsm.Types +import GHC.Data.FastString +import GHC.Data.OrdList +import GHC.Float +import GHC.Platform +import GHC.Platform.Reg +import GHC.Platform.Regs +import GHC.Prelude hiding (EQ) +import GHC.Types.Basic +import GHC.Types.ForeignCall +import GHC.Types.SrcLoc (srcSpanFile, srcSpanStartCol, srcSpanStartLine) +import GHC.Types.Tickish (GenTickish (..)) +import GHC.Types.Unique.Supply +import GHC.Utils.Constants (debugIsOn) +import GHC.Utils.Misc +import GHC.Utils.Monad +import GHC.Utils.Outputable +import GHC.Utils.Panic + +-- For an overview of an NCG's structure, see Note [General layout of an NCG] + +cmmTopCodeGen :: + RawCmmDecl -> + NatM [NatCmmDecl RawCmmStatics Instr] +-- Thus we'll have to deal with either CmmProc ... +cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do + picBaseMb <- getPicBaseMaybeNat + when (isJust picBaseMb) $ panic "RV64.cmmTopCodeGen: Unexpected PIC base register (RISCV ISA does not define one)" + + let blocks = toBlockListEntryFirst graph + (nat_blocks, statics) <- mapAndUnzipM basicBlockCodeGen blocks + + let proc = CmmProc info lab live (ListGraph $ concat nat_blocks) + tops = proc : concat statics + + pure tops + +-- ... or CmmData. +cmmTopCodeGen (CmmData sec dat) = pure [CmmData sec dat] -- no translation, we just use CmmStatic + +basicBlockCodeGen :: + Block CmmNode C C -> + NatM + ( [NatBasicBlock Instr], + [NatCmmDecl RawCmmStatics Instr] + ) +basicBlockCodeGen block = do + config <- getConfig + let (_, nodes, tail) = blockSplit block + id = entryLabel block + stmts = blockToList nodes + + header_comment_instr + | debugIsOn = + unitOL + $ MULTILINE_COMMENT + ( text "-- --------------------------- basicBlockCodeGen --------------------------- --\n" + $+$ withPprStyle defaultDumpStyle (pdoc (ncgPlatform config) block) + ) + | otherwise = nilOL + + -- Generate location directive `.loc` (DWARF debug location info) + loc_instrs <- genLocInstrs + + -- Generate other instructions + mid_instrs <- stmtsToInstrs stmts + (!tail_instrs) <- stmtToInstrs tail + + let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs + + -- TODO: Then x86 backend runs @verifyBasicBlock@ here. How important it is to + -- have a valid CFG is an open question: This and the AArch64 and PPC NCGs + -- work fine without it. + + -- Code generation may introduce new basic block boundaries, which are + -- indicated by the NEWBLOCK instruction. We must split up the instruction + -- stream into basic blocks again. Also, we extract LDATAs here too. + (top, other_blocks, statics) = foldrOL mkBlocks ([], [], []) instrs + + return (BasicBlock id top : other_blocks, statics) + where + genLocInstrs :: NatM (OrdList Instr) + genLocInstrs = do + dbg <- getDebugBlock (entryLabel block) + case dblSourceTick =<< dbg of + Just (SourceNote span name) -> + do + fileId <- getFileId (srcSpanFile span) + let line = srcSpanStartLine span; col = srcSpanStartCol span + pure $ unitOL $ LOCATION fileId line col name + _ -> pure nilOL + +mkBlocks :: + Instr -> + ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g]) -> + ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g]) +mkBlocks (NEWBLOCK id) (instrs, blocks, statics) = + ([], BasicBlock id instrs : blocks, statics) +mkBlocks (LDATA sec dat) (instrs, blocks, statics) = + (instrs, blocks, CmmData sec dat : statics) +mkBlocks instr (instrs, blocks, statics) = + (instr : instrs, blocks, statics) + +-- ----------------------------------------------------------------------------- + +-- | Utilities + +-- | Annotate an `Instr` with a `SDoc` comment +ann :: SDoc -> Instr -> Instr +ann doc instr {- debugIsOn -} = ANN doc instr +{-# INLINE ann #-} + +-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with +-- -dppr-debug. The idea is that we can trivially see how a cmm expression +-- ended up producing the assembly we see. By having the verbatim AST printed +-- we can simply check the patterns that were matched to arrive at the assembly +-- we generated. +-- +-- pprExpr will hide a lot of noise of the underlying data structure and print +-- the expression into something that can be easily read by a human. However +-- going back to the exact CmmExpr representation can be laborious and adds +-- indirections to find the matches that lead to the assembly. +-- +-- An improvement could be to have +-- +-- (pprExpr genericPlatform e) <> parens (text. show e) +-- +-- to have the best of both worlds. +-- +-- Note: debugIsOn is too restrictive, it only works for debug compilers. +-- However, we do not only want to inspect this for debug compilers. Ideally +-- we'd have a check for -dppr-debug here already, such that we don't even +-- generate the ANN expressions. However, as they are lazy, they shouldn't be +-- forced until we actually force them, and without -dppr-debug they should +-- never end up being forced. +annExpr :: CmmExpr -> Instr -> Instr +annExpr e {- debugIsOn -} = ANN (text . show $ e) +-- annExpr e instr {- debugIsOn -} = ANN (pprExpr genericPlatform e) instr +-- annExpr _ instr = instr +{-# INLINE annExpr #-} + +-- ----------------------------------------------------------------------------- +-- Generating a table-branch + +-- Note [RISCV64 Jump Tables] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- Jump tables are implemented by generating a table of relative addresses, +-- where each entry is the relative offset to the target block from the first +-- entry / table label (`generateJumpTableForInstr`). Using the jump table means +-- loading the entry's value and jumping to the calculated absolute address +-- (`genSwitch`). +-- +-- For example, this Cmm switch +-- +-- switch [1 .. 10] _s2wn::I64 { +-- case 1 : goto c347; +-- case 2 : goto c348; +-- case 3 : goto c349; +-- case 4 : goto c34a; +-- case 5 : goto c34b; +-- case 6 : goto c34c; +-- case 7 : goto c34d; +-- case 8 : goto c34e; +-- case 9 : goto c34f; +-- case 10 : goto c34g; +-- } // CmmSwitch +-- +-- leads to this jump table in Assembly +-- +-- .section .rodata +-- .balign 8 +-- .Ln34G: +-- .quad 0 +-- .quad .Lc347-(.Ln34G)+0 +-- .quad .Lc348-(.Ln34G)+0 +-- .quad .Lc349-(.Ln34G)+0 +-- .quad .Lc34a-(.Ln34G)+0 +-- .quad .Lc34b-(.Ln34G)+0 +-- .quad .Lc34c-(.Ln34G)+0 +-- .quad .Lc34d-(.Ln34G)+0 +-- .quad .Lc34e-(.Ln34G)+0 +-- .quad .Lc34f-(.Ln34G)+0 +-- .quad .Lc34g-(.Ln34G)+0 +-- +-- and this indexing code where the jump should be done (register t0 contains +-- the index) +-- +-- addi t0, t0, 0 // silly move (ignore it) +-- la t1, .Ln34G // load the table's address +-- sll t0, t0, 3 // index * 8 -> offset in bytes +-- add t0, t0, t1 // address of the table's entry +-- ld t0, 0(t0) // load entry +-- add t0, t0, t1 // relative to absolute address +-- jalr zero, t0, 0 // jump to the block +-- +-- In object code (disassembled) the table looks like +-- +-- 0000000000000000 <.Ln34G>: +-- ... +-- 8: R_RISCV_ADD64 .Lc347 +-- 8: R_RISCV_SUB64 .Ln34G +-- 10: R_RISCV_ADD64 .Lc348 +-- 10: R_RISCV_SUB64 .Ln34G +-- 18: R_RISCV_ADD64 .Lc349 +-- 18: R_RISCV_SUB64 .Ln34G +-- 20: R_RISCV_ADD64 .Lc34a +-- 20: R_RISCV_SUB64 .Ln34G +-- 28: R_RISCV_ADD64 .Lc34b +-- 28: R_RISCV_SUB64 .Ln34G +-- 30: R_RISCV_ADD64 .Lc34c +-- 30: R_RISCV_SUB64 .Ln34G +-- 38: R_RISCV_ADD64 .Lc34d +-- 38: R_RISCV_SUB64 .Ln34G +-- 40: R_RISCV_ADD64 .Lc34e +-- 40: R_RISCV_SUB64 .Ln34G +-- 48: R_RISCV_ADD64 .Lc34f +-- 48: R_RISCV_SUB64 .Ln34G +-- 50: R_RISCV_ADD64 .Lc34g +-- 50: R_RISCV_SUB64 .Ln34G +-- +-- I.e. the relative offset calculations are done by the linker via relocations. +-- This seems to be PIC compatible; at least `scanelf` (pax-utils) does not +-- complain. + + +-- | Generate jump to jump table target +-- +-- The index into the jump table is calulated by evaluating @expr@. The +-- corresponding table entry contains the relative address to jump to (relative +-- to the jump table's first entry / the table's own label). +genSwitch :: NCGConfig -> CmmExpr -> SwitchTargets -> NatM InstrBlock +genSwitch config expr targets = do + (reg, fmt1, e_code) <- getSomeReg indexExpr + let fmt = II64 + targetReg <- getNewRegNat fmt + lbl <- getNewLabelNat + dynRef <- cmmMakeDynamicReference config DataReference lbl + (tableReg, fmt2, t_code) <- getSomeReg dynRef + let code = + toOL + [ COMMENT (text "indexExpr" <+> (text . show) indexExpr), + COMMENT (text "dynRef" <+> (text . show) dynRef) + ] + `appOL` e_code + `appOL` t_code + `appOL` toOL + [ COMMENT (ftext "Jump table for switch"), + -- index to offset into the table (relative to tableReg) + annExpr expr (SLL (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt1) reg) (OpImm (ImmInt 3))), + -- calculate table entry address + ADD (OpReg W64 targetReg) (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt2) tableReg), + -- load table entry (relative offset from tableReg (first entry) to target label) + LDRU II64 (OpReg W64 targetReg) (OpAddr (AddrRegImm targetReg (ImmInt 0))), + -- calculate absolute address of the target label + ADD (OpReg W64 targetReg) (OpReg W64 targetReg) (OpReg W64 tableReg), + -- prepare jump to target label + J_TBL ids (Just lbl) targetReg + ] + return code + where + -- See Note [Sub-word subtlety during jump-table indexing] in + -- GHC.CmmToAsm.X86.CodeGen for why we must first offset, then widen. + indexExpr0 = cmmOffset platform expr offset + -- We widen to a native-width register to sanitize the high bits + indexExpr = + CmmMachOp + (MO_UU_Conv expr_w (platformWordWidth platform)) + [indexExpr0] + expr_w = cmmExprWidth platform expr + (offset, ids) = switchTargetsToTable targets + platform = ncgPlatform config + +-- | Generate jump table data (if required) +-- +-- The idea is to emit one table entry per case. The entry is the relative +-- address of the block to jump to (relative to the table's first entry / +-- table's own label.) The calculation itself is done by the linker. +generateJumpTableForInstr :: + NCGConfig -> + Instr -> + Maybe (NatCmmDecl RawCmmStatics Instr) +generateJumpTableForInstr config (J_TBL ids (Just lbl) _) = + let jumpTable = + map jumpTableEntryRel ids + where + jumpTableEntryRel Nothing = + CmmStaticLit (CmmInt 0 (ncgWordWidth config)) + jumpTableEntryRel (Just blockid) = + CmmStaticLit + ( CmmLabelDiffOff + blockLabel + lbl + 0 + (ncgWordWidth config) + ) + where + blockLabel = blockLbl blockid + in Just (CmmData (Section ReadOnlyData lbl) (CmmStaticsRaw lbl jumpTable)) +generateJumpTableForInstr _ _ = Nothing + +-- ----------------------------------------------------------------------------- +-- Top-level of the instruction selector + +stmtsToInstrs :: + -- | Cmm Statements + [CmmNode O O] -> + -- | Resulting instruction + NatM InstrBlock +stmtsToInstrs stmts = concatOL <$> mapM stmtToInstrs stmts + +stmtToInstrs :: + CmmNode e x -> + -- | Resulting instructions + NatM InstrBlock +stmtToInstrs stmt = do + config <- getConfig + platform <- getPlatform + case stmt of + CmmUnsafeForeignCall target result_regs args -> + genCCall target result_regs args + CmmComment s -> pure (unitOL (COMMENT (ftext s))) + CmmTick {} -> pure nilOL + CmmAssign reg src + | isFloatType ty -> assignReg_FltCode format reg src + | otherwise -> assignReg_IntCode format reg src + where + ty = cmmRegType reg + format = cmmTypeFormat ty + CmmStore addr src _alignment + | isFloatType ty -> assignMem_FltCode format addr src + | otherwise -> assignMem_IntCode format addr src + where + ty = cmmExprType platform src + format = cmmTypeFormat ty + CmmBranch id -> genBranch id + -- We try to arrange blocks such that the likely branch is the fallthrough + -- in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here. + CmmCondBranch arg true false _prediction -> + genCondBranch true false arg + CmmSwitch arg ids -> genSwitch config arg ids + CmmCall {cml_target = arg} -> genJump arg + CmmUnwind _regs -> pure nilOL + -- Intentionally not have a default case here: If anybody adds a + -- constructor, the compiler should force them to think about this here. + CmmForeignCall {} -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt) + CmmEntry {} -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt) + +-------------------------------------------------------------------------------- + +-- | 'InstrBlock's are the insn sequences generated by the insn selectors. +-- +-- They are really trees of insns to facilitate fast appending, where a +-- left-to-right traversal yields the insns in the correct order. +type InstrBlock = + OrdList Instr + +-- | Register's passed up the tree. +-- +-- If the stix code forces the register to live in a pre-decided machine +-- register, it comes out as @Fixed@; otherwise, it comes out as @Any@, and the +-- parent can decide which register to put it in. +data Register + = Fixed Format Reg InstrBlock + | Any Format (Reg -> InstrBlock) + +-- | Sometimes we need to change the Format of a register. Primarily during +-- conversion. +swizzleRegisterRep :: Format -> Register -> Register +swizzleRegisterRep format' (Fixed _format reg code) = Fixed format' reg code +swizzleRegisterRep format' (Any _format codefn) = Any format' codefn + +-- | Grab a `Reg` for a `CmmReg` +-- +-- `LocalReg`s are assigned virtual registers (`RegVirtual`), `GlobalReg`s are +-- assigned real registers (`RegReal`). It is an error if a `GlobalReg` is not a +-- STG register. +getRegisterReg :: Platform -> CmmReg -> Reg +getRegisterReg _ (CmmLocal (LocalReg u pk)) = + RegVirtual $ mkVirtualReg u (cmmTypeFormat pk) +getRegisterReg platform (CmmGlobal mid) = + case globalRegMaybe platform (globalRegUseGlobalReg mid) of + Just reg -> RegReal reg + Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) + +-- ----------------------------------------------------------------------------- +-- General things for putting together code sequences + +-- | Compute an expression into any register +getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock) +getSomeReg expr = do + r <- getRegister expr + case r of + Any rep code -> do + newReg <- getNewRegNat rep + return (newReg, rep, code newReg) + Fixed rep reg code -> + return (reg, rep, code) + +-- | Compute an expression into any floating-point register +-- +-- If the initial expression is not a floating-point expression, finally move +-- the result into a floating-point register. +getFloatReg :: (HasCallStack) => CmmExpr -> NatM (Reg, Format, InstrBlock) +getFloatReg expr = do + r <- getRegister expr + case r of + Any rep code | isFloatFormat rep -> do + newReg <- getNewRegNat rep + return (newReg, rep, code newReg) + Any II32 code -> do + newReg <- getNewRegNat FF32 + return (newReg, FF32, code newReg) + Any II64 code -> do + newReg <- getNewRegNat FF64 + return (newReg, FF64, code newReg) + Any _w _code -> do + config <- getConfig + pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr) + -- can't do much for fixed. + Fixed rep reg code -> + return (reg, rep, code) + +-- | Map `CmmLit` to `OpImm` +-- +-- N.B. this is a partial function, because not all `CmmLit`s have an immediate +-- representation. +litToImm' :: CmmLit -> Operand +litToImm' = OpImm . litToImm + +-- | Compute a `CmmExpr` into a `Register` +getRegister :: CmmExpr -> NatM Register +getRegister e = do + config <- getConfig + getRegister' config (ncgPlatform config) e + +-- | The register width to be used for an operation on the given width +-- operand. +opRegWidth :: Width -> Width +opRegWidth W64 = W64 +opRegWidth W32 = W32 +opRegWidth W16 = W32 +opRegWidth W8 = W32 +opRegWidth w = pprPanic "opRegWidth" (text "Unsupported width" <+> ppr w) + +-- Note [Signed arithmetic on RISCV64] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- Handling signed arithmetic on sub-word-size values on RISCV64 is a bit +-- tricky as Cmm's type system does not capture signedness. While 32-bit values +-- are fairly easy to handle due to RISCV64's 32-bit instruction variants +-- (denoted by use of %wN registers), 16- and 8-bit values require quite some +-- care. +-- +-- We handle 16-and 8-bit values by using the 32-bit operations and +-- sign-/zero-extending operands and truncate results as necessary. For +-- simplicity we maintain the invariant that a register containing a +-- sub-word-size value always contains the zero-extended form of that value +-- in between operations. +-- +-- For instance, consider the program, +-- +-- test(bits64 buffer) +-- bits8 a = bits8[buffer]; +-- bits8 b = %mul(a, 42); +-- bits8 c = %not(b); +-- bits8 d = %shrl(c, 4::bits8); +-- return (d); +-- } +-- +-- This program begins by loading `a` from memory, for which we use a +-- zero-extended byte-size load. We next sign-extend `a` to 32-bits, and use a +-- 32-bit multiplication to compute `b`, and truncate the result back down to +-- 8-bits. +-- +-- Next we compute `c`: The `%not` requires no extension of its operands, but +-- we must still truncate the result back down to 8-bits. Finally the `%shrl` +-- requires no extension and no truncate since we can assume that +-- `c` is zero-extended. +-- +-- The "RISC-V Sign Extension Optimizations" LLVM tech talk presentation by +-- Craig Topper covers possible future improvements +-- (https://llvm.org/devmtg/2022-11/slides/TechTalk21-RISC-VSignExtensionOptimizations.pdf) +-- +-- +-- Note [Handling PIC on RV64] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- RV64 does not have a special PIC register, the general approach is to simply +-- do PC-relative addressing or go through the GOT. There is assembly support +-- for both. +-- +-- rv64 assembly has a `la` (load address) pseudo-instruction, that allows +-- loading a label's address into a register. The instruction is desugared into +-- different addressing modes, e.g. PC-relative addressing: +-- +-- 1: lui rd1, %pcrel_hi(label) +-- addi rd1, %pcrel_lo(1b) +-- +-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dModifiers.html, +-- PIC can be enabled/disabled through +-- +-- .option pic +-- +-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dDirectives.html#RISC_002dV_002dDirectives +-- +-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the +-- @cmmMakePicReference@. This is in turn called from @cmmMakeDynamicReference@ +-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported. There are two +-- callsites for this. One is in this module to produce the @target@ in @genCCall@ +-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@. +-- +-- Conceptually we do not want any special PicBaseReg to be used on RV64. If +-- we want to distinguish between symbol loading, we need to address this through +-- the way we load it, not through a register. +-- + +getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register +-- OPTIMIZATION WARNING: CmmExpr rewrites +-- 1. Rewrite: Reg + (-n) => Reg - n +-- TODO: this expression shouldn't even be generated to begin with. +getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)]) + | i < 0 = + getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)]) +getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)]) + | i < 0 = + getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)]) +-- Generic case. +getRegister' config plat expr = + case expr of + CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _)) -> + -- See Note [Handling PIC on RV64] + pprPanic "getRegister': There's no PIC base register on RISCV" (ppr PicBaseReg) + CmmLit lit -> + case lit of + CmmInt 0 w -> pure $ Fixed (intFormat w) zeroReg nilOL + CmmInt i w -> + -- narrowU is important: Negative immediates may be + -- sign-extended on load! + let imm = OpImm . ImmInteger $ narrowU w i + in pure (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) imm))) + CmmFloat 0 w -> do + let op = litToImm' lit + pure (Any (floatFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) op))) + CmmFloat _f W8 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr) + CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr) + CmmFloat f W32 -> do + let word = castFloatToWord32 (fromRational f) :: Word32 + intReg <- getNewRegNat (intFormat W32) + return + ( Any + (floatFormat W32) + ( \dst -> + toOL + [ annExpr expr + $ MOV (OpReg W32 intReg) (OpImm (ImmInteger (fromIntegral word))), + MOV (OpReg W32 dst) (OpReg W32 intReg) + ] + ) + ) + CmmFloat f W64 -> do + let word = castDoubleToWord64 (fromRational f) :: Word64 + intReg <- getNewRegNat (intFormat W64) + return + ( Any + (floatFormat W64) + ( \dst -> + toOL + [ annExpr expr + $ MOV (OpReg W64 intReg) (OpImm (ImmInteger (fromIntegral word))), + MOV (OpReg W64 dst) (OpReg W64 intReg) + ] + ) + ) + CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr) + CmmVec _lits -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr) + CmmLabel lbl -> do + let op = OpImm (ImmCLbl lbl) + rep = cmmLitType plat lit + format = cmmTypeFormat rep + return (Any format (\dst -> unitOL $ annExpr expr (LDR format (OpReg (formatToWidth format) dst) op))) + CmmLabelOff lbl off | isNbitEncodeable 12 (fromIntegral off) -> do + let op = OpImm (ImmIndex lbl off) + rep = cmmLitType plat lit + format = cmmTypeFormat rep + return (Any format (\dst -> unitOL $ LDR format (OpReg (formatToWidth format) dst) op)) + CmmLabelOff lbl off -> do + let op = litToImm' (CmmLabel lbl) + rep = cmmLitType plat lit + format = cmmTypeFormat rep + width = typeWidth rep + (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width) + return + ( Any + format + ( \dst -> + off_code + `snocOL` LDR format (OpReg (formatToWidth format) dst) op + `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r) + ) + ) + CmmLabelDiffOff {} -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr) + CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr) + CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr) + CmmLoad mem rep _ -> do + let format = cmmTypeFormat rep + width = typeWidth rep + Amode addr addr_code <- getAmode plat width mem + case width of + w + | w <= W64 -> + -- Load without sign-extension. See Note [Signed arithmetic on RISCV64] + pure + ( Any + format + ( \dst -> + addr_code + `snocOL` LDRU format (OpReg width dst) (OpAddr addr) + ) + ) + _ -> + pprPanic ("Width too big! Cannot load: " ++ show width) (pdoc plat expr) + CmmStackSlot _ _ -> + pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr) + CmmReg reg -> + return + ( Fixed + (cmmTypeFormat (cmmRegType reg)) + (getRegisterReg plat reg) + nilOL + ) + CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do + getRegister' config plat + $ CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)] + where + width = typeWidth (cmmRegType reg) + CmmRegOff reg off -> do + (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width) + (reg, _format, code) <- getSomeReg $ CmmReg reg + return + $ Any + (intFormat width) + ( \dst -> + off_code + `appOL` code + `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r) + ) + where + width = typeWidth (cmmRegType reg) + + -- Handle MO_RelaxedRead as a normal CmmLoad, to allow + -- non-trivial addressing modes to be used. + CmmMachOp (MO_RelaxedRead w) [e] -> + getRegister (CmmLoad e (cmmBits w) NaturallyAligned) + -- for MachOps, see GHC.Cmm.MachOp + -- For CmmMachOp, see GHC.Cmm.Expr + CmmMachOp op [e] -> do + (reg, _format, code) <- getSomeReg e + case op of + MO_Not w -> return $ Any (intFormat w) $ \dst -> + let w' = opRegWidth w + in code + `snocOL` + -- pseudo instruction `not` is `xori rd, rs, -1` + ann (text "not") (XORI (OpReg w' dst) (OpReg w' reg) (OpImm (ImmInt (-1)))) + `appOL` truncateReg w' w dst -- See Note [Signed arithmetic on RISCV64] + MO_S_Neg w -> negate code w reg + MO_F_Neg w -> + return + $ Any + (floatFormat w) + ( \dst -> + code + `snocOL` NEG (OpReg w dst) (OpReg w reg) + ) + -- TODO: Can this case happen? + MO_SF_Conv from to | from < W32 -> do + -- extend to the smallest available representation + (reg_x, code_x) <- signExtendReg from W32 reg + pure + $ Any + (floatFormat to) + ( \dst -> + code + `appOL` code_x + `snocOL` annExpr expr (FCVT IntToFloat (OpReg to dst) (OpReg from reg_x)) -- (Signed ConVerT Float) + ) + MO_SF_Conv from to -> + pure + $ Any + (floatFormat to) + ( \dst -> + code + `snocOL` annExpr expr (FCVT IntToFloat (OpReg to dst) (OpReg from reg)) -- (Signed ConVerT Float) + ) + MO_FS_Conv from to + | to < W32 -> + pure + $ Any + (intFormat to) + ( \dst -> + code + `snocOL` + -- W32 is the smallest width to convert to. Decrease width afterwards. + annExpr expr (FCVT FloatToInt (OpReg W32 dst) (OpReg from reg)) + `appOL` signExtendAdjustPrecission W32 to dst dst -- (float convert (-> zero) signed) + ) + MO_FS_Conv from to -> + pure + $ Any + (intFormat to) + ( \dst -> + code + `snocOL` annExpr expr (FCVT FloatToInt (OpReg to dst) (OpReg from reg)) + `appOL` truncateReg from to dst -- (float convert (-> zero) signed) + ) + MO_UU_Conv from to + | from <= to -> + pure + $ Any + (intFormat to) + ( \dst -> + code + `snocOL` annExpr e (MOV (OpReg to dst) (OpReg from reg)) + ) + MO_UU_Conv from to -> + pure + $ Any + (intFormat to) + ( \dst -> + code + `snocOL` annExpr e (MOV (OpReg from dst) (OpReg from reg)) + `appOL` truncateReg from to dst + ) + MO_SS_Conv from to -> ss_conv from to reg code + MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` annExpr e (FCVT FloatToFloat (OpReg to dst) (OpReg from reg))) + -- Conversions + -- TODO: Duplication with MO_UU_Conv + MO_XX_Conv from to + | to < from -> + pure + $ Any + (intFormat to) + ( \dst -> + code + `snocOL` annExpr e (MOV (OpReg from dst) (OpReg from reg)) + `appOL` truncateReg from to dst + ) + MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e + MO_AlignmentCheck align wordWidth -> do + reg <- getRegister' config plat e + addAlignmentCheck align wordWidth reg + x -> pprPanic ("getRegister' (monadic CmmMachOp): " ++ show x) (pdoc plat expr) + where + -- In the case of 16- or 8-bit values we need to sign-extend to 32-bits + -- See Note [Signed arithmetic on RISCV64]. + negate code w reg = do + let w' = opRegWidth w + (reg', code_sx) <- signExtendReg w w' reg + return $ Any (intFormat w) $ \dst -> + code + `appOL` code_sx + `snocOL` NEG (OpReg w' dst) (OpReg w' reg') + `appOL` truncateReg w' w dst + + ss_conv from to reg code + | from < to = do + pure $ Any (intFormat to) $ \dst -> + code + `appOL` signExtend from to reg dst + `appOL` truncateReg from to dst + | from > to = + pure $ Any (intFormat to) $ \dst -> + code + `appOL` toOL + [ ann + (text "MO_SS_Conv: narrow register signed" <+> ppr reg <+> ppr from <> text "->" <> ppr to) + (SLL (OpReg to dst) (OpReg from reg) (OpImm (ImmInt shift))), + -- signed right shift + SRA (OpReg to dst) (OpReg to dst) (OpImm (ImmInt shift)) + ] + `appOL` truncateReg from to dst + | otherwise = + -- No conversion necessary: Just copy. + pure $ Any (intFormat from) $ \dst -> + code `snocOL` MOV (OpReg from dst) (OpReg from reg) + where + shift = 64 - (widthInBits from - widthInBits to) + + -- Dyadic machops: + -- + -- The general idea is: + -- compute x <- x + -- compute x <- y + -- OP x, x, x + -- + -- TODO: for now we'll only implement the 64bit versions. And rely on the + -- fallthrough to alert us if things go wrong! + -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring + -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg + CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr' + CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr' + -- 1. Compute Reg +/- n directly. + -- For Add/Sub we can directly encode 12bits, or 12bits lsl #12. + CmmMachOp (MO_Add w) [CmmReg reg, CmmLit (CmmInt n _)] + | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n)))) + where + -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12. + w' = formatToWidth (cmmTypeFormat (cmmRegType reg)) + r' = getRegisterReg plat reg + CmmMachOp (MO_Sub w) [CmmReg reg, CmmLit (CmmInt n _)] + | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n)))) + where + -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12. + w' = formatToWidth (cmmTypeFormat (cmmRegType reg)) + r' = getRegisterReg plat reg + CmmMachOp (MO_U_Quot w) [x, y] | w == W8 || w == W16 -> do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` truncateReg (formatToWidth format_x) w reg_x + `appOL` code_y + `appOL` truncateReg (formatToWidth format_y) w reg_y + `snocOL` annExpr expr (DIVU (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)) + ) + + -- 2. Shifts. x << n, x >> n. + CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)] + | w == W32, + 0 <= n, + n < 32 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `snocOL` annExpr expr (SLL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + `appOL` truncateReg w w dst + ) + CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)] + | w == W64, + 0 <= n, + n < 64 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `snocOL` annExpr expr (SLL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + `appOL` truncateReg w w dst + ) + CmmMachOp (MO_S_Shr w) [x, CmmLit (CmmInt n _)] | fitsIn12bitImm n -> do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` code_x' + `snocOL` annExpr expr (SRA (OpReg w dst) (OpReg w reg_x') (OpImm (ImmInteger n))) + ) + CmmMachOp (MO_S_Shr w) [x, y] -> do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, _format_y, code_y) <- getSomeReg y + (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` code_x' + `appOL` code_y + `snocOL` annExpr expr (SRA (OpReg w dst) (OpReg w reg_x') (OpReg w reg_y)) + ) + CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] + | w == W8, + 0 <= n, + n < 8 -> do + (reg_x, format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` truncateReg (formatToWidth format_x) w reg_x + `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + ) + CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] + | w == W16, + 0 <= n, + n < 16 -> do + (reg_x, format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` truncateReg (formatToWidth format_x) w reg_x + `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + ) + CmmMachOp (MO_U_Shr w) [x, y] | w == W8 || w == W16 -> do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, _format_y, code_y) <- getSomeReg y + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` code_y + `appOL` truncateReg (formatToWidth format_x) w reg_x + `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)) + ) + CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] + | w == W32, + 0 <= n, + n < 32 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + ) + CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] + | w == W64, + 0 <= n, + n < 64 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return + $ Any + (intFormat w) + ( \dst -> + code_x + `snocOL` annExpr expr (SRL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) + ) + + -- 3. Logic &&, || + CmmMachOp (MO_And w) [CmmReg reg, CmmLit (CmmInt n _)] + | fitsIn12bitImm n -> + return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n)))) + where + w' = formatToWidth (cmmTypeFormat (cmmRegType reg)) + r' = getRegisterReg plat reg + CmmMachOp (MO_Or w) [CmmReg reg, CmmLit (CmmInt n _)] + | fitsIn12bitImm n -> + return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORI (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n)))) + where + w' = formatToWidth (cmmTypeFormat (cmmRegType reg)) + r' = getRegisterReg plat reg + + -- Generic binary case. + CmmMachOp op [x, y] -> do + let -- A "plain" operation. + bitOp w op = do + -- compute x <- x + -- compute x <- y + -- x, x, x + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOp: incompatible" + return + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` code_y + `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y) + ) + + -- A (potentially signed) integer operation. + -- In the case of 8- and 16-bit signed arithmetic we must first + -- sign-extend both arguments to 32-bits. + -- See Note [Signed arithmetic on RISCV64]. + intOp is_signed w op = do + -- compute x <- x + -- compute x <- y + -- x, x, x + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int" + -- This is the width of the registers on which the operation + -- should be performed. + let w' = opRegWidth w + signExt r + | not is_signed = return (r, nilOL) + | otherwise = signExtendReg w w' r + (reg_x_sx, code_x_sx) <- signExt reg_x + (reg_y_sx, code_y_sx) <- signExt reg_y + return $ Any (intFormat w) $ \dst -> + code_x + `appOL` code_y + `appOL` + -- sign-extend both operands + code_x_sx + `appOL` code_y_sx + `appOL` op (OpReg w' dst) (OpReg w' reg_x_sx) (OpReg w' reg_y_sx) + `appOL` truncateReg w' w dst -- truncate back to the operand's original width + floatOp w op = do + (reg_fx, format_x, code_fx) <- getFloatReg x + (reg_fy, format_y, code_fy) <- getFloatReg y + massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatOp: non-float" + return + $ Any + (floatFormat w) + ( \dst -> + code_fx + `appOL` code_fy + `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) + ) + + -- need a special one for conditionals, as they return ints + floatCond w op = do + (reg_fx, format_x, code_fx) <- getFloatReg x + (reg_fy, format_y, code_fy) <- getFloatReg y + massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatCond: non-float" + return + $ Any + (intFormat w) + ( \dst -> + code_fx + `appOL` code_fy + `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) + ) + + case op of + -- Integer operations + -- Add/Sub should only be Integer Options. + MO_Add w -> intOp False w (\d x y -> unitOL $ annExpr expr (ADD d x y)) + -- TODO: Handle sub-word case + MO_Sub w -> intOp False w (\d x y -> unitOL $ annExpr expr (SUB d x y)) + -- N.B. We needn't sign-extend sub-word size (in)equality comparisons + -- since we don't care about ordering. + MO_Eq w -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ)) + MO_Ne w -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y NE)) + -- Signed multiply/divide + MO_Mul w -> intOp True w (\d x y -> unitOL $ annExpr expr (MUL d x y)) + MO_S_MulMayOflo w -> do_mul_may_oflo w x y + MO_S_Quot w -> intOp True w (\d x y -> unitOL $ annExpr expr (DIV d x y)) + MO_S_Rem w -> intOp True w (\d x y -> unitOL $ annExpr expr (REM d x y)) + -- Unsigned multiply/divide + MO_U_Quot w -> intOp False w (\d x y -> unitOL $ annExpr expr (DIVU d x y)) + MO_U_Rem w -> intOp False w (\d x y -> unitOL $ annExpr expr (REMU d x y)) + -- Signed comparisons + MO_S_Ge w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SGE)) + MO_S_Le w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SLE)) + MO_S_Gt w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SGT)) + MO_S_Lt w -> intOp True w (\d x y -> unitOL $ annExpr expr (CSET d x y SLT)) + -- Unsigned comparisons + MO_U_Ge w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGE)) + MO_U_Le w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULE)) + MO_U_Gt w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGT)) + MO_U_Lt w -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULT)) + -- Floating point arithmetic + MO_F_Add w -> floatOp w (\d x y -> unitOL $ annExpr expr (ADD d x y)) + MO_F_Sub w -> floatOp w (\d x y -> unitOL $ annExpr expr (SUB d x y)) + MO_F_Mul w -> floatOp w (\d x y -> unitOL $ annExpr expr (MUL d x y)) + MO_F_Quot w -> floatOp w (\d x y -> unitOL $ annExpr expr (DIV d x y)) + -- Floating point comparison + MO_F_Eq w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ)) + MO_F_Ne w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y NE)) + MO_F_Ge w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGE)) + MO_F_Le w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLE)) -- x <= y <=> y > x + MO_F_Gt w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGT)) + MO_F_Lt w -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLT)) -- x < y <=> y >= x + + -- Bitwise operations + MO_And w -> bitOp w (\d x y -> unitOL $ annExpr expr (AND d x y)) + MO_Or w -> bitOp w (\d x y -> unitOL $ annExpr expr (OR d x y)) + MO_Xor w -> bitOp w (\d x y -> unitOL $ annExpr expr (XOR d x y)) + MO_Shl w -> intOp False w (\d x y -> unitOL $ annExpr expr (SLL d x y)) + MO_U_Shr w -> intOp False w (\d x y -> unitOL $ annExpr expr (SRL d x y)) + MO_S_Shr w -> intOp True w (\d x y -> unitOL $ annExpr expr (SRA d x y)) + op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ pprMachOp op <+> text "in" <+> pdoc plat expr + + -- Generic ternary case. + CmmMachOp op [x, y, z] -> + case op of + -- Floating-point fused multiply-add operations + -- + -- x86 fmadd x * y + z <=> RISCV64 fmadd : d = r1 * r2 + r3 + -- x86 fmsub x * y - z <=> RISCV64 fnmsub: d = r1 * r2 - r3 + -- x86 fnmadd - x * y + z <=> RISCV64 fmsub : d = - r1 * r2 + r3 + -- x86 fnmsub - x * y - z <=> RISCV64 fnmadd: d = - r1 * r2 - r3 + MO_FMA var w -> case var of + FMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMAdd d n m a) + FMSub -> float3Op w (\d n m a -> unitOL $ FMA FMSub d n m a) + FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a) + FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a) + _ -> + pprPanic "getRegister' (unhandled ternary CmmMachOp): " + $ pprMachOp op + <+> text "in" + <+> pdoc plat expr + where + float3Op w op = do + (reg_fx, format_x, code_fx) <- getFloatReg x + (reg_fy, format_y, code_fy) <- getFloatReg y + (reg_fz, format_z, code_fz) <- getFloatReg z + massertPpr (isFloatFormat format_x && isFloatFormat format_y && isFloatFormat format_z) + $ text "float3Op: non-float" + pure + $ Any (floatFormat w) + $ \dst -> + code_fx + `appOL` code_fy + `appOL` code_fz + `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) (OpReg w reg_fz) + CmmMachOp _op _xs -> + pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr) + where + isNbitEncodeable :: Int -> Integer -> Bool + isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift) + -- N.B. MUL does not set the overflow flag. + -- Return 0 when the operation cannot overflow, /= 0 otherwise + do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register + do_mul_may_oflo w _x _y | w > W64 = pprPanic "Cannot multiply larger than 64bit" (ppr w) + do_mul_may_oflo w@W64 x y = do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + -- TODO: Can't we clobber reg_x and reg_y to save registers? + lo <- getNewRegNat II64 + hi <- getNewRegNat II64 + -- TODO: Overhaul CSET: 3rd operand isn't needed for SNEZ + let nonSense = OpImm (ImmInt 0) + pure + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x + `appOL` code_y + `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y + `appOL` toOL + [ annExpr expr (MULH (OpReg w hi) (OpReg w reg_x) (OpReg w reg_y)), + MUL (OpReg w lo) (OpReg w reg_x) (OpReg w reg_y), + SRA (OpReg w lo) (OpReg w lo) (OpImm (ImmInt (widthInBits W64 - 1))), + ann + (text "Set flag if result of MULH contains more than sign bits.") + (XOR (OpReg w hi) (OpReg w hi) (OpReg w lo)), + CSET (OpReg w dst) (OpReg w hi) nonSense NE + ] + ) + do_mul_may_oflo w x y = do + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + let width_x = formatToWidth format_x + width_y = formatToWidth format_y + if w > width_x && w > width_y + then + pure + $ Any + (intFormat w) + ( \dst -> + -- 8bit * 8bit cannot overflow 16bit + -- 16bit * 16bit cannot overflow 32bit + -- 32bit * 32bit cannot overflow 64bit + unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 0))) + ) + else do + let use32BitMul = w <= W32 && width_x <= W32 && width_y <= W32 + nonSense = OpImm (ImmInt 0) + if use32BitMul + then do + narrowedReg <- getNewRegNat II64 + pure + $ Any + (intFormat w) + ( \dst -> + code_x + `appOL` signExtend (formatToWidth format_x) W32 reg_x reg_x + `appOL` code_y + `appOL` signExtend (formatToWidth format_y) W32 reg_y reg_y + `snocOL` annExpr expr (MUL (OpReg W32 dst) (OpReg W32 reg_x) (OpReg W32 reg_y)) + `appOL` signExtendAdjustPrecission W32 w dst narrowedReg + `appOL` toOL + [ ann + (text "Check if the multiplied value fits in the narrowed register") + (SUB (OpReg w dst) (OpReg w dst) (OpReg w narrowedReg)), + CSET (OpReg w dst) (OpReg w dst) nonSense NE + ] + ) + else + pure + $ Any + (intFormat w) + ( \dst -> + -- Do not handle this unlikely case. Just tell that it may overflow. + unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 1))) + ) + +-- | Instructions to sign-extend the value in the given register from width @w@ +-- up to width @w'@. +signExtendReg :: Width -> Width -> Reg -> NatM (Reg, OrdList Instr) +signExtendReg w _w' r | w == W64 = pure (r, nilOL) +signExtendReg w w' r = do + r' <- getNewRegNat (intFormat w') + let instrs = signExtend w w' r r' + pure (r', instrs) + +-- | Sign extends to 64bit, if needed +-- +-- Source `Reg` @r@ stays untouched, while the conversion happens on destination +-- `Reg` @r'@. +signExtend :: Width -> Width -> Reg -> Reg -> OrdList Instr +signExtend w w' _r _r' | w > w' = pprPanic "This is not a sign extension, but a truncation." $ ppr w <> text "->" <+> ppr w' +signExtend w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w' +signExtend w w' r r' | w == W64 && w' == W64 && r == r' = nilOL +signExtend w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r) +signExtend w w' r r' + | w == W32 && w' == W64 = + unitOL + $ ann + (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w') + -- `ADDIW r r 0` is the pseudo-op SEXT.W + (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0))) +signExtend w w' r r' = + toOL + [ ann + (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w') + (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))), + -- signed (arithmetic) right shift + SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift)) + ] + where + shift = 64 - widthInBits w + +-- | Sign extends to 64bit, if needed and reduces the precission to the target `Width` (@w'@) +-- +-- Source `Reg` @r@ stays untouched, while the conversion happens on destination +-- `Reg` @r'@. +signExtendAdjustPrecission :: Width -> Width -> Reg -> Reg -> OrdList Instr +signExtendAdjustPrecission w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w' +signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 && r == r' = nilOL +signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r) +signExtendAdjustPrecission w w' r r' + | w == W32 && w' == W64 = + unitOL + $ ann + (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w') + -- `ADDIW r r 0` is the pseudo-op SEXT.W + (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0))) +signExtendAdjustPrecission w w' r r' + | w > w' = + toOL + [ ann + (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w') + (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))), + -- signed (arithmetic) right shift + SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift)) + ] + where + shift = 64 - widthInBits w' +signExtendAdjustPrecission w w' r r' = + toOL + [ ann + (text "sign extend register" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w') + (SLL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))), + -- signed (arithmetic) right shift + SRA (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift)) + ] + where + shift = 64 - widthInBits w + +-- | Instructions to truncate the value in the given register from width @w@ +-- to width @w'@. +-- +-- In other words, it just cuts the width out of the register. N.B.: This +-- ignores signedness (no sign extension takes place)! +truncateReg :: Width -> Width -> Reg -> OrdList Instr +truncateReg _w w' _r | w' == W64 = nilOL +truncateReg _w w' r | w' > W64 = pprPanic "Cannot truncate to width bigger than register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w' +truncateReg w _w' r | w > W64 = pprPanic "Unexpected register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w +truncateReg w w' r = + toOL + [ ann + (text "truncate register" <+> ppr r <+> ppr w <> text "->" <> ppr w') + (SLL (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift))), + -- SHL ignores signedness! + SRL (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift)) + ] + where + shift = 64 - widthInBits w' + +-- | Given a 'Register', produce a new 'Register' with an instruction block +-- which will check the value for alignment. Used for @-falignment-sanitisation@. +addAlignmentCheck :: Int -> Width -> Register -> NatM Register +addAlignmentCheck align wordWidth reg = do + jumpReg <- getNewRegNat II64 + cmpReg <- getNewRegNat II64 + okayLblId <- getBlockIdNat + + pure $ case reg of + Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt jumpReg cmpReg okayLblId reg) + Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt jumpReg cmpReg okayLblId reg) + where + check :: Format -> Reg -> Reg -> BlockId -> Reg -> InstrBlock + check fmt jumpReg cmpReg okayLblId reg = + let width = formatToWidth fmt + in assert (not $ isFloatFormat fmt) + $ toOL + [ ann + (text "Alignment check - alignment: " <> int align <> text ", word width: " <> text (show wordWidth)) + (AND (OpReg width cmpReg) (OpReg width reg) (OpImm $ ImmInt $ align - 1)), + BCOND EQ (OpReg width cmpReg) zero (TBlock okayLblId), + COMMENT (text "Alignment check failed"), + LDR II64 (OpReg W64 jumpReg) (OpImm $ ImmCLbl mkBadAlignmentLabel), + B (TReg jumpReg), + NEWBLOCK okayLblId + ] + +-- ----------------------------------------------------------------------------- +-- The 'Amode' type: Memory addressing modes passed up the tree. +data Amode = Amode AddrMode InstrBlock + +-- | Provide the value of a `CmmExpr` with an `Amode` +-- +-- N.B. this function should be used to provide operands to load and store +-- instructions with signed 12bit wide immediates (S & I types). For other +-- immediate sizes and formats (e.g. B type uses multiples of 2) this function +-- would need to be adjusted. +getAmode :: + Platform -> + -- | width of loaded value + Width -> + CmmExpr -> + NatM Amode +-- TODO: Specialize stuff we can destructure here. + +-- LDR/STR: Immediate can be represented with 12bits +getAmode platform w (CmmRegOff reg off) + | w <= W64, + fitsIn12bitImm off = + return $ Amode (AddrRegImm reg' off') nilOL + where + reg' = getRegisterReg platform reg + off' = ImmInt off + +-- For Stores we often see something like this: +-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2) +-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ] +-- for `n` in range. +getAmode _platform _ (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')]) + | fitsIn12bitImm off = + do + (reg, _format, code) <- getSomeReg expr + return $ Amode (AddrRegImm reg (ImmInteger off)) code +getAmode _platform _ (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')]) + | fitsIn12bitImm (-off) = + do + (reg, _format, code) <- getSomeReg expr + return $ Amode (AddrRegImm reg (ImmInteger (-off))) code + +-- Generic case +getAmode _platform _ expr = + do + (reg, _format, code) <- getSomeReg expr + return $ Amode (AddrReg reg) code + +-- ----------------------------------------------------------------------------- +-- Generating assignments + +-- Assignments are really at the heart of the whole code generation +-- business. Almost all top-level nodes of any real importance are +-- assignments, which correspond to loads, stores, or register +-- transfers. If we're really lucky, some of the register transfers +-- will go away, because we can use the destination register to +-- complete the code generation for the right hand side. This only +-- fails when the right hand side is forced into a fixed register +-- (e.g. the result of a call). + +assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock +assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock +assignMem_IntCode rep addrE srcE = + do + (src_reg, _format, code) <- getSomeReg srcE + platform <- getPlatform + let w = formatToWidth rep + Amode addr addr_code <- getAmode platform w addrE + return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE))) + `consOL` ( code + `appOL` addr_code + `snocOL` STR rep (OpReg w src_reg) (OpAddr addr) + ) + +assignReg_IntCode _ reg src = + do + platform <- getPlatform + let dst = getRegisterReg platform reg + r <- getRegister src + return $ case r of + Any _ code -> + COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) + `consOL` code dst + Fixed format freg fcode -> + COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) + `consOL` ( fcode + `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg) + ) + +-- Let's treat Floating point stuff +-- as integer code for now. Opaque. +assignMem_FltCode = assignMem_IntCode + +assignReg_FltCode = assignReg_IntCode + +-- ----------------------------------------------------------------------------- +-- Jumps +-- AArch64 has 26bits for targets, whereas RiscV only has 20. +-- Thus we need to distinguish between far (outside of the) +-- current compilation unit. And regular branches. +-- RiscV has ±2MB of displacement, whereas AArch64 has ±128MB. +-- Thus for most branches we can get away with encoding it +-- directly in the instruction rather than always loading the +-- address into a register and then using that to jump. +-- Under the assumption that our linked build product is less than +-- ~2*128MB of TEXT, and there are no jump that span the whole +-- TEXT segment. +-- Something where riscv's compressed instruction might come in +-- handy. +genJump :: CmmExpr {-the branch target-} -> NatM InstrBlock +genJump expr = do + (target, _format, code) <- getSomeReg expr + return (code `appOL` unitOL (annExpr expr (B (TReg target)))) + +-- ----------------------------------------------------------------------------- +-- Unconditional branches +genBranch :: BlockId -> NatM InstrBlock +genBranch = return . toOL . mkJumpInstr + +-- ----------------------------------------------------------------------------- +-- Conditional branches +genCondJump :: + BlockId -> + CmmExpr -> + NatM InstrBlock +genCondJump bid expr = do + case expr of + -- Optimized == 0 case. + CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return $ code_x `snocOL` annExpr expr (BCOND EQ zero (OpReg w reg_x) (TBlock bid)) + + -- Optimized /= 0 case. + CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return $ code_x `snocOL` annExpr expr (BCOND NE zero (OpReg w reg_x) (TBlock bid)) + + -- Generic case. + CmmMachOp mop [x, y] -> do + let ubcond w cmp = do + -- compute both sides. + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + let x' = OpReg w reg_x + y' = OpReg w reg_y + return $ case w of + w + | w == W8 || w == W16 -> + code_x + `appOL` truncateReg (formatToWidth format_x) w reg_x + `appOL` code_y + `appOL` truncateReg (formatToWidth format_y) w reg_y + `appOL` code_y + `snocOL` annExpr expr (BCOND cmp x' y' (TBlock bid)) + _ -> + code_x + `appOL` code_y + `snocOL` annExpr expr (BCOND cmp x' y' (TBlock bid)) + + sbcond w cmp = do + -- compute both sides. + (reg_x, format_x, code_x) <- getSomeReg x + (reg_y, format_y, code_y) <- getSomeReg y + let x' = OpReg w reg_x + y' = OpReg w reg_y + return $ case w of + w + | w `elem` [W8, W16, W32] -> + code_x + `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x + `appOL` code_y + `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y + `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid))) + _ -> code_x `appOL` code_y `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid))) + + fbcond w cmp = do + -- ensure we get float regs + (reg_fx, _format_fx, code_fx) <- getFloatReg x + (reg_fy, _format_fy, code_fy) <- getFloatReg y + condOpReg <- OpReg W64 <$> getNewRegNat II64 + oneReg <- getNewRegNat II64 + return $ code_fx + `appOL` code_fy + `snocOL` annExpr expr (CSET condOpReg (OpReg w reg_fx) (OpReg w reg_fy) cmp) + `snocOL` MOV (OpReg W64 oneReg) (OpImm (ImmInt 1)) + `snocOL` BCOND EQ condOpReg (OpReg w oneReg) (TBlock bid) + + case mop of + MO_F_Eq w -> fbcond w EQ + MO_F_Ne w -> fbcond w NE + MO_F_Gt w -> fbcond w FGT + MO_F_Ge w -> fbcond w FGE + MO_F_Lt w -> fbcond w FLT + MO_F_Le w -> fbcond w FLE + MO_Eq w -> sbcond w EQ + MO_Ne w -> sbcond w NE + MO_S_Gt w -> sbcond w SGT + MO_S_Ge w -> sbcond w SGE + MO_S_Lt w -> sbcond w SLT + MO_S_Le w -> sbcond w SLE + MO_U_Gt w -> ubcond w UGT + MO_U_Ge w -> ubcond w UGE + MO_U_Lt w -> ubcond w ULT + MO_U_Le w -> ubcond w ULE + _ -> pprPanic "RV64.genCondJump:case mop: " (text $ show expr) + _ -> pprPanic "RV64.genCondJump: " (text $ show expr) + +-- | Generate conditional branching instructions +-- +-- This is basically an "if with else" statement. +genCondBranch :: + -- | the true branch target + BlockId -> + -- | the false branch target + BlockId -> + -- | the condition on which to branch + CmmExpr -> + -- | Instructions + NatM InstrBlock +genCondBranch true false expr = + appOL + <$> genCondJump true expr + <*> genBranch false + +-- ----------------------------------------------------------------------------- +-- Generating C calls + +-- | Generate a call to a C function. +-- +-- - Integer values are passed in GP registers a0-a7. +-- - Floating point values are passed in FP registers fa0-fa7. +-- - If there are no free floating point registers, the FP values are passed in GP registers. +-- - If all GP registers are taken, the values are spilled as whole words (!) onto the stack. +-- - For integers/words, the return value is in a0. +-- - The return value is in fa0 if the return type is a floating point value. +genCCall :: + ForeignTarget -> -- function to call + [CmmFormal] -> -- where to put the result + [CmmActual] -> -- arguments (of mixed type) + NatM InstrBlock +-- TODO: Specialize where we can. +-- Generic impl +genCCall target@(ForeignTarget expr _cconv) dest_regs arg_regs = do + -- we want to pass arg_regs into allArgRegs + -- The target :: ForeignTarget call can either + -- be a foreign procedure with an address expr + -- and a calling convention. + (call_target_reg, call_target_code) <- + -- Compute the address of the call target into a register. This + -- addressing enables us to jump through the whole address space + -- without further ado. PC-relative addressing would involve + -- instructions to do similar, though. + do + (reg, _format, reg_code) <- getSomeReg expr + pure (reg, reg_code) + -- compute the code and register logic for all arg_regs. + -- this will give us the format information to match on. + arg_regs' <- mapM getSomeReg arg_regs + + -- Now this is stupid. Our Cmm expressions doesn't carry the proper sizes + -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in + -- STG; this then breaks packing of stack arguments, if we need to pack + -- for the pcs, e.g. darwinpcs. Option one would be to fix the Int type + -- in Cmm proper. Option two, which we choose here is to use extended Hint + -- information to contain the size information and use that when packing + -- arguments, spilled onto the stack. + let (_res_hints, arg_hints) = foreignTargetHints target + arg_regs'' = zipWith (\(r, f, c) h -> (r, f, h, c)) arg_regs' arg_hints + + (stackSpaceWords, passRegs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL + + readResultsCode <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL + + let moveStackDown 0 = + toOL + [ PUSH_STACK_FRAME, + DELTA (-16) + ] + moveStackDown i | odd i = moveStackDown (i + 1) + moveStackDown i = + toOL + [ PUSH_STACK_FRAME, + SUB (OpReg W64 spMachReg) (OpReg W64 spMachReg) (OpImm (ImmInt (8 * i))), + DELTA (-8 * i - 16) + ] + moveStackUp 0 = + toOL + [ POP_STACK_FRAME, + DELTA 0 + ] + moveStackUp i | odd i = moveStackUp (i + 1) + moveStackUp i = + toOL + [ ADD (OpReg W64 spMachReg) (OpReg W64 spMachReg) (OpImm (ImmInt (8 * i))), + POP_STACK_FRAME, + DELTA 0 + ] + + let code = + call_target_code -- compute the label (possibly into a register) + `appOL` moveStackDown stackSpaceWords + `appOL` passArgumentsCode -- put the arguments into x0, ... + `snocOL` BL call_target_reg passRegs -- branch and link (C calls aren't tail calls, but return) + `appOL` readResultsCode -- parse the results into registers + `appOL` moveStackUp stackSpaceWords + return code + where + -- Implementiation of the RISCV ABI calling convention. + -- https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/948463cd5dbebea7c1869e20146b17a2cc8fda2f/riscv-cc.adoc#integer-calling-convention + passArguments :: [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock) + -- Base case: no more arguments to pass (left) + passArguments _ _ [] stackSpaceWords accumRegs accumCode = return (stackSpaceWords, accumRegs, accumCode) + -- Still have GP regs, and we want to pass an GP argument. + passArguments (gpReg : gpRegs) fpRegs ((r, format, hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do + -- RISCV64 Integer Calling Convention: "When passed in registers or on the + -- stack, integer scalars narrower than XLEN bits are widened according to + -- the sign of their type up to 32 bits, then sign-extended to XLEN bits." + let w = formatToWidth format + assignArg = + if hint == SignedHint + then + COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r) + `consOL` signExtend w W64 r gpReg + else + toOL + [ COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r), + MOV (OpReg w gpReg) (OpReg w r) + ] + accumCode' = + accumCode + `appOL` code_r + `appOL` assignArg + passArguments gpRegs fpRegs args stackSpaceWords (gpReg : accumRegs) accumCode' + + -- Still have FP regs, and we want to pass an FP argument. + passArguments gpRegs (fpReg : fpRegs) ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do + let w = formatToWidth format + mov = MOV (OpReg w fpReg) (OpReg w r) + accumCode' = + accumCode + `appOL` code_r + `snocOL` ann (text "Pass fp argument: " <> ppr r) mov + passArguments gpRegs fpRegs args stackSpaceWords (fpReg : accumRegs) accumCode' + + -- No mor regs left to pass. Must pass on stack. + passArguments [] [] ((r, format, hint, code_r) : args) stackSpaceWords accumRegs accumCode = do + let w = formatToWidth format + spOffet = 8 * stackSpaceWords + str = STR format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet))) + stackCode = + if hint == SignedHint + then + code_r + `appOL` signExtend w W64 r tmpReg + `snocOL` ann (text "Pass signed argument (size " <> ppr w <> text ") on the stack: " <> ppr tmpReg) str + else + code_r + `snocOL` ann (text "Pass unsigned argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str + passArguments [] [] args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode) + + -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then. + passArguments [] fpRegs ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do + let w = formatToWidth format + spOffet = 8 * stackSpaceWords + str = STR format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet))) + stackCode = + code_r + `snocOL` ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str + passArguments [] fpRegs args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode) + + -- Still have gpRegs left, but want to pass a FP argument. Must be passed in gpReg then. + passArguments (gpReg : gpRegs) [] ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do + let w = formatToWidth format + mov = MOV (OpReg w gpReg) (OpReg w r) + accumCode' = + accumCode + `appOL` code_r + `snocOL` ann (text "Pass fp argument in gpReg: " <> ppr r) mov + passArguments gpRegs [] args stackSpaceWords (gpReg : accumRegs) accumCode' + passArguments _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state") + + readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg] -> InstrBlock -> NatM InstrBlock + readResults _ _ [] _ accumCode = return accumCode + readResults [] _ _ _ _ = do + platform <- getPlatform + pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target) + readResults _ [] _ _ _ = do + platform <- getPlatform + pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target) + readResults (gpReg : gpRegs) (fpReg : fpRegs) (dst : dsts) accumRegs accumCode = do + -- gp/fp reg -> dst + platform <- getPlatform + let rep = cmmRegType (CmmLocal dst) + format = cmmTypeFormat rep + w = cmmRegWidth (CmmLocal dst) + r_dst = getRegisterReg platform (CmmLocal dst) + if isFloatFormat format + then readResults (gpReg : gpRegs) fpRegs dsts (fpReg : accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg)) + else + readResults gpRegs (fpReg : fpRegs) dsts (gpReg : accumRegs) + $ accumCode + `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg) + `appOL` + -- truncate, otherwise an unexpectedly big value might be used in upfollowing calculations + truncateReg W64 w r_dst +genCCall (PrimTarget mop) dest_regs arg_regs = do + case mop of + MO_F32_Fabs + | [arg_reg] <- arg_regs, + [dest_reg] <- dest_regs -> + unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg + MO_F64_Fabs + | [arg_reg] <- arg_regs, + [dest_reg] <- dest_regs -> + unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg + -- 64 bit float ops + MO_F64_Pwr -> mkCCall "pow" + MO_F64_Sin -> mkCCall "sin" + MO_F64_Cos -> mkCCall "cos" + MO_F64_Tan -> mkCCall "tan" + MO_F64_Sinh -> mkCCall "sinh" + MO_F64_Cosh -> mkCCall "cosh" + MO_F64_Tanh -> mkCCall "tanh" + MO_F64_Asin -> mkCCall "asin" + MO_F64_Acos -> mkCCall "acos" + MO_F64_Atan -> mkCCall "atan" + MO_F64_Asinh -> mkCCall "asinh" + MO_F64_Acosh -> mkCCall "acosh" + MO_F64_Atanh -> mkCCall "atanh" + MO_F64_Log -> mkCCall "log" + MO_F64_Log1P -> mkCCall "log1p" + MO_F64_Exp -> mkCCall "exp" + MO_F64_ExpM1 -> mkCCall "expm1" + MO_F64_Fabs -> mkCCall "fabs" + MO_F64_Sqrt -> mkCCall "sqrt" + -- 32 bit float ops + MO_F32_Pwr -> mkCCall "powf" + MO_F32_Sin -> mkCCall "sinf" + MO_F32_Cos -> mkCCall "cosf" + MO_F32_Tan -> mkCCall "tanf" + MO_F32_Sinh -> mkCCall "sinhf" + MO_F32_Cosh -> mkCCall "coshf" + MO_F32_Tanh -> mkCCall "tanhf" + MO_F32_Asin -> mkCCall "asinf" + MO_F32_Acos -> mkCCall "acosf" + MO_F32_Atan -> mkCCall "atanf" + MO_F32_Asinh -> mkCCall "asinhf" + MO_F32_Acosh -> mkCCall "acoshf" + MO_F32_Atanh -> mkCCall "atanhf" + MO_F32_Log -> mkCCall "logf" + MO_F32_Log1P -> mkCCall "log1pf" + MO_F32_Exp -> mkCCall "expf" + MO_F32_ExpM1 -> mkCCall "expm1f" + MO_F32_Fabs -> mkCCall "fabsf" + MO_F32_Sqrt -> mkCCall "sqrtf" + -- 64-bit primops + MO_I64_ToI -> mkCCall "hs_int64ToInt" + MO_I64_FromI -> mkCCall "hs_intToInt64" + MO_W64_ToW -> mkCCall "hs_word64ToWord" + MO_W64_FromW -> mkCCall "hs_wordToWord64" + MO_x64_Neg -> mkCCall "hs_neg64" + MO_x64_Add -> mkCCall "hs_add64" + MO_x64_Sub -> mkCCall "hs_sub64" + MO_x64_Mul -> mkCCall "hs_mul64" + MO_I64_Quot -> mkCCall "hs_quotInt64" + MO_I64_Rem -> mkCCall "hs_remInt64" + MO_W64_Quot -> mkCCall "hs_quotWord64" + MO_W64_Rem -> mkCCall "hs_remWord64" + MO_x64_And -> mkCCall "hs_and64" + MO_x64_Or -> mkCCall "hs_or64" + MO_x64_Xor -> mkCCall "hs_xor64" + MO_x64_Not -> mkCCall "hs_not64" + MO_x64_Shl -> mkCCall "hs_uncheckedShiftL64" + MO_I64_Shr -> mkCCall "hs_uncheckedIShiftRA64" + MO_W64_Shr -> mkCCall "hs_uncheckedShiftRL64" + MO_x64_Eq -> mkCCall "hs_eq64" + MO_x64_Ne -> mkCCall "hs_ne64" + MO_I64_Ge -> mkCCall "hs_geInt64" + MO_I64_Gt -> mkCCall "hs_gtInt64" + MO_I64_Le -> mkCCall "hs_leInt64" + MO_I64_Lt -> mkCCall "hs_ltInt64" + MO_W64_Ge -> mkCCall "hs_geWord64" + MO_W64_Gt -> mkCCall "hs_gtWord64" + MO_W64_Le -> mkCCall "hs_leWord64" + MO_W64_Lt -> mkCCall "hs_ltWord64" + -- Conversion + MO_UF_Conv w -> mkCCall (word2FloatLabel w) + -- Optional MachOps + -- These are enabled/disabled by backend flags: GHC.StgToCmm.Config + MO_S_Mul2 _w -> unsupported mop + MO_S_QuotRem _w -> unsupported mop + MO_U_QuotRem _w -> unsupported mop + MO_U_QuotRem2 _w -> unsupported mop + MO_Add2 _w -> unsupported mop + MO_AddWordC _w -> unsupported mop + MO_SubWordC _w -> unsupported mop + MO_AddIntC _w -> unsupported mop + MO_SubIntC _w -> unsupported mop + MO_U_Mul2 _w -> unsupported mop + -- Memory Ordering + -- The related C functions are: + -- #include + -- atomic_thread_fence(memory_order_acquire); + -- atomic_thread_fence(memory_order_release); + -- atomic_thread_fence(memory_order_seq_cst); + MO_AcquireFence -> pure (unitOL (FENCE FenceRead FenceReadWrite)) + MO_ReleaseFence -> pure (unitOL (FENCE FenceReadWrite FenceWrite)) + MO_SeqCstFence -> pure (unitOL (FENCE FenceReadWrite FenceReadWrite)) + MO_Touch -> pure nilOL -- Keep variables live (when using interior pointers) + -- Prefetch + MO_Prefetch_Data _n -> pure nilOL -- Prefetch hint. + + -- Memory copy/set/move/cmp, with alignment for optimization + MO_Memcpy _align -> mkCCall "memcpy" + MO_Memset _align -> mkCCall "memset" + MO_Memmove _align -> mkCCall "memmove" + MO_Memcmp _align -> mkCCall "memcmp" + MO_SuspendThread -> mkCCall "suspendThread" + MO_ResumeThread -> mkCCall "resumeThread" + MO_PopCnt w -> mkCCall (popCntLabel w) + MO_Pdep w -> mkCCall (pdepLabel w) + MO_Pext w -> mkCCall (pextLabel w) + MO_Clz w -> mkCCall (clzLabel w) + MO_Ctz w -> mkCCall (ctzLabel w) + MO_BSwap w -> mkCCall (bSwapLabel w) + MO_BRev w -> mkCCall (bRevLabel w) + -- Atomic read-modify-write. + mo@(MO_AtomicRead w ord) + | [p_reg] <- arg_regs, + [dst_reg] <- dest_regs -> do + (p, _fmt_p, code_p) <- getSomeReg p_reg + platform <- getPlatform + -- Analog to the related MachOps (above) + -- The related C functions are: + -- #include + -- __atomic_load_n(&a, __ATOMIC_ACQUIRE); + -- __atomic_load_n(&a, __ATOMIC_SEQ_CST); + let instrs = case ord of + MemOrderRelaxed -> unitOL $ ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)) + MemOrderAcquire -> + toOL + [ ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)), + FENCE FenceRead FenceReadWrite + ] + MemOrderSeqCst -> + toOL + [ ann moDescr (FENCE FenceReadWrite FenceReadWrite), + LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p), + FENCE FenceRead FenceReadWrite + ] + MemOrderRelease -> panic $ "Unexpected MemOrderRelease on an AtomicRead: " ++ show mo + dst = getRegisterReg platform (CmmLocal dst_reg) + moDescr = (text . show) mo + code = code_p `appOL` instrs + return code + | otherwise -> panic "mal-formed AtomicRead" + mo@(MO_AtomicWrite w ord) + | [p_reg, val_reg] <- arg_regs -> do + (p, _fmt_p, code_p) <- getSomeReg p_reg + (val, fmt_val, code_val) <- getSomeReg val_reg + -- Analog to the related MachOps (above) + -- The related C functions are: + -- #include + -- __atomic_store_n(&a, 23, __ATOMIC_SEQ_CST); + -- __atomic_store_n(&a, 23, __ATOMIC_RELEASE); + let instrs = case ord of + MemOrderRelaxed -> unitOL $ ann moDescr (STR fmt_val (OpReg w val) (OpAddr $ AddrReg p)) + MemOrderSeqCst -> + toOL + [ ann moDescr (FENCE FenceReadWrite FenceWrite), + STR fmt_val (OpReg w val) (OpAddr $ AddrReg p), + FENCE FenceReadWrite FenceReadWrite + ] + MemOrderRelease -> + toOL + [ ann moDescr (FENCE FenceReadWrite FenceWrite), + STR fmt_val (OpReg w val) (OpAddr $ AddrReg p) + ] + MemOrderAcquire -> panic $ "Unexpected MemOrderAcquire on an AtomicWrite" ++ show mo + moDescr = (text . show) mo + code = + code_p + `appOL` code_val + `appOL` instrs + pure code + | otherwise -> panic "mal-formed AtomicWrite" + MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop) + MO_Cmpxchg w -> mkCCall (cmpxchgLabel w) + -- -- Should be an AtomicRMW variant eventually. + -- -- Sequential consistent. + -- TODO: this should be implemented properly! + MO_Xchg w -> mkCCall (xchgLabel w) + where + unsupported :: (Show a) => a -> b + unsupported mop = + panic + ( "outOfLineCmmOp: " + ++ show mop + ++ " not supported here" + ) + mkCCall :: FastString -> NatM InstrBlock + mkCCall name = do + config <- getConfig + target <- + cmmMakeDynamicReference config CallReference + $ mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction + let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn + genCCall (ForeignTarget target cconv) dest_regs arg_regs + + unaryFloatOp w op arg_reg dest_reg = do + platform <- getPlatform + (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg + let dst = getRegisterReg platform (CmmLocal dest_reg) + let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx) + pure code + +{- Note [RISCV64 far jumps] +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +RISCV64 conditional jump instructions can only encode an offset of +/-4KiB +(12bits) which is usually enough but can be exceeded in edge cases. In these +cases we will replace: + + b.cond foo + +with the sequence: + + b.cond + b + : + la reg foo + b reg + : + +and + + b foo + +with the sequence: + + la reg foo + b reg + +Compared to AArch64 the target label is loaded to a register, because +unconditional jump instructions can only address +/-1MiB. The LA +pseudo-instruction will be replaced by up to two real instructions, ensuring +correct addressing. + +One could surely find more efficient replacements, taking PC-relative addressing +into account. This could be a future improvement. (As far branches are pretty +rare, one might question and measure the value of such improvement.) + +RISCV has many pseudo-instructions which emit more than one real instructions. +Thus, we count the real instructions after the Assembler has seen them. + +We make some simplifications in the name of performance which can result in +overestimating jump <-> label offsets: + +\* To avoid having to recalculate the label offsets once we replaced a jump we simply + assume all label jumps will be expanded to a three instruction far jump sequence. +\* For labels associated with a info table we assume the info table is 64byte large. + Most info tables are smaller than that but it means we don't have to distinguish + between multiple types of info tables. + +In terms of implementation we walk the instruction stream at least once calculating +label offsets, and if we determine during this that the functions body is big enough +to potentially contain out of range jumps we walk the instructions a second time, replacing +out of range jumps with the sequence of instructions described above. + +-} + +-- | A conditional jump to a far target +-- +-- By loading the far target into a register for the jump, we can address the +-- whole memory range. +genCondFarJump :: (MonadUnique m) => Cond -> Operand -> Operand -> BlockId -> m InstrBlock +genCondFarJump cond op1 op2 far_target = do + skip_lbl_id <- newBlockId + jmp_lbl_id <- newBlockId + + -- TODO: We can improve this by inverting the condition + -- but it's not quite trivial since we don't know if we + -- need to consider float orderings. + -- So we take the hit of the additional jump in the false + -- case for now. + return + $ toOL + [ ann (text "Conditional far jump to: " <> ppr far_target) + $ BCOND cond op1 op2 (TBlock jmp_lbl_id), + B (TBlock skip_lbl_id), + NEWBLOCK jmp_lbl_id, + LDR II64 (OpReg W64 tmpReg) (OpImm (ImmCLbl (blockLbl far_target))), + B (TReg tmpReg), + NEWBLOCK skip_lbl_id + ] + +-- | An unconditional jump to a far target +-- +-- By loading the far target into a register for the jump, we can address the +-- whole memory range. +genFarJump :: (MonadUnique m) => BlockId -> m InstrBlock +genFarJump far_target = + return + $ toOL + [ ann (text "Unconditional far jump to: " <> ppr far_target) + $ LDR II64 (OpReg W64 tmpReg) (OpImm (ImmCLbl (blockLbl far_target))), + B (TReg tmpReg) + ] + +-- See Note [RISCV64 far jumps] +data BlockInRange = InRange | NotInRange BlockId + +-- See Note [RISCV64 far jumps] +makeFarBranches :: + Platform -> + LabelMap RawCmmStatics -> + [NatBasicBlock Instr] -> + UniqSM [NatBasicBlock Instr] +makeFarBranches {- only used when debugging -} _platform statics basic_blocks = do + -- All offsets/positions are counted in multiples of 4 bytes (the size of RISCV64 instructions) + -- That is an offset of 1 represents a 4-byte/one instruction offset. + let (func_size, lblMap) = foldl' calc_lbl_positions (0, mapEmpty) basic_blocks + if func_size < max_jump_dist + then pure basic_blocks + else do + (_, blocks) <- mapAccumLM (replace_blk lblMap) 0 basic_blocks + pure $ concat blocks + where + -- pprTrace "lblMap" (ppr lblMap) $ basic_blocks + + -- 2^11, 12 bit immediate with one bit is reserved for the sign + max_jump_dist = 2 ^ (11 :: Int) - 1 :: Int + -- Currently all inline info tables fit into 64 bytes. + max_info_size = 16 :: Int + long_bc_jump_size = 5 :: Int + long_b_jump_size = 2 :: Int + + -- Replace out of range conditional jumps with unconditional jumps. + replace_blk :: LabelMap Int -> Int -> GenBasicBlock Instr -> UniqSM (Int, [GenBasicBlock Instr]) + replace_blk !m !pos (BasicBlock lbl instrs) = do + -- Account for a potential info table before the label. + let !block_pos = pos + infoTblSize_maybe lbl + (!pos', instrs') <- mapAccumLM (replace_jump m) block_pos instrs + let instrs'' = concat instrs' + -- We might have introduced new labels, so split the instructions into basic blocks again if neccesary. + let (top, split_blocks, no_data) = foldr mkBlocks ([], [], []) instrs'' + -- There should be no data in the instruction stream at this point + massert (null no_data) + + let final_blocks = BasicBlock lbl top : split_blocks + pure (pos', final_blocks) + + replace_jump :: LabelMap Int -> Int -> Instr -> UniqSM (Int, [Instr]) + replace_jump !m !pos instr = do + case instr of + ANN ann instr -> do + (idx, instr' : instrs') <- replace_jump m pos instr + pure (idx, ANN ann instr' : instrs') + BCOND cond op1 op2 t -> + case target_in_range m t pos of + InRange -> pure (pos + instr_size instr, [instr]) + NotInRange far_target -> do + jmp_code <- genCondFarJump cond op1 op2 far_target + pure (pos + instr_size instr, fromOL jmp_code) + B t -> + case target_in_range m t pos of + InRange -> pure (pos + instr_size instr, [instr]) + NotInRange far_target -> do + jmp_code <- genFarJump far_target + pure (pos + instr_size instr, fromOL jmp_code) + _ -> pure (pos + instr_size instr, [instr]) + + target_in_range :: LabelMap Int -> Target -> Int -> BlockInRange + target_in_range m target src = + case target of + (TReg {}) -> InRange + (TBlock bid) -> block_in_range m src bid + + block_in_range :: LabelMap Int -> Int -> BlockId -> BlockInRange + block_in_range m src_pos dest_lbl = + case mapLookup dest_lbl m of + Nothing -> + pprTrace "not in range" (ppr dest_lbl) + $ NotInRange dest_lbl + Just dest_pos -> + if abs (dest_pos - src_pos) < max_jump_dist + then InRange + else NotInRange dest_lbl + + calc_lbl_positions :: (Int, LabelMap Int) -> GenBasicBlock Instr -> (Int, LabelMap Int) + calc_lbl_positions (pos, m) (BasicBlock lbl instrs) = + let !pos' = pos + infoTblSize_maybe lbl + in foldl' instr_pos (pos', mapInsert lbl pos' m) instrs + + instr_pos :: (Int, LabelMap Int) -> Instr -> (Int, LabelMap Int) + instr_pos (pos, m) instr = (pos + instr_size instr, m) + + infoTblSize_maybe bid = + case mapLookup bid statics of + Nothing -> 0 :: Int + Just _info_static -> max_info_size + + instr_size :: Instr -> Int + instr_size i = case i of + COMMENT {} -> 0 + MULTILINE_COMMENT {} -> 0 + ANN _ instr -> instr_size instr + LOCATION {} -> 0 + DELTA {} -> 0 + -- At this point there should be no NEWBLOCK in the instruction stream (pos, mapInsert bid pos m) + NEWBLOCK {} -> panic "mkFarBranched - Unexpected" + LDATA {} -> panic "mkFarBranched - Unexpected" + PUSH_STACK_FRAME -> 4 + POP_STACK_FRAME -> 4 + ADD {} -> 1 + MUL {} -> 1 + MULH {} -> 1 + NEG {} -> 1 + DIV {} -> 1 + REM {} -> 1 + REMU {} -> 1 + SUB {} -> 1 + DIVU {} -> 1 + AND {} -> 1 + OR {} -> 1 + SRA {} -> 1 + XOR {} -> 1 + SLL {} -> 1 + SRL {} -> 1 + MOV {} -> 2 + ORI {} -> 1 + XORI {} -> 1 + CSET {} -> 2 + STR {} -> 1 + LDR {} -> 3 + LDRU {} -> 1 + FENCE {} -> 1 + FCVT {} -> 1 + FABS {} -> 1 + FMA {} -> 1 + -- estimate the subsituted size for jumps to lables + -- jumps to registers have size 1 + BCOND {} -> long_bc_jump_size + B (TBlock _) -> long_b_jump_size + B (TReg _) -> 1 + BL _ _ -> 1 + J_TBL {} -> 1 Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Cond.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Cond.hs @@ -0,0 +1,42 @@ +module GHC.CmmToAsm.RV64.Cond + ( Cond (..), + ) +where + +import GHC.Prelude hiding (EQ) + +-- | Condition codes. +-- +-- Used in conditional branches and bit setters. According to the available +-- instruction set, some conditions are encoded as their negated opposites. I.e. +-- these are logical things that don't necessarily map 1:1 to hardware/ISA. +data Cond + = -- | int and float + EQ + | -- | int and float + NE + | -- | signed less than + SLT + | -- | signed less than or equal + SLE + | -- | signed greater than or equal + SGE + | -- | signed greater than + SGT + | -- | unsigned less than + ULT + | -- | unsigned less than or equal + ULE + | -- | unsigned greater than or equal + UGE + | -- | unsigned greater than + UGT + | -- | floating point instruction @flt@ + FLT + | -- | floating point instruction @fle@ + FLE + | -- | floating point instruction @fge@ + FGE + | -- | floating point instruction @fgt@ + FGT + deriving (Eq, Show) Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Instr.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Instr.hs @@ -0,0 +1,823 @@ +-- All instructions will be rendered eventually. Thus, there's no benefit in +-- being lazy in data types. +{-# LANGUAGE StrictData #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module GHC.CmmToAsm.RV64.Instr where + +import Data.Maybe +import GHC.Cmm +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import GHC.Cmm.Dataflow.Label +import GHC.CmmToAsm.Config +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Instr (RegUsage (..)) +import GHC.CmmToAsm.RV64.Cond +import GHC.CmmToAsm.RV64.Regs +import GHC.CmmToAsm.Types +import GHC.CmmToAsm.Utils +import GHC.Data.FastString (LexicalFastString) +import GHC.Platform +import GHC.Platform.Reg +import GHC.Platform.Regs +import GHC.Prelude +import GHC.Stack +import GHC.Types.Unique.Supply +import GHC.Utils.Outputable +import GHC.Utils.Panic + +-- | Stack frame header size in bytes. +-- +-- The stack frame header is made of the values that are always saved +-- (regardless of the context.) It consists of the saved return address and a +-- pointer to the previous frame. Thus, its size is two stack frame slots which +-- equals two addresses/words (2 * 8 byte). +stackFrameHeaderSize :: Int +stackFrameHeaderSize = 2 * spillSlotSize + +-- | All registers are 8 byte wide. +spillSlotSize :: Int +spillSlotSize = 8 + +-- | The number of bytes that the stack pointer should be aligned to. +stackAlign :: Int +stackAlign = 16 + +-- | The number of spill slots available without allocating more. +maxSpillSlots :: NCGConfig -> Int +maxSpillSlots config = + ( (ncgSpillPreallocSize config - stackFrameHeaderSize) + `div` spillSlotSize + ) + - 1 + +-- | Convert a spill slot number to a *byte* offset. +spillSlotToOffset :: Int -> Int +spillSlotToOffset slot = + stackFrameHeaderSize + spillSlotSize * slot + +instance Outputable RegUsage where + ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')' + +-- | Get the registers that are being used by this instruction. +-- regUsage doesn't need to do any trickery for jumps and such. +-- Just state precisely the regs read and written by that insn. +-- The consequences of control flow transfers, as far as register +-- allocation goes, are taken care of by the register allocator. +-- +-- RegUsage = RU [] [] +regUsageOfInstr :: Platform -> Instr -> RegUsage +regUsageOfInstr platform instr = case instr of + ANN _ i -> regUsageOfInstr platform i + COMMENT {} -> usage ([], []) + MULTILINE_COMMENT {} -> usage ([], []) + PUSH_STACK_FRAME -> usage ([], []) + POP_STACK_FRAME -> usage ([], []) + LOCATION {} -> usage ([], []) + DELTA {} -> usage ([], []) + ADD dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + MUL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + NEG dst src -> usage (regOp src, regOp dst) + MULH dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + DIV dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + REM dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + REMU dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + SUB dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + DIVU dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + AND dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + OR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + SRA dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + XOR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + SLL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + SRL dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) + MOV dst src -> usage (regOp src, regOp dst) + -- ORI's third operand is always an immediate + ORI dst src1 _ -> usage (regOp src1, regOp dst) + XORI dst src1 _ -> usage (regOp src1, regOp dst) + J_TBL _ _ t -> usage ([t], []) + B t -> usage (regTarget t, []) + BCOND _ l r t -> usage (regTarget t ++ regOp l ++ regOp r, []) + BL t ps -> usage (t : ps, callerSavedRegisters) + CSET dst l r _ -> usage (regOp l ++ regOp r, regOp dst) + STR _ src dst -> usage (regOp src ++ regOp dst, []) + LDR _ dst src -> usage (regOp src, regOp dst) + LDRU _ dst src -> usage (regOp src, regOp dst) + FENCE _ _ -> usage ([], []) + FCVT _variant dst src -> usage (regOp src, regOp dst) + FABS dst src -> usage (regOp src, regOp dst) + FMA _ dst src1 src2 src3 -> + usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst) + _ -> panic $ "regUsageOfInstr: " ++ instrCon instr + where + -- filtering the usage is necessary, otherwise the register + -- allocator will try to allocate pre-defined fixed stg + -- registers as well, as they show up. + usage :: ([Reg], [Reg]) -> RegUsage + usage (srcRegs, dstRegs) = + RU + (filter (interesting platform) srcRegs) + (filter (interesting platform) dstRegs) + + regAddr :: AddrMode -> [Reg] + regAddr (AddrRegImm r1 _imm) = [r1] + regAddr (AddrReg r1) = [r1] + + regOp :: Operand -> [Reg] + regOp (OpReg _w r1) = [r1] + regOp (OpAddr a) = regAddr a + regOp (OpImm _imm) = [] + + regTarget :: Target -> [Reg] + regTarget (TBlock _bid) = [] + regTarget (TReg r1) = [r1] + + -- Is this register interesting for the register allocator? + interesting :: Platform -> Reg -> Bool + interesting _ (RegVirtual _) = True + interesting platform (RegReal (RealRegSingle i)) = freeReg platform i + +-- | Caller-saved registers (according to calling convention) +-- +-- These registers may be clobbered after a jump. +callerSavedRegisters :: [Reg] +callerSavedRegisters = + [regSingle raRegNo] + ++ map regSingle [t0RegNo .. t2RegNo] + ++ map regSingle [a0RegNo .. a7RegNo] + ++ map regSingle [t3RegNo .. t6RegNo] + ++ map regSingle [ft0RegNo .. ft7RegNo] + ++ map regSingle [fa0RegNo .. fa7RegNo] + +-- | Apply a given mapping to all the register references in this instruction. +patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr +patchRegsOfInstr instr env = case instr of + ANN d i -> ANN d (patchRegsOfInstr i env) + COMMENT {} -> instr + MULTILINE_COMMENT {} -> instr + PUSH_STACK_FRAME -> instr + POP_STACK_FRAME -> instr + LOCATION {} -> instr + DELTA {} -> instr + ADD o1 o2 o3 -> ADD (patchOp o1) (patchOp o2) (patchOp o3) + MUL o1 o2 o3 -> MUL (patchOp o1) (patchOp o2) (patchOp o3) + NEG o1 o2 -> NEG (patchOp o1) (patchOp o2) + MULH o1 o2 o3 -> MULH (patchOp o1) (patchOp o2) (patchOp o3) + DIV o1 o2 o3 -> DIV (patchOp o1) (patchOp o2) (patchOp o3) + REM o1 o2 o3 -> REM (patchOp o1) (patchOp o2) (patchOp o3) + REMU o1 o2 o3 -> REMU (patchOp o1) (patchOp o2) (patchOp o3) + SUB o1 o2 o3 -> SUB (patchOp o1) (patchOp o2) (patchOp o3) + DIVU o1 o2 o3 -> DIVU (patchOp o1) (patchOp o2) (patchOp o3) + AND o1 o2 o3 -> AND (patchOp o1) (patchOp o2) (patchOp o3) + OR o1 o2 o3 -> OR (patchOp o1) (patchOp o2) (patchOp o3) + SRA o1 o2 o3 -> SRA (patchOp o1) (patchOp o2) (patchOp o3) + XOR o1 o2 o3 -> XOR (patchOp o1) (patchOp o2) (patchOp o3) + SLL o1 o2 o3 -> SLL (patchOp o1) (patchOp o2) (patchOp o3) + SRL o1 o2 o3 -> SRL (patchOp o1) (patchOp o2) (patchOp o3) + MOV o1 o2 -> MOV (patchOp o1) (patchOp o2) + -- o3 cannot be a register for ORI (always an immediate) + ORI o1 o2 o3 -> ORI (patchOp o1) (patchOp o2) (patchOp o3) + XORI o1 o2 o3 -> XORI (patchOp o1) (patchOp o2) (patchOp o3) + J_TBL ids mbLbl t -> J_TBL ids mbLbl (env t) + B t -> B (patchTarget t) + BL t ps -> BL (patchReg t) ps + BCOND c o1 o2 t -> BCOND c (patchOp o1) (patchOp o2) (patchTarget t) + CSET o l r c -> CSET (patchOp o) (patchOp l) (patchOp r) c + STR f o1 o2 -> STR f (patchOp o1) (patchOp o2) + LDR f o1 o2 -> LDR f (patchOp o1) (patchOp o2) + LDRU f o1 o2 -> LDRU f (patchOp o1) (patchOp o2) + FENCE o1 o2 -> FENCE o1 o2 + FCVT variant o1 o2 -> FCVT variant (patchOp o1) (patchOp o2) + FABS o1 o2 -> FABS (patchOp o1) (patchOp o2) + FMA s o1 o2 o3 o4 -> + FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4) + _ -> panic $ "patchRegsOfInstr: " ++ instrCon instr + where + patchOp :: Operand -> Operand + patchOp (OpReg w r) = OpReg w (env r) + patchOp (OpAddr a) = OpAddr (patchAddr a) + patchOp opImm = opImm + + patchTarget :: Target -> Target + patchTarget (TReg r) = TReg (env r) + patchTarget tBlock = tBlock + + patchAddr :: AddrMode -> AddrMode + patchAddr (AddrRegImm r1 imm) = AddrRegImm (env r1) imm + patchAddr (AddrReg r) = AddrReg (env r) + + patchReg :: Reg -> Reg + patchReg = env + +-- | Checks whether this instruction is a jump/branch instruction. +-- +-- One that can change the flow of control in a way that the +-- register allocator needs to worry about. +isJumpishInstr :: Instr -> Bool +isJumpishInstr instr = case instr of + ANN _ i -> isJumpishInstr i + J_TBL {} -> True + B {} -> True + BL {} -> True + BCOND {} -> True + _ -> False + +-- | Get the `BlockId`s of the jump destinations (if any) +jumpDestsOfInstr :: Instr -> [BlockId] +jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i +jumpDestsOfInstr (J_TBL ids _mbLbl _r) = catMaybes ids +jumpDestsOfInstr (B t) = [id | TBlock id <- [t]] +jumpDestsOfInstr (BCOND _ _ _ t) = [id | TBlock id <- [t]] +jumpDestsOfInstr _ = [] + +-- | Change the destination of this (potential) jump instruction. +-- +-- Used in the linear allocator when adding fixup blocks for join +-- points. +patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr +patchJumpInstr instr patchF = + case instr of + ANN d i -> ANN d (patchJumpInstr i patchF) + J_TBL ids mbLbl r -> J_TBL (map (fmap patchF) ids) mbLbl r + B (TBlock bid) -> B (TBlock (patchF bid)) + BCOND c o1 o2 (TBlock bid) -> BCOND c o1 o2 (TBlock (patchF bid)) + _ -> panic $ "patchJumpInstr: " ++ instrCon instr + +-- ----------------------------------------------------------------------------- +-- Note [RISCV64 Spills and Reloads] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading +-- registers. The load and store instructions of RISCV64 address with a signed +-- 12-bit immediate + a register; machine stackpointer (sp/x2) in this case. +-- +-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't always address into it in a +-- single load/store instruction. There are offsets to sp (not to be confused +-- with STG's SP!) which need a register to be calculated. +-- +-- Using sp to compute the offset would violate assumptions about the stack pointer +-- pointing to the top of the stack during signal handling. As we can't force +-- every signal to use its own stack, we have to ensure that the stack pointer +-- always points to the top of the stack, and we can't use it for computation. +-- +-- So, we reserve one register (TMP) for this purpose (and other, unrelated +-- intermediate operations.) See Note [The made-up RISCV64 TMP (IP) register] + +-- | Generate instructions to spill a register into a spill slot. +mkSpillInstr :: + (HasCallStack) => + NCGConfig -> + -- | register to spill + Reg -> + -- | current stack delta + Int -> + -- | spill slot to use + Int -> + [Instr] +mkSpillInstr _config reg delta slot = + case off - delta of + imm | fitsIn12bitImm imm -> [mkStrSpImm imm] + imm -> + [ movImmToTmp imm, + addSpToTmp, + mkStrTmp + ] + where + fmt = case reg of + RegReal (RealRegSingle n) | n < d0RegNo -> II64 + _ -> FF64 + mkStrSpImm imm = + ANN (text "Spill@" <> int (off - delta)) + $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm))) + movImmToTmp imm = + ANN (text "Spill: TMP <- " <> int imm) + $ MOV tmp (OpImm (ImmInt imm)) + addSpToTmp = + ANN (text "Spill: TMP <- SP + TMP ") + $ ADD tmp tmp sp + mkStrTmp = + ANN (text "Spill@" <> int (off - delta)) + $ STR fmt (OpReg W64 reg) (OpAddr (AddrReg tmpReg)) + + off = spillSlotToOffset slot + +-- | Generate instructions to load a register from a spill slot. +mkLoadInstr :: + NCGConfig -> + -- | register to load + Reg -> + -- | current stack delta + Int -> + -- | spill slot to use + Int -> + [Instr] +mkLoadInstr _config reg delta slot = + case off - delta of + imm | fitsIn12bitImm imm -> [mkLdrSpImm imm] + imm -> + [ movImmToTmp imm, + addSpToTmp, + mkLdrTmp + ] + where + fmt = case reg of + RegReal (RealRegSingle n) | n < d0RegNo -> II64 + _ -> FF64 + mkLdrSpImm imm = + ANN (text "Reload@" <> int (off - delta)) + $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm))) + movImmToTmp imm = + ANN (text "Reload: TMP <- " <> int imm) + $ MOV tmp (OpImm (ImmInt imm)) + addSpToTmp = + ANN (text "Reload: TMP <- SP + TMP ") + $ ADD tmp tmp sp + mkLdrTmp = + ANN (text "Reload@" <> int (off - delta)) + $ LDR fmt (OpReg W64 reg) (OpAddr (AddrReg tmpReg)) + + off = spillSlotToOffset slot + +-- | See if this instruction is telling us the current C stack delta +takeDeltaInstr :: Instr -> Maybe Int +takeDeltaInstr (ANN _ i) = takeDeltaInstr i +takeDeltaInstr (DELTA i) = Just i +takeDeltaInstr _ = Nothing + +-- | Not real instructions. Just meta data +isMetaInstr :: Instr -> Bool +isMetaInstr instr = + case instr of + ANN _ i -> isMetaInstr i + COMMENT {} -> True + MULTILINE_COMMENT {} -> True + LOCATION {} -> True + LDATA {} -> True + NEWBLOCK {} -> True + DELTA {} -> True + PUSH_STACK_FRAME -> True + POP_STACK_FRAME -> True + _ -> False + +-- | Copy the value in a register to another one. +-- +-- Must work for all register classes. +mkRegRegMoveInstr :: Reg -> Reg -> Instr +mkRegRegMoveInstr src dst = ANN desc instr + where + desc = text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst + instr = MOV (operandFromReg dst) (operandFromReg src) + +-- | Take the source and destination from this (potential) reg -> reg move instruction +-- +-- We have to be a bit careful here: A `MOV` can also mean an implicit +-- conversion. This case is filtered out. +takeRegRegMoveInstr :: Instr -> Maybe (Reg, Reg) +takeRegRegMoveInstr (MOV (OpReg width dst) (OpReg width' src)) + | width == width' && (isFloatReg dst == isFloatReg src) = pure (src, dst) +takeRegRegMoveInstr _ = Nothing + +-- | Make an unconditional jump instruction. +mkJumpInstr :: BlockId -> [Instr] +mkJumpInstr = pure . B . TBlock + +-- | Decrement @sp@ to allocate stack space. +-- +-- The stack grows downwards, so we decrement the stack pointer by @n@ (bytes). +-- This is dual to `mkStackDeallocInstr`. @sp@ is the RISCV stack pointer, not +-- to be confused with the STG stack pointer. +mkStackAllocInstr :: Platform -> Int -> [Instr] +mkStackAllocInstr _platform = moveSp . negate + +-- | Increment SP to deallocate stack space. +-- +-- The stack grows downwards, so we increment the stack pointer by @n@ (bytes). +-- This is dual to `mkStackAllocInstr`. @sp@ is the RISCV stack pointer, not to +-- be confused with the STG stack pointer. +mkStackDeallocInstr :: Platform -> Int -> [Instr] +mkStackDeallocInstr _platform = moveSp + +moveSp :: Int -> [Instr] +moveSp n + | n == 0 = [] + | n /= 0 && fitsIn12bitImm n = pure . ANN desc $ ADD sp sp (OpImm (ImmInt n)) + | otherwise = + -- This ends up in three effective instructions. We could get away with + -- two for intMax12bit < n < 3 * intMax12bit by recursing once. However, + -- this way is likely less surprising. + [ ANN desc (MOV tmp (OpImm (ImmInt n))), + ADD sp sp tmp + ] + where + desc = text "Move SP:" <+> int n + +-- +-- See Note [extra spill slots] in X86/Instr.hs +-- +allocMoreStack :: + Platform -> + Int -> + NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr -> + UniqSM (NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr, [(BlockId, BlockId)]) +allocMoreStack _ _ top@(CmmData _ _) = return (top, []) +allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do + let entries = entryBlocks proc + + uniqs <- getUniquesM + + let delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up + where + x = slots * spillSlotSize -- sp delta + alloc = mkStackAllocInstr platform delta + dealloc = mkStackDeallocInstr platform delta + + retargetList = zip entries (map mkBlockId uniqs) + + new_blockmap :: LabelMap BlockId + new_blockmap = mapFromList retargetList + + insert_stack_insn (BasicBlock id insns) + | Just new_blockid <- mapLookup id new_blockmap = + [ BasicBlock id $ alloc ++ [B (TBlock new_blockid)], + BasicBlock new_blockid block' + ] + | otherwise = + [BasicBlock id block'] + where + block' = foldr insert_dealloc [] insns + + insert_dealloc insn r = case insn of + J_TBL {} -> dealloc ++ (insn : r) + ANN _ e -> insert_dealloc e r + _other + | jumpDestsOfInstr insn /= [] -> + patchJumpInstr insn retarget : r + _other -> insn : r + where + retarget b = fromMaybe b (mapLookup b new_blockmap) + + new_code = concatMap insert_stack_insn code + return (CmmProc info lbl live (ListGraph new_code), retargetList) + +data Instr + = -- | Comment pseudo-op + COMMENT SDoc + | -- | Multi-line comment pseudo-op + MULTILINE_COMMENT SDoc + | -- | Annotated instruction. Should print # + ANN SDoc Instr + | -- | Location pseudo-op @.loc@ (file, line, col, name) + LOCATION Int Int Int LexicalFastString + | -- | Static data spat out during code generation. + LDATA Section RawCmmStatics + | -- | Start a new basic block. + -- + -- Useful during codegen, removed later. Preceding instruction should be a + -- jump, as per the invariants for a BasicBlock (see Cmm). + NEWBLOCK BlockId + | -- | Specify current stack offset for benefit of subsequent passes + DELTA Int + | -- | Push a minimal stack frame consisting of the return address (RA) and the frame pointer (FP). + PUSH_STACK_FRAME + | -- | Pop the minimal stack frame of prior `PUSH_STACK_FRAME`. + POP_STACK_FRAME + | -- | Arithmetic addition (both integer and floating point) + -- + -- @rd = rs1 + rs2@ + ADD Operand Operand Operand + | -- | Arithmetic subtraction (both integer and floating point) + -- + -- @rd = rs1 - rs2@ + SUB Operand Operand Operand + | -- | Logical AND (integer only) + -- + -- @rd = rs1 & rs2@ + AND Operand Operand Operand + | -- | Logical OR (integer only) + -- + -- @rd = rs1 | rs2@ + OR Operand Operand Operand + | -- | Logical left shift (zero extened, integer only) + -- + -- @rd = rs1 << rs2@ + SLL Operand Operand Operand + | -- | Logical right shift (zero extened, integer only) + -- + -- @rd = rs1 >> rs2@ + SRL Operand Operand Operand + | -- | Arithmetic right shift (sign-extened, integer only) + -- + -- @rd = rs1 >> rs2@ + SRA Operand Operand Operand + | -- | Store to memory (both, integer and floating point) + STR Format Operand Operand + | -- | Load from memory (sign-extended, integer and floating point) + LDR Format Operand Operand + | -- | Load from memory (unsigned, integer and floating point) + LDRU Format Operand Operand + | -- | Arithmetic multiplication (both, integer and floating point) + -- + -- @rd = rn × rm@ + MUL Operand Operand Operand + | -- | Negation (both, integer and floating point) + -- + -- @rd = -op2@ + NEG Operand Operand + | -- | Division (both, integer and floating point) + -- + -- @rd = rn ÷ rm@ + DIV Operand Operand Operand + | -- | Remainder (integer only, signed) + -- + -- @rd = rn % rm@ + REM Operand Operand Operand -- + | -- | Remainder (integer only, unsigned) + -- + -- @rd = |rn % rm|@ + REMU Operand Operand Operand + | -- | High part of a multiplication that doesn't fit into 64bits (integer only) + -- + -- E.g. for a multiplication with 64bits width: @rd = (rs1 * rs2) >> 64@. + MULH Operand Operand Operand + | -- | Unsigned division (integer only) + -- + -- @rd = |rn ÷ rm|@ + DIVU Operand Operand Operand + | -- | XOR (integer only) + -- + -- @rd = rn ⊕ op2@ + XOR Operand Operand Operand + | -- | ORI with immediate (integer only) + -- + -- @rd = rn | op2@ + ORI Operand Operand Operand + | -- | OR with immediate (integer only) + -- + -- @rd = rn ⊕ op2@ + XORI Operand Operand Operand + | -- | Move to register (integer and floating point) + -- + -- @rd = rn@ or @rd = #imm@ + MOV Operand Operand + | -- | Pseudo-op for conditional setting of a register. + -- + -- @if(o2 cond o3) op <- 1 else op <- 0@ + CSET Operand Operand Operand Cond + | -- | A jump instruction with data for switch/jump tables + J_TBL [Maybe BlockId] (Maybe CLabel) Reg + | -- | Unconditional jump (no linking) + B Target + | -- | Unconditional jump, links return address (sets @ra@/@x1@) + BL Reg [Reg] + | -- | branch with condition (integer only) + BCOND Cond Operand Operand Target + | -- | Fence instruction + -- + -- Memory barrier. + FENCE FenceType FenceType + | -- | Floating point conversion + FCVT FcvtVariant Operand Operand + | -- | Floating point ABSolute value + FABS Operand Operand + | -- | Floating-point fused multiply-add instructions + -- + -- - fmadd : d = r1 * r2 + r3 + -- - fnmsub: d = r1 * r2 - r3 + -- - fmsub : d = - r1 * r2 + r3 + -- - fnmadd: d = - r1 * r2 - r3 + FMA FMASign Operand Operand Operand Operand + +-- | Operand of a FENCE instruction (@r@, @w@ or @rw@) +data FenceType = FenceRead | FenceWrite | FenceReadWrite + +-- | Variant of a floating point conversion instruction +data FcvtVariant = FloatToFloat | IntToFloat | FloatToInt + +instrCon :: Instr -> String +instrCon i = + case i of + COMMENT {} -> "COMMENT" + MULTILINE_COMMENT {} -> "COMMENT" + ANN {} -> "ANN" + LOCATION {} -> "LOCATION" + LDATA {} -> "LDATA" + NEWBLOCK {} -> "NEWBLOCK" + DELTA {} -> "DELTA" + PUSH_STACK_FRAME {} -> "PUSH_STACK_FRAME" + POP_STACK_FRAME {} -> "POP_STACK_FRAME" + ADD {} -> "ADD" + OR {} -> "OR" + MUL {} -> "MUL" + NEG {} -> "NEG" + DIV {} -> "DIV" + REM {} -> "REM" + REMU {} -> "REMU" + MULH {} -> "MULH" + SUB {} -> "SUB" + DIVU {} -> "DIVU" + AND {} -> "AND" + SRA {} -> "SRA" + XOR {} -> "XOR" + SLL {} -> "SLL" + SRL {} -> "SRL" + MOV {} -> "MOV" + ORI {} -> "ORI" + XORI {} -> "ORI" + STR {} -> "STR" + LDR {} -> "LDR" + LDRU {} -> "LDRU" + CSET {} -> "CSET" + J_TBL {} -> "J_TBL" + B {} -> "B" + BL {} -> "BL" + BCOND {} -> "BCOND" + FENCE {} -> "FENCE" + FCVT {} -> "FCVT" + FABS {} -> "FABS" + FMA variant _ _ _ _ -> + case variant of + FMAdd -> "FMADD" + FMSub -> "FMSUB" + FNMAdd -> "FNMADD" + FNMSub -> "FNMSUB" + +data Target + = TBlock BlockId + | TReg Reg + +data Operand + = -- | register + OpReg Width Reg + | -- | immediate value + OpImm Imm + | -- | memory reference + OpAddr AddrMode + deriving (Eq, Show) + +operandFromReg :: Reg -> Operand +operandFromReg = OpReg W64 + +operandFromRegNo :: RegNo -> Operand +operandFromRegNo = operandFromReg . regSingle + +zero, ra, sp, gp, tp, fp, tmp :: Operand +zero = operandFromReg zeroReg +ra = operandFromReg raReg +sp = operandFromReg spMachReg +gp = operandFromRegNo 3 +tp = operandFromRegNo 4 +fp = operandFromRegNo 8 +tmp = operandFromReg tmpReg + +x0, x1, x2, x3, x4, x5, x6, x7 :: Operand +x8, x9, x10, x11, x12, x13, x14, x15 :: Operand +x16, x17, x18, x19, x20, x21, x22, x23 :: Operand +x24, x25, x26, x27, x28, x29, x30, x31 :: Operand +x0 = operandFromRegNo x0RegNo +x1 = operandFromRegNo 1 +x2 = operandFromRegNo 2 +x3 = operandFromRegNo 3 +x4 = operandFromRegNo 4 +x5 = operandFromRegNo x5RegNo +x6 = operandFromRegNo 6 +x7 = operandFromRegNo x7RegNo + +x8 = operandFromRegNo 8 + +x9 = operandFromRegNo 9 + +x10 = operandFromRegNo x10RegNo + +x11 = operandFromRegNo 11 + +x12 = operandFromRegNo 12 + +x13 = operandFromRegNo 13 + +x14 = operandFromRegNo 14 + +x15 = operandFromRegNo 15 + +x16 = operandFromRegNo 16 + +x17 = operandFromRegNo x17RegNo + +x18 = operandFromRegNo 18 + +x19 = operandFromRegNo 19 + +x20 = operandFromRegNo 20 + +x21 = operandFromRegNo 21 + +x22 = operandFromRegNo 22 + +x23 = operandFromRegNo 23 + +x24 = operandFromRegNo 24 + +x25 = operandFromRegNo 25 + +x26 = operandFromRegNo 26 + +x27 = operandFromRegNo 27 + +x28 = operandFromRegNo x28RegNo + +x29 = operandFromRegNo 29 + +x30 = operandFromRegNo 30 + +x31 = operandFromRegNo x31RegNo + +d0, d1, d2, d3, d4, d5, d6, d7 :: Operand +d8, d9, d10, d11, d12, d13, d14, d15 :: Operand +d16, d17, d18, d19, d20, d21, d22, d23 :: Operand +d24, d25, d26, d27, d28, d29, d30, d31 :: Operand +d0 = operandFromRegNo d0RegNo +d1 = operandFromRegNo 33 +d2 = operandFromRegNo 34 +d3 = operandFromRegNo 35 +d4 = operandFromRegNo 36 +d5 = operandFromRegNo 37 +d6 = operandFromRegNo 38 +d7 = operandFromRegNo d7RegNo + +d8 = operandFromRegNo 40 + +d9 = operandFromRegNo 41 + +d10 = operandFromRegNo d10RegNo + +d11 = operandFromRegNo 43 + +d12 = operandFromRegNo 44 + +d13 = operandFromRegNo 45 + +d14 = operandFromRegNo 46 + +d15 = operandFromRegNo 47 + +d16 = operandFromRegNo 48 + +d17 = operandFromRegNo d17RegNo + +d18 = operandFromRegNo 50 + +d19 = operandFromRegNo 51 + +d20 = operandFromRegNo 52 + +d21 = operandFromRegNo 53 + +d22 = operandFromRegNo 54 + +d23 = operandFromRegNo 55 + +d24 = operandFromRegNo 56 + +d25 = operandFromRegNo 57 + +d26 = operandFromRegNo 58 + +d27 = operandFromRegNo 59 + +d28 = operandFromRegNo 60 + +d29 = operandFromRegNo 61 + +d30 = operandFromRegNo 62 + +d31 = operandFromRegNo d31RegNo + +fitsIn12bitImm :: (Num a, Ord a) => a -> Bool +fitsIn12bitImm off = off >= intMin12bit && off <= intMax12bit + +intMin12bit :: (Num a) => a +intMin12bit = -2048 + +intMax12bit :: (Num a) => a +intMax12bit = 2047 + +fitsIn32bits :: (Num a, Ord a, Bits a) => a -> Bool +fitsIn32bits i = (-1 `shiftL` 31) <= i && i <= (1 `shiftL` 31 - 1) + +isNbitEncodeable :: Int -> Integer -> Bool +isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift) + +isEncodeableInWidth :: Width -> Integer -> Bool +isEncodeableInWidth = isNbitEncodeable . widthInBits + +isIntOp :: Operand -> Bool +isIntOp = not . isFloatOp + +isFloatOp :: Operand -> Bool +isFloatOp (OpReg _ reg) | isFloatReg reg = True +isFloatOp _ = False + +isFloatReg :: Reg -> Bool +isFloatReg (RegReal (RealRegSingle i)) | i > 31 = True +isFloatReg (RegVirtual (VirtualRegF _)) = True +isFloatReg (RegVirtual (VirtualRegD _)) = True +isFloatReg _ = False Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Ppr.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Ppr.hs @@ -0,0 +1,715 @@ +{-# LANGUAGE ScopedTypeVariables #-} + +module GHC.CmmToAsm.RV64.Ppr (pprNatCmmDecl, pprInstr) where + +import GHC.Cmm hiding (topInfoTable) +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import GHC.Cmm.Dataflow.Label +import GHC.CmmToAsm.Config +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Ppr +import GHC.CmmToAsm.RV64.Cond +import GHC.CmmToAsm.RV64.Instr +import GHC.CmmToAsm.RV64.Regs +import GHC.CmmToAsm.Types +import GHC.CmmToAsm.Utils +import GHC.Platform +import GHC.Platform.Reg +import GHC.Prelude hiding (EQ) +import GHC.Types.Basic (Alignment, alignmentBytes, mkAlignment) +import GHC.Types.Unique (getUnique, pprUniqueAlways) +import GHC.Utils.Outputable +import GHC.Utils.Panic + +pprNatCmmDecl :: forall doc. (IsDoc doc) => NCGConfig -> NatCmmDecl RawCmmStatics Instr -> doc +pprNatCmmDecl config (CmmData section dats) = + pprSectionAlign config section $$ pprDatas config dats +pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) = + let platform = ncgPlatform config + + pprProcAlignment :: doc + pprProcAlignment = maybe empty (pprAlign . mkAlignment) (ncgProcAlignment config) + in pprProcAlignment + $$ case topInfoTable proc of + Nothing -> + -- special case for code without info table: + pprSectionAlign config (Section Text lbl) + $$ + -- do not + -- pprProcAlignment config $$ + pprLabel platform lbl + $$ vcat (map (pprBasicBlock config top_info) blocks) -- blocks guaranteed not null, so label needed + $$ ppWhen + (ncgDwarfEnabled config) + (line (pprBlockEndLabel platform lbl) $$ line (pprProcEndLabel platform lbl)) + $$ pprSizeDecl platform lbl + Just (CmmStaticsRaw info_lbl _) -> + pprSectionAlign config (Section Text info_lbl) + $$ + -- pprProcAlignment config $$ + ( if platformHasSubsectionsViaSymbols platform + then line (pprAsmLabel platform (mkDeadStripPreventer info_lbl) <> char ':') + else empty + ) + $$ vcat (map (pprBasicBlock config top_info) blocks) + $$ ppWhen (ncgDwarfEnabled config) (line (pprProcEndLabel platform info_lbl)) + $$ + -- above: Even the first block gets a label, because with branch-chain + -- elimination, it might be the target of a goto. + ( if platformHasSubsectionsViaSymbols platform + then -- See Note [Subsections Via Symbols] + + line + $ text "\t.long " + <+> pprAsmLabel platform info_lbl + <+> char '-' + <+> pprAsmLabel platform (mkDeadStripPreventer info_lbl) + else empty + ) + $$ pprSizeDecl platform info_lbl +{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc #-} +{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> HDoc #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable + +pprLabel :: (IsDoc doc) => Platform -> CLabel -> doc +pprLabel platform lbl = + pprGloblDecl platform lbl + $$ pprTypeDecl platform lbl + $$ line (pprAsmLabel platform lbl <> char ':') + +pprAlign :: (IsDoc doc) => Alignment -> doc +pprAlign alignment = + -- "The .align directive for RISC-V is an alias to .p2align, which aligns to a + -- power of two, so .align 2 means align to 4 bytes. Because the definition of + -- the .align directive varies by architecture, it is recommended to use the + -- unambiguous .p2align or .balign directives instead." + -- (https://github.com/riscv-non-isa/riscv-asm-manual/blob/main/riscv-asm.md#-align) + line $ text "\t.balign " <> int (alignmentBytes alignment) + +-- | Print appropriate alignment for the given section type. +-- +-- Currently, this always aligns to a full machine word (8 byte.) A future +-- improvement could be to really do this per section type (though, it's +-- probably not a big gain.) +pprAlignForSection :: (IsDoc doc) => SectionType -> doc +pprAlignForSection _seg = pprAlign . mkAlignment $ 8 + +-- | Print section header and appropriate alignment for that section. +-- +-- This will e.g. emit a header like: +-- +-- .section .text +-- .balign 8 +pprSectionAlign :: (IsDoc doc) => NCGConfig -> Section -> doc +pprSectionAlign _config (Section (OtherSection _) _) = + panic "RV64.Ppr.pprSectionAlign: unknown section" +pprSectionAlign config sec@(Section seg _) = + line (pprSectionHeader config sec) + $$ pprAlignForSection seg + +pprProcEndLabel :: + (IsLine doc) => + Platform -> + -- | Procedure name + CLabel -> + doc +pprProcEndLabel platform lbl = + pprAsmLabel platform (mkAsmTempProcEndLabel lbl) <> colon + +pprBlockEndLabel :: + (IsLine doc) => + Platform -> + -- | Block name + CLabel -> + doc +pprBlockEndLabel platform lbl = + pprAsmLabel platform (mkAsmTempEndLabel lbl) <> colon + +-- | Output the ELF .size directive (if needed.) +pprSizeDecl :: (IsDoc doc) => Platform -> CLabel -> doc +pprSizeDecl platform lbl + | osElfTarget (platformOS platform) = + line $ text "\t.size" <+> asmLbl <> text ", .-" <> asmLbl + where + asmLbl = pprAsmLabel platform lbl +pprSizeDecl _ _ = empty + +pprBasicBlock :: + (IsDoc doc) => + NCGConfig -> + LabelMap RawCmmStatics -> + NatBasicBlock Instr -> + doc +pprBasicBlock config info_env (BasicBlock blockid instrs) = + maybe_infotable + $ pprLabel platform asmLbl + $$ vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs)) + $$ ppWhen + (ncgDwarfEnabled config) + ( -- Emit both end labels since this may end up being a standalone + -- top-level block + line + ( pprBlockEndLabel platform asmLbl + <> pprProcEndLabel platform asmLbl + ) + ) + where + -- TODO: Check if we can filter more instructions here. + -- TODO: Shouldn't this be a more general check on a higher level? + -- Filter out identity moves. E.g. mov x18, x18 will be dropped. + optInstrs = filter f instrs + where + f (MOV o1 o2) | o1 == o2 = False + f _ = True + + asmLbl = blockLbl blockid + platform = ncgPlatform config + maybe_infotable c = case mapLookup blockid info_env of + Nothing -> c + Just (CmmStaticsRaw info_lbl info) -> + -- pprAlignForSection platform Text $$ + infoTableLoc + $$ vcat (map (pprData config) info) + $$ pprLabel platform info_lbl + $$ c + $$ ppWhen + (ncgDwarfEnabled config) + (line (pprBlockEndLabel platform info_lbl)) + -- Make sure the info table has the right .loc for the block + -- coming right after it. See Note [Info Offset] + infoTableLoc = case instrs of + (l@LOCATION {} : _) -> pprInstr platform l + _other -> empty + +pprDatas :: (IsDoc doc) => NCGConfig -> RawCmmStatics -> doc +-- See Note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel". +pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _]) + | lbl == mkIndStaticInfoLabel, + let labelInd (CmmLabelOff l _) = Just l + labelInd (CmmLabel l) = Just l + labelInd _ = Nothing, + Just ind' <- labelInd ind, + alias `mayRedirectTo` ind' = + pprGloblDecl (ncgPlatform config) alias + $$ line (text ".equiv" <+> pprAsmLabel (ncgPlatform config) alias <> comma <> pprAsmLabel (ncgPlatform config) ind') +pprDatas config (CmmStaticsRaw lbl dats) = + vcat (pprLabel platform lbl : map (pprData config) dats) + where + platform = ncgPlatform config + +pprData :: (IsDoc doc) => NCGConfig -> CmmStatic -> doc +pprData _config (CmmString str) = line (pprString str) +pprData _config (CmmFileEmbed path _) = line (pprFileEmbed path) +-- TODO: AFAIK there no Darwin for RISCV, so we may consider to simplify this. +pprData config (CmmUninitialised bytes) = + line + $ let platform = ncgPlatform config + in if platformOS platform == OSDarwin + then text ".space " <> int bytes + else text ".skip " <> int bytes +pprData config (CmmStaticLit lit) = pprDataItem config lit + +pprGloblDecl :: (IsDoc doc) => Platform -> CLabel -> doc +pprGloblDecl platform lbl + | not (externallyVisibleCLabel lbl) = empty + | otherwise = line (text "\t.globl " <> pprAsmLabel platform lbl) + +-- Note [Always use objects for info tables] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- See discussion in X86.Ppr for why this is necessary. Essentially we need to +-- ensure that we never pass function symbols when we might want to lookup the +-- info table. If we did, we could end up with procedure linking tables +-- (PLT)s, and thus the lookup wouldn't point to the function, but into the +-- jump table. +-- +-- Fun fact: The LLVMMangler exists to patch this issue on the LLVM side as +-- well. +pprLabelType' :: (IsLine doc) => Platform -> CLabel -> doc +pprLabelType' platform lbl = + if isCFunctionLabel lbl || functionOkInfoTable + then text "@function" + else text "@object" + where + functionOkInfoTable = + platformTablesNextToCode platform + && isInfoTableLabel lbl + && not (isCmmInfoTableLabel lbl) + && not (isConInfoTableLabel lbl) + +-- this is called pprTypeAndSizeDecl in PPC.Ppr +pprTypeDecl :: (IsDoc doc) => Platform -> CLabel -> doc +pprTypeDecl platform lbl = + if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl + then line (text ".type " <> pprAsmLabel platform lbl <> text ", " <> pprLabelType' platform lbl) + else empty + +pprDataItem :: (IsDoc doc) => NCGConfig -> CmmLit -> doc +pprDataItem config lit = + lines_ (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit) + where + platform = ncgPlatform config + + imm = litToImm lit + + ppr_item II8 _ = [text "\t.byte\t" <> pprDataImm platform imm] + ppr_item II16 _ = [text "\t.short\t" <> pprDataImm platform imm] + ppr_item II32 _ = [text "\t.long\t" <> pprDataImm platform imm] + ppr_item II64 _ = [text "\t.quad\t" <> pprDataImm platform imm] + ppr_item FF32 (CmmFloat r _) = + let bs = floatToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs + ppr_item FF64 (CmmFloat r _) = + let bs = doubleToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs + ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit) + +-- | Pretty print an immediate value in the @data@ section +-- +-- This does not include any checks. We rely on the Assembler to check for +-- errors. Use `pprOpImm` for immediates in instructions (operands.) +pprDataImm :: (IsLine doc) => Platform -> Imm -> doc +pprDataImm _ (ImmInt i) = int i +pprDataImm _ (ImmInteger i) = integer i +pprDataImm p (ImmCLbl l) = pprAsmLabel p l +pprDataImm p (ImmIndex l i) = pprAsmLabel p l <> char '+' <> int i +pprDataImm _ (ImmLit s) = ftext s +pprDataImm _ (ImmFloat f) = float (fromRational f) +pprDataImm _ (ImmDouble d) = double (fromRational d) +pprDataImm p (ImmConstantSum a b) = pprDataImm p a <> char '+' <> pprDataImm p b +pprDataImm p (ImmConstantDiff a b) = + pprDataImm p a + <> char '-' + <> lparen + <> pprDataImm p b + <> rparen + +-- | Comment @c@ with @# c@ +asmComment :: SDoc -> SDoc +asmComment c = text "#" <+> c + +-- | Commen @c@ with @// c@ +asmDoubleslashComment :: SDoc -> SDoc +asmDoubleslashComment c = text "//" <+> c + +-- | Comment @c@ with @/* c */@ (multiline comment) +asmMultilineComment :: SDoc -> SDoc +asmMultilineComment c = text "/*" $+$ c $+$ text "*/" + +-- | Pretty print an immediate operand of an instruction +-- +-- The kinds of immediates we can use here is pretty limited: RISCV doesn't +-- support index expressions (as e.g. Aarch64 does.) Floating points need to +-- fit in range. As we don't need them, forbit them to save us from future +-- troubles. +pprOpImm :: (IsLine doc) => Platform -> Imm -> doc +pprOpImm platform im = case im of + ImmInt i -> int i + ImmInteger i -> integer i + ImmCLbl l -> char '=' <> pprAsmLabel platform l + _ -> pprPanic "RV64.Ppr.pprOpImm" (text "Unsupported immediate for instruction operands" <> colon <+> (text . show) im) + +-- | Negate integer immediate operand +-- +-- This function is partial and will panic if the operand is not an integer. +negOp :: Operand -> Operand +negOp (OpImm (ImmInt i)) = OpImm (ImmInt (negate i)) +negOp (OpImm (ImmInteger i)) = OpImm (ImmInteger (negate i)) +negOp op = pprPanic "RV64.negOp" (text $ show op) + +-- | Pretty print an operand +pprOp :: (IsLine doc) => Platform -> Operand -> doc +pprOp plat op = case op of + OpReg w r -> pprReg w r + OpImm im -> pprOpImm plat im + OpAddr (AddrRegImm r1 im) -> pprOpImm plat im <> char '(' <> pprReg W64 r1 <> char ')' + OpAddr (AddrReg r1) -> text "0(" <+> pprReg W64 r1 <+> char ')' + +-- | Pretty print register with calling convention name +-- +-- This representation makes it easier to reason about the emitted assembly +-- code. +pprReg :: forall doc. (IsLine doc) => Width -> Reg -> doc +pprReg w r = case r of + RegReal (RealRegSingle i) -> ppr_reg_no i + -- virtual regs should not show up, but this is helpful for debugging. + RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u + RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u + RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u + _ -> pprPanic "RiscV64.pprReg" (text (show r) <+> ppr w) + where + ppr_reg_no :: Int -> doc + -- General Purpose Registers + ppr_reg_no 0 = text "zero" + ppr_reg_no 1 = text "ra" + ppr_reg_no 2 = text "sp" + ppr_reg_no 3 = text "gp" + ppr_reg_no 4 = text "tp" + ppr_reg_no 5 = text "t0" + ppr_reg_no 6 = text "t1" + ppr_reg_no 7 = text "t2" + ppr_reg_no 8 = text "s0" + ppr_reg_no 9 = text "s1" + ppr_reg_no 10 = text "a0" + ppr_reg_no 11 = text "a1" + ppr_reg_no 12 = text "a2" + ppr_reg_no 13 = text "a3" + ppr_reg_no 14 = text "a4" + ppr_reg_no 15 = text "a5" + ppr_reg_no 16 = text "a6" + ppr_reg_no 17 = text "a7" + ppr_reg_no 18 = text "s2" + ppr_reg_no 19 = text "s3" + ppr_reg_no 20 = text "s4" + ppr_reg_no 21 = text "s5" + ppr_reg_no 22 = text "s6" + ppr_reg_no 23 = text "s7" + ppr_reg_no 24 = text "s8" + ppr_reg_no 25 = text "s9" + ppr_reg_no 26 = text "s10" + ppr_reg_no 27 = text "s11" + ppr_reg_no 28 = text "t3" + ppr_reg_no 29 = text "t4" + ppr_reg_no 30 = text "t5" + ppr_reg_no 31 = text "t6" + -- Floating Point Registers + ppr_reg_no 32 = text "ft0" + ppr_reg_no 33 = text "ft1" + ppr_reg_no 34 = text "ft2" + ppr_reg_no 35 = text "ft3" + ppr_reg_no 36 = text "ft4" + ppr_reg_no 37 = text "ft5" + ppr_reg_no 38 = text "ft6" + ppr_reg_no 39 = text "ft7" + ppr_reg_no 40 = text "fs0" + ppr_reg_no 41 = text "fs1" + ppr_reg_no 42 = text "fa0" + ppr_reg_no 43 = text "fa1" + ppr_reg_no 44 = text "fa2" + ppr_reg_no 45 = text "fa3" + ppr_reg_no 46 = text "fa4" + ppr_reg_no 47 = text "fa5" + ppr_reg_no 48 = text "fa6" + ppr_reg_no 49 = text "fa7" + ppr_reg_no 50 = text "fs2" + ppr_reg_no 51 = text "fs3" + ppr_reg_no 52 = text "fs4" + ppr_reg_no 53 = text "fs5" + ppr_reg_no 54 = text "fs6" + ppr_reg_no 55 = text "fs7" + ppr_reg_no 56 = text "fs8" + ppr_reg_no 57 = text "fs9" + ppr_reg_no 58 = text "fs10" + ppr_reg_no 59 = text "fs11" + ppr_reg_no 60 = text "ft8" + ppr_reg_no 61 = text "ft9" + ppr_reg_no 62 = text "ft10" + ppr_reg_no 63 = text "ft11" + ppr_reg_no i + | i < 0 = pprPanic "Unexpected register number (min is 0)" (ppr w <+> int i) + | i > 63 = pprPanic "Unexpected register number (max is 63)" (ppr w <+> int i) + -- no support for widths > W64. + | otherwise = pprPanic "Unsupported width in register (max is 64)" (ppr w <+> int i) + +-- | Single precission `Operand` (floating-point) +isSingleOp :: Operand -> Bool +isSingleOp (OpReg W32 _) = True +isSingleOp _ = False + +-- | Double precission `Operand` (floating-point) +isDoubleOp :: Operand -> Bool +isDoubleOp (OpReg W64 _) = True +isDoubleOp _ = False + +-- | `Operand` is an immediate value +isImmOp :: Operand -> Bool +isImmOp (OpImm _) = True +isImmOp _ = False + +-- | `Operand` is an immediate @0@ value +isImmZero :: Operand -> Bool +isImmZero (OpImm (ImmFloat 0)) = True +isImmZero (OpImm (ImmDouble 0)) = True +isImmZero (OpImm (ImmInt 0)) = True +isImmZero _ = False + +-- | `Target` represents a label +isLabel :: Target -> Bool +isLabel (TBlock _) = True +isLabel _ = False + +-- | Get the pretty-printed label from a `Target` +-- +-- This function is partial and will panic if the `Target` is not a label. +getLabel :: (IsLine doc) => Platform -> Target -> doc +getLabel platform (TBlock bid) = pprBlockId platform bid + where + pprBlockId :: (IsLine doc) => Platform -> BlockId -> doc + pprBlockId platform bid = pprAsmLabel platform (mkLocalBlockLabel (getUnique bid)) +getLabel _platform _other = panic "Cannot turn this into a label" + +-- | Pretty-print an `Instr` +-- +-- This function is partial and will panic if the `Instr` is not supported. This +-- can happen due to invalid operands or unexpected meta instructions. +pprInstr :: (IsDoc doc) => Platform -> Instr -> doc +pprInstr platform instr = case instr of + -- see Note [dualLine and dualDoc] in GHC.Utils.Outputable + COMMENT s -> dualDoc (asmComment s) empty + MULTILINE_COMMENT s -> dualDoc (asmMultilineComment s) empty + ANN d i -> dualDoc (pprInstr platform i <+> asmDoubleslashComment d) (pprInstr platform i) + LOCATION file line' col _name -> + line (text "\t.loc" <+> int file <+> int line' <+> int col) + DELTA d -> dualDoc (asmComment $ text "\tdelta = " <> int d) empty + NEWBLOCK _ -> panic "PprInstr: NEWBLOCK" + LDATA _ _ -> panic "pprInstr: LDATA" + PUSH_STACK_FRAME -> + lines_ + [ text "\taddi sp, sp, -16", + text "\tsd x1, 8(sp)", -- store RA + text "\tsd x8, 0(sp)", -- store FP/s0 + text "\taddi x8, sp, 16" + ] + POP_STACK_FRAME -> + lines_ + [ text "\tld x8, 0(sp)", -- restore FP/s0 + text "\tld x1, 8(sp)", -- restore RA + text "\taddi sp, sp, 16" + ] + ADD o1 o2 o3 + | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfadd." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3 + -- This case is used for sign extension: SEXT.W op + | OpReg W64 _ <- o1, OpReg W32 _ <- o2, isImmOp o3 -> op3 (text "\taddiw") o1 o2 o3 + | otherwise -> op3 (text "\tadd") o1 o2 o3 + MUL o1 o2 o3 + | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfmul." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3 + | otherwise -> op3 (text "\tmul") o1 o2 o3 + MULH o1 o2 o3 -> op3 (text "\tmulh") o1 o2 o3 + NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfneg.s") o1 o2 + NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfneg.d") o1 o2 + NEG o1 o2 -> op2 (text "\tneg") o1 o2 + DIV o1 o2 o3 + | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> + -- TODO: This must (likely) be refined regarding width + op3 (text "\tfdiv." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3 + DIV o1 o2 o3 -> op3 (text "\tdiv") o1 o2 o3 + REM o1 o2 o3 + | isFloatOp o1 || isFloatOp o2 || isFloatOp o3 -> + panic "pprInstr - REM not implemented for floats (yet)" + REM o1 o2 o3 -> op3 (text "\trem") o1 o2 o3 + REMU o1 o2 o3 -> op3 (text "\tremu") o1 o2 o3 + SUB o1 o2 o3 + | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfsub." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3 + | isImmOp o3 -> op3 (text "\taddi") o1 o2 (negOp o3) + | otherwise -> op3 (text "\tsub") o1 o2 o3 + DIVU o1 o2 o3 -> op3 (text "\tdivu") o1 o2 o3 + AND o1 o2 o3 + | isImmOp o3 -> op3 (text "\tandi") o1 o2 o3 + | otherwise -> op3 (text "\tand") o1 o2 o3 + OR o1 o2 o3 -> op3 (text "\tor") o1 o2 o3 + SRA o1 o2 o3 | isImmOp o3 -> op3 (text "\tsrai") o1 o2 o3 + SRA o1 o2 o3 -> op3 (text "\tsra") o1 o2 o3 + XOR o1 o2 o3 -> op3 (text "\txor") o1 o2 o3 + SLL o1 o2 o3 -> op3 (text "\tsll") o1 o2 o3 + SRL o1 o2 o3 -> op3 (text "\tsrl") o1 o2 o3 + MOV o1 o2 + | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.d") o1 o2 -- fmv.d rd, rs is pseudo op fsgnj.d rd, rs, rs + | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.s") o1 o2 -- fmv.s rd, rs is pseudo op fsgnj.s rd, rs, rs + | isFloatOp o1 && isImmZero o2 && isDoubleOp o1 -> op2 (text "\tfcvt.d.w") o1 zero + | isFloatOp o1 && isImmZero o2 && isSingleOp o1 -> op2 (text "\tfcvt.s.w") o1 zero + | isFloatOp o1 && not (isFloatOp o2) && isSingleOp o1 -> op2 (text "\tfmv.w.x") o1 o2 + | isFloatOp o1 && not (isFloatOp o2) && isDoubleOp o1 -> op2 (text "\tfmv.d.x") o1 o2 + | not (isFloatOp o1) && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.x.w") o1 o2 + | not (isFloatOp o1) && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.x.d") o1 o2 + | (OpImm (ImmInteger i)) <- o2, + fitsIn12bitImm i -> + lines_ [text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2] + | (OpImm (ImmInt i)) <- o2, + fitsIn12bitImm i -> + lines_ [text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2] + | (OpImm (ImmInteger i)) <- o2, + fitsIn32bits i -> + lines_ + [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")", + text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")" + ] + | (OpImm (ImmInt i)) <- o2, + fitsIn32bits i -> + lines_ + [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")", + text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")" + ] + | isImmOp o2 -> + -- Surrender! Let the assembler figure out the right expressions with pseudo-op LI. + lines_ [text "\tli" <+> pprOp platform o1 <> comma <+> pprOp platform o2] + | otherwise -> op3 (text "\taddi") o1 o2 (OpImm (ImmInt 0)) + ORI o1 o2 o3 -> op3 (text "\tori") o1 o2 o3 + XORI o1 o2 o3 -> op3 (text "\txori") o1 o2 o3 + J_TBL _ _ r -> pprInstr platform (B (TReg r)) + B l | isLabel l -> line $ text "\tjal" <+> pprOp platform x0 <> comma <+> getLabel platform l + B (TReg r) -> line $ text "\tjalr" <+> pprOp platform x0 <> comma <+> pprReg W64 r <> comma <+> text "0" + BL r _ -> line $ text "\tjalr" <+> text "x1" <> comma <+> pprReg W64 r <> comma <+> text "0" + BCOND c l r t + | isLabel t -> + line $ text "\t" <> pprBcond c <+> pprOp platform l <> comma <+> pprOp platform r <> comma <+> getLabel platform t + BCOND _ _ _ (TReg _) -> panic "RV64.ppr: No conditional branching to registers!" + CSET o l r c -> case c of + EQ + | isIntOp l && isIntOp r -> + lines_ + [ subFor l r, + text "\tseqz" <+> pprOp platform o <> comma <+> pprOp platform o + ] + EQ | isFloatOp l && isFloatOp r -> line $ binOp ("\tfeq." ++ floatOpPrecision platform l r) + NE + | isIntOp l && isIntOp r -> + lines_ + [ subFor l r, + text "\tsnez" <+> pprOp platform o <> comma <+> pprOp platform o + ] + NE + | isFloatOp l && isFloatOp r -> + lines_ + [ binOp ("\tfeq." ++ floatOpPrecision platform l r), + text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" + ] + SLT -> lines_ [sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r] + SLE -> + lines_ + [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l, + text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" + ] + SGE -> + lines_ + [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r, + text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" + ] + SGT -> lines_ [sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l] + ULT -> lines_ [sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r] + ULE -> + lines_ + [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l, + text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" + ] + UGE -> + lines_ + [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r, + text "\txori" <+> pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" + ] + UGT -> lines_ [sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l] + FLT | isFloatOp l && isFloatOp r -> line $ binOp ("\tflt." ++ floatOpPrecision platform l r) + FLE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfle." ++ floatOpPrecision platform l r) + FGT | isFloatOp l && isFloatOp r -> line $ binOp ("\tfgt." ++ floatOpPrecision platform l r) + FGE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfge." ++ floatOpPrecision platform l r) + x -> pprPanic "RV64.ppr: unhandled CSET conditional" (text (show x) <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l) + where + subFor l r + | (OpImm _) <- r = text "\taddi" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform (negOp r) + | (OpImm _) <- l = panic "RV64.ppr: Cannot SUB IMM _" + | otherwise = text "\tsub" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r + sltFor l r + | (OpImm _) <- r = text "\tslti" + | (OpImm _) <- l = panic "PV64.ppr: Cannot SLT IMM _" + | otherwise = text "\tslt" + sltuFor l r + | (OpImm _) <- r = text "\tsltui" + | (OpImm _) <- l = panic "PV64.ppr: Cannot SLTU IMM _" + | otherwise = text "\tsltu" + binOp :: (IsLine doc) => String -> doc + binOp op = text op <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r + STR II8 o1 o2 -> op2 (text "\tsb") o1 o2 + STR II16 o1 o2 -> op2 (text "\tsh") o1 o2 + STR II32 o1 o2 -> op2 (text "\tsw") o1 o2 + STR II64 o1 o2 -> op2 (text "\tsd") o1 o2 + STR FF32 o1 o2 -> op2 (text "\tfsw") o1 o2 + STR FF64 o1 o2 -> op2 (text "\tfsd") o1 o2 + LDR _f o1 (OpImm (ImmIndex lbl off)) -> + lines_ + [ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl, + text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> int off + ] + LDR _f o1 (OpImm (ImmCLbl lbl)) -> + line $ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl + LDR II8 o1 o2 -> op2 (text "\tlb") o1 o2 + LDR II16 o1 o2 -> op2 (text "\tlh") o1 o2 + LDR II32 o1 o2 -> op2 (text "\tlw") o1 o2 + LDR II64 o1 o2 -> op2 (text "\tld") o1 o2 + LDR FF32 o1 o2 -> op2 (text "\tflw") o1 o2 + LDR FF64 o1 o2 -> op2 (text "\tfld") o1 o2 + LDRU II8 o1 o2 -> op2 (text "\tlbu") o1 o2 + LDRU II16 o1 o2 -> op2 (text "\tlhu") o1 o2 + LDRU II32 o1 o2 -> op2 (text "\tlwu") o1 o2 + -- double words (64bit) cannot be sign extended by definition + LDRU II64 o1 o2 -> op2 (text "\tld") o1 o2 + LDRU FF32 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tflw") o1 o2 + LDRU FF32 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tflw") o1 o2 + LDRU FF64 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tfld") o1 o2 + LDRU FF64 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tfld") o1 o2 + LDRU f o1 o2 -> pprPanic "Unsupported unsigned load" ((text . show) f <+> pprOp platform o1 <+> pprOp platform o2) + FENCE r w -> line $ text "\tfence" <+> pprFenceType r <> char ',' <+> pprFenceType w + FCVT FloatToFloat o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.d") o1 o2 + FCVT FloatToFloat o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.s") o1 o2 + FCVT FloatToFloat o1 o2 -> + pprPanic "RV64.pprInstr - impossible float to float conversion" + $ line (pprOp platform o1 <> text "->" <> pprOp platform o2) + FCVT IntToFloat o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.s.w") o1 o2 + FCVT IntToFloat o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.l") o1 o2 + FCVT IntToFloat o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.w") o1 o2 + FCVT IntToFloat o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.d.l") o1 o2 + FCVT IntToFloat o1 o2 -> + pprPanic "RV64.pprInstr - impossible integer to float conversion" + $ line (pprOp platform o1 <> text "->" <> pprOp platform o2) + FCVT FloatToInt o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.w.s") o1 o2 + FCVT FloatToInt o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.w.d") o1 o2 + FCVT FloatToInt o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.l.s") o1 o2 + FCVT FloatToInt o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.l.d") o1 o2 + FCVT FloatToInt o1 o2 -> + pprPanic "RV64.pprInstr - impossible float to integer conversion" + $ line (pprOp platform o1 <> text "->" <> pprOp platform o2) + FABS o1 o2 | isSingleOp o2 -> op2 (text "\tfabs.s") o1 o2 + FABS o1 o2 | isDoubleOp o2 -> op2 (text "\tfabs.d") o1 o2 + FMA variant d r1 r2 r3 -> + let fma = case variant of + FMAdd -> text "\tfmadd" <> dot <> floatPrecission d + FMSub -> text "\tfmsub" <> dot <> floatPrecission d + FNMAdd -> text "\tfnmadd" <> dot <> floatPrecission d + FNMSub -> text "\tfnmsub" <> dot <> floatPrecission d + in op4 fma d r1 r2 r3 + instr -> panic $ "RV64.pprInstr - Unknown instruction: " ++ instrCon instr + where + op2 op o1 o2 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 + op3 op o1 o2 o3 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 + op4 op o1 o2 o3 o4 = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4 + pprFenceType FenceRead = text "r" + pprFenceType FenceWrite = text "w" + pprFenceType FenceReadWrite = text "rw" + floatPrecission o + | isSingleOp o = text "s" + | isDoubleOp o = text "d" + | otherwise = pprPanic "Impossible floating point precission: " (pprOp platform o) + +floatOpPrecision :: Platform -> Operand -> Operand -> String +floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isSingleOp l && isSingleOp r = "s" -- single precision +floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isDoubleOp l && isDoubleOp r = "d" -- double precision +floatOpPrecision p l r = pprPanic "Cannot determine floating point precission" (text "op1" <+> pprOp p l <+> text "op2" <+> pprOp p r) + +-- | Pretty print a conditional branch +-- +-- This function is partial and will panic if the conditional is not supported; +-- i.e. if its floating point related. +pprBcond :: (IsLine doc) => Cond -> doc +pprBcond c = text "b" <> pprCond c + where + pprCond :: (IsLine doc) => Cond -> doc + pprCond c = case c of + EQ -> text "eq" + NE -> text "ne" + SLT -> text "lt" + SLE -> text "le" + SGE -> text "ge" + SGT -> text "gt" + ULT -> text "ltu" + ULE -> text "leu" + UGE -> text "geu" + UGT -> text "gtu" + -- BCOND cannot handle floating point comparisons / registers + _ -> panic $ "RV64.ppr: unhandled BCOND conditional: " ++ show c Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/RegInfo.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/RegInfo.hs @@ -0,0 +1,41 @@ +-- | Minimum viable implementation of jump short-cutting: No short-cutting. +-- +-- The functions here simply implement the no-short-cutting case. Implementing +-- the real behaviour would be a great optimization in future. +module GHC.CmmToAsm.RV64.RegInfo + ( getJumpDestBlockId, + canShortcut, + shortcutStatics, + shortcutJump, + JumpDest (..), + ) +where + +import GHC.Cmm +import GHC.Cmm.BlockId +import GHC.CmmToAsm.RV64.Instr +import GHC.Prelude +import GHC.Utils.Outputable + +newtype JumpDest = DestBlockId BlockId + +instance Outputable JumpDest where + ppr (DestBlockId bid) = text "jd:" <> ppr bid + +-- | Extract BlockId +-- +-- Never `Nothing` for Riscv64 NCG. +getJumpDestBlockId :: JumpDest -> Maybe BlockId +getJumpDestBlockId (DestBlockId bid) = Just bid + +-- No `Instr`s can bet shortcut (for now) +canShortcut :: Instr -> Maybe JumpDest +canShortcut _ = Nothing + +-- Identity of the provided `RawCmmStatics` +shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics +shortcutStatics _ other_static = other_static + +-- Identity of the provided `Instr` +shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr +shortcutJump _ other = other Index: ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Regs.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/RV64/Regs.hs @@ -0,0 +1,230 @@ +module GHC.CmmToAsm.RV64.Regs where + +import GHC.Cmm +import GHC.Cmm.CLabel (CLabel) +import GHC.CmmToAsm.Format +import GHC.Data.FastString +import GHC.Platform +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.Platform.Regs +import GHC.Prelude +import GHC.Types.Unique +import GHC.Utils.Outputable +import GHC.Utils.Panic + +-- * Registers + +-- | First integer register number. @zero@ register. +x0RegNo :: RegNo +x0RegNo = 0 + +-- | return address register +x1RegNo, raRegNo :: RegNo +x1RegNo = 1 +raRegNo = x1RegNo + +x5RegNo, t0RegNo :: RegNo +x5RegNo = 5 +t0RegNo = x5RegNo + +x7RegNo, t2RegNo :: RegNo +x7RegNo = 7 +t2RegNo = x7RegNo + +x28RegNo, t3RegNo :: RegNo +x28RegNo = 28 +t3RegNo = x28RegNo + +-- | Last integer register number. Used as TMP (IP) register. +x31RegNo, t6RegNo, tmpRegNo :: RegNo +x31RegNo = 31 +t6RegNo = x31RegNo +tmpRegNo = x31RegNo + +-- | First floating point register. +d0RegNo, ft0RegNo :: RegNo +d0RegNo = 32 +ft0RegNo = d0RegNo + +d7RegNo, ft7RegNo :: RegNo +d7RegNo = 39 +ft7RegNo = d7RegNo + +-- | Last floating point register. +d31RegNo :: RegNo +d31RegNo = 63 + +a0RegNo, x10RegNo :: RegNo +x10RegNo = 10 +a0RegNo = x10RegNo + +a7RegNo, x17RegNo :: RegNo +x17RegNo = 17 +a7RegNo = x17RegNo + +fa0RegNo, d10RegNo :: RegNo +d10RegNo = 42 +fa0RegNo = d10RegNo + +fa7RegNo, d17RegNo :: RegNo +d17RegNo = 49 +fa7RegNo = d17RegNo + +-- Note [The made-up RISCV64 TMP (IP) register] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- RISCV64 has no inter-procedural register in its ABI. However, we need one to +-- make register spills/loads to/from high number slots. I.e. slot numbers that +-- do not fit in a 12bit integer which is used as immediate in the arithmetic +-- operations. Thus, we're marking one additional register (x31) as permanently +-- non-free and call it TMP. +-- +-- TMP can be used as temporary register in all operations. Just be aware that +-- it may be clobbered as soon as you loose direct control over it (i.e. using +-- TMP by-passes the register allocation/spilling mechanisms.) It should be fine +-- to use it as temporary register in a MachOp translation as long as you don't +-- rely on its value beyond this limited scope. +-- +-- X31 is a caller-saved register. I.e. there are no guarantees about what the +-- callee does with it. That's exactly what we want here. + +zeroReg, raReg, spMachReg, tmpReg :: Reg +zeroReg = regSingle x0RegNo +raReg = regSingle 1 + +-- | Not to be confused with the `CmmReg` `spReg` +spMachReg = regSingle 2 + +tmpReg = regSingle tmpRegNo + +-- | All machine register numbers. +allMachRegNos :: [RegNo] +allMachRegNos = intRegs ++ fpRegs + where + intRegs = [x0RegNo .. x31RegNo] + fpRegs = [d0RegNo .. d31RegNo] + +-- | Registers available to the register allocator. +-- +-- These are all registers minus those with a fixed role in RISCV ABI (zero, lr, +-- sp, gp, tp, fp, tmp) and GHC RTS (Base, Sp, Hp, HpLim, R1..R8, F1..F6, +-- D1..D6.) +allocatableRegs :: Platform -> [RealReg] +allocatableRegs platform = + let isFree = freeReg platform + in map RealRegSingle $ filter isFree allMachRegNos + +-- | Integer argument registers according to the calling convention +allGpArgRegs :: [Reg] +allGpArgRegs = map regSingle [a0RegNo .. a7RegNo] + +-- | Floating point argument registers according to the calling convention +allFpArgRegs :: [Reg] +allFpArgRegs = map regSingle [fa0RegNo .. fa7RegNo] + +-- * Addressing modes + +-- | Addressing modes +data AddrMode + = -- | A register plus some immediate integer, e.g. @8(sp)@ or @-16(sp)@. The + -- offset needs to fit into 12bits. + AddrRegImm Reg Imm + | -- | A register + AddrReg Reg + deriving (Eq, Show) + +-- * Immediates + +data Imm + = ImmInt Int + | ImmInteger Integer -- Sigh. + | ImmCLbl CLabel -- AbstractC Label (with baggage) + | ImmLit FastString + | ImmIndex CLabel Int + | ImmFloat Rational + | ImmDouble Rational + | ImmConstantSum Imm Imm + | ImmConstantDiff Imm Imm + deriving (Eq, Show) + +-- | Map `CmmLit` to `Imm` +-- +-- N.B. this is a partial function, because not all `CmmLit`s have an immediate +-- representation. +litToImm :: CmmLit -> Imm +litToImm (CmmInt i w) = ImmInteger (narrowS w i) +-- narrow to the width: a CmmInt might be out of +-- range, but we assume that ImmInteger only contains +-- in-range values. A signed value should be fine here. +litToImm (CmmFloat f W32) = ImmFloat f +litToImm (CmmFloat f W64) = ImmDouble f +litToImm (CmmLabel l) = ImmCLbl l +litToImm (CmmLabelOff l off) = ImmIndex l off +litToImm (CmmLabelDiffOff l1 l2 off _) = + ImmConstantSum + (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2)) + (ImmInt off) +litToImm l = panic $ "RV64.Regs.litToImm: no match for " ++ show l + +-- == To satisfy GHC.CmmToAsm.Reg.Target ======================================= + +-- squeese functions for the graph allocator ----------------------------------- + +-- | regSqueeze_class reg +-- Calculate the maximum number of register colors that could be +-- denied to a node of this class due to having this reg +-- as a neighbour. +{-# INLINE virtualRegSqueeze #-} +virtualRegSqueeze :: RegClass -> VirtualReg -> Int +virtualRegSqueeze cls vr = + case cls of + RcInteger -> + case vr of + VirtualRegI {} -> 1 + VirtualRegHi {} -> 1 + _other -> 0 + RcDouble -> + case vr of + VirtualRegD {} -> 1 + VirtualRegF {} -> 0 + _other -> 0 + _other -> 0 + +{-# INLINE realRegSqueeze #-} +realRegSqueeze :: RegClass -> RealReg -> Int +realRegSqueeze cls rr = + case cls of + RcInteger -> + case rr of + RealRegSingle regNo + | regNo < d0RegNo -> 1 + | otherwise -> 0 + RcDouble -> + case rr of + RealRegSingle regNo + | regNo < d0RegNo -> 0 + | otherwise -> 1 + _other -> 0 + +mkVirtualReg :: Unique -> Format -> VirtualReg +mkVirtualReg u format + | not (isFloatFormat format) = VirtualRegI u + | otherwise = + case format of + FF32 -> VirtualRegD u + FF64 -> VirtualRegD u + _ -> panic "RV64.mkVirtualReg" + +{-# INLINE classOfRealReg #-} +classOfRealReg :: RealReg -> RegClass +classOfRealReg (RealRegSingle i) + | i < d0RegNo = RcInteger + | otherwise = RcDouble + +regDotColor :: RealReg -> SDoc +regDotColor reg = + case classOfRealReg reg of + RcInteger -> text "blue" + RcFloat -> text "red" + RcDouble -> text "green" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs @@ -119,7 +119,7 @@ trivColorable platform virtualRegSqueeze ArchMipseb -> panic "trivColorable ArchMipseb" ArchMipsel -> panic "trivColorable ArchMipsel" ArchS390X -> panic "trivColorable ArchS390X" - ArchRISCV64 -> panic "trivColorable ArchRISCV64" + ArchRISCV64 -> 14 ArchLoongArch64->panic "trivColorable ArchLoongArch64" ArchJavaScript-> panic "trivColorable ArchJavaScript" ArchWasm32 -> panic "trivColorable ArchWasm32" @@ -154,7 +154,7 @@ trivColorable platform virtualRegSqueeze ArchMipseb -> panic "trivColorable ArchMipseb" ArchMipsel -> panic "trivColorable ArchMipsel" ArchS390X -> panic "trivColorable ArchS390X" - ArchRISCV64 -> panic "trivColorable ArchRISCV64" + ArchRISCV64 -> 0 ArchLoongArch64->panic "trivColorable ArchLoongArch64" ArchJavaScript-> panic "trivColorable ArchJavaScript" ArchWasm32 -> panic "trivColorable ArchWasm32" @@ -188,7 +188,7 @@ trivColorable platform virtualRegSqueeze ArchMipseb -> panic "trivColorable ArchMipseb" ArchMipsel -> panic "trivColorable ArchMipsel" ArchS390X -> panic "trivColorable ArchS390X" - ArchRISCV64 -> panic "trivColorable ArchRISCV64" + ArchRISCV64 -> 20 ArchLoongArch64->panic "trivColorable ArchLoongArch64" ArchJavaScript-> panic "trivColorable ArchJavaScript" ArchWasm32 -> panic "trivColorable ArchWasm32" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Linear.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear.hs @@ -112,6 +112,7 @@ import qualified GHC.CmmToAsm.Reg.Linear import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86 import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64 import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64 +import qualified GHC.CmmToAsm.Reg.Linear.RV64 as RV64 import GHC.CmmToAsm.Reg.Target import GHC.CmmToAsm.Reg.Liveness import GHC.CmmToAsm.Reg.Utils @@ -221,7 +222,7 @@ linearRegAlloc config entry_ids block_li ArchAlpha -> panic "linearRegAlloc ArchAlpha" ArchMipseb -> panic "linearRegAlloc ArchMipseb" ArchMipsel -> panic "linearRegAlloc ArchMipsel" - ArchRISCV64 -> panic "linearRegAlloc ArchRISCV64" + ArchRISCV64 -> go (frInitFreeRegs platform :: RV64.FreeRegs) ArchLoongArch64-> panic "linearRegAlloc ArchLoongArch64" ArchJavaScript -> panic "linearRegAlloc ArchJavaScript" ArchWasm32 -> panic "linearRegAlloc ArchWasm32" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs @@ -29,10 +29,12 @@ import qualified GHC.CmmToAsm.Reg.Linear import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86 import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64 import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64 +import qualified GHC.CmmToAsm.Reg.Linear.RV64 as RV64 import qualified GHC.CmmToAsm.PPC.Instr as PPC.Instr import qualified GHC.CmmToAsm.X86.Instr as X86.Instr import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr +import qualified GHC.CmmToAsm.RV64.Instr as RV64.Instr class Show freeRegs => FR freeRegs where frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs @@ -64,6 +66,12 @@ instance FR AArch64.FreeRegs where frInitFreeRegs = AArch64.initFreeRegs frReleaseReg = \_ -> AArch64.releaseReg +instance FR RV64.FreeRegs where + frAllocateReg = const RV64.allocateReg + frGetFreeRegs = const RV64.getFreeRegs + frInitFreeRegs = RV64.initFreeRegs + frReleaseReg = const RV64.releaseReg + maxSpillSlots :: NCGConfig -> Int maxSpillSlots config = case platformArch (ncgPlatform config) of ArchX86 -> X86.Instr.maxSpillSlots config @@ -76,7 +84,7 @@ maxSpillSlots config = case platformArch ArchAlpha -> panic "maxSpillSlots ArchAlpha" ArchMipseb -> panic "maxSpillSlots ArchMipseb" ArchMipsel -> panic "maxSpillSlots ArchMipsel" - ArchRISCV64 -> panic "maxSpillSlots ArchRISCV64" + ArchRISCV64 -> RV64.Instr.maxSpillSlots config ArchLoongArch64->panic "maxSpillSlots ArchLoongArch64" ArchJavaScript-> panic "maxSpillSlots ArchJavaScript" ArchWasm32 -> panic "maxSpillSlots ArchWasm32" Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs =================================================================== --- /dev/null +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs @@ -0,0 +1,96 @@ +-- | Functions to implement the @FR@ (as in "free regs") type class. +-- +-- For LLVM GHC calling convention (used registers), see +-- https://github.com/llvm/llvm-project/blob/6ab900f8746e7d8e24afafb5886a40801f6799f4/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L13638-L13685 +module GHC.CmmToAsm.Reg.Linear.RV64 + ( allocateReg, + getFreeRegs, + initFreeRegs, + releaseReg, + FreeRegs (..), + ) +where + +import Data.Word +import GHC.CmmToAsm.RV64.Regs +import GHC.Platform +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.Prelude +import GHC.Stack +import GHC.Utils.Outputable +import GHC.Utils.Panic + +-- | Bitmaps to indicate which registers are free (currently unused) +-- +-- The bit index represents the `RegNo`, in case of floating point registers +-- with an offset of 32. The register is free when the bit is set. +data FreeRegs + = FreeRegs + -- | integer/general purpose registers (`RcInteger`) + !Word32 + -- | floating point registers (`RcDouble`) + !Word32 + +instance Show FreeRegs where + show (FreeRegs g f) = "FreeRegs 0b" ++ showBits g ++ " 0b" ++ showBits f + +-- | Show bits as a `String` of @1@s and @0@s +showBits :: Word32 -> String +showBits w = map (\i -> if testBit w i then '1' else '0') [0 .. 31] + +instance Outputable FreeRegs where + ppr (FreeRegs g f) = + text " " + <+> foldr (\i x -> pad_int i <+> x) (text "") [0 .. 31] + $$ text "GPR" + <+> foldr (\i x -> show_bit g i <+> x) (text "") [0 .. 31] + $$ text "FPR" + <+> foldr (\i x -> show_bit f i <+> x) (text "") [0 .. 31] + where + pad_int i | i < 10 = char ' ' <> int i + pad_int i = int i + -- remember bit = 1 means it's available. + show_bit bits bit | testBit bits bit = text " " + show_bit _ _ = text " x" + +-- | Set bits of all allocatable registers to 1 +initFreeRegs :: Platform -> FreeRegs +initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform) + where + noFreeRegs :: FreeRegs + noFreeRegs = FreeRegs 0 0 + +-- | Get all free `RealReg`s (i.e. those where the corresponding bit is 1) +getFreeRegs :: RegClass -> FreeRegs -> [RealReg] +getFreeRegs cls (FreeRegs g f) + | RcFloat <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted. + | RcDouble <- cls = go 32 f allocatableDoubleRegs + | RcInteger <- cls = go 0 g allocatableIntRegs + where + go _ _ [] = [] + go off x (i : is) + | testBit x i = RealRegSingle (off + i) : (go off x $! is) + | otherwise = go off x $! is + -- The lists of allocatable registers are manually crafted: Register + -- allocation is pretty hot code. We don't want to iterate and map like + -- `initFreeRegs` all the time! (The register mappings aren't supposed to + -- change often.) + allocatableIntRegs = [5 .. 7] ++ [10 .. 17] ++ [28 .. 30] + allocatableDoubleRegs = [0 .. 7] ++ [10 .. 17] ++ [28 .. 31] + +-- | Set corresponding register bit to 0 +allocateReg :: (HasCallStack) => RealReg -> FreeRegs -> FreeRegs +allocateReg (RealRegSingle r) (FreeRegs g f) + | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32)) + | r < 32 && testBit g r = FreeRegs (clearBit g r) f + | r > 31 = panic $ "Linear.RV64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f + | otherwise = pprPanic "Linear.RV64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g) + +-- | Set corresponding register bit to 1 +releaseReg :: (HasCallStack) => RealReg -> FreeRegs -> FreeRegs +releaseReg (RealRegSingle r) (FreeRegs g f) + | r > 31 && testBit f (r - 32) = pprPanic "Linear.RV64.releaseReg" (text "can't release non-allocated reg v" <> int (r - 32)) + | r < 32 && testBit g r = pprPanic "Linear.RV64.releaseReg" (text "can't release non-allocated reg x" <> int r) + | r > 31 = FreeRegs g (setBit f (r - 32)) + | otherwise = FreeRegs (setBit g r) f Index: ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Target.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/CmmToAsm/Reg/Target.hs +++ ghc-9.10.1/compiler/GHC/CmmToAsm/Reg/Target.hs @@ -34,7 +34,7 @@ import qualified GHC.CmmToAsm.X86.Regs import qualified GHC.CmmToAsm.X86.RegInfo as X86 import qualified GHC.CmmToAsm.PPC.Regs as PPC import qualified GHC.CmmToAsm.AArch64.Regs as AArch64 - +import qualified GHC.CmmToAsm.RV64.Regs as RV64 targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int targetVirtualRegSqueeze platform @@ -49,7 +49,7 @@ targetVirtualRegSqueeze platform ArchAlpha -> panic "targetVirtualRegSqueeze ArchAlpha" ArchMipseb -> panic "targetVirtualRegSqueeze ArchMipseb" ArchMipsel -> panic "targetVirtualRegSqueeze ArchMipsel" - ArchRISCV64 -> panic "targetVirtualRegSqueeze ArchRISCV64" + ArchRISCV64 -> RV64.virtualRegSqueeze ArchLoongArch64->panic "targetVirtualRegSqueeze ArchLoongArch64" ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript" ArchWasm32 -> panic "targetVirtualRegSqueeze ArchWasm32" @@ -69,7 +69,7 @@ targetRealRegSqueeze platform ArchAlpha -> panic "targetRealRegSqueeze ArchAlpha" ArchMipseb -> panic "targetRealRegSqueeze ArchMipseb" ArchMipsel -> panic "targetRealRegSqueeze ArchMipsel" - ArchRISCV64 -> panic "targetRealRegSqueeze ArchRISCV64" + ArchRISCV64 -> RV64.realRegSqueeze ArchLoongArch64->panic "targetRealRegSqueeze ArchLoongArch64" ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript" ArchWasm32 -> panic "targetRealRegSqueeze ArchWasm32" @@ -88,7 +88,7 @@ targetClassOfRealReg platform ArchAlpha -> panic "targetClassOfRealReg ArchAlpha" ArchMipseb -> panic "targetClassOfRealReg ArchMipseb" ArchMipsel -> panic "targetClassOfRealReg ArchMipsel" - ArchRISCV64 -> panic "targetClassOfRealReg ArchRISCV64" + ArchRISCV64 -> RV64.classOfRealReg ArchLoongArch64->panic "targetClassOfRealReg ArchLoongArch64" ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript" ArchWasm32 -> panic "targetClassOfRealReg ArchWasm32" @@ -107,7 +107,7 @@ targetMkVirtualReg platform ArchAlpha -> panic "targetMkVirtualReg ArchAlpha" ArchMipseb -> panic "targetMkVirtualReg ArchMipseb" ArchMipsel -> panic "targetMkVirtualReg ArchMipsel" - ArchRISCV64 -> panic "targetMkVirtualReg ArchRISCV64" + ArchRISCV64 -> RV64.mkVirtualReg ArchLoongArch64->panic "targetMkVirtualReg ArchLoongArch64" ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript" ArchWasm32 -> panic "targetMkVirtualReg ArchWasm32" @@ -126,7 +126,7 @@ targetRegDotColor platform ArchAlpha -> panic "targetRegDotColor ArchAlpha" ArchMipseb -> panic "targetRegDotColor ArchMipseb" ArchMipsel -> panic "targetRegDotColor ArchMipsel" - ArchRISCV64 -> panic "targetRegDotColor ArchRISCV64" + ArchRISCV64 -> RV64.regDotColor ArchLoongArch64->panic "targetRegDotColor ArchLoongArch64" ArchJavaScript-> panic "targetRegDotColor ArchJavaScript" ArchWasm32 -> panic "targetRegDotColor ArchWasm32" Index: ghc-9.10.1/compiler/GHC/Driver/Backend.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/Driver/Backend.hs +++ ghc-9.10.1/compiler/GHC/Driver/Backend.hs @@ -213,6 +213,7 @@ platformNcgSupported platform = if ArchPPC_64 {} -> True ArchAArch64 -> True ArchWasm32 -> True + ArchRISCV64 -> True _ -> False -- | Is the platform supported by the JS backend? Index: ghc-9.10.1/compiler/GHC/Driver/DynFlags.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/Driver/DynFlags.hs +++ ghc-9.10.1/compiler/GHC/Driver/DynFlags.hs @@ -1325,6 +1325,7 @@ default_PIC platform = (OSDarwin, ArchAArch64) -> [Opt_PIC] (OSLinux, ArchAArch64) -> [Opt_PIC, Opt_ExternalDynamicRefs] (OSLinux, ArchARM {}) -> [Opt_PIC, Opt_ExternalDynamicRefs] + (OSLinux, ArchRISCV64 {}) -> [Opt_PIC, Opt_ExternalDynamicRefs] (OSOpenBSD, ArchX86_64) -> [Opt_PIC] -- Due to PIE support in -- OpenBSD since 5.3 release -- (1 May 2013) we need to Index: ghc-9.10.1/compiler/GHC/Platform.hs =================================================================== --- ghc-9.10.1.orig/compiler/GHC/Platform.hs +++ ghc-9.10.1/compiler/GHC/Platform.hs @@ -250,7 +250,6 @@ platformHasRTSLinker p = case archOS_arc ArchPPC_64 ELF_V1 -> False -- powerpc64 ArchPPC_64 ELF_V2 -> False -- powerpc64le ArchS390X -> False - ArchRISCV64 -> False ArchLoongArch64 -> False ArchJavaScript -> False ArchWasm32 -> False Index: ghc-9.10.1/compiler/ghc.cabal.in =================================================================== --- ghc-9.10.1.orig/compiler/ghc.cabal.in +++ ghc-9.10.1/compiler/ghc.cabal.in @@ -290,6 +290,7 @@ Library GHC.CmmToAsm.Reg.Linear.FreeRegs GHC.CmmToAsm.Reg.Linear.JoinToTargets GHC.CmmToAsm.Reg.Linear.PPC + GHC.CmmToAsm.Reg.Linear.RV64 GHC.CmmToAsm.Reg.Linear.StackMap GHC.CmmToAsm.Reg.Linear.State GHC.CmmToAsm.Reg.Linear.Stats @@ -298,6 +299,13 @@ Library GHC.CmmToAsm.Reg.Liveness GHC.CmmToAsm.Reg.Target GHC.CmmToAsm.Reg.Utils + GHC.CmmToAsm.RV64 + GHC.CmmToAsm.RV64.CodeGen + GHC.CmmToAsm.RV64.Cond + GHC.CmmToAsm.RV64.Instr + GHC.CmmToAsm.RV64.Ppr + GHC.CmmToAsm.RV64.RegInfo + GHC.CmmToAsm.RV64.Regs GHC.CmmToAsm.Types GHC.CmmToAsm.Utils GHC.CmmToAsm.X86 Index: ghc-9.10.1/hadrian/bindist/config.mk.in =================================================================== --- ghc-9.10.1.orig/hadrian/bindist/config.mk.in +++ ghc-9.10.1/hadrian/bindist/config.mk.in @@ -152,7 +152,7 @@ GhcWithSMP := $(strip $(if $(filter YESN # Whether to include GHCi in the compiler. Depends on whether the RTS linker # has support for this OS/ARCH combination. OsSupportsGHCi=$(strip $(patsubst $(TargetOS_CPP), YES, $(findstring $(TargetOS_CPP), mingw32 linux solaris2 freebsd dragonfly netbsd openbsd darwin kfreebsdgnu))) -ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64))) +ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64 riscv64))) ifeq "$(OsSupportsGHCi)$(ArchSupportsGHCi)" "YESYES" GhcWithInterpreter=YES Index: ghc-9.10.1/hadrian/src/Settings/Builders/RunTest.hs =================================================================== --- ghc-9.10.1.orig/hadrian/src/Settings/Builders/RunTest.hs +++ ghc-9.10.1/hadrian/src/Settings/Builders/RunTest.hs @@ -118,7 +118,7 @@ inTreeCompilerArgs stg = do os <- queryHostTarget queryOS arch <- queryTargetTarget queryArch - let codegen_arches = ["x86_64", "i386", "powerpc", "powerpc64", "powerpc64le", "aarch64", "wasm32"] + let codegen_arches = ["x86_64", "i386", "powerpc", "powerpc64", "powerpc64le", "aarch64", "wasm32", "riscv64"] let withNativeCodeGen | unregisterised = False | arch `elem` codegen_arches = True @@ -139,7 +139,7 @@ inTreeCompilerArgs stg = do -- For this information, we need to query ghc --info, however, that would -- require building ghc, which we don't want to do here. Therefore, the -- logic from `platformHasRTSLinker` is duplicated here. - let rtsLinker = not $ arch `elem` ["powerpc", "powerpc64", "powerpc64le", "s390x", "riscv64", "loongarch64", "javascript", "wasm32"] + let rtsLinker = not $ arch `elem` ["powerpc", "powerpc64", "powerpc64le", "s390x", "loongarch64", "javascript", "wasm32"] return TestCompilerArgs{..} Index: ghc-9.10.1/rts/LinkerInternals.h =================================================================== --- ghc-9.10.1.orig/rts/LinkerInternals.h +++ ghc-9.10.1/rts/LinkerInternals.h @@ -208,7 +208,7 @@ typedef struct _Segment { int n_sections; } Segment; -#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) +#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH) #define NEED_SYMBOL_EXTRAS 1 #endif @@ -220,8 +220,9 @@ typedef struct _Segment { #define NEED_M32 1 #endif -/* Jump Islands are sniplets of machine code required for relative - * address relocations on the PowerPC, x86_64 and ARM. +/* Jump Islands are sniplets of machine code required for relative address + * relocations on the PowerPC, x86_64 and ARM. On RISCV64 we use symbolextras + * like a GOT for locals where SymbolExtra represents one entry. */ typedef struct { #if defined(powerpc_HOST_ARCH) @@ -237,6 +238,8 @@ typedef struct { uint8_t jumpIsland[8]; #elif defined(arm_HOST_ARCH) uint8_t jumpIsland[16]; +#elif defined(riscv64_HOST_ARCH) + uint64_t addr; #endif } SymbolExtra; Index: ghc-9.10.1/rts/RtsSymbols.c =================================================================== --- ghc-9.10.1.orig/rts/RtsSymbols.c +++ ghc-9.10.1/rts/RtsSymbols.c @@ -980,6 +980,17 @@ extern char **environ; #define RTS_LIBGCC_SYMBOLS #endif +#if defined(riscv64_HOST_ARCH) +// See https://gcc.gnu.org/onlinedocs/gccint/Integer-library-routines.html as +// reference for the following built-ins. __clzdi2 and __ctzdi2 probably relate +// to __builtin-s in libraries/ghc-prim/cbits/ctz.c. +#define RTS_ARCH_LIBGCC_SYMBOLS \ + SymI_NeedsProto(__clzdi2) \ + SymI_NeedsProto(__ctzdi2) +#else +#define RTS_ARCH_LIBGCC_SYMBOLS +#endif + // Symbols defined by libgcc/compiler-rt for AArch64's outline atomics. #if defined(HAVE_ARM_OUTLINE_ATOMICS) #include "ARMOutlineAtomicsSymbols.h" @@ -1032,6 +1043,7 @@ RTS_DARWIN_ONLY_SYMBOLS RTS_OPENBSD_ONLY_SYMBOLS RTS_LIBC_SYMBOLS RTS_LIBGCC_SYMBOLS +RTS_ARCH_LIBGCC_SYMBOLS RTS_FINI_ARRAY_SYMBOLS RTS_LIBFFI_SYMBOLS RTS_ARM_OUTLINE_ATOMIC_SYMBOLS @@ -1074,6 +1086,7 @@ RtsSymbolVal rtsSyms[] = { RTS_DARWIN_ONLY_SYMBOLS RTS_OPENBSD_ONLY_SYMBOLS RTS_LIBGCC_SYMBOLS + RTS_ARCH_LIBGCC_SYMBOLS RTS_FINI_ARRAY_SYMBOLS RTS_LIBFFI_SYMBOLS RTS_ARM_OUTLINE_ATOMIC_SYMBOLS Index: ghc-9.10.1/rts/adjustor/LibffiAdjustor.c =================================================================== --- ghc-9.10.1.orig/rts/adjustor/LibffiAdjustor.c +++ ghc-9.10.1/rts/adjustor/LibffiAdjustor.c @@ -12,6 +12,7 @@ #include "Adjustor.h" #include "rts/ghc_ffi.h" +#include #include // Note that ffi_alloc_prep_closure is a non-standard libffi closure @@ -187,5 +188,21 @@ createAdjustor (int cconv, barf("createAdjustor: failed to allocate memory"); } - return (void*)code; +#if defined(riscv64_HOST_ARCH) + // Synchronize the memory and instruction cache to prevent illegal + // instruction exceptions. + + // We expect two instructions for address loading, one for the jump. + int instrCount = 3; + // On Linux the parameters of __builtin___clear_cache are currently unused. + // Add them anyways for future compatibility. (I.e. the parameters couldn't + // be checked during development.) + // TODO: Check the upper boundary e.g. with a debugger. + __builtin___clear_cache((void *)code, + (void *)((uint64_t *) code + instrCount)); + // Memory barrier to ensure nothing circumvents the fence.i / cache flush. + SEQ_CST_FENCE(); +#endif + + return (void *)code; } Index: ghc-9.10.1/rts/linker/Elf.c =================================================================== --- ghc-9.10.1.orig/rts/linker/Elf.c +++ ghc-9.10.1/rts/linker/Elf.c @@ -103,7 +103,8 @@ #include "elf_got.h" -#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) +#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH) +# define NEED_GOT # define NEED_PLT # include "elf_plt.h" # include "elf_reloc.h" @@ -430,10 +431,7 @@ ocVerifyImage_ELF ( ObjectCode* oc ) case EM_AARCH64: IF_DEBUG(linker,debugBelch( "aarch64" )); break; #endif #if defined(EM_RISCV) - case EM_RISCV: IF_DEBUG(linker,debugBelch( "riscv" )); - errorBelch("%s: RTS linker not implemented on riscv", - oc->fileName); - return 0; + case EM_RISCV: IF_DEBUG(linker,debugBelch( "riscv" )); break; #endif #if defined(EM_LOONGARCH) case EM_LOONGARCH: IF_DEBUG(linker,debugBelch( "loongarch64" )); @@ -1130,9 +1128,10 @@ end: return result; } -// the aarch64 linker uses relocacteObjectCodeAarch64, -// see elf_reloc_aarch64.{h,c} -#if !defined(aarch64_HOST_ARCH) +// the aarch64 and riscv64 linkers use relocateObjectCodeAarch64() and +// relocateObjectCodeRISCV64() (respectively), see elf_reloc_aarch64.{h,c} and +// elf_reloc_riscv64.{h,c} +#if !defined(aarch64_HOST_ARCH) && !defined(riscv64_HOST_ARCH) /* Do ELF relocations which lack an explicit addend. All x86-linux and arm-linux relocations appear to be of this form. */ @@ -1359,7 +1358,7 @@ do_Elf_Rel_relocations ( ObjectCode* oc, /* try to locate an existing stub for this target */ if(findStub(&oc->sections[target_shndx], (void**)&S, 0)) { /* didn't find any. Need to create one */ - if(makeStub(&oc->sections[target_shndx], (void**)&S, 0)) { + if(makeStub(&oc->sections[target_shndx], (void**)&S, NULL, 0)) { errorBelch("Unable to create veneer for ARM_CALL\n"); return 0; } @@ -1451,7 +1450,7 @@ do_Elf_Rel_relocations ( ObjectCode* oc, /* try to locate an existing stub for this target */ if(findStub(&oc->sections[target_shndx], (void**)&S, 1)) { /* didn't find any. Need to create one */ - if(makeStub(&oc->sections[target_shndx], (void**)&S, 1)) { + if(makeStub(&oc->sections[target_shndx], (void**)&S, NULL, 1)) { errorBelch("Unable to create veneer for ARM_THM_CALL\n"); return 0; } @@ -1991,7 +1990,7 @@ ocResolve_ELF ( ObjectCode* oc ) (void) shnum; (void) shdr; -#if defined(aarch64_HOST_ARCH) +#if defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH) /* use new relocation design */ if(relocateObjectCode( oc )) return 0; @@ -2014,6 +2013,9 @@ ocResolve_ELF ( ObjectCode* oc ) #if defined(powerpc_HOST_ARCH) ocFlushInstructionCache( oc ); +#elif defined(riscv64_HOST_ARCH) + /* New-style pseudo-polymorph (by architecture) call */ + flushInstructionCache( oc ); #endif return ocMprotect_Elf(oc); Index: ghc-9.10.1/rts/linker/ElfTypes.h =================================================================== --- ghc-9.10.1.orig/rts/linker/ElfTypes.h +++ ghc-9.10.1/rts/linker/ElfTypes.h @@ -150,6 +150,7 @@ typedef struct _Stub { void * addr; void * target; + void* got_addr; /* flags can hold architecture specific information they are used during * lookup of stubs as well. Thus two stubs for the same target with * different flags are considered unequal. Index: ghc-9.10.1/rts/linker/SymbolExtras.c =================================================================== --- ghc-9.10.1.orig/rts/linker/SymbolExtras.c +++ ghc-9.10.1/rts/linker/SymbolExtras.c @@ -153,7 +153,7 @@ void ocProtectExtras(ObjectCode* oc) } -#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) +#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH) SymbolExtra* makeSymbolExtra( ObjectCode const* oc, unsigned long symbolNumber, unsigned long target ) @@ -189,9 +189,12 @@ SymbolExtra* makeSymbolExtra( ObjectCode extra->addr = target; memcpy(extra->jumpIsland, jmp, 8); #endif /* x86_64_HOST_ARCH */ - +#if defined(riscv64_HOST_ARCH) + // Fake GOT entry (used like GOT, but located in symbol extras) + extra->addr = target; +#endif return extra; } -#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH */ +#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH || riscv64_HOST_ARCH */ #endif /* !x86_64_HOST_ARCH) || !mingw32_HOST_OS */ #endif // NEED_SYMBOL_EXTRAS Index: ghc-9.10.1/rts/linker/SymbolExtras.h =================================================================== --- ghc-9.10.1.orig/rts/linker/SymbolExtras.h +++ ghc-9.10.1/rts/linker/SymbolExtras.h @@ -16,7 +16,7 @@ SymbolExtra* makeArmSymbolExtra( ObjectC unsigned long target, bool fromThumb, bool toThumb ); -#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) +#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH) SymbolExtra* makeSymbolExtra( ObjectCode const* oc, unsigned long symbolNumber, unsigned long target ); Index: ghc-9.10.1/rts/linker/elf_plt.c =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_plt.c +++ ghc-9.10.1/rts/linker/elf_plt.c @@ -5,7 +5,7 @@ #include #include -#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) +#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH) #if defined(OBJFORMAT_ELF) #define STRINGIFY(x) #x @@ -49,11 +49,13 @@ findStub(Section * section, bool makeStub(Section * section, void* * addr, + void* got_addr, uint8_t flags) { Stub * s = calloc(1, sizeof(Stub)); ASSERT(s != NULL); s->target = *addr; + s->got_addr = got_addr; s->flags = flags; s->next = NULL; s->addr = (uint8_t *)section->info->stub_offset + 8 Index: ghc-9.10.1/rts/linker/elf_plt.h =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_plt.h +++ ghc-9.10.1/rts/linker/elf_plt.h @@ -4,8 +4,9 @@ #include "elf_plt_arm.h" #include "elf_plt_aarch64.h" +#include "elf_plt_riscv64.h" -#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) +#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH) #if defined(OBJFORMAT_ELF) @@ -21,6 +22,8 @@ #define __suffix__ Arm #elif defined(__mips__) #define __suffix__ Mips +#elif defined(__riscv) +#define __suffix__ RISCV64 #else #error "unknown architecture" #endif @@ -34,10 +37,10 @@ unsigned numberOfStubsForSection( Objec #define STUB_SIZE ADD_SUFFIX(stubSize) bool findStub(Section * section, void* * addr, uint8_t flags); -bool makeStub(Section * section, void* * addr, uint8_t flags); +bool makeStub(Section * section, void* * addr, void* got_addr, uint8_t flags); void freeStubs(Section * section); #endif // OBJECTFORMAT_ELF -#endif // arm/aarch64_HOST_ARCH +#endif // arm/aarch64_HOST_ARCH/riscv64_HOST_ARCH Index: ghc-9.10.1/rts/linker/elf_plt_riscv64.c =================================================================== --- /dev/null +++ ghc-9.10.1/rts/linker/elf_plt_riscv64.c @@ -0,0 +1,90 @@ +#include "Rts.h" +#include "elf_compat.h" +#include "elf_plt_riscv64.h" +#include "rts/Messages.h" +#include "linker/ElfTypes.h" + +#include +#include + +#if defined(riscv64_HOST_ARCH) + +#if defined(OBJFORMAT_ELF) + +const size_t instSizeRISCV64 = 4; +const size_t stubSizeRISCV64 = 3 * instSizeRISCV64; + +bool needStubForRelRISCV64(Elf_Rel *rel) { + switch (ELF64_R_TYPE(rel->r_info)) { + case R_RISCV_CALL: + case R_RISCV_CALL_PLT: + return true; + default: + return false; + } +} + +bool needStubForRelaRISCV64(Elf_Rela *rela) { + switch (ELF64_R_TYPE(rela->r_info)) { + case R_RISCV_CALL: + case R_RISCV_CALL_PLT: + return true; + default: + return false; + } +} + +// After the global offset table (GOT) has been set up, we can use these three +// instructions to jump to the target address / function: +// +// 1. AUIPC ip, %pcrel_hi(addr) +// 2. LD ip, %pcrel_lo(addr)(ip) +// 3. JARL x0, ip, 0 +// +// We could use the absolute address of the target (because we know it), but +// that would require loading a 64-bit constant which is a nightmare to do in +// riscv64 assembly. (See +// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/5ffe5b5aeedb37b1c1c0c3d94641267d9ad4795a/riscv-elf.adoc#procedure-linkage-table) +// +// So far, PC-relative addressing seems to be good enough. If it ever turns out +// to be not, one could (additionally for out-of-range cases?) encode absolute +// addressing here. +bool makeStubRISCV64(Stub *s) { + uint32_t *P = (uint32_t *)s->addr; + int32_t addr = (uint64_t)s->got_addr - (uint64_t)P; + + uint64_t hi = (addr + 0x800) >> 12; + uint64_t lo = addr - (hi << 12); + + IF_DEBUG( + linker, + debugBelch( + "makeStubRISCV64: P = %p, got_addr = %p, target = %p, addr = 0x%x " + ", hi = 0x%lx, lo = 0x%lx\n", + P, s->got_addr, s->target, addr, hi, lo)); + + // AUIPC ip, %pcrel_hi(addr) + uint32_t auipcInst = 0b0010111; // opcode + auipcInst |= 0x1f << 7; // rd = ip (x31) + auipcInst |= hi << 12; // imm[31:12] + + // LD ip, %pcrel_lo(addr)(ip) + uint32_t ldInst = 0b0000011; // opcode + ldInst |= 0x1f << 7; // rd = ip (x31) + ldInst |= 0x1f << 15; // rs = ip (x31) + ldInst |= 0b11 << 12; // funct3 = 0x3 (LD) + ldInst |= lo << 20; // imm[11:0] + + // JARL x0, ip, 0 + uint32_t jalrInst = 0b1100111; // opcode + jalrInst |= 0x1f << 15; // rs = ip (x31) + + P[0] = auipcInst; + P[1] = ldInst; + P[2] = jalrInst; + + return EXIT_SUCCESS; +} + +#endif +#endif Index: ghc-9.10.1/rts/linker/elf_plt_riscv64.h =================================================================== --- /dev/null +++ ghc-9.10.1/rts/linker/elf_plt_riscv64.h @@ -0,0 +1,12 @@ +#pragma once + +#include "LinkerInternals.h" + +#if defined(OBJFORMAT_ELF) + +extern const size_t stubSizeRISCV64; +bool needStubForRelRISCV64(Elf_Rel * rel); +bool needStubForRelaRISCV64(Elf_Rela * rel); +bool makeStubRISCV64(Stub * s); + +#endif Index: ghc-9.10.1/rts/linker/elf_reloc.c =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_reloc.c +++ ghc-9.10.1/rts/linker/elf_reloc.c @@ -4,13 +4,18 @@ #if defined(OBJFORMAT_ELF) -/* we currently only use this abstraction for elf/aarch64 */ -#if defined(aarch64_HOST_ARCH) +/* we currently only use this abstraction for elf/aarch64 and elf/riscv64 */ +#if defined(aarch64_HOST_ARCH) | defined(riscv64_HOST_ARCH) bool relocateObjectCode(ObjectCode * oc) { return ADD_SUFFIX(relocateObjectCode)(oc); } + + +void flushInstructionCache(ObjectCode * oc){ + return ADD_SUFFIX(flushInstructionCache)(oc); +} #endif #endif Index: ghc-9.10.1/rts/linker/elf_reloc.h =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_reloc.h +++ ghc-9.10.1/rts/linker/elf_reloc.h @@ -5,9 +5,10 @@ #if defined(OBJFORMAT_ELF) #include "elf_reloc_aarch64.h" +#include "elf_reloc_riscv64.h" bool relocateObjectCode(ObjectCode * oc); - +void flushInstructionCache(ObjectCode *oc); #endif /* OBJETFORMAT_ELF */ Index: ghc-9.10.1/rts/linker/elf_reloc_aarch64.c =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_reloc_aarch64.c +++ ghc-9.10.1/rts/linker/elf_reloc_aarch64.c @@ -240,7 +240,7 @@ computeAddend(Section * section, Elf_Rel /* check if we already have that stub */ if(findStub(section, (void**)&S, 0)) { /* did not find it. Crete a new stub. */ - if(makeStub(section, (void**)&S, 0)) { + if(makeStub(section, (void**)&S, NULL, 0)) { abort(/* could not find or make stub */); } } @@ -339,5 +339,10 @@ relocateObjectCodeAarch64(ObjectCode * o return EXIT_SUCCESS; } +void flushInstructionCacheAarch64(ObjectCode * oc STG_UNUSED) { + // Looks like we don't need this on Aarch64. + /* no-op */ +} + #endif /* OBJECTFORMAT_ELF */ #endif /* aarch64_HOST_ARCH */ Index: ghc-9.10.1/rts/linker/elf_reloc_aarch64.h =================================================================== --- ghc-9.10.1.orig/rts/linker/elf_reloc_aarch64.h +++ ghc-9.10.1/rts/linker/elf_reloc_aarch64.h @@ -7,4 +7,5 @@ bool relocateObjectCodeAarch64(ObjectCode * oc); +void flushInstructionCacheAarch64(ObjectCode *oc); #endif /* OBJETFORMAT_ELF */ Index: ghc-9.10.1/rts/linker/elf_reloc_riscv64.c =================================================================== --- /dev/null +++ ghc-9.10.1/rts/linker/elf_reloc_riscv64.c @@ -0,0 +1,693 @@ +#include "elf_reloc_riscv64.h" +#include "LinkerInternals.h" +#include "Rts.h" +#include "Stg.h" +#include "SymbolExtras.h" +#include "linker/ElfTypes.h" +#include "elf_plt.h" +#include "elf_util.h" +#include "rts/Messages.h" +#include "util.h" + +#include +#include + +#if defined(riscv64_HOST_ARCH) + +#if defined(OBJFORMAT_ELF) + +typedef uint64_t addr_t; + +/* regular instructions are 32bit */ +typedef uint32_t inst_t; + +/* compressed instructions are 16bit */ +typedef uint16_t cinst_t; + +// TODO: These instances could be static. They are not yet, because we might +// need their debugging symbols. +char *relocationTypeToString(Elf64_Xword type); +int32_t decodeAddendRISCV64(Section *section, Elf_Rel *rel); +bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend); +void write8le(uint8_t *p, uint8_t v); +uint8_t read8le(const uint8_t *P); +void write16le(cinst_t *p, uint16_t v); +uint16_t read16le(const cinst_t *P); +uint32_t read32le(const inst_t *P); +void write32le(inst_t *p, uint32_t v); +uint64_t read64le(const uint64_t *P); +void write64le(uint64_t *p, uint64_t v); +uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end); +void setCJType(cinst_t *loc, uint32_t val); +void setCBType(cinst_t *loc, uint32_t val); +void setBType(inst_t *loc, uint32_t val); +void setSType(inst_t *loc, uint32_t val); +int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol, + int64_t addend, ObjectCode *oc); +void setJType(inst_t *loc, uint32_t val); +void setIType(inst_t *loc, int32_t val); +void checkInt(inst_t *loc, int32_t v, int n); +void setUType(inst_t *loc, int32_t val); + + +char *relocationTypeToString(Elf64_Xword type) { + switch (ELF64_R_TYPE(type)) { + case R_RISCV_NONE: + return "R_RISCV_NONE"; + case R_RISCV_32: + return "R_RISCV_32"; + case R_RISCV_64: + return "R_RISCV_64"; + case R_RISCV_RELATIVE: + return "R_RISCV_RELATIVE"; + case R_RISCV_COPY: + return "R_RISCV_COPY"; + case R_RISCV_JUMP_SLOT: + return "R_RISCV_JUMP_SLOT"; + case R_RISCV_TLS_DTPMOD32: + return "R_RISCV_TLS_DTPMOD32"; + case R_RISCV_TLS_DTPMOD64: + return "R_RISCV_TLS_DTPMOD64"; + case R_RISCV_TLS_DTPREL32: + return "R_RISCV_TLS_DTPREL32"; + case R_RISCV_TLS_DTPREL64: + return "R_RISCV_TLS_DTPREL64"; + case R_RISCV_TLS_TPREL32: + return "R_RISCV_TLS_TPREL32"; + case R_RISCV_TLS_TPREL64: + return "R_RISCV_TLS_TPREL64"; + case R_RISCV_BRANCH: + return "R_RISCV_BRANCH"; + case R_RISCV_JAL: + return "R_RISCV_JAL"; + case R_RISCV_CALL: + return "R_RISCV_CALL"; + case R_RISCV_CALL_PLT: + return "R_RISCV_CALL_PLT"; + case R_RISCV_GOT_HI20: + return "R_RISCV_GOT_HI20"; + case R_RISCV_PCREL_HI20: + return "R_RISCV_PCREL_HI20"; + case R_RISCV_LO12_I: + return "R_RISCV_LO12_I"; + case R_RISCV_PCREL_LO12_I: + return "R_RISCV_PCREL_LO12_I"; + case R_RISCV_HI20: + return "R_RISCV_HI20"; + case R_RISCV_LO12_S: + return "R_RISCV_LO12_S"; + case R_RISCV_PCREL_LO12_S: + return "R_RISCV_PCREL_LO12_S"; + case R_RISCV_RELAX: + return "R_RISCV_RELAX"; + case R_RISCV_RVC_BRANCH: + return "R_RISCV_RVC_BRANCH"; + case R_RISCV_RVC_JUMP: + return "R_RISCV_RVC_JUMP"; + default: + return "Unknown relocation type"; + } +} + +STG_NORETURN +int32_t decodeAddendRISCV64(Section *section STG_UNUSED, + Elf_Rel *rel STG_UNUSED) { + barf("decodeAddendRISCV64: Relocations with explicit addend are not supported." + " Please open a ticket; providing the causing code/binary."); +} + +// Make sure that V can be represented as an N bit signed integer. +void checkInt(inst_t *loc, int32_t v, int n) { + if (!isInt(n, v)) { + barf("Relocation at 0x%x is out of range. value: 0x%x (%d), " + "sign-extended value: 0x%x (%d), max bits 0x%x (%d)\n", + *loc, v, v, signExtend32(v, n), signExtend32(v, n), n, n); + } +} + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +void write8le(uint8_t *p, uint8_t v) { *p = v; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +uint8_t read8le(const uint8_t *p) { return *p; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +void write16le(cinst_t *p, uint16_t v) { *p = v; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +uint16_t read16le(const cinst_t *p) { return *p; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +uint32_t read32le(const inst_t *p) { return *p; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +void write32le(inst_t *p, uint32_t v) { *p = v; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +uint64_t read64le(const uint64_t *p) { return *p; } + +// RISCV is little-endian by definition: We can rely on (implicit) casts. +void write64le(uint64_t *p, uint64_t v) { *p = v; } + +uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end) { + return (v & ((1ULL << (begin + 1)) - 1)) >> end; +} + +// Set immediate val in the instruction at *loc. In U-type instructions the +// upper 20bits carry the upper 20bits of the immediate. +void setUType(inst_t *loc, int32_t val) { + const unsigned bits = 32; + uint32_t hi = val + 0x800; + checkInt(loc, signExtend32(hi, bits) >> 12, 20); + IF_DEBUG(linker, debugBelch("setUType: hi 0x%x val 0x%x\n", hi, val)); + + uint32_t imm = hi & 0xFFFFF000; + write32le(loc, (read32le(loc) & 0xFFF) | imm); +} + +// Set immediate val in the instruction at *loc. In I-type instructions the +// upper 12bits carry the lower 12bit of the immediate. +void setIType(inst_t *loc, int32_t val) { + uint64_t hi = (val + 0x800) >> 12; + uint64_t lo = val - (hi << 12); + + IF_DEBUG(linker, debugBelch("setIType: hi 0x%lx lo 0x%lx\n", hi, lo)); + IF_DEBUG(linker, debugBelch("setIType: loc %p *loc 0x%x val 0x%x\n", loc, + *loc, val)); + + uint32_t imm = lo & 0xfff; + uint32_t instr = (read32le(loc) & 0xfffff) | (imm << 20); + + IF_DEBUG(linker, debugBelch("setIType: insn 0x%x\n", instr)); + write32le(loc, instr); + IF_DEBUG(linker, debugBelch("setIType: loc %p *loc' 0x%x val 0x%x\n", loc, + *loc, val)); +} + +// Set immediate val in the instruction at *loc. In S-type instructions the +// lower 12 bits of the immediate are at bits 7 to 11 ([0:4]) and 25 to 31 +// ([5:11]). +void setSType(inst_t *loc, uint32_t val) { + uint64_t hi = (val + 0x800) >> 12; + uint64_t lo = val - (hi << 12); + + uint32_t imm = lo; + uint32_t instr = (read32le(loc) & 0x1fff07f) | (extractBits(imm, 11, 5) << 25) | + (extractBits(imm, 4, 0) << 7); + + write32le(loc, instr); +} + +// Set immediate val in the instruction at *loc. In J-type instructions the +// immediate has 20bits which are pretty scattered: +// instr bit -> imm bit +// 31 -> 20 +// [30:21] -> [10:1] +// 20 -> 11 +// [19:12] -> [19:12] +// +// N.B. bit 0 of the immediate is missing! +void setJType(inst_t *loc, uint32_t val) { + checkInt(loc, val, 21); + + uint32_t insn = read32le(loc) & 0xFFF; + uint32_t imm20 = extractBits(val, 20, 20) << 31; + uint32_t imm10_1 = extractBits(val, 10, 1) << 21; + uint32_t imm11 = extractBits(val, 11, 11) << 20; + uint32_t imm19_12 = extractBits(val, 19, 12) << 12; + insn |= imm20 | imm10_1 | imm11 | imm19_12; + + write32le(loc, insn); +} + +// Set immediate val in the instruction at *loc. In B-type instructions the +// immediate has 12bits which are pretty scattered: +// instr bit -> imm bit +// 31 -> 12 +// [30:25] -> [10:5] +// [11:8] -> [4:1] +// 7 -> 11 +// +// N.B. bit 0 of the immediate is missing! +void setBType(inst_t *loc, uint32_t val) { + checkInt(loc, val, 13); + + uint32_t insn = read32le(loc) & 0x1FFF07F; + uint32_t imm12 = extractBits(val, 12, 12) << 31; + uint32_t imm10_5 = extractBits(val, 10, 5) << 25; + uint32_t imm4_1 = extractBits(val, 4, 1) << 8; + uint32_t imm11 = extractBits(val, 11, 11) << 7; + insn |= imm12 | imm10_5 | imm4_1 | imm11; + + write32le(loc, insn); +} + + +// Set immediate val in the instruction at *loc. CB-type instructions have a +// lenght of 16 bits (half-word, compared to the usual 32bit/word instructions.) +// The immediate has 8bits which are pretty scattered: +// instr bit -> imm bit +// 12 -> 8 +// [11:10] -> [4:3] +// [6:5] -> [7:6] +// [4:3] -> [2:1] +// 2 -> 5 +// +// N.B. bit 0 of the immediate is missing! +void setCBType(cinst_t *loc, uint32_t val) { + checkInt((inst_t *)loc, val, 9); + uint16_t insn = read16le(loc) & 0xE383; + uint16_t imm8 = extractBits(val, 8, 8) << 12; + uint16_t imm4_3 = extractBits(val, 4, 3) << 10; + uint16_t imm7_6 = extractBits(val, 7, 6) << 5; + uint16_t imm2_1 = extractBits(val, 2, 1) << 3; + uint16_t imm5 = extractBits(val, 5, 5) << 2; + insn |= imm8 | imm4_3 | imm7_6 | imm2_1 | imm5; + + write16le(loc, insn); +} + +// Set immediate val in the instruction at *loc. CJ-type instructions have a +// lenght of 16 bits (half-word, compared to the usual 32bit/word instructions.) +// The immediate has 11bits which are pretty scattered: +// instr bit -> imm bit +// 12 -> 11 +// 11 -> 4 +// [10:9] ->[9:8] +// 8 -> 10 +// 7 -> 6 +// 6 -> 7 +// [5:3] -> [3:1] +// 2 -> 5 +// +// N.B. bit 0 of the immediate is missing! +void setCJType(cinst_t *loc, uint32_t val) { + checkInt((inst_t *)loc, val, 12); + uint16_t insn = read16le(loc) & 0xE003; + uint16_t imm11 = extractBits(val, 11, 11) << 12; + uint16_t imm4 = extractBits(val, 4, 4) << 11; + uint16_t imm9_8 = extractBits(val, 9, 8) << 9; + uint16_t imm10 = extractBits(val, 10, 10) << 8; + uint16_t imm6 = extractBits(val, 6, 6) << 7; + uint16_t imm7 = extractBits(val, 7, 7) << 6; + uint16_t imm3_1 = extractBits(val, 3, 1) << 3; + uint16_t imm5 = extractBits(val, 5, 5) << 2; + insn |= imm11 | imm4 | imm9_8 | imm10 | imm6 | imm7 | imm3_1 | imm5; + + write16le(loc, insn); +} + +// Encode the addend according to the relocaction into the instruction. +bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend) { + // instruction to rewrite (P: Position of the relocation) + addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset); + IF_DEBUG(linker, + debugBelch( + "Relocation type %s 0x%lx (%lu) symbol 0x%lx addend 0x%x (%u / " + "%d) P 0x%lx\n", + relocationTypeToString(rel->r_info), ELF64_R_TYPE(rel->r_info), + ELF64_R_TYPE(rel->r_info), ELF64_R_SYM(rel->r_info), addend, + addend, addend, P)); + switch (ELF64_R_TYPE(rel->r_info)) { + case R_RISCV_32_PCREL: + case R_RISCV_32: + write32le((inst_t *)P, addend); + break; + case R_RISCV_64: + write64le((uint64_t *)P, addend); + break; + case R_RISCV_GOT_HI20: + case R_RISCV_PCREL_HI20: + case R_RISCV_HI20: { + setUType((inst_t *)P, addend); + break; + } + case R_RISCV_PCREL_LO12_I: + case R_RISCV_LO12_I: { + setIType((inst_t *)P, addend); + break; + } + case R_RISCV_RVC_JUMP: { + setCJType((cinst_t *)P, addend); + break; + } + case R_RISCV_RVC_BRANCH: { + setCBType((cinst_t *)P, addend); + break; + } + case R_RISCV_BRANCH: { + setBType((inst_t *)P, addend); + break; + } + case R_RISCV_CALL: + case R_RISCV_CALL_PLT: { + // We could relax more (in some cases) but right now most important is to + // make it work. + setUType((inst_t *)P, addend); + setIType(((inst_t *)P) + 1, addend); + break; + } + case R_RISCV_JAL: { + setJType((inst_t *)P, addend); + break; + } + case R_RISCV_ADD8: + write8le((uint8_t *)P, read8le((uint8_t *)P) + addend); + break; + case R_RISCV_ADD16: + write16le((cinst_t *)P, read16le((cinst_t *)P) + addend); + break; + case R_RISCV_ADD32: + write32le((inst_t *)P, read32le((inst_t *)P) + addend); + break; + case R_RISCV_ADD64: + write64le((uint64_t *)P, read64le((uint64_t *)P) + addend); + break; + case R_RISCV_SUB6: { + uint8_t keep = *((uint8_t *)P) & 0xc0; + uint8_t imm = (((*(uint8_t *)P) & 0x3f) - addend) & 0x3f; + + write8le((uint8_t *)P, keep | imm); + break; + } + case R_RISCV_SUB8: + write8le((uint8_t *)P, read8le((uint8_t *)P) - addend); + break; + case R_RISCV_SUB16: + write16le((cinst_t *)P, read16le((cinst_t *)P) - addend); + break; + case R_RISCV_SUB32: + write32le((inst_t *)P, read32le((inst_t *)P) - addend); + break; + case R_RISCV_SUB64: + write64le((uint64_t *)P, read64le((uint64_t *)P) - addend); + break; + case R_RISCV_SET6: { + uint8_t keep = *((uint8_t *)P) & 0xc0; + uint8_t imm = (addend & 0x3f) & 0x3f; + + write8le((uint8_t *)P, keep | imm); + break; + } + case R_RISCV_SET8: + write8le((uint8_t *)P, addend); + break; + case R_RISCV_SET16: + write16le((cinst_t *)P, addend); + break; + case R_RISCV_SET32: + write32le((inst_t *)P, addend); + break; + case R_RISCV_PCREL_LO12_S: + case R_RISCV_TPREL_LO12_S: + case R_RISCV_LO12_S: { + setSType((inst_t *)P, addend); + break; + } + case R_RISCV_RELAX: + case R_RISCV_ALIGN: + // Implementing relaxations (rewriting instructions to more efficient ones) + // could be implemented in future. As the code already is aligned and we do + // not change the instruction sizes, we should get away with not aligning + // (though, that is cheating.) To align or change the instruction count, we + // would need machinery to squeeze or extend memory at the current location. + break; + default: + barf("Missing relocation 0x%lx\n", ELF64_R_TYPE(rel->r_info)); + } + return EXIT_SUCCESS; +} + +/** + * Compute the *new* addend for a relocation, given a pre-existing addend. + * @param section The section the relocation is in. + * @param rel The Relocation struct. + * @param symbol The target symbol. + * @param addend The existing addend. Either explicit or implicit. + * @return The new computed addend. + */ +int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol, + int64_t addend, ObjectCode *oc) { + Section * section = &oc->sections[relaTab->targetSectionIndex]; + + // instruction to rewrite (P: Position of the relocation) + addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset); + + CHECK(0x0 != P); + CHECK((uint64_t)section->start <= P); + CHECK(P <= (uint64_t)section->start + section->size); + // S: Value of the symbol in the symbol table + addr_t S = (addr_t)symbol->addr; + /* GOT slot for the symbol (G + GOT) */ + addr_t GOT_S = (addr_t)symbol->got_addr; + + // A: Addend field in the relocation entry associated with the symbol + int64_t A = addend; + + IF_DEBUG(linker, debugBelch("%s: P 0x%lx S 0x%lx %s GOT_S 0x%lx A 0x%lx relNo %u\n", + relocationTypeToString(rel->r_info), P, S, + symbol->name, GOT_S, A, relNo)); + switch (ELF64_R_TYPE(rel->r_info)) { + case R_RISCV_32: + return S + A; + case R_RISCV_64: + return S + A; + case R_RISCV_HI20: + return S + A; + case R_RISCV_JUMP_SLOT: + return S; + case R_RISCV_JAL: + return S + A - P; + case R_RISCV_PCREL_HI20: + return S + A - P; + case R_RISCV_LO12_I: + return S + A; + // Quoting LLVM docs: For R_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}), + // the symbol actually points the corresponding R_RISCV_PCREL_HI20 + // relocation, and the target VA is calculated using PCREL_HI20's symbol. + case R_RISCV_PCREL_LO12_S: + FALLTHROUGH; + case R_RISCV_PCREL_LO12_I: { + // Lookup related HI20 relocation and use that value. I'm still confused why + // relocations aren't self-contained, but this is how LLVM does it. And, + // calculating the lower 12 bit without any relationship to the GOT entry's + // address makes no sense either. + for (int64_t i = relNo; i >= 0 ; i--) { + Elf_Rela *rel_prime = &relaTab->relocations[i]; + + addr_t P_prime = + (addr_t)((uint8_t *)section->start + rel_prime->r_offset); + + if (P_prime != S) { + // S points to the P of the corresponding *_HI20 relocation. + continue; + } + + ElfSymbol *symbol_prime = + findSymbol(oc, relaTab->sectionHeader->sh_link, + ELF64_R_SYM((Elf64_Xword)rel_prime->r_info)); + + CHECK(0x0 != symbol_prime); + + /* take explicit addend */ + int64_t addend_prime = rel_prime->r_addend; + + uint64_t type_prime = ELF64_R_TYPE(rel_prime->r_info); + + if (type_prime == R_RISCV_PCREL_HI20 || + type_prime == R_RISCV_GOT_HI20 || + type_prime == R_RISCV_TLS_GD_HI20 || + type_prime == R_RISCV_TLS_GOT_HI20) { + IF_DEBUG(linker, + debugBelch( + "Found matching relocation: %s (P: 0x%lx, S: 0x%lx, " + "sym-name: %s) -> %s (P: 0x%lx, S: %p, sym-name: %s, relNo: %ld)", + relocationTypeToString(rel->r_info), P, S, symbol->name, + relocationTypeToString(rel_prime->r_info), P_prime, + symbol_prime->addr, symbol_prime->name, i)); + int32_t result = computeAddend(relaTab, i, (Elf_Rel *)rel_prime, + symbol_prime, addend_prime, oc); + IF_DEBUG(linker, debugBelch("Result of computeAddend: 0x%x (%d)\n", + result, result)); + return result; + } + } + debugBelch("Missing HI relocation for %s: P 0x%lx S 0x%lx %s\n", + relocationTypeToString(rel->r_info), P, S, symbol->name); + abort(); + } + + case R_RISCV_RVC_JUMP: + return S + A - P; + case R_RISCV_RVC_BRANCH: + return S + A - P; + case R_RISCV_BRANCH: + return S + A - P; + case R_RISCV_CALL: + case R_RISCV_CALL_PLT: { + addr_t GOT_Target; + if (GOT_S != 0) { + // 1. Public symbol with GOT entry. + GOT_Target = GOT_S; + } else { + // 2. Fake GOT entry with symbol extra entry. + SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S); + addr_t* FAKE_GOT_S = &symbolExtra->addr; + IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT w/ SymbolExtra = %p , " + "entry = %p\n", + symbolExtra, FAKE_GOT_S)); + GOT_Target = (addr_t) FAKE_GOT_S; + } + + if (findStub(section, (void **)&S, 0)) { + /* did not find it. Crete a new stub. */ + if (makeStub(section, (void **)&S, (void *)GOT_Target, 0)) { + abort(/* could not find or make stub */); + } + } + IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT: S = 0x%lx A = 0x%lx P = " + "0x%lx (S + A) - P = 0x%lx \n", + S, A, P, (S + A) - P)); + return (S + A) - P; + } + case R_RISCV_ADD8: + FALLTHROUGH; + case R_RISCV_ADD16: + FALLTHROUGH; + case R_RISCV_ADD32: + FALLTHROUGH; + case R_RISCV_ADD64: + return S + A; // Add V when the value is set + case R_RISCV_SUB6: + FALLTHROUGH; + case R_RISCV_SUB8: + FALLTHROUGH; + case R_RISCV_SUB16: + FALLTHROUGH; + case R_RISCV_SUB32: + FALLTHROUGH; + case R_RISCV_SUB64: + return S + A; // Subtract from V when value is set + case R_RISCV_SET6: + FALLTHROUGH; + case R_RISCV_SET8: + FALLTHROUGH; + case R_RISCV_SET16: + FALLTHROUGH; + case R_RISCV_SET32: + return S + A; + case R_RISCV_RELAX: + // This "relocation" has no addend. + FALLTHROUGH; + case R_RISCV_ALIGN: + // I guess we don't need to implement this relaxation. Otherwise, this + // should return the number of blank bytes to insert via NOPs. + return 0; + case R_RISCV_32_PCREL: + return S + A - P; + case R_RISCV_GOT_HI20: { + // TODO: Allocating extra memory for every symbol just to play this trick + // seems to be a bit obscene. (GOT relocations hitting local symbols + // happens, but not very often.) It would be better to allocate only what we + // really need. + + // There are two cases here: 1. The symbol is public and has an entry in the + // GOT. 2. It's local and has no corresponding GOT entry. The first case is + // easy: We simply calculate the addend with the GOT address. In the second + // case we create a symbol extra entry and pretend it's the GOT. + if (GOT_S != 0) { + // 1. Public symbol with GOT entry. + return GOT_S + A - P; + } else { + // 2. Fake GOT entry with symbol extra entry. + SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S); + addr_t* FAKE_GOT_S = &symbolExtra->addr; + addr_t res = (addr_t) FAKE_GOT_S + A - P; + IF_DEBUG(linker, debugBelch("R_RISCV_GOT_HI20 w/ SymbolExtra = %p , " + "entry = %p , reloc-addend = 0x%lu ", + symbolExtra, FAKE_GOT_S, res)); + return res; + } + } + default: + barf("Unimplemented relocation: 0x%lx\n (%lu)", + ELF64_R_TYPE(rel->r_info), ELF64_R_TYPE(rel->r_info)); + } + barf("This should never happen!"); +} + +// Iterate over all relocations and perform them. +bool relocateObjectCodeRISCV64(ObjectCode *oc) { + for (ElfRelocationTable *relTab = oc->info->relTable; relTab != NULL; + relTab = relTab->next) { + /* only relocate interesting sections */ + if (SECTIONKIND_OTHER == oc->sections[relTab->targetSectionIndex].kind) + continue; + + Section *targetSection = &oc->sections[relTab->targetSectionIndex]; + + for (unsigned i = 0; i < relTab->n_relocations; i++) { + Elf_Rel *rel = &relTab->relocations[i]; + + ElfSymbol *symbol = findSymbol(oc, relTab->sectionHeader->sh_link, + ELF64_R_SYM((Elf64_Xword)rel->r_info)); + + CHECK(0x0 != symbol); + + // This always fails, because we don't support Rel locations, yet: Do we + // need this case? Leaving it in to spot the potential bug when it + // appears. + /* decode implicit addend */ + int64_t addend = decodeAddendRISCV64(targetSection, rel); + + addend = computeAddend((ElfRelocationATable*) relTab, i, rel, symbol, addend, oc); + encodeAddendRISCV64(targetSection, rel, addend); + } + } + for (ElfRelocationATable *relaTab = oc->info->relaTable; relaTab != NULL; + relaTab = relaTab->next) { + /* only relocate interesting sections */ + if (SECTIONKIND_OTHER == oc->sections[relaTab->targetSectionIndex].kind) + continue; + + Section *targetSection = &oc->sections[relaTab->targetSectionIndex]; + + for (unsigned i = 0; i < relaTab->n_relocations; i++) { + + Elf_Rela *rel = &relaTab->relocations[i]; + + ElfSymbol *symbol = findSymbol(oc, relaTab->sectionHeader->sh_link, + ELF64_R_SYM((Elf64_Xword)rel->r_info)); + + CHECK(0x0 != symbol); + + /* take explicit addend */ + int64_t addend = rel->r_addend; + + addend = computeAddend(relaTab, i, (Elf_Rel *)rel, symbol, addend, oc); + encodeAddendRISCV64(targetSection, (Elf_Rel *)rel, addend); + } + } + return EXIT_SUCCESS; +} + +void flushInstructionCacheRISCV64(ObjectCode *oc) { + // Synchronize the memory and instruction cache to prevent illegal instruction + // exceptions. On Linux the parameters of __builtin___clear_cache are + // currently unused. Add them anyways for future compatibility. (I.e. the + // parameters couldn't be checked during development.) + + /* The main object code */ + void *codeBegin = oc->image + oc->misalignment; + __builtin___clear_cache(codeBegin, (void*) ((uint64_t*) codeBegin + oc->fileSize)); + + /* Jump Islands */ + __builtin___clear_cache((void *)oc->symbol_extras, + (void *)(oc->symbol_extras + oc->n_symbol_extras)); + + // Memory barrier to ensure nothing circumvents the fence.i / cache flushes. + SEQ_CST_FENCE(); +} + +#endif /* OBJECTFORMAT_ELF */ +#endif /* riscv64_HOST_ARCH */ Index: ghc-9.10.1/rts/linker/elf_reloc_riscv64.h =================================================================== --- /dev/null +++ ghc-9.10.1/rts/linker/elf_reloc_riscv64.h @@ -0,0 +1,11 @@ +#pragma once + +#include "LinkerInternals.h" + +#if defined(OBJFORMAT_ELF) + +bool +relocateObjectCodeRISCV64(ObjectCode * oc); + +void flushInstructionCacheRISCV64(ObjectCode *oc); +#endif /* OBJETFORMAT_ELF */ Index: ghc-9.10.1/rts/rts.cabal =================================================================== --- ghc-9.10.1.orig/rts/rts.cabal +++ ghc-9.10.1/rts/rts.cabal @@ -468,9 +468,11 @@ library linker/elf_got.c linker/elf_plt.c linker/elf_plt_aarch64.c + linker/elf_plt_riscv64.c linker/elf_plt_arm.c linker/elf_reloc.c linker/elf_reloc_aarch64.c + linker/elf_reloc_riscv64.c linker/elf_tlsgd.c linker/elf_util.c sm/BlockAlloc.c Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.hs =================================================================== --- /dev/null +++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.hs @@ -0,0 +1,123 @@ +{-# LANGUAGE ForeignFunctionInterface #-} +{-# LANGUAGE GHCForeignImportPrim #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +{-# LANGUAGE UnliftedFFITypes #-} + +-- | This test ensures that sub-word signed and unsigned parameters are correctly +-- handed over to C functions. I.e. it asserts the calling-convention. +-- +-- The number of parameters is currently shaped for the RISCV64 calling-convention. +-- You may need to add more parameters to the C functions in case there are more +-- registers reserved for parameters in your architecture. +module Main where + +import Data.Word +import GHC.Exts +import GHC.Int + +foreign import ccall "fun8" + fun8 :: + Int8# -> -- a0 + Word8# -> -- a1 + Int8# -> -- a2 + Int8# -> -- a3 + Int8# -> -- a4 + Int8# -> -- a5 + Int8# -> -- a6 + Int8# -> -- a7 + Word8# -> -- s0 + Int8# -> -- s1 + Int64# -- result + +foreign import ccall "fun16" + fun16 :: + Int16# -> -- a0 + Word16# -> -- a1 + Int16# -> -- a2 + Int16# -> -- a3 + Int16# -> -- a4 + Int16# -> -- a5 + Int16# -> -- a6 + Int16# -> -- a7 + Word16# -> -- s0 + Int16# -> -- s1 + Int64# -- result + +foreign import ccall "fun32" + fun32 :: + Int32# -> -- a0 + Word32# -> -- a1 + Int32# -> -- a2 + Int32# -> -- a3 + Int32# -> -- a4 + Int32# -> -- a5 + Int32# -> -- a6 + Int32# -> -- a7 + Word32# -> -- s0 + Int32# -> -- s1 + Int64# -- result + +foreign import ccall "funFloat" + funFloat :: + Float# -> -- a0 + Float# -> -- a1 + Float# -> -- a2 + Float# -> -- a3 + Float# -> -- a4 + Float# -> -- a5 + Float# -> -- a6 + Float# -> -- a7 + Float# -> -- s0 + Float# -> -- s1 + Float# -- result + +foreign import ccall "funDouble" + funDouble :: + Double# -> -- a0 + Double# -> -- a1 + Double# -> -- a2 + Double# -> -- a3 + Double# -> -- a4 + Double# -> -- a5 + Double# -> -- a6 + Double# -> -- a7 + Double# -> -- s0 + Double# -> -- s1 + Double# -- result + +main :: IO () +main = + -- N.B. the values here aren't choosen by accident: -1 means all bits one in + -- twos-complement, which is the same as the max word value. + let i8 :: Int8# = intToInt8# (-1#) + w8 :: Word8# = wordToWord8# (255##) + res8 :: Int64# = fun8 i8 w8 i8 i8 i8 i8 i8 i8 w8 i8 + expected_res8 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word8) + 8 * (-1) + i16 :: Int16# = intToInt16# (-1#) + w16 :: Word16# = wordToWord16# (65535##) + res16 :: Int64# = fun16 i16 w16 i16 i16 i16 i16 i16 i16 w16 i16 + expected_res16 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word16) + 8 * (-1) + i32 :: Int32# = intToInt32# (-1#) + w32 :: Word32# = wordToWord32# (4294967295##) + res32 :: Int64# = fun32 i32 w32 i32 i32 i32 i32 i32 i32 w32 i32 + expected_res32 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word32) + 8 * (-1) + resFloat :: Float = F# (funFloat 1.0# 1.1# 1.2# 1.3# 1.4# 1.5# 1.6# 1.7# 1.8# 1.9#) + resDouble :: Double = D# (funDouble 1.0## 1.1## 1.2## 1.3## 1.4## 1.5## 1.6## 1.7## 1.8## 1.9##) + in do + print $ "fun8 result:" ++ show (I64# res8) + assertEqual expected_res8 (I64# res8) + print $ "fun16 result:" ++ show (I64# res16) + assertEqual expected_res16 (I64# res16) + print $ "fun32 result:" ++ show (I64# res32) + assertEqual expected_res32 (I64# res32) + print $ "funFloat result:" ++ show resFloat + assertEqual (14.5 :: Float) resFloat + print $ "funDouble result:" ++ show resDouble + assertEqual (14.5 :: Double) resDouble + +assertEqual :: (Eq a, Show a) => a -> a -> IO () +assertEqual a b = + if a == b + then pure () + else error $ show a ++ " =/= " ++ show b Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.stdout =================================================================== --- /dev/null +++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv.stdout @@ -0,0 +1,60 @@ +"fun8 result:502" +"fun16 result:131062" +"fun32 result:8589934582" +"funFloat result:14.5" +"funDouble result:14.5" +fun32: +a0: 0xffffffff -1 +a1: 0xffffffff 4294967295 +a2: 0xffffffff -1 +a3: 0xffffffff -1 +a4: 0xffffffff -1 +a5: 0xffffffff -1 +a6: 0xffffffff -1 +a7: 0xffffffff -1 +s0: 0xffffffff -1 +s1: 0xffffffff 4294967295 +fun16: +a0: 0xffffffff -1 +a1: 0xffff 65535 +a2: 0xffffffff -1 +a3: 0xffffffff -1 +a4: 0xffffffff -1 +a5: 0xffffffff -1 +a6: 0xffffffff -1 +a7: 0xffffffff -1 +s0: 0xffffffff -1 +s1: 0xffff 65535 +fun8: +a0: 0xffffffff -1 +a1: 0xff 255 +a2: 0xffffffff -1 +a3: 0xffffffff -1 +a4: 0xffffffff -1 +a5: 0xffffffff -1 +a6: 0xffffffff -1 +a7: 0xffffffff -1 +s0: 0xffffffff -1 +s1: 0xff 255 +funFloat: +a0: 1.000000 +a1: 1.100000 +a2: 1.200000 +a3: 1.300000 +a4: 1.400000 +a5: 1.500000 +a6: 1.600000 +a7: 1.700000 +s0: 1.800000 +s1: 1.900000 +funDouble: +a0: 1.000000 +a1: 1.100000 +a2: 1.200000 +a3: 1.300000 +a4: 1.400000 +a5: 1.500000 +a6: 1.600000 +a7: 1.700000 +s0: 1.800000 +s1: 1.900000 Index: ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv_c.c =================================================================== --- /dev/null +++ ghc-9.10.1/testsuite/tests/codeGen/should_run/CCallConv_c.c @@ -0,0 +1,91 @@ +#include "stdint.h" +#include "stdio.h" + +int64_t fun8(int8_t a0, uint8_t a1, int8_t a2, int8_t a3, int8_t a4, int8_t a5, + int8_t a6, int8_t a7, int8_t s0, uint8_t s1) { + printf("fun8:\n"); + printf("a0: %#x %hhd\n", a0, a0); + printf("a1: %#x %hhu\n", a1, a1); + printf("a2: %#x %hhd\n", a2, a2); + printf("a3: %#x %hhd\n", a3, a3); + printf("a4: %#x %hhd\n", a4, a4); + printf("a5: %#x %hhd\n", a5, a5); + printf("a6: %#x %hhd\n", a6, a6); + printf("a7: %#x %hhd\n", a7, a7); + printf("s0: %#x %hhd\n", s0, s0); + printf("s1: %#x %hhu\n", s1, s1); + + return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1; +} + +int64_t fun16(int16_t a0, uint16_t a1, int16_t a2, int16_t a3, int16_t a4, + int16_t a5, int16_t a6, int16_t a7, int16_t s0, uint16_t s1) { + printf("fun16:\n"); + printf("a0: %#x %hd\n", a0, a0); + printf("a1: %#x %hu\n", a1, a1); + printf("a2: %#x %hd\n", a2, a2); + printf("a3: %#x %hd\n", a3, a3); + printf("a4: %#x %hd\n", a4, a4); + printf("a5: %#x %hd\n", a5, a5); + printf("a6: %#x %hd\n", a6, a6); + printf("a7: %#x %hd\n", a7, a7); + printf("s0: %#x %hd\n", s0, s0); + printf("s1: %#x %hu\n", s1, s1); + + return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1; +} + +int64_t fun32(int32_t a0, uint32_t a1, int32_t a2, int32_t a3, int32_t a4, + int32_t a5, int32_t a6, int32_t a7, int32_t s0, uint32_t s1) { + printf("fun32:\n"); + printf("a0: %#x %d\n", a0, a0); + printf("a1: %#x %u\n", a1, a1); + printf("a2: %#x %d\n", a2, a2); + printf("a3: %#x %d\n", a3, a3); + printf("a4: %#x %d\n", a4, a4); + printf("a5: %#x %d\n", a5, a5); + printf("a6: %#x %d\n", a6, a6); + printf("a7: %#x %d\n", a7, a7); + printf("s0: %#x %d\n", s0, s0); + printf("s1: %#x %u\n", s1, s1); + + // Ensure the addition happens in long int (not just int) precission. + // Otherwise, the result is truncated during the operation. + int64_t force_int64_precission = 0; + return force_int64_precission + a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + + s1; +} + +float funFloat(float a0, float a1, float a2, float a3, float a4, float a5, + float a6, float a7, float s0, float s1) { + printf("funFloat:\n"); + printf("a0: %f\n", a0); + printf("a1: %f\n", a1); + printf("a2: %f\n", a2); + printf("a3: %f\n", a3); + printf("a4: %f\n", a4); + printf("a5: %f\n", a5); + printf("a6: %f\n", a6); + printf("a7: %f\n", a7); + printf("s0: %f\n", s0); + printf("s1: %f\n", s1); + + return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1; +} + +double funDouble(double a0, double a1, double a2, double a3, double a4, double a5, + double a6, double a7, double s0, double s1) { + printf("funDouble:\n"); + printf("a0: %f\n", a0); + printf("a1: %f\n", a1); + printf("a2: %f\n", a2); + printf("a3: %f\n", a3); + printf("a4: %f\n", a4); + printf("a5: %f\n", a5); + printf("a6: %f\n", a6); + printf("a7: %f\n", a7); + printf("s0: %f\n", s0); + printf("s1: %f\n", s1); + + return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1; +}