Index: auxprogs/genoffsets.c
===================================================================
--- auxprogs/genoffsets.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ auxprogs/genoffsets.c	(.../trunk)	(revision 2863)
@@ -51,6 +51,7 @@
 #include "../pub/libvex_guest_ppc32.h"
 #include "../pub/libvex_guest_ppc64.h"
 #include "../pub/libvex_guest_arm.h"
+#include "../pub/libvex_guest_arm64.h"
 #include "../pub/libvex_guest_s390x.h"
 #include "../pub/libvex_guest_mips32.h"
 #include "../pub/libvex_guest_mips64.h"
@@ -159,6 +160,19 @@
    GENOFFSET(ARM,arm,R14);
    GENOFFSET(ARM,arm,R15T);
 
+   // arm64
+   GENOFFSET(ARM64,arm64,X0);
+   GENOFFSET(ARM64,arm64,X1);
+   GENOFFSET(ARM64,arm64,X2);
+   GENOFFSET(ARM64,arm64,X3);
+   GENOFFSET(ARM64,arm64,X4);
+   GENOFFSET(ARM64,arm64,X5);
+   GENOFFSET(ARM64,arm64,X6);
+   GENOFFSET(ARM64,arm64,X7);
+   GENOFFSET(ARM64,arm64,X8);
+   GENOFFSET(ARM64,arm64,XSP);
+   GENOFFSET(ARM64,arm64,PC);
+
    // s390x
    GENOFFSET(S390X,s390x,r2);
    GENOFFSET(S390X,s390x,r3);
Index: priv/guest_amd64_helpers.c
===================================================================
--- priv/guest_amd64_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_amd64_helpers.c	(.../trunk)	(revision 2863)
@@ -3777,6 +3777,7 @@
 
    vex_state->guest_DFLAG   = 1; /* forwards */
    vex_state->guest_IDFLAG  = 0;
+   vex_state->guest_ACFLAG  = 0;
 
    /* HACK: represent the offset associated with %fs==0. This
       assumes that %fs is only ever zero. */
@@ -3817,8 +3818,8 @@
 
    /* These should not ever be either read or written, but we
       initialise them anyway. */
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN   = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN   = 0;
 
    vex_state->guest_NRADDR   = 0;
    vex_state->guest_SC_CLASS = 0;
@@ -3923,8 +3924,8 @@
                  // /* */ ALWAYSDEFD(guest_GDT),
                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
-                 /* 12 */ ALWAYSDEFD(guest_TISTART),
-                 /* 13 */ ALWAYSDEFD(guest_TILEN),
+                 /* 12 */ ALWAYSDEFD(guest_CMSTART),
+                 /* 13 */ ALWAYSDEFD(guest_CMLEN),
                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
                }
Index: priv/guest_amd64_toIR.c
===================================================================
--- priv/guest_amd64_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_amd64_toIR.c	(.../trunk)	(revision 2863)
@@ -51,10 +51,6 @@
      float-to-float rounding.  For all other operations,
      round-to-nearest is used, regardless.
  
-   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
-     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
-     even when it isn't.
- 
    * some of the FCOM cases could do with testing -- not convinced
      that the args are the right way round.
  
@@ -436,8 +432,8 @@
 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
 
 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
-#define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
-#define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
+#define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
+#define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
 
 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
 
@@ -771,10 +767,10 @@
   return toBool( ! haveNo66noF2noF3(pfx) );
 }
 
-/* Return True iff pfx has 66 or F2 set */
-static Bool have66orF2 ( Prefix pfx )
+/* Return True iff pfx has 66 or F3 set */
+static Bool have66orF3 ( Prefix pfx )
 {
-   return toBool((pfx & (PFX_66|PFX_F2)) > 0);
+   return toBool((pfx & (PFX_66|PFX_F3)) > 0);
 }
 
 /* Clear all the segment-override bits in a prefix. */
@@ -4270,8 +4266,12 @@
 
    modrm = getUChar(delta);
    if (epartIsReg(modrm)) {
-      /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
-      if (haveF2orF3(pfx)) goto unhandledR;
+      /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
+         F2/CALL and F2/JMP may have bnd prefix. */
+     if (haveF2orF3(pfx)
+         && ! (haveF2(pfx)
+               && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
+        goto unhandledR;
       assign(t1, getIRegE(sz,pfx,modrm));
       switch (gregLO3ofRM(modrm)) {
          case 0: /* INC */
@@ -4291,6 +4291,7 @@
          case 2: /* call Ev */
             /* Ignore any sz value and operate as if sz==8. */
             if (!(sz == 4 || sz == 8)) goto unhandledR;
+            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, getIRegE(sz,pfx,modrm));
@@ -4306,6 +4307,7 @@
          case 4: /* jmp Ev */
             /* Ignore any sz value and operate as if sz==8. */
             if (!(sz == 4 || sz == 8)) goto unhandledR;
+            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, getIRegE(sz,pfx,modrm));
@@ -4338,11 +4340,14 @@
                        showSz ? nameISize(sz) : ' ', 
                        nameIRegE(sz, pfx, modrm));
    } else {
-      /* Decide if F2/XACQ or F3/XREL might be valid. */
+      /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
          validF2orF3 = True;
+      } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
+                 && (haveF2(pfx) && !haveF3(pfx))) {
+         validF2orF3 = True;
       }
       if (!validF2orF3) goto unhandledM;
       /* */
@@ -4379,6 +4384,7 @@
          case 2: /* call Ev */
             /* Ignore any sz value and operate as if sz==8. */
             if (!(sz == 4 || sz == 8)) goto unhandledM;
+            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
@@ -4394,6 +4400,7 @@
          case 4: /* JMP Ev */
             /* Ignore any sz value and operate as if sz==8. */
             if (!(sz == 4 || sz == 8)) goto unhandledM;
+            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
@@ -5066,6 +5073,42 @@
 }
 
 
+/* Given i, and some expression e, and a condition cond, generate IR
+   which has the same effect as put_ST(i,e) when cond is true and has
+   no effect when cond is false.  Given the lack of proper
+   if-then-else in the IR, this is pretty tricky.
+*/
+
+static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
+{
+   // new_tag = if cond then FULL else old_tag
+   // new_val = if cond then (if old_tag==FULL then NaN else val)
+   //                   else old_val
+
+   IRTemp old_tag = newTemp(Ity_I8);
+   assign(old_tag, get_ST_TAG(i));
+   IRTemp new_tag = newTemp(Ity_I8);
+   assign(new_tag,
+          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
+
+   IRTemp old_val = newTemp(Ity_F64);
+   assign(old_val, get_ST_UNCHECKED(i));
+   IRTemp new_val = newTemp(Ity_F64);
+   assign(new_val,
+          IRExpr_ITE(mkexpr(cond),
+                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
+                                /* non-0 means full */
+                                mkQNaN64(),
+                                /* 0 means empty */
+                                value),
+                     mkexpr(old_val)));
+
+   put_ST_UNCHECKED(i, mkexpr(new_val));
+   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So 
+   // now set it to new_tag instead.
+   put_ST_TAG(i, mkexpr(new_tag));
+}
+
 /* Adjust FTOP downwards by one register. */
 
 static void fp_push ( void )
@@ -5073,6 +5116,14 @@
    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
 }
 
+/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
+   don't change it. */
+
+static void maybe_fp_push ( IRTemp cond )
+{
+   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
+}
+
 /* Adjust FTOP upwards by one register, and mark the vacated register
    as empty.  */
 
@@ -5082,12 +5133,49 @@
    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
 }
 
-/* Clear the C2 bit of the FPU status register, for
-   sin/cos/tan/sincos. */
+/* Set the C2 bit of the FPU status register to e[0].  Assumes that
+   e[31:1] == 0. 
+*/
+static void set_C2 ( IRExpr* e )
+{
+   IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
+   put_C3210( binop(Iop_Or64,
+                    cleared,
+                    binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
+}
 
-static void clear_C2 ( void )
+/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
+   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
+   test is simple, but the derivation of it is not so simple.
+
+   The exponent field for an IEEE754 double is 11 bits.  That means it
+   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
+   the number is either a NaN or an Infinity and so is not finite.
+   Furthermore, a finite value of exactly 2^63 is the smallest value
+   that has exponent value 0x43E.  Hence, what we need to do is
+   extract the exponent, ignoring the sign bit and mantissa, and check
+   it is < 0x43E, or <= 0x43D.
+
+   To make this easily applicable to 32- and 64-bit targets, a
+   roundabout approach is used.  First the number is converted to I64,
+   then the top 32 bits are taken.  Shifting them right by 20 bits
+   places the sign bit and exponent in the bottom 12 bits.  Anding
+   with 0x7FF gets rid of the sign bit, leaving just the exponent
+   available for comparison.
+*/
+static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
 {
-   put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
+   IRTemp i64 = newTemp(Ity_I64);
+   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
+   IRTemp exponent = newTemp(Ity_I32);
+   assign(exponent,
+          binop(Iop_And32,
+                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
+                mkU32(0x7FF)));
+   IRTemp in_range_and_finite = newTemp(Ity_I1);
+   assign(in_range_and_finite,
+          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
+   return in_range_and_finite;
 }
 
 /* Invent a plausible-looking FPU status word value:
@@ -5717,16 +5805,31 @@
                fp_pop();
                break;
 
-            case 0xF2: /* FPTAN */
-               DIP("ftan\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_TanF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               fp_push();
-               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
-               clear_C2(); /* HACK */
+            case 0xF2: { /* FPTAN */
+               DIP("fptan\n");
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(Iop_TanF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               /* Conditionally push 1.0 on the stack, if the arg is
+                  in range */
+               maybe_fp_push(argOK);
+               maybe_put_ST(argOK, 0,
+                            IRExpr_Const(IRConst_F64(1.0)));
+               set_C2( binop(Iop_Xor64,
+                             unop(Iop_1Uto64, mkexpr(argOK)), 
+                             mkU64(1)) );
                break;
+            }
 
             case 0xF3: /* FPATAN */
                DIP("fpatan\n");
@@ -5842,19 +5945,30 @@
                break;
 
             case 0xFB: { /* FSINCOS */
-               IRTemp a1 = newTemp(Ity_F64);
-               assign( a1, get_ST(0) );
                DIP("fsincos\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_SinF64, 
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(Iop_SinF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               /* Conditionally push the cos value on the stack, if
+                  the arg is in range */
+               maybe_fp_push(argOK);
+               maybe_put_ST(argOK, 0,
+                  binop(Iop_CosF64,
                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        mkexpr(a1)));
-               fp_push();
-               put_ST(0, 
-                  binop(Iop_CosF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        mkexpr(a1)));
-               clear_C2(); /* HACK */
+                        mkexpr(argD)));
+               set_C2( binop(Iop_Xor64,
+                             unop(Iop_1Uto64, mkexpr(argOK)), 
+                             mkU64(1)) );
                break;
             }
 
@@ -5873,24 +5987,29 @@
                         get_ST(1)));
                break;
 
-            case 0xFE: /* FSIN */
-               DIP("fsin\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_SinF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               clear_C2(); /* HACK */
+            case 0xFE:   /* FSIN */
+            case 0xFF: { /* FCOS */
+               Bool isSIN = modrm == 0xFE;
+               DIP("%s\n", isSIN ? "fsin" : "fcos");
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               set_C2( binop(Iop_Xor64,
+                             unop(Iop_1Uto64, mkexpr(argOK)), 
+                             mkU64(1)) );
                break;
+            }
 
-            case 0xFF: /* FCOS */
-               DIP("fcos\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_CosF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               clear_C2(); /* HACK */
-               break;
-
             default:
                goto decode_fail;
          }
@@ -8548,6 +8667,32 @@
 /*--- SSE/SSE2/SSE3 helpers                                ---*/
 /*------------------------------------------------------------*/
 
+/* Indicates whether the op requires a rounding-mode argument.  Note
+   that this covers only vector floating point arithmetic ops, and
+   omits the scalar ones that need rounding modes.  Note also that
+   inconsistencies here will get picked up later by the IR sanity
+   checker, so this isn't correctness-critical. */
+static Bool requiresRMode ( IROp op )
+{
+   switch (op) {
+      /* 128 bit ops */
+      case Iop_Add32Fx4: case Iop_Sub32Fx4:
+      case Iop_Mul32Fx4: case Iop_Div32Fx4:
+      case Iop_Add64Fx2: case Iop_Sub64Fx2:
+      case Iop_Mul64Fx2: case Iop_Div64Fx2:
+      /* 256 bit ops */
+      case Iop_Add32Fx8: case Iop_Sub32Fx8:
+      case Iop_Mul32Fx8: case Iop_Div32Fx8:
+      case Iop_Add64Fx4: case Iop_Sub64Fx4:
+      case Iop_Mul64Fx4: case Iop_Div64Fx4:
+         return True;
+      default:
+         break;
+   }
+   return False;
+}
+
+
 /* Worker function; do not call directly. 
    Handles full width G = G `op` E   and   G = (not G) `op` E.
 */
@@ -8563,13 +8708,20 @@
    Int     alen;
    IRTemp  addr;
    UChar   rm = getUChar(delta);
+   Bool    needsRMode = requiresRMode(op);
    IRExpr* gpart
       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
                 : getXMMReg(gregOfRexRM(pfx,rm));
    if (epartIsReg(rm)) {
-      putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           getXMMReg(eregOfRexRM(pfx,rm))) );
+      putXMMReg(
+         gregOfRexRM(pfx,rm),
+         needsRMode
+            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        gpart,
+                        getXMMReg(eregOfRexRM(pfx,rm)))
+            : binop(op, gpart,
+                        getXMMReg(eregOfRexRM(pfx,rm)))
+      );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8576,9 +8728,15 @@
       return delta+1;
    } else {
       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-      putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           loadLE(Ity_V128, mkexpr(addr))) );
+      putXMMReg(
+         gregOfRexRM(pfx,rm), 
+         needsRMode
+            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        gpart,
+                        loadLE(Ity_V128, mkexpr(addr)))
+            : binop(op, gpart,
+                        loadLE(Ity_V128, mkexpr(addr)))
+      );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -10982,9 +11140,11 @@
    IRTemp subV = newTemp(Ity_V128);
    IRTemp a1   = newTemp(Ity_I64);
    IRTemp s0   = newTemp(Ity_I64);
+   IRTemp rm   = newTemp(Ity_I32);
 
-   assign( addV, binop(Iop_Add64Fx2, mkexpr(dV), mkexpr(sV)) );
-   assign( subV, binop(Iop_Sub64Fx2, mkexpr(dV), mkexpr(sV)) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
+   assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
 
    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
@@ -11000,10 +11160,12 @@
    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
    IRTemp addV = newTemp(Ity_V256);
    IRTemp subV = newTemp(Ity_V256);
+   IRTemp rm   = newTemp(Ity_I32);
    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
 
-   assign( addV, binop(Iop_Add64Fx4, mkexpr(dV), mkexpr(sV)) );
-   assign( subV, binop(Iop_Sub64Fx4, mkexpr(dV), mkexpr(sV)) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
+   assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
 
    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
@@ -11019,10 +11181,12 @@
    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
    IRTemp addV = newTemp(Ity_V128);
    IRTemp subV = newTemp(Ity_V128);
+   IRTemp rm   = newTemp(Ity_I32);
    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
 
-   assign( addV, binop(Iop_Add32Fx4, mkexpr(dV), mkexpr(sV)) );
-   assign( subV, binop(Iop_Sub32Fx4, mkexpr(dV), mkexpr(sV)) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
+   assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
 
    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
@@ -11039,11 +11203,13 @@
    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
    IRTemp addV = newTemp(Ity_V256);
    IRTemp subV = newTemp(Ity_V256);
+   IRTemp rm   = newTemp(Ity_I32);
    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
 
-   assign( addV, binop(Iop_Add32Fx8, mkexpr(dV), mkexpr(sV)) );
-   assign( subV, binop(Iop_Sub32Fx8, mkexpr(dV), mkexpr(sV)) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
+   assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
 
    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
@@ -13361,14 +13527,14 @@
 
          /* Round addr down to the start of the containing block. */
          stmt( IRStmt_Put(
-                  OFFB_TISTART,
+                  OFFB_CMSTART,
                   binop( Iop_And64, 
                          mkexpr(addr), 
                          mkU64( ~(lineszB-1) ))) );
 
-         stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
+         stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
 
-         jmp_lit(dres, Ijk_TInval, (Addr64)(guest_RIP_bbstart+delta));
+         jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
 
          DIP("clflush %s\n", dis_buf);
          goto decode_success;
@@ -14594,6 +14760,7 @@
    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
    IRTemp leftV  = newTemp(Ity_V128);
    IRTemp rightV = newTemp(Ity_V128);
+   IRTemp rm     = newTemp(Ity_I32);
    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
 
    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
@@ -14603,8 +14770,9 @@
    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
 
    IRTemp res = newTemp(Ity_V128);
-   assign( res, binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
-                              mkexpr(leftV), mkexpr(rightV) ) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
+                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
    return res;
 }
 
@@ -14614,6 +14782,7 @@
    IRTemp s1, s0, d1, d0;
    IRTemp leftV  = newTemp(Ity_V128);
    IRTemp rightV = newTemp(Ity_V128);
+   IRTemp rm     = newTemp(Ity_I32);
    s1 = s0 = d1 = d0 = IRTemp_INVALID;
 
    breakupV128to64s( sV, &s1, &s0 );
@@ -14623,8 +14792,9 @@
    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
 
    IRTemp res = newTemp(Ity_V128);
-   assign( res, binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
-                              mkexpr(leftV), mkexpr(rightV) ) );
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+   assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
+                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
    return res;
 }
 
@@ -18271,8 +18441,11 @@
    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
    IRTemp and_vec = newTemp(Ity_V128);
    IRTemp sum_vec = newTemp(Ity_V128);
+   IRTemp rm      = newTemp(Ity_I32);
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
    assign( and_vec, binop( Iop_AndV128,
-                           binop( Iop_Mul64Fx2,
+                           triop( Iop_Mul64Fx2,
+                                  mkexpr(rm),
                                   mkexpr(dst_vec), mkexpr(src_vec) ),
                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
 
@@ -18296,6 +18469,7 @@
    IRTemp tmp_prod_vec = newTemp(Ity_V128);
    IRTemp prod_vec     = newTemp(Ity_V128);
    IRTemp sum_vec      = newTemp(Ity_V128);
+   IRTemp rm           = newTemp(Ity_I32);
    IRTemp v3, v2, v1, v0;
    v3 = v2 = v1 = v0   = IRTemp_INVALID;
    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 
@@ -18303,15 +18477,17 @@
                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
                              0xFFFF };
 
+   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
    assign( tmp_prod_vec, 
            binop( Iop_AndV128, 
-                  binop( Iop_Mul32Fx4, mkexpr(dst_vec),
-                                       mkexpr(src_vec) ), 
+                  triop( Iop_Mul32Fx4,
+                         mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ), 
                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
 
-   assign( sum_vec, binop( Iop_Add32Fx4,
+   assign( sum_vec, triop( Iop_Add32Fx4,
+                           mkexpr(rm),
                            binop( Iop_InterleaveHI32x4, 
                                   mkexpr(prod_vec), mkexpr(prod_vec) ), 
                            binop( Iop_InterleaveLO32x4, 
@@ -18319,7 +18495,8 @@
 
    IRTemp res = newTemp(Ity_V128);
    assign( res, binop( Iop_AndV128, 
-                       binop( Iop_Add32Fx4,
+                       triop( Iop_Add32Fx4,
+                              mkexpr(rm),
                               binop( Iop_InterleaveHI32x4,
                                      mkexpr(sum_vec), mkexpr(sum_vec) ), 
                               binop( Iop_InterleaveLO32x4,
@@ -19550,7 +19727,8 @@
    case 0x7F: { /* JGb/JNLEb (jump greater) */
       Long   jmpDelta;
       const HChar* comment  = "";
-      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveF3(pfx)) goto decode_failure;
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       jmpDelta = getSDisp8(delta);
       vassert(-128 <= jmpDelta && jmpDelta < 128);
       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
@@ -20203,7 +20381,8 @@
    }
 
    case 0xC2: /* RET imm16 */
-      if (have66orF2orF3(pfx)) goto decode_failure;
+      if (have66orF3(pfx)) goto decode_failure;
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       d64 = getUDisp16(delta); 
       delta += 2;
       dis_ret(dres, vbi, d64);
@@ -20211,8 +20390,9 @@
       return delta;
 
    case 0xC3: /* RET */
-      if (have66orF2(pfx)) goto decode_failure;
+      if (have66(pfx)) goto decode_failure;
       /* F3 is acceptable on AMD. */
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       dis_ret(dres, vbi, 0);
       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
       return delta;
@@ -20251,7 +20431,7 @@
          return delta;
       }
       /* BEGIN HACKY SUPPORT FOR xbegin */
-      if (modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
+      if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
          delta++; /* mod/rm byte */
          d64 = getSDisp(4,delta); 
@@ -20270,6 +20450,16 @@
          return delta;
       }
       /* END HACKY SUPPORT FOR xbegin */
+      /* BEGIN HACKY SUPPORT FOR xabort */
+      if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
+          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+         delta++; /* mod/rm byte */
+         abyte = getUChar(delta); delta++;
+         /* There is never a real transaction in progress, so do nothing. */
+         DIP("xabort $%d", (Int)abyte);
+         return delta;
+      }
+      /* END HACKY SUPPORT FOR xabort */
       goto decode_failure;
 
    case 0xC8: /* ENTER */
@@ -20606,7 +20796,8 @@
    }
 
    case 0xE8: /* CALL J4 */
-      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveF3(pfx)) goto decode_failure;
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       d64 = getSDisp32(delta); delta += 4;
       d64 += (guest_RIP_bbstart+delta); 
       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
@@ -20629,9 +20820,10 @@
       return delta;
 
    case 0xE9: /* Jv (jump, 16/32 offset) */
-      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveF3(pfx)) goto decode_failure;
       if (sz != 4) 
          goto decode_failure; /* JRS added 2004 July 11 */
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta); 
       delta += sz;
       if (resteerOkFn(callback_opaque,d64)) {
@@ -20645,9 +20837,10 @@
       return delta;
 
    case 0xEB: /* Jb (jump, byte offset) */
-      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveF3(pfx)) goto decode_failure;
       if (sz != 4) 
          goto decode_failure; /* JRS added 2004 July 11 */
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta); 
       delta++;
       if (resteerOkFn(callback_opaque,d64)) {
@@ -21065,7 +21258,8 @@
    case 0x8F: { /* JGb/JNLEb (jump greater) */
       Long   jmpDelta;
       const HChar* comment  = "";
-      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveF3(pfx)) goto decode_failure;
+      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
       jmpDelta = getSDisp32(delta);
       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
       delta += 4;
@@ -21156,6 +21350,66 @@
       }
       return delta;
 
+   case 0x1A:
+   case 0x1B: { /* Future MPX instructions, currently NOPs.
+                   BNDMK b, m     F3 0F 1B
+                   BNDCL b, r/m   F3 0F 1A
+                   BNDCU b, r/m   F2 0F 1A
+                   BNDCN b, r/m   F2 0F 1B
+                   BNDMOV b, b/m  66 0F 1A
+                   BNDMOV b/m, b  66 0F 1B
+                   BNDLDX b, mib     0F 1A
+                   BNDSTX mib, b     0F 1B */
+
+      /* All instructions have two operands. One operand is always the
+         bnd register number (bnd0-bnd3, other register numbers are
+         ignored when MPX isn't enabled, but should generate an
+         exception if MPX is enabled) given by gregOfRexRM. The other
+         operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
+         address, all of which can be decoded by using either
+         eregOfRexRM or disAMode. */
+
+      modrm = getUChar(delta);
+      int bnd = gregOfRexRM(pfx,modrm);
+      const HChar *oper;
+      if (epartIsReg(modrm)) {
+         oper = nameIReg64 (eregOfRexRM(pfx,modrm));
+         delta += 1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         oper = dis_buf;
+      }
+
+      if (haveF3no66noF2 (pfx)) {
+         if (opc == 0x1B) {
+            DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
+         } else /* opc == 0x1A */ {
+            DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
+         }
+      } else if (haveF2no66noF3 (pfx)) {
+         if (opc == 0x1A) {
+            DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
+         } else /* opc == 0x1B */ {
+            DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
+         }
+      } else if (have66noF2noF3 (pfx)) {
+         if (opc == 0x1A) {
+            DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
+         } else /* opc == 0x1B */ {
+            DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
+         }
+      } else if (haveNo66noF2noF3 (pfx)) {
+         if (opc == 0x1A) {
+            DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
+         } else /* opc == 0x1B */ {
+            DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
+         }
+      } else goto decode_failure;
+
+      return delta;
+   }
+
    case 0xA2: { /* CPUID */
       /* Uses dirty helper: 
             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
@@ -21888,8 +22142,17 @@
    if (op != Iop_INVALID) {
       vassert(opFn == NULL);
       res = newTemp(Ity_V128);
-      assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
-                           : binop(op, mkexpr(tSL), mkexpr(tSR)));
+      if (requiresRMode(op)) {
+         IRTemp rm = newTemp(Ity_I32);
+         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
+         assign(res, swapArgs
+                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
+                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
+      } else {
+         assign(res, swapArgs
+                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
+                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
+      }
    } else {
       vassert(opFn != NULL);
       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
@@ -22792,8 +23055,17 @@
    if (op != Iop_INVALID) {
       vassert(opFn == NULL);
       res = newTemp(Ity_V256);
-      assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
-                           : binop(op, mkexpr(tSL), mkexpr(tSR)));
+      if (requiresRMode(op)) {
+         IRTemp rm = newTemp(Ity_I32);
+         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
+         assign(res, swapArgs
+                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
+                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
+      } else {
+         assign(res, swapArgs
+                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
+                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
+      }
    } else {
       vassert(opFn != NULL);
       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
@@ -30936,14 +31208,14 @@
             // injecting here can change. In which case the translation has to
             // be redone. For ease of handling, we simply invalidate all the
             // time.
-            stmt(IRStmt_Put(OFFB_TISTART, mkU64(guest_RIP_curr_instr)));
-            stmt(IRStmt_Put(OFFB_TILEN,   mkU64(19)));
+            stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
    
             delta += 19;
 
             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             goto decode_success;
          }
          /* We don't know what it is. */
Index: priv/guest_arm64_defs.h
===================================================================
--- priv/guest_arm64_defs.h	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/guest_arm64_defs.h	(.../trunk)	(revision 2863)
@@ -0,0 +1,244 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                guest_arm64_defs.h ---*/
+/*---------------------------------------------------------------*/
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __VEX_GUEST_ARM64_DEFS_H
+#define __VEX_GUEST_ARM64_DEFS_H
+
+#include "libvex_basictypes.h"
+#include "guest_generic_bb_to_IR.h"     // DisResult
+
+/*---------------------------------------------------------*/
+/*--- arm64 to IR conversion                            ---*/
+/*---------------------------------------------------------*/
+
+/* Convert one ARM64 insn to IR.  See the type DisOneInstrFn in
+   bb_to_IR.h. */
+extern
+DisResult disInstr_ARM64 ( IRSB*        irbb,
+                           Bool         (*resteerOkFn) ( void*, Addr64 ),
+                           Bool         resteerCisOk,
+                           void*        callback_opaque,
+                           UChar*       guest_code,
+                           Long         delta,
+                           Addr64       guest_IP,
+                           VexArch      guest_arch,
+                           VexArchInfo* archinfo,
+                           VexAbiInfo*  abiinfo,
+                           Bool         host_bigendian,
+                           Bool         sigill_diag );
+
+/* Used by the optimiser to specialise calls to helpers. */
+extern
+IRExpr* guest_arm64_spechelper ( const HChar* function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts );
+
+/* Describes to the optimser which part of the guest state require
+   precise memory exceptions.  This is logically part of the guest
+   state description. */
+extern 
+Bool guest_arm64_state_requires_precise_mem_exns ( Int, Int );
+
+extern
+VexGuestLayout arm64Guest_layout;
+
+
+/*---------------------------------------------------------*/
+/*--- arm64 guest helpers                               ---*/
+/*---------------------------------------------------------*/
+
+/* --- CLEAN HELPERS --- */
+
+/* Calculate NZCV from the supplied thunk components, in the positions
+   they appear in the CPSR, viz bits 31:28 for N Z C V respectively.
+   Returned bits 63:32 and 27:0 are zero. */
+extern 
+ULong arm64g_calculate_flags_nzcv ( ULong cc_op, ULong cc_dep1,
+                                    ULong cc_dep2, ULong cc_dep3 );
+
+//ZZ /* Calculate the C flag from the thunk components, in the lowest bit
+//ZZ    of the word (bit 0). */
+//ZZ extern 
+//ZZ UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
+//ZZ                              UInt cc_dep2, UInt cc_dep3 );
+//ZZ 
+//ZZ /* Calculate the V flag from the thunk components, in the lowest bit
+//ZZ    of the word (bit 0). */
+//ZZ extern 
+//ZZ UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1,
+//ZZ                              UInt cc_dep2, UInt cc_dep3 );
+//ZZ 
+/* Calculate the specified condition from the thunk components, in the
+   lowest bit of the word (bit 0). */
+extern 
+ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */
+                                   ULong cond_n_op ,
+                                   ULong cc_dep1,
+                                   ULong cc_dep2, ULong cc_dep3 );
+
+//ZZ /* Calculate the QC flag from the thunk components, in the lowest bit
+//ZZ    of the word (bit 0). */
+//ZZ extern 
+//ZZ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
+//ZZ                               UInt resR1, UInt resR2 );
+
+
+/*---------------------------------------------------------*/
+/*--- Condition code stuff                              ---*/
+/*---------------------------------------------------------*/
+
+/* Flag masks.  Defines positions of flag bits in the NZCV
+   register. */
+#define ARM64G_CC_SHIFT_N  31
+#define ARM64G_CC_SHIFT_Z  30
+#define ARM64G_CC_SHIFT_C  29
+#define ARM64G_CC_SHIFT_V  28
+//ZZ #define ARMG_CC_SHIFT_Q  27
+//ZZ 
+//ZZ #define ARMG_CC_MASK_N    (1 << ARMG_CC_SHIFT_N)
+//ZZ #define ARMG_CC_MASK_Z    (1 << ARMG_CC_SHIFT_Z)
+//ZZ #define ARMG_CC_MASK_C    (1 << ARMG_CC_SHIFT_C)
+//ZZ #define ARMG_CC_MASK_V    (1 << ARMG_CC_SHIFT_V)
+//ZZ #define ARMG_CC_MASK_Q    (1 << ARMG_CC_SHIFT_Q)
+
+/* Flag thunk descriptors.  A four-word thunk is used to record
+   details of the most recent flag-setting operation, so NZCV can
+   be computed later if needed.
+
+   The four words are:
+
+      CC_OP, which describes the operation.
+
+      CC_DEP1, CC_DEP2, CC_NDEP.  These are arguments to the
+         operation.  We want set up the mcx_masks in flag helper calls
+         involving these fields so that Memcheck "believes" that the
+         resulting flags are data-dependent on both CC_DEP1 and
+         CC_DEP2.  Hence the name DEP.
+
+   When building the thunk, it is always necessary to write words into
+   CC_DEP1/2 and NDEP, even if those args are not used given the CC_OP
+   field.  This is important because otherwise Memcheck could give
+   false positives as it does not understand the relationship between
+   the CC_OP field and CC_DEP1/2/NDEP, and so believes that the
+   definedness of the stored flags always depends on all 3 DEP values.
+
+   A summary of the field usages is:
+
+   OP                DEP1              DEP2              DEP3
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   OP_COPY           curr_NZCV:28x0    unused            unused
+   OP_ADD32          argL              argR              unused
+   OP_ADD64          argL              argR              unused
+   OP_SUB32          argL              argR              unused
+   OP_SUB64          argL              argR              unused
+//ZZ    OP_ADC            argL              argR              31x0:old_C
+//ZZ    OP_SBB            argL              argR              31x0:old_C
+   OP_LOGIC32        result            unused            unused
+   OP_LOGIC64        result            unused            unused
+//ZZ    OP_MUL            result            unused            30x0:old_C:old_V
+//ZZ    OP_MULL           resLO32           resHI32           30x0:old_C:old_V
+//ZZ */
+
+enum {
+   ARM64G_CC_OP_COPY=0,   /* DEP1 = NZCV in 31:28, DEP2 = 0, DEP3 = 0
+                             just copy DEP1 to output */
+
+   ARM64G_CC_OP_ADD32,    /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                             DEP3 = 0 */
+
+   ARM64G_CC_OP_ADD64,    /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                             DEP3 = 0 */
+
+   ARM64G_CC_OP_SUB32,    /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                             DEP3 = 0 */
+
+   ARM64G_CC_OP_SUB64,    /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                             DEP3 = 0 */
+
+//ZZ    ARMG_CC_OP_ADC,     /* DEP1 = argL (Rn), DEP2 = arg2 (shifter_op),
+//ZZ                           DEP3 = oldC (in LSB) */
+//ZZ 
+//ZZ    ARMG_CC_OP_SBB,     /* DEP1 = argL (Rn), DEP2 = arg2 (shifter_op),
+//ZZ                           DEP3 = oldC (in LSB) */
+
+   ARM64G_CC_OP_LOGIC32,  /* DEP1 = result, DEP2 = 0, DEP3 = 0 */
+   ARM64G_CC_OP_LOGIC64,  /* DEP1 = result, DEP2 = 0, DEP3 = 0 */
+
+//ZZ    ARMG_CC_OP_MUL,     /* DEP1 = result, DEP2 = 0, DEP3 = oldC:old_V
+//ZZ                           (in bits 1:0) */
+//ZZ 
+//ZZ    ARMG_CC_OP_MULL,    /* DEP1 = resLO32, DEP2 = resHI32, DEP3 = oldC:old_V
+//ZZ                           (in bits 1:0) */
+
+   ARM64G_CC_OP_NUMBER
+};
+
+/* XXXX because of the calling conventions for
+   arm64g_calculate_condition, all these OP values MUST be in the range
+   0 .. 15 only (viz, 4-bits). */
+
+
+
+/* Defines conditions which we can ask for */
+
+typedef
+   enum {
+      ARM64CondEQ = 0,  /* equal                         : Z=1 */
+      ARM64CondNE = 1,  /* not equal                     : Z=0 */
+
+      ARM64CondCS = 2,  /* >=u (higher or same) (aka HS) : C=1 */
+      ARM64CondCC = 3,  /* <u  (lower)          (aka LO) : C=0 */
+
+      ARM64CondMI = 4,  /* minus (negative)              : N=1 */
+      ARM64CondPL = 5,  /* plus (zero or +ve)            : N=0 */
+
+      ARM64CondVS = 6,  /* overflow                      : V=1 */
+      ARM64CondVC = 7,  /* no overflow                   : V=0 */
+
+      ARM64CondHI = 8,  /* >u   (higher)                 : C=1 && Z=0 */
+      ARM64CondLS = 9,  /* <=u  (lower or same)          : C=0 || Z=1 */
+
+      ARM64CondGE = 10, /* >=s (signed greater or equal) : N=V */
+      ARM64CondLT = 11, /* <s  (signed less than)        : N!=V */
+
+      ARM64CondGT = 12, /* >s  (signed greater)          : Z=0 && N=V */
+      ARM64CondLE = 13, /* <=s (signed less or equal)    : Z=1 || N!=V */
+
+      ARM64CondAL = 14, /* always (unconditional)        : 1 */
+      ARM64CondNV = 15  /* always (unconditional)        : 1 */
+   }
+   ARM64Condcode;
+
+#endif /* ndef __VEX_GUEST_ARM64_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                  guest_arm64_defs.h ---*/
+/*---------------------------------------------------------------*/
Index: priv/guest_arm64_helpers.c
===================================================================
--- priv/guest_arm64_helpers.c	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/guest_arm64_helpers.c	(.../trunk)	(revision 2863)
@@ -0,0 +1,1292 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             guest_arm64_helpers.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_emnote.h"
+#include "libvex_guest_arm64.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_arm64_defs.h"
+
+
+/* This file contains helper functions for arm guest code.  Calls to
+   these functions are generated by the back end.  These calls are of
+   course in the host machine code and this file will be compiled to
+   host machine code, so that all makes sense.
+
+   Only change the signatures of these helper functions very
+   carefully.  If you change the signature here, you'll have to change
+   the parameters passed to it in the IR calls constructed by
+   guest_arm64_toIR.c.
+*/
+
+
+/* Set to 1 to get detailed profiling info about individual N, Z, C
+   and V flag evaluation. */
+#define PROFILE_NZCV_FLAGS 0
+
+#if PROFILE_NZCV_FLAGS
+
+static UInt tab_eval[ARM64G_CC_OP_NUMBER][16];
+static UInt initted = 0;
+static UInt tot_evals = 0;
+
+static void initCounts ( void )
+{
+   UInt i, j;
+   for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
+      for (j = 0; j < 16; j++) {
+         tab_eval[i][j] = 0;
+      }
+   }
+   initted = 1;
+}
+
+static void showCounts ( void )
+{
+   const HChar* nameCC[16]
+      = { "EQ", "NE", "CS", "CC", "MI", "PL", "VS", "VC",
+          "HI", "LS", "GE", "LT", "GT", "LE", "AL", "NV" };
+   UInt i, j;
+   ULong sum = 0;
+   vex_printf("\nCC_OP          0         1         2         3    "
+              "     4         5         6\n");
+   vex_printf(  "--------------------------------------------------"
+              "--------------------------\n");
+   for (j = 0; j < 16; j++) {
+      vex_printf("%2d %s  ", j, nameCC[j]);
+      for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
+         vex_printf("%9d ", tab_eval[i][j]);
+         sum += tab_eval[i][j];
+      }
+      vex_printf("\n");
+   }
+   vex_printf("(In total %llu calls)\n", sum);
+}
+
+#define NOTE_EVAL(_cc_op, _cond) \
+   do { \
+      if (!initted) initCounts(); \
+      vassert( ((UInt)(_cc_op)) < ARM64G_CC_OP_NUMBER); \
+      vassert( ((UInt)(_cond)) < 16); \
+      tab_eval[(UInt)(_cc_op)][(UInt)(cond)]++;  \
+      tot_evals++; \
+      if (0 == (tot_evals & 0x7FFF)) \
+        showCounts(); \
+   } while (0)
+
+#endif /* PROFILE_NZCV_FLAGS */
+
+
+/* Calculate the N flag from the supplied thunk components, in the
+   least significant bit of the word.  Returned bits 63:1 are zero. */
+static
+ULong arm64g_calculate_flag_n ( ULong cc_op, ULong cc_dep1,
+                                ULong cc_dep2, ULong cc_dep3 )
+{
+   switch (cc_op) {
+      case ARM64G_CC_OP_COPY: {
+         /* (nzcv:28x0, unused, unused) */
+         ULong nf   = (cc_dep1 >> ARM64G_CC_SHIFT_N) & 1;
+         return nf;
+      }
+      case ARM64G_CC_OP_ADD32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL + argR;
+         ULong nf   = (ULong)(res >> 31);
+         return nf;
+      }
+      case ARM64G_CC_OP_ADD64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL + argR;
+         ULong nf   = (ULong)(res >> 63);
+         return nf;
+      }
+      case ARM64G_CC_OP_SUB32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL - argR;
+         ULong nf   = (ULong)(res >> 31);
+         return nf;
+      }
+      case ARM64G_CC_OP_SUB64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL - argR;
+         ULong nf   = res >> 63;
+         return nf;
+      }
+//ZZ       case ARMG_CC_OP_ADC: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL + argR + oldC;
+//ZZ          UInt nf   = res >> 31;
+//ZZ          return nf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_SBB: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL - argR - (oldC ^ 1);
+//ZZ          UInt nf   = res >> 31;
+//ZZ          return nf;
+//ZZ       }
+      case ARM64G_CC_OP_LOGIC32: {
+         /* (res, unused, unused) */
+         UInt  res = (UInt)cc_dep1;
+         ULong nf  = res >> 31;
+         return nf;
+      }
+      case ARM64G_CC_OP_LOGIC64: {
+         /* (res, unused, unused) */
+         ULong res = cc_dep1;
+         ULong nf  = res >> 63;
+         return nf;
+      }
+//ZZ       case ARMG_CC_OP_MUL: {
+//ZZ          /* (res, unused, oldC:oldV) */
+//ZZ          UInt res  = cc_dep1;
+//ZZ          UInt nf   = res >> 31;
+//ZZ          return nf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_MULL: {
+//ZZ          /* (resLo32, resHi32, oldC:oldV) */
+//ZZ          UInt resHi32 = cc_dep2;
+//ZZ          UInt nf      = resHi32 >> 31;
+//ZZ          return nf;
+//ZZ       }
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("arm64g_calculate_flag_n"
+                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
+                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("arm64g_calculate_flag_n");
+   }
+}
+
+
+/* Calculate the Z flag from the supplied thunk components, in the
+   least significant bit of the word.  Returned bits 63:1 are zero. */
+static
+ULong arm64g_calculate_flag_z ( ULong cc_op, ULong cc_dep1,
+                                ULong cc_dep2, ULong cc_dep3 )
+{
+   switch (cc_op) {
+      case ARM64G_CC_OP_COPY: {
+         /* (nzcv:28x0, unused, unused) */
+         ULong zf   = (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1;
+         return zf;
+      }
+      case ARM64G_CC_OP_ADD32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL + argR;
+         ULong zf   = res == 0;
+         return zf;
+      }
+      case ARM64G_CC_OP_ADD64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL + argR;
+         ULong zf   = res == 0;
+         return zf;
+      }
+      case ARM64G_CC_OP_SUB32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL - argR;
+         ULong zf   = res == 0;
+         return zf;
+      }
+      case ARM64G_CC_OP_SUB64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL - argR;
+         ULong zf   = res == 0;
+         return zf;
+      }
+//ZZ       case ARMG_CC_OP_ADC: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL + argR + oldC;
+//ZZ          UInt zf   = res == 0;
+//ZZ          return zf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_SBB: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL - argR - (oldC ^ 1);
+//ZZ          UInt zf   = res == 0;
+//ZZ          return zf;
+//ZZ       }
+      case ARM64G_CC_OP_LOGIC32: {
+         /* (res, unused, unused) */
+         UInt  res  = (UInt)cc_dep1;
+         ULong zf   = res == 0;
+         return zf;
+      }
+      case ARM64G_CC_OP_LOGIC64: {
+         /* (res, unused, unused) */
+         ULong res  = cc_dep1;
+         ULong zf   = res == 0;
+         return zf;
+      }
+//ZZ       case ARMG_CC_OP_MUL: {
+//ZZ          /* (res, unused, oldC:oldV) */
+//ZZ          UInt res  = cc_dep1;
+//ZZ          UInt zf   = res == 0;
+//ZZ          return zf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_MULL: {
+//ZZ          /* (resLo32, resHi32, oldC:oldV) */
+//ZZ          UInt resLo32 = cc_dep1;
+//ZZ          UInt resHi32 = cc_dep2;
+//ZZ          UInt zf      = (resHi32|resLo32) == 0;
+//ZZ          return zf;
+//ZZ       }
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("arm64g_calculate_flag_z"
+                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
+                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("arm64g_calculate_flag_z");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the C flag from the supplied thunk components, in the
+   least significant bit of the word.  Returned bits 63:1 are zero. */
+static
+ULong arm64g_calculate_flag_c ( ULong cc_op, ULong cc_dep1,
+                                ULong cc_dep2, ULong cc_dep3 )
+{
+   switch (cc_op) {
+      case ARM64G_CC_OP_COPY: {
+         /* (nzcv:28x0, unused, unused) */
+         ULong cf = (cc_dep1 >> ARM64G_CC_SHIFT_C) & 1;
+         return cf;
+      }
+      case ARM64G_CC_OP_ADD32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL + argR;
+         ULong cf   = res < argL;
+         return cf;
+      }
+      case ARM64G_CC_OP_ADD64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL + argR;
+         ULong cf   = res < argL;
+         return cf;
+      }
+      case ARM64G_CC_OP_SUB32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         ULong cf   = argL >= argR;
+         return cf;
+      }
+      case ARM64G_CC_OP_SUB64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong cf   = argL >= argR;
+         return cf;
+      }
+//ZZ       case ARMG_CC_OP_ADC: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL + argR + oldC;
+//ZZ          UInt cf   = oldC ? (res <= argL) : (res < argL);
+//ZZ          return cf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_SBB: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt cf   = oldC ? (argL >= argR) : (argL > argR);
+//ZZ          return cf;
+//ZZ       }
+      case ARM64G_CC_OP_LOGIC32:
+      case ARM64G_CC_OP_LOGIC64: {
+         /* (res, unused, unused) */
+         return 0; // C after logic is zero on arm64
+      }
+//ZZ       case ARMG_CC_OP_MUL: {
+//ZZ          /* (res, unused, oldC:oldV) */
+//ZZ          UInt oldC = (cc_dep3 >> 1) & 1;
+//ZZ          vassert((cc_dep3 & ~3) == 0);
+//ZZ          UInt cf   = oldC;
+//ZZ          return cf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_MULL: {
+//ZZ          /* (resLo32, resHi32, oldC:oldV) */
+//ZZ          UInt oldC    = (cc_dep3 >> 1) & 1;
+//ZZ          vassert((cc_dep3 & ~3) == 0);
+//ZZ          UInt cf      = oldC;
+//ZZ          return cf;
+//ZZ       }
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("arm64g_calculate_flag_c"
+                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
+                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("arm64g_calculate_flag_c");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the V flag from the supplied thunk components, in the
+   least significant bit of the word.  Returned bits 63:1 are zero. */
+static
+ULong arm64g_calculate_flag_v ( ULong cc_op, ULong cc_dep1,
+                                ULong cc_dep2, ULong cc_dep3 )
+{
+   switch (cc_op) {
+      case ARM64G_CC_OP_COPY: {
+         /* (nzcv:28x0, unused, unused) */
+         ULong vf   = (cc_dep1 >> ARM64G_CC_SHIFT_V) & 1;
+         return vf;
+      }
+      case ARM64G_CC_OP_ADD32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL + argR;
+         ULong vf   = (ULong)(((res ^ argL) & (res ^ argR)) >> 31);
+         return vf;
+      }
+      case ARM64G_CC_OP_ADD64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL + argR;
+         ULong vf   = ((res ^ argL) & (res ^ argR)) >> 63;
+         return vf;
+      }
+      case ARM64G_CC_OP_SUB32: {
+         /* (argL, argR, unused) */
+         UInt  argL = (UInt)cc_dep1;
+         UInt  argR = (UInt)cc_dep2;
+         UInt  res  = argL - argR;
+         ULong vf   = (ULong)(((argL ^ argR) & (argL ^ res)) >> 31);
+         return vf;
+      }
+      case ARM64G_CC_OP_SUB64: {
+         /* (argL, argR, unused) */
+         ULong argL = cc_dep1;
+         ULong argR = cc_dep2;
+         ULong res  = argL - argR;
+         ULong vf   = (((argL ^ argR) & (argL ^ res))) >> 63;
+         return vf;
+      }
+//ZZ       case ARMG_CC_OP_ADC: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL + argR + oldC;
+//ZZ          UInt vf   = ((res ^ argL) & (res ^ argR)) >> 31;
+//ZZ          return vf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_SBB: {
+//ZZ          /* (argL, argR, oldC) */
+//ZZ          UInt argL = cc_dep1;
+//ZZ          UInt argR = cc_dep2;
+//ZZ          UInt oldC = cc_dep3;
+//ZZ          vassert((oldC & ~1) == 0);
+//ZZ          UInt res  = argL - argR - (oldC ^ 1);
+//ZZ          UInt vf   = ((argL ^ argR) & (argL ^ res)) >> 31;
+//ZZ          return vf;
+//ZZ       }
+      case ARM64G_CC_OP_LOGIC32:
+      case ARM64G_CC_OP_LOGIC64: {
+         /* (res, unused, unused) */
+         return 0; // V after logic is zero on arm64
+      }
+//ZZ       case ARMG_CC_OP_MUL: {
+//ZZ          /* (res, unused, oldC:oldV) */
+//ZZ          UInt oldV = (cc_dep3 >> 0) & 1;
+//ZZ          vassert((cc_dep3 & ~3) == 0);
+//ZZ          UInt vf   = oldV;
+//ZZ          return vf;
+//ZZ       }
+//ZZ       case ARMG_CC_OP_MULL: {
+//ZZ          /* (resLo32, resHi32, oldC:oldV) */
+//ZZ          UInt oldV    = (cc_dep3 >> 0) & 1;
+//ZZ          vassert((cc_dep3 & ~3) == 0);
+//ZZ          UInt vf      = oldV;
+//ZZ          return vf;
+//ZZ       }
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("arm64g_calculate_flag_v"
+                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
+                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("arm64g_calculate_flag_v");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate NZCV from the supplied thunk components, in the positions
+   they appear in the CPSR, viz bits 31:28 for N Z C V respectively.
+   Returned bits 27:0 are zero. */
+ULong arm64g_calculate_flags_nzcv ( ULong cc_op, ULong cc_dep1,
+                                    ULong cc_dep2, ULong cc_dep3 )
+{
+   ULong f;
+   ULong res = 0;
+   f = 1 & arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   res |= (f << ARM64G_CC_SHIFT_N);
+   f = 1 & arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   res |= (f << ARM64G_CC_SHIFT_Z);
+   f = 1 & arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   res |= (f << ARM64G_CC_SHIFT_C);
+   f = 1 & arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   res |= (f << ARM64G_CC_SHIFT_V);
+   return res;
+}
+
+//ZZ 
+//ZZ /* CALLED FROM GENERATED CODE: CLEAN HELPER */
+//ZZ /* Calculate the QC flag from the arguments, in the lowest bit
+//ZZ    of the word (bit 0).  Urr, having this out of line is bizarre.
+//ZZ    Push back inline. */
+//ZZ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
+//ZZ                               UInt resR1, UInt resR2 )
+//ZZ {
+//ZZ    if (resL1 != resR1 || resL2 != resR2)
+//ZZ       return 1;
+//ZZ    else
+//ZZ       return 0;
+//ZZ }
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the specified condition from the thunk components, in the
+   lowest bit of the word (bit 0).  Returned bits 63:1 are zero. */
+ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */
+                                   ULong cond_n_op ,
+                                   ULong cc_dep1,
+                                   ULong cc_dep2, ULong cc_dep3 )
+{
+   ULong cond  = cond_n_op >> 4;
+   ULong cc_op = cond_n_op & 0xF;
+   ULong inv   = cond & 1;
+   ULong nf, zf, vf, cf;
+
+#  if PROFILE_NZCV_FLAGS
+   NOTE_EVAL(cc_op, cond);
+#  endif
+
+   //   vex_printf("XXXXXXXX %llx %llx %llx %llx\n", 
+   //              cond_n_op, cc_dep1, cc_dep2, cc_dep3);
+
+   switch (cond) {
+      case ARM64CondEQ:    // Z=1         => z
+      case ARM64CondNE:    // Z=0
+         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ zf;
+
+      case ARM64CondCS:    // C=1         => c
+      case ARM64CondCC:    // C=0
+         cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ cf;
+
+      case ARM64CondMI:    // N=1         => n
+      case ARM64CondPL:    // N=0
+         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ nf;
+
+      case ARM64CondVS:    // V=1         => v
+      case ARM64CondVC:    // V=0
+         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ vf;
+
+      case ARM64CondHI:    // C=1 && Z=0   => c & ~z
+      case ARM64CondLS:    // C=0 || Z=1
+         cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ (1 & (cf & ~zf));
+
+      case ARM64CondGE:    // N=V          => ~(n^v)
+      case ARM64CondLT:    // N!=V
+         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ (1 & ~(nf ^ vf));
+
+      case ARM64CondGT:    // Z=0 && N=V   => ~z & ~(n^v)  =>  ~(z | (n^v))
+      case ARM64CondLE:    // Z=1 || N!=V
+         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
+         return inv ^ (1 & ~(zf | (nf ^ vf)));
+
+      case ARM64CondAL:    // 1
+      case ARM64CondNV:    // 1
+         return 1;
+
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("arm64g_calculate_condition(ARM64)"
+                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
+                    cond, cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("armg_calculate_condition(ARM64)");
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Flag-helpers translation-time function specialisers.    ---*/
+/*--- These help iropt specialise calls the above run-time    ---*/
+/*--- flags functions.                                        ---*/
+/*---------------------------------------------------------------*/
+
+/* Used by the optimiser to try specialisations.  Returns an
+   equivalent expression, or NULL if none. */
+
+static Bool isU64 ( IRExpr* e, ULong n )
+{
+   return
+      toBool( e->tag == Iex_Const
+              && e->Iex.Const.con->tag == Ico_U64
+              && e->Iex.Const.con->Ico.U64 == n );
+}
+
+IRExpr* guest_arm64_spechelper ( const HChar* function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts )
+{
+#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
+#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
+#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
+#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
+
+   Int i, arity = 0;
+   for (i = 0; args[i]; i++)
+      arity++;
+//ZZ #  if 0
+//ZZ    vex_printf("spec request:\n");
+//ZZ    vex_printf("   %s  ", function_name);
+//ZZ    for (i = 0; i < arity; i++) {
+//ZZ       vex_printf("  ");
+//ZZ       ppIRExpr(args[i]);
+//ZZ    }
+//ZZ    vex_printf("\n");
+//ZZ #  endif
+
+   /* --------- specialising "arm64g_calculate_condition" --------- */
+
+   if (vex_streq(function_name, "arm64g_calculate_condition")) {
+
+      /* specialise calls to the "arm64g_calculate_condition" function.
+         Not sure whether this is strictly necessary, but: the
+         replacement IR must produce only the values 0 or 1.  Bits
+         63:1 are required to be zero. */
+      IRExpr *cond_n_op, *cc_dep1, *cc_dep2  ; //, *cc_ndep;
+      vassert(arity == 4);
+      cond_n_op = args[0]; /* (ARM64Condcode << 4)  |  ARM64G_CC_OP_* */
+      cc_dep1   = args[1];
+      cc_dep2   = args[2];
+      //cc_ndep   = args[3];
+
+      /*---------------- SUB64 ----------------*/
+
+      /* 0, 1 */
+      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB64)) {
+         /* EQ after SUB --> test argL == argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, cc_dep1, cc_dep2));
+      }
+      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB64)) {
+         /* NE after SUB --> test argL != argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64, cc_dep1, cc_dep2));
+      }
+
+      /* 2, 3 */
+      if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB64)) {
+         /* CS after SUB --> test argL >=u argR
+                         --> test argR <=u argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
+      }
+      if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB64)) {
+         /* CC after SUB --> test argL <u argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
+      }
+
+      /* 8, 9 */
+      if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB64)) {
+         /* LS after SUB --> test argL <=u argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
+      }
+      if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB64)) {
+         /* HI after SUB --> test argL >u argR
+                         --> test argR <u argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, cc_dep2, cc_dep1));
+      }
+
+      /* 10, 11 */
+      if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB64)) {
+         /* LT after SUB --> test argL <s argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
+      }
+      if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB64)) {
+         /* GE after SUB --> test argL >=s argR
+                         --> test argR <=s argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
+      }
+
+      /* 12, 13 */
+      if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB64)) {
+         /* GT after SUB --> test argL >s argR
+                         --> test argR <s argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
+      }
+      if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB64)) {
+         /* LE after SUB --> test argL <=s argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
+      }
+
+      /*---------------- SUB32 ----------------*/
+
+      /* 0, 1 */
+      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB32)) {
+         /* EQ after SUB --> test argL == argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
+                                        unop(Iop_64to32, cc_dep2)));
+      }
+      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB32)) {
+         /* NE after SUB --> test argL != argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
+                                        unop(Iop_64to32, cc_dep2)));
+      }
+
+      /* 2, 3 */
+      if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB32)) {
+         /* CS after SUB --> test argL >=u argR
+                         --> test argR <=u argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep2),
+                                         unop(Iop_64to32, cc_dep1)));
+      }
+      if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB32)) {
+         /* CC after SUB --> test argL <u argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep1),
+                                         unop(Iop_64to32, cc_dep2)));
+      }
+
+      /* 8, 9 */
+      if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB32)) {
+         /* LS after SUB --> test argL <=u argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep1),
+                                         unop(Iop_64to32, cc_dep2)));
+      }
+      if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB32)) {
+         /* HI after SUB --> test argL >u argR
+                         --> test argR <u argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep2),
+                                         unop(Iop_64to32, cc_dep1)));
+      }
+
+      /* 10, 11 */
+      if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB32)) {
+         /* LT after SUB --> test argL <s argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep1),
+                                         unop(Iop_64to32, cc_dep2)));
+      }
+      if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB32)) {
+         /* GE after SUB --> test argL >=s argR
+                         --> test argR <=s argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep2),
+                                         unop(Iop_64to32, cc_dep1)));
+      }
+
+      /* 12, 13 */
+      if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB32)) {
+         /* GT after SUB --> test argL >s argR
+                         --> test argR <s argL */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep2), 
+                                         unop(Iop_64to32, cc_dep1)));
+      }
+      if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB32)) {
+         /* LE after SUB --> test argL <=s argR */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep1),
+                                         unop(Iop_64to32, cc_dep2)));
+      }
+
+//ZZ       /*---------------- SBB ----------------*/
+//ZZ 
+//ZZ       if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) {
+//ZZ          /* This seems to happen a lot in softfloat code, eg __divdf3+140 */
+//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+//ZZ          /* HS after SBB (same as C after SBB below)
+//ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
+//ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
+//ZZ          */
+//ZZ          return
+//ZZ             IRExpr_ITE(
+//ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
+//ZZ                /* case oldC != 0 */
+//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
+//ZZ                /* case oldC == 0 */
+//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
+//ZZ             );
+//ZZ       }
+//ZZ 
+//ZZ       /*---------------- LOGIC ----------------*/
+//ZZ 
+//ZZ       if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_LOGIC)) {
+//ZZ          /* EQ after LOGIC --> test res == 0 */
+//ZZ          return unop(Iop_1Uto32,
+//ZZ                      binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+//ZZ       }
+//ZZ       if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_LOGIC)) {
+//ZZ          /* NE after LOGIC --> test res != 0 */
+//ZZ          return unop(Iop_1Uto32,
+//ZZ                      binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
+//ZZ       }
+//ZZ 
+//ZZ       if (isU32(cond_n_op, (ARMCondPL << 4) | ARMG_CC_OP_LOGIC)) {
+//ZZ          /* PL after LOGIC --> test (res >> 31) == 0 */
+//ZZ          return unop(Iop_1Uto32,
+//ZZ                      binop(Iop_CmpEQ32,
+//ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
+//ZZ                            mkU32(0)));
+//ZZ       }
+//ZZ       if (isU32(cond_n_op, (ARMCondMI << 4) | ARMG_CC_OP_LOGIC)) {
+//ZZ          /* MI after LOGIC --> test (res >> 31) == 1 */
+//ZZ          return unop(Iop_1Uto32,
+//ZZ                      binop(Iop_CmpEQ32,
+//ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
+//ZZ                            mkU32(1)));
+//ZZ       }
+
+      /*---------------- COPY ----------------*/
+
+      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_COPY)) {
+         /* EQ after COPY --> (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1 */
+         return binop(Iop_And64,
+                      binop(Iop_Shr64, cc_dep1,
+                                       mkU8(ARM64G_CC_SHIFT_Z)),
+                      mkU64(1));
+      }
+      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_COPY)) {
+         /* NE after COPY --> ((cc_dep1 >> ARM64G_CC_SHIFT_Z) ^ 1) & 1 */
+         return binop(Iop_And64,
+                      binop(Iop_Xor64,
+                            binop(Iop_Shr64, cc_dep1,
+                                             mkU8(ARM64G_CC_SHIFT_Z)),
+                            mkU64(1)),
+                      mkU64(1));
+      }
+
+//ZZ       /*----------------- AL -----------------*/
+//ZZ 
+//ZZ       /* A critically important case for Thumb code.
+//ZZ 
+//ZZ          What we're trying to spot is the case where cond_n_op is an
+//ZZ          expression of the form Or32(..., 0xE0) since that means the
+//ZZ          caller is asking for CondAL and we can simply return 1
+//ZZ          without caring what the ... part is.  This is a potentially
+//ZZ          dodgy kludge in that it assumes that the ... part has zeroes
+//ZZ          in bits 7:4, so that the result of the Or32 is guaranteed to
+//ZZ          be 0xE in bits 7:4.  Given that the places where this first
+//ZZ          arg are constructed (in guest_arm_toIR.c) are very
+//ZZ          constrained, we can get away with this.  To make this
+//ZZ          guaranteed safe would require to have a new primop, Slice44
+//ZZ          or some such, thusly
+//ZZ 
+//ZZ          Slice44(arg1, arg2) = 0--(24)--0 arg1[7:4] arg2[3:0]
+//ZZ 
+//ZZ          and we would then look for Slice44(0xE0, ...)
+//ZZ          which would give the required safety property.
+//ZZ 
+//ZZ          It would be infeasibly expensive to scan backwards through
+//ZZ          the entire block looking for an assignment to the temp, so
+//ZZ          just look at the previous 16 statements.  That should find it
+//ZZ          if it is an interesting case, as a result of how the
+//ZZ          boilerplate guff at the start of each Thumb insn translation
+//ZZ          is made.
+//ZZ       */
+//ZZ       if (cond_n_op->tag == Iex_RdTmp) {
+//ZZ          Int    j;
+//ZZ          IRTemp look_for = cond_n_op->Iex.RdTmp.tmp;
+//ZZ          Int    limit    = n_precedingStmts - 16;
+//ZZ          if (limit < 0) limit = 0;
+//ZZ          if (0) vex_printf("scanning %d .. %d\n", n_precedingStmts-1, limit);
+//ZZ          for (j = n_precedingStmts - 1; j >= limit; j--) {
+//ZZ             IRStmt* st = precedingStmts[j];
+//ZZ             if (st->tag == Ist_WrTmp
+//ZZ                 && st->Ist.WrTmp.tmp == look_for
+//ZZ                 && st->Ist.WrTmp.data->tag == Iex_Binop
+//ZZ                 && st->Ist.WrTmp.data->Iex.Binop.op == Iop_Or32
+//ZZ                 && isU32(st->Ist.WrTmp.data->Iex.Binop.arg2, (ARMCondAL << 4)))
+//ZZ                return mkU32(1);
+//ZZ          }
+//ZZ          /* Didn't find any useful binding to the first arg
+//ZZ             in the previous 16 stmts. */
+//ZZ       }
+   }
+
+//ZZ    /* --------- specialising "armg_calculate_flag_c" --------- */
+//ZZ 
+//ZZ    else
+//ZZ    if (vex_streq(function_name, "armg_calculate_flag_c")) {
+//ZZ 
+//ZZ       /* specialise calls to the "armg_calculate_flag_c" function.
+//ZZ          Note that the returned value must be either 0 or 1; nonzero
+//ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
+//ZZ          values (from the thunk) are assumed to have bits 31:1
+//ZZ          clear. */
+//ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+//ZZ       vassert(arity == 4);
+//ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
+//ZZ       cc_dep1 = args[1];
+//ZZ       cc_dep2 = args[2];
+//ZZ       cc_ndep = args[3];
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+//ZZ          /* Thunk args are (result, shco, oldV) */
+//ZZ          /* C after LOGIC --> shco */
+//ZZ          return cc_dep2;
+//ZZ       }
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
+//ZZ          /* Thunk args are (argL, argR, unused) */
+//ZZ          /* C after SUB --> argL >=u argR
+//ZZ                         --> argR <=u argL */
+//ZZ          return unop(Iop_1Uto32,
+//ZZ                      binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
+//ZZ       }
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+//ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
+//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+//ZZ          /* C after SBB (same as HS after SBB above)
+//ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
+//ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
+//ZZ          */
+//ZZ          return
+//ZZ             IRExpr_ITE(
+//ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
+//ZZ                /* case oldC != 0 */
+//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
+//ZZ                /* case oldC == 0 */
+//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
+//ZZ             );
+//ZZ       }
+//ZZ 
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- specialising "armg_calculate_flag_v" --------- */
+//ZZ 
+//ZZ    else
+//ZZ    if (vex_streq(function_name, "armg_calculate_flag_v")) {
+//ZZ 
+//ZZ       /* specialise calls to the "armg_calculate_flag_v" function.
+//ZZ          Note that the returned value must be either 0 or 1; nonzero
+//ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
+//ZZ          values (from the thunk) are assumed to have bits 31:1
+//ZZ          clear. */
+//ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+//ZZ       vassert(arity == 4);
+//ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
+//ZZ       cc_dep1 = args[1];
+//ZZ       cc_dep2 = args[2];
+//ZZ       cc_ndep = args[3];
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+//ZZ          /* Thunk args are (result, shco, oldV) */
+//ZZ          /* V after LOGIC --> oldV */
+//ZZ          return cc_ndep;
+//ZZ       }
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
+//ZZ          /* Thunk args are (argL, argR, unused) */
+//ZZ          /* V after SUB 
+//ZZ             --> let res = argL - argR
+//ZZ                 in ((argL ^ argR) & (argL ^ res)) >> 31
+//ZZ             --> ((argL ^ argR) & (argL ^ (argL - argR))) >> 31
+//ZZ          */
+//ZZ          IRExpr* argL = cc_dep1;
+//ZZ          IRExpr* argR = cc_dep2;
+//ZZ          return
+//ZZ             binop(Iop_Shr32,
+//ZZ                   binop(Iop_And32,
+//ZZ                         binop(Iop_Xor32, argL, argR),
+//ZZ                         binop(Iop_Xor32, argL, binop(Iop_Sub32, argL, argR))
+//ZZ                   ),
+//ZZ                   mkU8(31)
+//ZZ             );
+//ZZ       }
+//ZZ 
+//ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+//ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
+//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+//ZZ          /* V after SBB
+//ZZ             --> let res = argL - argR - (oldC ^ 1)
+//ZZ                 in  (argL ^ argR) & (argL ^ res) & 1
+//ZZ          */
+//ZZ          return
+//ZZ             binop(
+//ZZ                Iop_And32,
+//ZZ                binop(
+//ZZ                   Iop_And32,
+//ZZ                   // argL ^ argR
+//ZZ                   binop(Iop_Xor32, cc_dep1, cc_dep2),
+//ZZ                   // argL ^ (argL - argR - (oldC ^ 1))
+//ZZ                   binop(Iop_Xor32,
+//ZZ                         cc_dep1,
+//ZZ                         binop(Iop_Sub32,
+//ZZ                               binop(Iop_Sub32, cc_dep1, cc_dep2),
+//ZZ                               binop(Iop_Xor32, cc_ndep, mkU32(1)))
+//ZZ                   )
+//ZZ                ),
+//ZZ                mkU32(1)
+//ZZ             );
+//ZZ       }
+//ZZ 
+//ZZ    }
+
+#  undef unop
+#  undef binop
+#  undef mkU64
+#  undef mkU8
+
+   return NULL;
+}
+
+
+/*----------------------------------------------*/
+/*--- The exported fns ..                    ---*/
+/*----------------------------------------------*/
+
+//ZZ /* VISIBLE TO LIBVEX CLIENT */
+//ZZ #if 0
+//ZZ void LibVEX_GuestARM_put_flags ( UInt flags_native,
+//ZZ                                  /*OUT*/VexGuestARMState* vex_state )
+//ZZ {
+//ZZ    vassert(0); // FIXME
+//ZZ 
+//ZZ    /* Mask out everything except N Z V C. */
+//ZZ    flags_native
+//ZZ       &= (ARMG_CC_MASK_N | ARMG_CC_MASK_Z | ARMG_CC_MASK_V | ARMG_CC_MASK_C);
+//ZZ    
+//ZZ    vex_state->guest_CC_OP   = ARMG_CC_OP_COPY;
+//ZZ    vex_state->guest_CC_DEP1 = flags_native;
+//ZZ    vex_state->guest_CC_DEP2 = 0;
+//ZZ    vex_state->guest_CC_NDEP = 0;
+//ZZ }
+//ZZ #endif
+
+/* VISIBLE TO LIBVEX CLIENT */
+ULong LibVEX_GuestARM64_get_nzcv ( /*IN*/const VexGuestARM64State* vex_state )
+{
+   ULong nzcv = 0;
+   // NZCV
+   nzcv |= arm64g_calculate_flags_nzcv(
+               vex_state->guest_CC_OP,
+               vex_state->guest_CC_DEP1,
+               vex_state->guest_CC_DEP2,
+               vex_state->guest_CC_NDEP
+            );
+   vassert(0 == (nzcv & 0xFFFFFFFF0FFFFFFFULL));
+//ZZ    // Q
+//ZZ    if (vex_state->guest_QFLAG32 > 0)
+//ZZ       cpsr |= (1 << 27);
+//ZZ    // GE
+//ZZ    if (vex_state->guest_GEFLAG0 > 0)
+//ZZ       cpsr |= (1 << 16);
+//ZZ    if (vex_state->guest_GEFLAG1 > 0)
+//ZZ       cpsr |= (1 << 17);
+//ZZ    if (vex_state->guest_GEFLAG2 > 0)
+//ZZ       cpsr |= (1 << 18);
+//ZZ    if (vex_state->guest_GEFLAG3 > 0)
+//ZZ       cpsr |= (1 << 19);
+//ZZ    // M
+//ZZ    cpsr |= (1 << 4); // 0b10000 means user-mode
+//ZZ    // J,T   J (bit 24) is zero by initialisation above
+//ZZ    // T  we copy from R15T[0]
+//ZZ    if (vex_state->guest_R15T & 1)
+//ZZ       cpsr |= (1 << 5);
+//ZZ    // ITSTATE we punt on for the time being.  Could compute it
+//ZZ    // if needed though.
+//ZZ    // E, endianness, 0 (littleendian) from initialisation above
+//ZZ    // A,I,F disable some async exceptions.  Not sure about these.
+//ZZ    // Leave as zero for the time being.
+   return nzcv;
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestARM64_initialise ( /*OUT*/VexGuestARM64State* vex_state )
+{
+   vex_bzero(vex_state, sizeof(*vex_state));
+//ZZ    vex_state->host_EvC_FAILADDR = 0;
+//ZZ    vex_state->host_EvC_COUNTER = 0;
+//ZZ 
+//ZZ    vex_state->guest_R0  = 0;
+//ZZ    vex_state->guest_R1  = 0;
+//ZZ    vex_state->guest_R2  = 0;
+//ZZ    vex_state->guest_R3  = 0;
+//ZZ    vex_state->guest_R4  = 0;
+//ZZ    vex_state->guest_R5  = 0;
+//ZZ    vex_state->guest_R6  = 0;
+//ZZ    vex_state->guest_R7  = 0;
+//ZZ    vex_state->guest_R8  = 0;
+//ZZ    vex_state->guest_R9  = 0;
+//ZZ    vex_state->guest_R10 = 0;
+//ZZ    vex_state->guest_R11 = 0;
+//ZZ    vex_state->guest_R12 = 0;
+//ZZ    vex_state->guest_R13 = 0;
+//ZZ    vex_state->guest_R14 = 0;
+//ZZ    vex_state->guest_R15T = 0;  /* NB: implies ARM mode */
+//ZZ 
+   vex_state->guest_CC_OP   = ARM64G_CC_OP_COPY;
+//ZZ    vex_state->guest_CC_DEP1 = 0;
+//ZZ    vex_state->guest_CC_DEP2 = 0;
+//ZZ    vex_state->guest_CC_NDEP = 0;
+//ZZ    vex_state->guest_QFLAG32 = 0;
+//ZZ    vex_state->guest_GEFLAG0 = 0;
+//ZZ    vex_state->guest_GEFLAG1 = 0;
+//ZZ    vex_state->guest_GEFLAG2 = 0;
+//ZZ    vex_state->guest_GEFLAG3 = 0;
+//ZZ 
+//ZZ    vex_state->guest_EMNOTE  = EmNote_NONE;
+//ZZ    vex_state->guest_CMSTART = 0;
+//ZZ    vex_state->guest_CMLEN   = 0;
+//ZZ    vex_state->guest_NRADDR  = 0;
+//ZZ    vex_state->guest_IP_AT_SYSCALL = 0;
+//ZZ 
+//ZZ    vex_state->guest_D0  = 0;
+//ZZ    vex_state->guest_D1  = 0;
+//ZZ    vex_state->guest_D2  = 0;
+//ZZ    vex_state->guest_D3  = 0;
+//ZZ    vex_state->guest_D4  = 0;
+//ZZ    vex_state->guest_D5  = 0;
+//ZZ    vex_state->guest_D6  = 0;
+//ZZ    vex_state->guest_D7  = 0;
+//ZZ    vex_state->guest_D8  = 0;
+//ZZ    vex_state->guest_D9  = 0;
+//ZZ    vex_state->guest_D10 = 0;
+//ZZ    vex_state->guest_D11 = 0;
+//ZZ    vex_state->guest_D12 = 0;
+//ZZ    vex_state->guest_D13 = 0;
+//ZZ    vex_state->guest_D14 = 0;
+//ZZ    vex_state->guest_D15 = 0;
+//ZZ    vex_state->guest_D16 = 0;
+//ZZ    vex_state->guest_D17 = 0;
+//ZZ    vex_state->guest_D18 = 0;
+//ZZ    vex_state->guest_D19 = 0;
+//ZZ    vex_state->guest_D20 = 0;
+//ZZ    vex_state->guest_D21 = 0;
+//ZZ    vex_state->guest_D22 = 0;
+//ZZ    vex_state->guest_D23 = 0;
+//ZZ    vex_state->guest_D24 = 0;
+//ZZ    vex_state->guest_D25 = 0;
+//ZZ    vex_state->guest_D26 = 0;
+//ZZ    vex_state->guest_D27 = 0;
+//ZZ    vex_state->guest_D28 = 0;
+//ZZ    vex_state->guest_D29 = 0;
+//ZZ    vex_state->guest_D30 = 0;
+//ZZ    vex_state->guest_D31 = 0;
+//ZZ 
+//ZZ    /* ARM encoded; zero is the default as it happens (result flags
+//ZZ       (NZCV) cleared, FZ disabled, round to nearest, non-vector mode,
+//ZZ       all exns masked, all exn sticky bits cleared). */
+//ZZ    vex_state->guest_FPSCR = 0;
+//ZZ 
+//ZZ    vex_state->guest_TPIDRURO = 0;
+//ZZ 
+//ZZ    /* Not in a Thumb IT block. */
+//ZZ    vex_state->guest_ITSTATE = 0;
+//ZZ 
+//ZZ    vex_state->padding1 = 0;
+//ZZ    vex_state->padding2 = 0;
+//ZZ    vex_state->padding3 = 0;
+//ZZ    vex_state->padding4 = 0;
+//ZZ    vex_state->padding5 = 0;
+}
+
+
+/*-----------------------------------------------------------*/
+/*--- Describing the arm guest state, for the benefit     ---*/
+/*--- of iropt and instrumenters.                         ---*/
+/*-----------------------------------------------------------*/
+
+/* Figure out if any part of the guest state contained in minoff
+   .. maxoff requires precise memory exceptions.  If in doubt return
+   True (but this generates significantly slower code).  
+
+   We enforce precise exns for guest SP, PC, 29(FP), 30(LR).
+   That might be overkill (for 29 and 30); I don't know.
+*/
+Bool guest_arm64_state_requires_precise_mem_exns ( Int minoff, 
+                                                   Int maxoff)
+{
+   Int xsp_min = offsetof(VexGuestARM64State, guest_XSP);
+   Int xsp_max = xsp_min + 8 - 1;
+   Int pc_min  = offsetof(VexGuestARM64State, guest_PC);
+   Int pc_max  = pc_min + 8 - 1;
+
+   if (maxoff < xsp_min || minoff > xsp_max) {
+      /* no overlap with xsp */
+      if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess)
+         return False; // We only need to check stack pointer.
+   } else {
+      return True;
+   }
+
+   if (maxoff < pc_min || minoff > pc_max) {
+      /* no overlap with pc */
+   } else {
+      return True;
+   }
+
+   /* Guessing that we need PX for FP, but I don't really know. */
+   Int x29_min = offsetof(VexGuestARM64State, guest_X29);
+   Int x29_max = x29_min + 8 - 1;
+
+   if (maxoff < x29_min || minoff > x29_max) {
+      /* no overlap with x29 */
+   } else {
+      return True;
+   }
+
+   /* Guessing that we need PX for LR, but I don't really know. */
+   Int x30_min = offsetof(VexGuestARM64State, guest_X30);
+   Int x30_max = x30_min + 8 - 1;
+
+   if (maxoff < x30_min || minoff > x30_max) {
+      /* no overlap with r30 */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+
+#define ALWAYSDEFD(field)                             \
+    { offsetof(VexGuestARM64State, field),            \
+      (sizeof ((VexGuestARM64State*)0)->field) }
+VexGuestLayout
+   arm64Guest_layout 
+      = { 
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestARM64State),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestARM64State,guest_XSP),
+          .sizeof_SP = 8,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestARM64State,guest_PC),
+          .sizeof_IP = 8,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 10,
+
+          /* flags thunk: OP is always defd, whereas DEP1 and DEP2
+             have to be tracked.  See detailed comment in gdefs.h on
+             meaning of thunk fields. */
+          .alwaysDefd
+             = { /* 0 */ ALWAYSDEFD(guest_PC),
+                 /* 1 */ ALWAYSDEFD(guest_CC_OP),
+                 /* 2 */ ALWAYSDEFD(guest_CC_NDEP),
+                 /* 3 */ ALWAYSDEFD(guest_EMNOTE),
+                 /* 4 */ ALWAYSDEFD(guest_CMSTART),
+                 /* 5 */ ALWAYSDEFD(guest_CMLEN),
+                 /* 6 */ ALWAYSDEFD(guest_NRADDR),
+                 /* 7 */ ALWAYSDEFD(guest_IP_AT_SYSCALL),
+                 /* 8 */ ALWAYSDEFD(guest_FPCR),
+                 /* 9 */ ALWAYSDEFD(guest_FPSR)
+               }
+        };
+
+
+/*---------------------------------------------------------------*/
+/*--- end                               guest_arm64_helpers.c ---*/
+/*---------------------------------------------------------------*/
Index: priv/guest_arm64_toIR.c
===================================================================
--- priv/guest_arm64_toIR.c	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/guest_arm64_toIR.c	(.../trunk)	(revision 2863)
@@ -0,0 +1,7856 @@
+/* -*- mode: C; c-basic-offset: 3; -*- */
+
+/*--------------------------------------------------------------------*/
+/*--- begin                                     guest_arm64_toIR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+//ZZ /* XXXX thumb to check:
+//ZZ    that all cases where putIRegT writes r15, we generate a jump.
+//ZZ 
+//ZZ    All uses of newTemp assign to an IRTemp and not a UInt
+//ZZ 
+//ZZ    For all thumb loads and stores, including VFP ones, new-ITSTATE is
+//ZZ    backed out before the memory op, and restored afterwards.  This
+//ZZ    needs to happen even after we go uncond.  (and for sure it doesn't
+//ZZ    happen for VFP loads/stores right now).
+//ZZ 
+//ZZ    VFP on thumb: check that we exclude all r13/r15 cases that we
+//ZZ    should.
+//ZZ 
+//ZZ    XXXX thumb to do: improve the ITSTATE-zeroing optimisation by
+//ZZ    taking into account the number of insns guarded by an IT.
+//ZZ 
+//ZZ    remove the nasty hack, in the spechelper, of looking for Or32(...,
+//ZZ    0xE0) in as the first arg to armg_calculate_condition, and instead
+//ZZ    use Slice44 as specified in comments in the spechelper.
+//ZZ 
+//ZZ    add specialisations for armg_calculate_flag_c and _v, as they
+//ZZ    are moderately often needed in Thumb code.
+//ZZ 
+//ZZ    Correctness: ITSTATE handling in Thumb SVCs is wrong.
+//ZZ 
+//ZZ    Correctness (obscure): in m_transtab, when invalidating code
+//ZZ    address ranges, invalidate up to 18 bytes after the end of the
+//ZZ    range.  This is because the ITSTATE optimisation at the top of
+//ZZ    _THUMB_WRK below analyses up to 18 bytes before the start of any
+//ZZ    given instruction, and so might depend on the invalidated area.
+//ZZ */
+//ZZ 
+//ZZ /* Limitations, etc
+//ZZ 
+//ZZ    - pretty dodgy exception semantics for {LD,ST}Mxx and {LD,ST}RD.
+//ZZ      These instructions are non-restartable in the case where the
+//ZZ      transfer(s) fault.
+//ZZ 
+//ZZ    - SWP: the restart jump back is Ijk_Boring; it should be
+//ZZ      Ijk_NoRedir but that's expensive.  See comments on casLE() in
+//ZZ      guest_x86_toIR.c.
+//ZZ */
+
+/* "Special" instructions.
+
+   This instruction decoder can decode four special instructions
+   which mean nothing natively (are no-ops as far as regs/mem are
+   concerned) but have meaning for supporting Valgrind.  A special
+   instruction is flagged by a 16-byte preamble:
+
+      93CC0D8C 93CC358C 93CCCD8C 93CCF58C
+      (ror x12, x12, #3;   ror x12, x12, #13
+       ror x12, x12, #51;  ror x12, x12, #61)
+
+   Following that, one of the following 3 are allowed
+   (standard interpretation in parentheses):
+
+      AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
+      AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
+      AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
+      AA090129 (orr x9,x9,x9)      IR injection
+
+   Any other bytes following the 16-byte preamble are illegal and
+   constitute a failure in instruction decoding.  This all assumes
+   that the preamble will never occur except in specific code
+   fragments designed for Valgrind to catch.
+*/
+
+/* Translates ARM64 code to IR. */
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "libvex_guest_arm64.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_arm64_defs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Globals                                              ---*/
+/*------------------------------------------------------------*/
+
+/* These are set at the start of the translation of a instruction, so
+   that we don't have to pass them around endlessly.  CONST means does
+   not change during translation of the instruction.
+*/
+
+/* CONST: is the host bigendian?  We need to know this in order to do
+   sub-register accesses to the SIMD/FP registers correctly. */
+static Bool host_is_bigendian;
+
+/* CONST: The guest address for the instruction currently being
+   translated.  */
+static Addr64 guest_PC_curr_instr;
+
+/* MOD: The IRSB* into which we're generating code. */
+static IRSB* irsb;
+
+
+/*------------------------------------------------------------*/
+/*--- Debugging output                                     ---*/
+/*------------------------------------------------------------*/
+
+#define DIP(format, args...)           \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_printf(format, ## args)
+
+#define DIS(buf, format, args...)      \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_sprintf(buf, format, ## args)
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for deconstructing the        ---*/
+/*--- arm insn stream.                                     ---*/
+/*------------------------------------------------------------*/
+
+/* Do a little-endian load of a 32-bit word, regardless of the
+   endianness of the underlying host. */
+static inline UInt getUIntLittleEndianly ( UChar* p )
+{
+   UInt w = 0;
+   w = (w << 8) | p[3];
+   w = (w << 8) | p[2];
+   w = (w << 8) | p[1];
+   w = (w << 8) | p[0];
+   return w;
+}
+
+/* Sign extend a N-bit value up to 64 bits, by copying
+   bit N-1 into all higher positions. */
+static ULong sx_to_64 ( ULong x, UInt n )
+{
+   vassert(n > 1 && n < 64);
+   Long r = (Long)x;
+   r = (r << (64-n)) >> (64-n);
+   return (ULong)r;
+}
+
+//ZZ /* Do a little-endian load of a 16-bit word, regardless of the
+//ZZ    endianness of the underlying host. */
+//ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
+//ZZ {
+//ZZ    UShort w = 0;
+//ZZ    w = (w << 8) | p[1];
+//ZZ    w = (w << 8) | p[0];
+//ZZ    return w;
+//ZZ }
+//ZZ 
+//ZZ static UInt ROR32 ( UInt x, UInt sh ) {
+//ZZ    vassert(sh >= 0 && sh < 32);
+//ZZ    if (sh == 0)
+//ZZ       return x;
+//ZZ    else
+//ZZ       return (x << (32-sh)) | (x >> sh);
+//ZZ }
+//ZZ 
+//ZZ static Int popcount32 ( UInt x )
+//ZZ {
+//ZZ    Int res = 0, i;
+//ZZ    for (i = 0; i < 32; i++) {
+//ZZ       res += (x & 1);
+//ZZ       x >>= 1;
+//ZZ    }
+//ZZ    return res;
+//ZZ }
+//ZZ 
+//ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
+//ZZ {
+//ZZ    UInt mask = 1 << ix;
+//ZZ    x &= ~mask;
+//ZZ    x |= ((b << ix) & mask);
+//ZZ    return x;
+//ZZ }
+
+#define BITS2(_b1,_b0)  \
+   (((_b1) << 1) | (_b0))
+
+#define BITS3(_b2,_b1,_b0)  \
+  (((_b2) << 2) | ((_b1) << 1) | (_b0))
+
+#define BITS4(_b3,_b2,_b1,_b0)  \
+   (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
+
+#define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
+    | BITS4((_b3),(_b2),(_b1),(_b0)))
+
+#define BITS5(_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
+#define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+#define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+#define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (((_b8) << 8)  \
+    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+#define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (((_b9) << 9) | ((_b8) << 8)  \
+    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+#define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (((_b10) << 10)  \
+    | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
+
+#define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
+   (((_b11) << 11)  \
+    | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
+
+// produces _uint[_bMax:_bMin]
+#define SLICE_UInt(_uint,_bMax,_bMin)  \
+   (( ((UInt)(_uint)) >> (_bMin))  \
+    & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for creating IR fragments.    ---*/
+/*------------------------------------------------------------*/
+
+static IRExpr* mkV128 ( UShort w )
+{
+   return IRExpr_Const(IRConst_V128(w));
+}
+
+static IRExpr* mkU64 ( ULong i )
+{
+   return IRExpr_Const(IRConst_U64(i));
+}
+
+static IRExpr* mkU32 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U32(i));
+}
+
+static IRExpr* mkU8 ( UInt i )
+{
+   vassert(i < 256);
+   return IRExpr_Const(IRConst_U8( (UChar)i ));
+}
+
+static IRExpr* mkexpr ( IRTemp tmp )
+{
+   return IRExpr_RdTmp(tmp);
+}
+
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
+{
+   return IRExpr_Triop(op, a1, a2, a3);
+}
+
+static IRExpr* loadLE ( IRType ty, IRExpr* addr )
+{
+   return IRExpr_Load(Iend_LE, ty, addr);
+}
+
+/* Add a statement to the list held by "irbb". */
+static void stmt ( IRStmt* st )
+{
+   addStmtToIRSB( irsb, st );
+}
+
+static void assign ( IRTemp dst, IRExpr* e )
+{
+   stmt( IRStmt_WrTmp(dst, e) );
+}
+
+static void storeLE ( IRExpr* addr, IRExpr* data )
+{
+   stmt( IRStmt_Store(Iend_LE, addr, data) );
+}
+
+//ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
+//ZZ {
+//ZZ    if (guardT == IRTemp_INVALID) {
+//ZZ       /* unconditional */
+//ZZ       storeLE(addr, data);
+//ZZ    } else {
+//ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
+//ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
+//ZZ                             IRExpr* addr, IRExpr* alt, 
+//ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+//ZZ {
+//ZZ    if (guardT == IRTemp_INVALID) {
+//ZZ       /* unconditional */
+//ZZ       IRExpr* loaded = NULL;
+//ZZ       switch (cvt) {
+//ZZ          case ILGop_Ident32:
+//ZZ             loaded = loadLE(Ity_I32, addr); break;
+//ZZ          case ILGop_8Uto32:
+//ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
+//ZZ          case ILGop_8Sto32:
+//ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
+//ZZ          case ILGop_16Uto32:
+//ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
+//ZZ          case ILGop_16Sto32:
+//ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
+//ZZ          default:
+//ZZ             vassert(0);
+//ZZ       }
+//ZZ       vassert(loaded != NULL);
+//ZZ       assign(dst, loaded);
+//ZZ    } else {
+//ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
+//ZZ          loaded data before putting the data in 'dst'.  If the load
+//ZZ          does not take place, 'alt' is placed directly in 'dst'. */
+//ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
+//ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
+//ZZ    }
+//ZZ }
+
+/* Generate a new temporary of the given type. */
+static IRTemp newTemp ( IRType ty )
+{
+   vassert(isPlausibleIRType(ty));
+   return newIRTemp( irsb->tyenv, ty );
+}
+
+//ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
+//ZZ    IRRoundingMode. */
+//ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
+//ZZ {
+//ZZ    return mkU32(Irrm_NEAREST);
+//ZZ }
+//ZZ 
+//ZZ /* Generate an expression for SRC rotated right by ROT. */
+//ZZ static IRExpr* genROR32( IRTemp src, Int rot )
+//ZZ {
+//ZZ    vassert(rot >= 0 && rot < 32);
+//ZZ    if (rot == 0)
+//ZZ       return mkexpr(src);
+//ZZ    return
+//ZZ       binop(Iop_Or32,
+//ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
+//ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
+//ZZ }
+//ZZ 
+//ZZ static IRExpr* mkU128 ( ULong i )
+//ZZ {
+//ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
+//ZZ }
+//ZZ 
+//ZZ /* Generate a 4-aligned version of the given expression if
+//ZZ    the given condition is true.  Else return it unchanged. */
+//ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
+//ZZ {
+//ZZ    if (b)
+//ZZ       return binop(Iop_And32, e, mkU32(~3));
+//ZZ    else
+//ZZ       return e;
+//ZZ }
+
+/* Other IR construction helpers. */
+static IROp mkAND ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_And32;
+      case Ity_I64: return Iop_And64;
+      default: vpanic("mkAND");
+   }
+}
+
+static IROp mkOR ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Or32;
+      case Ity_I64: return Iop_Or64;
+      default: vpanic("mkOR");
+   }
+}
+
+static IROp mkXOR ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Xor32;
+      case Ity_I64: return Iop_Xor64;
+      default: vpanic("mkXOR");
+   }
+}
+
+static IROp mkSHL ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Shl32;
+      case Ity_I64: return Iop_Shl64;
+      default: vpanic("mkSHL");
+   }
+}
+
+static IROp mkSHR ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Shr32;
+      case Ity_I64: return Iop_Shr64;
+      default: vpanic("mkSHR");
+   }
+}
+
+static IROp mkSAR ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Sar32;
+      case Ity_I64: return Iop_Sar64;
+      default: vpanic("mkSAR");
+   }
+}
+
+static IROp mkNOT ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Not32;
+      case Ity_I64: return Iop_Not64;
+      default: vpanic("mkNOT");
+   }
+}
+
+static IROp mkADD ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Add32;
+      case Ity_I64: return Iop_Add64;
+      default: vpanic("mkADD");
+   }
+}
+
+static IROp mkSUB ( IRType ty ) {
+   switch (ty) {
+      case Ity_I32: return Iop_Sub32;
+      case Ity_I64: return Iop_Sub64;
+      default: vpanic("mkSUB");
+   }
+}
+
+static IROp mkADDF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_AddF32;
+      case Ity_F64: return Iop_AddF64;
+      default: vpanic("mkADDF");
+   }
+}
+
+static IROp mkSUBF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_SubF32;
+      case Ity_F64: return Iop_SubF64;
+      default: vpanic("mkSUBF");
+   }
+}
+
+static IROp mkMULF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_MulF32;
+      case Ity_F64: return Iop_MulF64;
+      default: vpanic("mkMULF");
+   }
+}
+
+static IROp mkDIVF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_DivF32;
+      case Ity_F64: return Iop_DivF64;
+      default: vpanic("mkMULF");
+   }
+}
+
+static IROp mkNEGF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_NegF32;
+      case Ity_F64: return Iop_NegF64;
+      default: vpanic("mkNEGF");
+   }
+}
+
+static IROp mkABSF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_AbsF32;
+      case Ity_F64: return Iop_AbsF64;
+      default: vpanic("mkNEGF");
+   }
+}
+
+static IROp mkSQRTF ( IRType ty ) {
+   switch (ty) {
+      case Ity_F32: return Iop_SqrtF32;
+      case Ity_F64: return Iop_SqrtF64;
+      default: vpanic("mkNEGF");
+   }
+}
+
+static IRExpr* mkU ( IRType ty, ULong imm ) {
+   switch (ty) {
+      case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
+      case Ity_I64: return mkU64(imm);
+      default: vpanic("mkU");
+   }
+}
+
+/* Generate IR to create 'arg rotated right by imm', for sane values
+   of 'ty' and 'imm'. */
+static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
+{
+   UInt w = 0;
+   if (ty == Ity_I64) {
+      w = 64;
+   } else {
+      vassert(ty == Ity_I32);
+      w = 32;
+   }
+   vassert(w != 0);
+   vassert(imm < w);
+   if (imm == 0) {
+      return arg;
+   }
+   IRTemp res = newTemp(ty);
+   assign(res, binop(mkOR(ty),
+                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
+                     binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
+   return res;
+}
+
+/* Generate IR to set the returned temp to either all-zeroes or
+   all ones, as a copy of arg<imm>. */
+static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
+{
+   UInt w = 0;
+   if (ty == Ity_I64) {
+      w = 64;
+   } else {
+      vassert(ty == Ity_I32);
+      w = 32;
+   }
+   vassert(w != 0);
+   vassert(imm < w);
+   IRTemp res = newTemp(ty);
+   assign(res, binop(mkSAR(ty),
+                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
+                     mkU8(w - 1)));
+   return res;
+}
+
+/* U-widen 8/16/32/64 bit int expr to 64. */
+static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
+{
+   switch (srcTy) {
+      case Ity_I64: return e;
+      case Ity_I32: return unop(Iop_32Uto64, e);
+      case Ity_I16: return unop(Iop_16Uto64, e);
+      case Ity_I8:  return unop(Iop_8Uto64, e);
+      default: vpanic("widenUto64(arm64)");
+   }
+}
+
+/* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
+   of these combinations make sense. */
+static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
+{
+   switch (dstTy) {
+      case Ity_I64: return e;
+      case Ity_I32: return unop(Iop_64to32, e);
+      case Ity_I16: return unop(Iop_64to16, e);
+      case Ity_I8:  return unop(Iop_64to8, e);
+      default: vpanic("narrowFrom64(arm64)");
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for accessing guest registers.               ---*/
+/*------------------------------------------------------------*/
+
+#define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
+#define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
+#define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
+#define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
+#define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
+#define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
+#define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
+#define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
+#define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
+#define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
+#define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
+#define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
+#define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
+#define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
+#define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
+#define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
+#define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
+#define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
+#define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
+#define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
+#define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
+#define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
+#define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
+#define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
+#define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
+#define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
+#define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
+#define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
+#define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
+#define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
+#define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
+
+#define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
+#define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
+
+#define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
+#define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
+#define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
+#define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
+
+#define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
+#define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
+
+#define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
+#define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
+#define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
+#define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
+#define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
+#define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
+#define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
+#define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
+#define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
+#define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
+#define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
+#define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
+#define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
+#define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
+#define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
+#define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
+#define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
+#define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
+#define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
+#define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
+#define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
+#define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
+#define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
+#define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
+#define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
+#define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
+#define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
+#define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
+#define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
+#define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
+#define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
+#define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
+
+#define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
+#define OFFB_FPSR     offsetof(VexGuestARM64State,guest_FPSR)
+//ZZ #define OFFB_TPIDRURO offsetof(VexGuestARMState,guest_TPIDRURO)
+//ZZ #define OFFB_ITSTATE  offsetof(VexGuestARMState,guest_ITSTATE)
+//ZZ #define OFFB_QFLAG32  offsetof(VexGuestARMState,guest_QFLAG32)
+//ZZ #define OFFB_GEFLAG0  offsetof(VexGuestARMState,guest_GEFLAG0)
+//ZZ #define OFFB_GEFLAG1  offsetof(VexGuestARMState,guest_GEFLAG1)
+//ZZ #define OFFB_GEFLAG2  offsetof(VexGuestARMState,guest_GEFLAG2)
+//ZZ #define OFFB_GEFLAG3  offsetof(VexGuestARMState,guest_GEFLAG3)
+
+#define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
+#define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
+
+
+/* ---------------- Integer registers ---------------- */
+
+static Int offsetIReg64 ( UInt iregNo )
+{
+   /* Do we care about endianness here?  We do if sub-parts of integer
+      registers are accessed. */
+   switch (iregNo) {
+      case 0:  return OFFB_X0;
+      case 1:  return OFFB_X1;
+      case 2:  return OFFB_X2;
+      case 3:  return OFFB_X3;
+      case 4:  return OFFB_X4;
+      case 5:  return OFFB_X5;
+      case 6:  return OFFB_X6;
+      case 7:  return OFFB_X7;
+      case 8:  return OFFB_X8;
+      case 9:  return OFFB_X9;
+      case 10: return OFFB_X10;
+      case 11: return OFFB_X11;
+      case 12: return OFFB_X12;
+      case 13: return OFFB_X13;
+      case 14: return OFFB_X14;
+      case 15: return OFFB_X15;
+      case 16: return OFFB_X16;
+      case 17: return OFFB_X17;
+      case 18: return OFFB_X18;
+      case 19: return OFFB_X19;
+      case 20: return OFFB_X20;
+      case 21: return OFFB_X21;
+      case 22: return OFFB_X22;
+      case 23: return OFFB_X23;
+      case 24: return OFFB_X24;
+      case 25: return OFFB_X25;
+      case 26: return OFFB_X26;
+      case 27: return OFFB_X27;
+      case 28: return OFFB_X28;
+      case 29: return OFFB_X29;
+      case 30: return OFFB_X30;
+      /* but not 31 */
+      default: vassert(0);
+   }
+}
+
+static Int offsetIReg64orSP ( UInt iregNo )
+{
+   return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
+}
+
+static const HChar* nameIReg64orZR ( UInt iregNo )
+{
+   vassert(iregNo < 32);
+   static const HChar* names[32]
+      = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7", 
+          "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15", 
+          "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", 
+          "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
+   return names[iregNo];
+}
+
+static const HChar* nameIReg64orSP ( UInt iregNo )
+{
+   if (iregNo == 31) {
+      return "sp";
+   }
+   vassert(iregNo < 31);
+   return nameIReg64orZR(iregNo);
+}
+
+static IRExpr* getIReg64orSP ( UInt iregNo )
+{
+   vassert(iregNo < 32);
+   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
+}
+
+static IRExpr* getIReg64orZR ( UInt iregNo )
+{
+   if (iregNo == 31) {
+      return mkU64(0);
+   }
+   vassert(iregNo < 31);
+   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
+}
+
+static void putIReg64orSP ( UInt iregNo, IRExpr* e ) 
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
+   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
+}
+
+static void putIReg64orZR ( UInt iregNo, IRExpr* e ) 
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
+   if (iregNo == 31) {
+      return;
+   }
+   vassert(iregNo < 31);
+   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
+}
+
+static const HChar* nameIReg32orZR ( UInt iregNo )
+{
+   vassert(iregNo < 32);
+   static const HChar* names[32]
+      = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7", 
+          "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15", 
+          "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23", 
+          "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
+   return names[iregNo];
+}
+
+static const HChar* nameIReg32orSP ( UInt iregNo )
+{
+   if (iregNo == 31) {
+      return "wsp";
+   }
+   vassert(iregNo < 31);
+   return nameIReg32orZR(iregNo);
+}
+
+static IRExpr* getIReg32orSP ( UInt iregNo )
+{
+   vassert(iregNo < 32);
+   return unop(Iop_64to32,
+               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
+}
+
+static IRExpr* getIReg32orZR ( UInt iregNo )
+{
+   if (iregNo == 31) {
+      return mkU32(0);
+   }
+   vassert(iregNo < 31);
+   return unop(Iop_64to32,
+               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
+}
+
+static void putIReg32orSP ( UInt iregNo, IRExpr* e ) 
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
+}
+
+static void putIReg32orZR ( UInt iregNo, IRExpr* e ) 
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   if (iregNo == 31) {
+      return;
+   }
+   vassert(iregNo < 31);
+   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
+}
+
+static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
+{
+   vassert(is64 == True || is64 == False);
+   return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
+}
+
+static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
+{
+   vassert(is64 == True || is64 == False);
+   return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
+}
+
+static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
+{
+   vassert(is64 == True || is64 == False);
+   return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
+}
+
+static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
+{
+   vassert(is64 == True || is64 == False);
+   if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
+}
+
+static void putPC ( IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
+   stmt( IRStmt_Put(OFFB_PC, e) );
+}
+
+
+/* ---------------- Vector (Q) registers ---------------- */
+
+static Int offsetQReg128 ( UInt qregNo )
+{
+   /* We don't care about endianness at this point.  It only becomes
+      relevant when dealing with sections of these registers.*/
+   switch (qregNo) {
+      case 0:  return OFFB_Q0;
+      case 1:  return OFFB_Q1;
+      case 2:  return OFFB_Q2;
+      case 3:  return OFFB_Q3;
+      case 4:  return OFFB_Q4;
+      case 5:  return OFFB_Q5;
+      case 6:  return OFFB_Q6;
+      case 7:  return OFFB_Q7;
+      case 8:  return OFFB_Q8;
+      case 9:  return OFFB_Q9;
+      case 10: return OFFB_Q10;
+      case 11: return OFFB_Q11;
+      case 12: return OFFB_Q12;
+      case 13: return OFFB_Q13;
+      case 14: return OFFB_Q14;
+      case 15: return OFFB_Q15;
+      case 16: return OFFB_Q16;
+      case 17: return OFFB_Q17;
+      case 18: return OFFB_Q18;
+      case 19: return OFFB_Q19;
+      case 20: return OFFB_Q20;
+      case 21: return OFFB_Q21;
+      case 22: return OFFB_Q22;
+      case 23: return OFFB_Q23;
+      case 24: return OFFB_Q24;
+      case 25: return OFFB_Q25;
+      case 26: return OFFB_Q26;
+      case 27: return OFFB_Q27;
+      case 28: return OFFB_Q28;
+      case 29: return OFFB_Q29;
+      case 30: return OFFB_Q30;
+      case 31: return OFFB_Q31;
+      default: vassert(0);
+   }
+}
+
+/* Write to a complete Qreg. */
+static void putQReg128 ( UInt qregNo, IRExpr* e )
+{
+   vassert(qregNo < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
+   stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
+}
+
+/* Read a complete Qreg. */
+static IRExpr* getQReg128 ( UInt qregNo )
+{
+   vassert(qregNo < 32);
+   return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
+}
+
+/* Produce the IR type for some sub-part of a vector.  For 32- and 64-
+   bit sub-parts we can choose either integer or float types, and
+   choose float on the basis that that is the common use case and so
+   will give least interference with Put-to-Get forwarding later
+   on. */
+static IRType preferredVectorSubTypeFromSize ( UInt szB )
+{
+   switch (szB) {
+      case 1:  return Ity_I8;
+      case 2:  return Ity_I16;
+      case 4:  return Ity_I32; //Ity_F32;
+      case 8:  return Ity_F64;
+      case 16: return Ity_V128;
+      default: vassert(0);
+   }
+}
+
+/* Find the offset of the laneNo'th lane of type laneTy in the given
+   Qreg.  Since the host is little-endian, the least significant lane
+   has the lowest offset. */
+static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
+{
+   vassert(!host_is_bigendian);
+   Int base = offsetQReg128(qregNo);
+   /* Since the host is little-endian, the least significant lane
+      will be at the lowest address. */
+   /* Restrict this to known types, so as to avoid silently accepting
+      stupid types. */
+   UInt laneSzB = 0;
+   switch (laneTy) {
+      case Ity_I8:                 laneSzB = 1;  break;
+      case Ity_I16:                laneSzB = 2;  break;
+      case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
+      case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
+      case Ity_V128:               laneSzB = 16; break;
+      default: break;
+   }
+   vassert(laneSzB > 0);
+   UInt minOff = laneNo * laneSzB;
+   UInt maxOff = minOff + laneSzB - 1;
+   vassert(maxOff < 16);
+   return base + minOff;
+}
+
+/* Put to the least significant lane of a Qreg. */
+static void putQRegLO ( UInt qregNo, IRExpr* e )
+{
+   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
+   Int    off = offsetQRegLane(qregNo, ty, 0);
+   switch (ty) {
+      case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
+      case Ity_F32: case Ity_F64: case Ity_V128:
+         break;
+      default:
+         vassert(0); // Other cases are probably invalid
+   }
+   stmt(IRStmt_Put(off, e));
+}
+
+/* Get from the least significant lane of a Qreg. */
+static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
+{
+   Int off = offsetQRegLane(qregNo, ty, 0);
+   switch (ty) {
+      case Ity_I8:
+      case Ity_I16:
+      case Ity_I32: case Ity_I64:
+      case Ity_F32: case Ity_F64: case Ity_V128:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
+   }
+   return IRExpr_Get(off, ty);
+}
+
+static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
+{
+   static const HChar* namesQ[32]
+      = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7", 
+          "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15", 
+          "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23", 
+          "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
+   static const HChar* namesD[32]
+      = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7", 
+          "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15", 
+          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 
+          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
+   static const HChar* namesS[32]
+      = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7", 
+          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15", 
+          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", 
+          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
+   static const HChar* namesH[32]
+      = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7", 
+          "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15", 
+          "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23", 
+          "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
+   static const HChar* namesB[32]
+      = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7", 
+          "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15", 
+          "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23", 
+          "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
+   vassert(qregNo < 32);
+   switch (sizeofIRType(laneTy)) {
+      case 1:  return namesB[qregNo];
+      case 2:  return namesH[qregNo];
+      case 4:  return namesS[qregNo];
+      case 8:  return namesD[qregNo];
+      case 16: return namesQ[qregNo];
+      default: vassert(0);
+   }
+   /*NOTREACHED*/
+}
+
+static const HChar* nameQReg128 ( UInt qregNo )
+{
+   return nameQRegLO(qregNo, Ity_V128);
+}
+
+/* Find the offset of the most significant half (8 bytes) of the given
+   Qreg.  This requires knowing the endianness of the host. */
+static Int offsetQRegHI64 ( UInt qregNo )
+{
+   return offsetQRegLane(qregNo, Ity_I64, 1);
+}
+
+static IRExpr* getQRegHI64 ( UInt qregNo )
+{
+   return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
+}
+
+static void putQRegHI64 ( UInt qregNo, IRExpr* e )
+{
+   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
+   Int    off = offsetQRegHI64(qregNo);
+   switch (ty) {
+      case Ity_I64: case Ity_F64:
+         break;
+      default:
+         vassert(0); // Other cases are plain wrong
+   }
+   stmt(IRStmt_Put(off, e));
+}
+
+/* Put to a specified lane of a Qreg. */
+static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
+{
+   IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
+   Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
+   switch (laneTy) {
+      case Ity_F64: case Ity_I64:
+      case Ity_I32: case Ity_F32:
+      case Ity_I16:
+      case Ity_I8:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
+   }
+   stmt(IRStmt_Put(off, e));
+}
+
+/* Get from a specified lane of a Qreg. */
+static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
+{
+   Int off = offsetQRegLane(qregNo, laneTy, laneNo);
+   switch (laneTy) {
+      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
+      case Ity_F64:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
+   }
+   return IRExpr_Get(off, laneTy);
+}
+
+
+//ZZ /* ---------------- Misc registers ---------------- */
+//ZZ 
+//ZZ static void putMiscReg32 ( UInt    gsoffset, 
+//ZZ                            IRExpr* e, /* :: Ity_I32 */
+//ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+//ZZ {
+//ZZ    switch (gsoffset) {
+//ZZ       case OFFB_FPSCR:   break;
+//ZZ       case OFFB_QFLAG32: break;
+//ZZ       case OFFB_GEFLAG0: break;
+//ZZ       case OFFB_GEFLAG1: break;
+//ZZ       case OFFB_GEFLAG2: break;
+//ZZ       case OFFB_GEFLAG3: break;
+//ZZ       default: vassert(0); /* awaiting more cases */
+//ZZ    }
+//ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+//ZZ 
+//ZZ    if (guardT == IRTemp_INVALID) {
+//ZZ       /* unconditional write */
+//ZZ       stmt(IRStmt_Put(gsoffset, e));
+//ZZ    } else {
+//ZZ       stmt(IRStmt_Put(
+//ZZ          gsoffset,
+//ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
+//ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
+//ZZ       ));
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static IRTemp get_ITSTATE ( void )
+//ZZ {
+//ZZ    ASSERT_IS_THUMB;
+//ZZ    IRTemp t = newTemp(Ity_I32);
+//ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
+//ZZ    return t;
+//ZZ }
+//ZZ 
+//ZZ static void put_ITSTATE ( IRTemp t )
+//ZZ {
+//ZZ    ASSERT_IS_THUMB;
+//ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
+//ZZ }
+//ZZ 
+//ZZ static IRTemp get_QFLAG32 ( void )
+//ZZ {
+//ZZ    IRTemp t = newTemp(Ity_I32);
+//ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
+//ZZ    return t;
+//ZZ }
+//ZZ 
+//ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
+//ZZ {
+//ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
+//ZZ }
+//ZZ 
+//ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
+//ZZ    Status Register) to indicate that overflow or saturation occurred.
+//ZZ    Nb: t must be zero to denote no saturation, and any nonzero
+//ZZ    value to indicate saturation. */
+//ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
+//ZZ {
+//ZZ    IRTemp old = get_QFLAG32();
+//ZZ    IRTemp nyu = newTemp(Ity_I32);
+//ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
+//ZZ    put_QFLAG32(nyu, condT);
+//ZZ }
+
+
+/* ---------------- FPCR stuff ---------------- */
+
+/* Generate IR to get hold of the rounding mode bits in FPCR, and
+   convert them to IR format.  Bind the final result to the
+   returned temp. */
+static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
+{
+   /* The ARMvfp encoding for rounding mode bits is:
+         00  to nearest
+         01  to +infinity
+         10  to -infinity
+         11  to zero
+      We need to convert that to the IR encoding:
+         00  to nearest (the default)
+         10  to +infinity
+         01  to -infinity
+         11  to zero
+      Which can be done by swapping bits 0 and 1.
+      The rmode bits are at 23:22 in FPSCR.
+   */
+   IRTemp armEncd = newTemp(Ity_I32);
+   IRTemp swapped = newTemp(Ity_I32);
+   /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
+      we don't zero out bits 24 and above, since the assignment to
+      'swapped' will mask them out anyway. */
+   assign(armEncd,
+          binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
+   /* Now swap them. */
+   assign(swapped,
+          binop(Iop_Or32,
+                binop(Iop_And32,
+                      binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
+                      mkU32(2)),
+                binop(Iop_And32,
+                      binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
+                      mkU32(1))
+         ));
+   return swapped;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for flag handling and conditional insns      ---*/
+/*------------------------------------------------------------*/
+
+static const HChar* nameARM64Condcode ( ARM64Condcode cond )
+{
+   switch (cond) {
+      case ARM64CondEQ:  return "eq";
+      case ARM64CondNE:  return "ne";
+      case ARM64CondCS:  return "cs";  // or 'hs'
+      case ARM64CondCC:  return "cc";  // or 'lo'
+      case ARM64CondMI:  return "mi";
+      case ARM64CondPL:  return "pl";
+      case ARM64CondVS:  return "vs";
+      case ARM64CondVC:  return "vc";
+      case ARM64CondHI:  return "hi";
+      case ARM64CondLS:  return "ls";
+      case ARM64CondGE:  return "ge";
+      case ARM64CondLT:  return "lt";
+      case ARM64CondGT:  return "gt";
+      case ARM64CondLE:  return "le";
+      case ARM64CondAL:  return "al";
+      case ARM64CondNV:  return "nv";
+      default: vpanic("name_ARM64Condcode");
+   }
+}
+
+/* and a handy shorthand for it */
+static const HChar* nameCC ( ARM64Condcode cond ) {
+   return nameARM64Condcode(cond);
+}
+
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
+   Ity_I64, suitable for narrowing.  Although the return type is
+   Ity_I64, the returned value is either 0 or 1.  'cond' must be
+   :: Ity_I64 and must denote the condition to compute in 
+   bits 7:4, and be zero everywhere else.
+*/
+static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
+   /* And 'cond' had better produce a value in which only bits 7:4 are
+      nonzero.  However, obviously we can't assert for that. */
+
+   /* So what we're constructing for the first argument is 
+      "(cond << 4) | stored-operation".
+      However, as per comments above, 'cond' must be supplied
+      pre-shifted to this function.
+
+      This pairing scheme requires that the ARM64_CC_OP_ values all fit
+      in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
+      8 bits of the first argument. */
+   IRExpr** args
+      = mkIRExprVec_4(
+           binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
+           IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
+           IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
+           IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
+        );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I64,
+           0/*regparm*/, 
+           "arm64g_calculate_condition", &arm64g_calculate_condition,
+           args
+        );
+
+   /* Exclude the requested condition, OP and NDEP from definedness
+      checking.  We're only interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
+   Ity_I64, suitable for narrowing.  Although the return type is
+   Ity_I64, the returned value is either 0 or 1.
+*/
+static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
+{
+  /* First arg is "(cond << 4) | condition".  This requires that the
+     ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
+     (COND, OP) pair in the lowest 8 bits of the first argument. */
+   vassert(cond >= 0 && cond <= 15);
+   return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
+}
+
+
+//ZZ /* Build IR to calculate just the carry flag from stored
+//ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+//ZZ    Ity_I32. */
+//ZZ static IRExpr* mk_armg_calculate_flag_c ( void )
+//ZZ {
+//ZZ    IRExpr** args
+//ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+//ZZ    IRExpr* call
+//ZZ       = mkIRExprCCall(
+//ZZ            Ity_I32,
+//ZZ            0/*regparm*/, 
+//ZZ            "armg_calculate_flag_c", &armg_calculate_flag_c,
+//ZZ            args
+//ZZ         );
+//ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
+//ZZ       interested in DEP1 and DEP2. */
+//ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+//ZZ    return call;
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* Build IR to calculate just the overflow flag from stored
+//ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+//ZZ    Ity_I32. */
+//ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
+//ZZ {
+//ZZ    IRExpr** args
+//ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+//ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+//ZZ    IRExpr* call
+//ZZ       = mkIRExprCCall(
+//ZZ            Ity_I32,
+//ZZ            0/*regparm*/, 
+//ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
+//ZZ            args
+//ZZ         );
+//ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
+//ZZ       interested in DEP1 and DEP2. */
+//ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+//ZZ    return call;
+//ZZ }
+
+
+/* Build IR to calculate N Z C V in bits 31:28 of the
+   returned word. */
+static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I64,
+           0/*regparm*/, 
+           "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* Build IR to set the flags thunk, in the most general case. */
+static
+void setFlags_D1_D2_ND ( UInt cc_op,
+                         IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
+{
+   vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
+   vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
+   vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
+   vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
+}
+
+/* Build IR to set the flags thunk after ADD or SUB. */
+static
+void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
+{
+   IRTemp argL64 = IRTemp_INVALID;
+   IRTemp argR64 = IRTemp_INVALID;
+   IRTemp z64    = newTemp(Ity_I64);
+   if (is64) {
+      argL64 = argL;
+      argR64 = argR;
+   } else {
+      argL64 = newTemp(Ity_I64);
+      argR64 = newTemp(Ity_I64);
+      assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
+      assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
+   }
+   assign(z64, mkU64(0));
+   UInt cc_op = ARM64G_CC_OP_NUMBER;
+   /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
+   else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
+   else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
+   else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
+   else                      { vassert(0); }
+   setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
+}
+
+/* Build IR to set the flags thunk after ADD or SUB, if the given
+   condition evaluates to True at run time.  If not, the flags are set
+   to the specified NZCV value. */
+static
+void setFlags_ADD_SUB_conditionally (
+        Bool is64, Bool isSUB,
+        IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
+     )
+{
+   /* Generate IR as follows:
+        CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
+        CC_DEP1 = ITE(cond, argL64, nzcv << 28)
+        CC_DEP2 = ITE(cond, argR64, 0)
+        CC_NDEP = 0
+   */
+
+   IRTemp z64 = newTemp(Ity_I64);
+   assign(z64, mkU64(0));
+
+   /* Establish the operation and operands for the True case. */
+   IRTemp t_dep1 = IRTemp_INVALID;
+   IRTemp t_dep2 = IRTemp_INVALID;
+   UInt   t_op   = ARM64G_CC_OP_NUMBER;
+   /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
+   else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
+   else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
+   else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
+   else                      { vassert(0); }
+   /* */
+   if (is64) {
+      t_dep1 = argL;
+      t_dep2 = argR;
+   } else {
+      t_dep1 = newTemp(Ity_I64);
+      t_dep2 = newTemp(Ity_I64);
+      assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
+      assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
+   }
+
+   /* Establish the operation and operands for the False case. */
+   IRTemp f_dep1 = newTemp(Ity_I64);
+   IRTemp f_dep2 = z64;
+   UInt   f_op   = ARM64G_CC_OP_COPY;
+   assign(f_dep1, mkU64(nzcv << 28));
+
+   /* Final thunk values */
+   IRTemp dep1 = newTemp(Ity_I64);
+   IRTemp dep2 = newTemp(Ity_I64);
+   IRTemp op   = newTemp(Ity_I64);
+
+   assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
+   assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
+   assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
+
+   /* finally .. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
+}
+
+/* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
+static
+void setFlags_LOGIC ( Bool is64, IRTemp res )
+{
+   IRTemp res64 = IRTemp_INVALID;
+   IRTemp z64   = newTemp(Ity_I64);
+   UInt   cc_op = ARM64G_CC_OP_NUMBER;
+   if (is64) {
+      res64 = res;
+      cc_op = ARM64G_CC_OP_LOGIC64;
+   } else {
+      res64 = newTemp(Ity_I64);
+      assign(res64, unop(Iop_32Uto64, mkexpr(res)));
+      cc_op = ARM64G_CC_OP_LOGIC32;
+   }
+   assign(z64, mkU64(0));
+   setFlags_D1_D2_ND(cc_op, res64, z64, z64);
+}
+
+/* Build IR to set the flags thunk to a given NZCV value.  NZCV is
+   located in bits 31:28 of the supplied value. */
+static
+void setFlags_COPY ( IRTemp nzcv_28x0 )
+{
+   IRTemp z64 = newTemp(Ity_I64);
+   assign(z64, mkU64(0));
+   setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
+}
+
+
+//ZZ /* Minor variant of the above that sets NDEP to zero (if it
+//ZZ    sets it at all) */
+//ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
+//ZZ                              IRTemp t_dep2,
+//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+//ZZ {
+//ZZ    IRTemp z32 = newTemp(Ity_I32);
+//ZZ    assign( z32, mkU32(0) );
+//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* Minor variant of the above that sets DEP2 to zero (if it
+//ZZ    sets it at all) */
+//ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
+//ZZ                              IRTemp t_ndep,
+//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+//ZZ {
+//ZZ    IRTemp z32 = newTemp(Ity_I32);
+//ZZ    assign( z32, mkU32(0) );
+//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
+//ZZ    sets them at all) */
+//ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
+//ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+//ZZ {
+//ZZ    IRTemp z32 = newTemp(Ity_I32);
+//ZZ    assign( z32, mkU32(0) );
+//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
+//ZZ }
+
+
+/*------------------------------------------------------------*/
+/*--- Misc math helpers                                    ---*/
+/*------------------------------------------------------------*/
+
+/* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
+static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
+{
+   IRTemp maskT = newTemp(Ity_I64);
+   IRTemp res   = newTemp(Ity_I64);
+   vassert(sh >= 1 && sh <= 63);
+   assign(maskT, mkU64(mask));
+   assign( res,
+           binop(Iop_Or64,
+                 binop(Iop_Shr64,
+                       binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
+                       mkU8(sh)),
+                 binop(Iop_And64,
+                       binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
+                       mkexpr(maskT))
+                 ) 
+           );
+   return res;
+}
+
+/* Generates byte swaps within 32-bit lanes. */
+static IRTemp math_UINTSWAP64 ( IRTemp src )
+{
+   IRTemp res;
+   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
+   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
+   return res;
+}
+
+/* Generates byte swaps within 16-bit lanes. */
+static IRTemp math_USHORTSWAP64 ( IRTemp src )
+{
+   IRTemp res;
+   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
+   return res;
+}
+
+/* Generates a 64-bit byte swap. */
+static IRTemp math_BYTESWAP64 ( IRTemp src )
+{
+   IRTemp res;
+   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
+   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
+   res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
+   return res;
+}
+
+/* Generates a 64-bit bit swap. */
+static IRTemp math_BITSWAP64 ( IRTemp src )
+{
+   IRTemp res;
+   res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
+   res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
+   res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
+   return math_BYTESWAP64(res);
+}
+
+/* Duplicates the bits at the bottom of the given word to fill the
+   whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
+   except for the bottom bits. */
+static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
+{
+   if (srcTy == Ity_I8) {
+      IRTemp t16 = newTemp(Ity_I64);
+      assign(t16, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(8))));
+      IRTemp t32 = newTemp(Ity_I64);
+      assign(t32, binop(Iop_Or64, mkexpr(t16),
+                                  binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(t32),
+                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I16) {
+      IRTemp t32 = newTemp(Ity_I64);
+      assign(t32, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(16))));
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(t32),
+                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I32) {
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I64) {
+      return src;
+   }
+   vassert(0);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- FP comparison helpers                                ---*/
+/*------------------------------------------------------------*/
+
+/* irRes :: Ity_I32 holds a floating point comparison result encoded
+   as an IRCmpF64Result.  Generate code to convert it to an
+   ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
+   Assign a new temp to hold that value, and return the temp. */
+static
+IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
+{
+   IRTemp ix       = newTemp(Ity_I64);
+   IRTemp termL    = newTemp(Ity_I64);
+   IRTemp termR    = newTemp(Ity_I64);
+   IRTemp nzcv     = newTemp(Ity_I64);
+   IRTemp irRes    = newTemp(Ity_I64);
+
+   /* This is where the fun starts.  We have to convert 'irRes' from
+      an IR-convention return result (IRCmpF64Result) to an
+      ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
+      4 bits of 'nzcv'. */
+   /* Map compare result from IR to ARM(nzcv) */
+   /*
+      FP cmp result | IR   | ARM(nzcv)
+      --------------------------------
+      UN              0x45   0011
+      LT              0x01   1000
+      GT              0x00   0010
+      EQ              0x40   0110
+   */
+   /* Now since you're probably wondering WTF ..
+
+      ix fishes the useful bits out of the IR value, bits 6 and 0, and
+      places them side by side, giving a number which is 0, 1, 2 or 3.
+
+      termL is a sequence cooked up by GNU superopt.  It converts ix
+         into an almost correct value NZCV value (incredibly), except
+         for the case of UN, where it produces 0100 instead of the
+         required 0011.
+
+      termR is therefore a correction term, also computed from ix.  It
+         is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
+         the final correct value, we subtract termR from termL.
+
+      Don't take my word for it.  There's a test program at the bottom
+      of guest_arm_toIR.c, to try this out with.
+   */
+   assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
+
+   assign(
+      ix,
+      binop(Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
+                  mkU64(3)),
+            binop(Iop_And64, mkexpr(irRes), mkU64(1))));
+
+   assign(
+      termL,
+      binop(Iop_Add64,
+            binop(Iop_Shr64,
+                  binop(Iop_Sub64,
+                        binop(Iop_Shl64,
+                              binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
+                              mkU8(62)),
+                        mkU64(1)),
+                  mkU8(61)),
+            mkU64(1)));
+
+   assign(
+      termR,
+      binop(Iop_And64,
+            binop(Iop_And64,
+                  mkexpr(ix),
+                  binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
+            mkU64(1)));
+
+   assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
+   return nzcv;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Data processing (immediate)                          ---*/
+/*------------------------------------------------------------*/
+
+/* Helper functions for supporting "DecodeBitMasks" */
+
+static ULong dbm_ROR ( Int width, ULong x, Int rot )
+{
+   vassert(width > 0 && width <= 64);
+   vassert(rot >= 0 && rot < width);
+   if (rot == 0) return x;
+   ULong res = x >> rot;
+   res |= (x << (width - rot));
+   if (width < 64)
+     res &= ((1ULL << width) - 1);
+   return res;
+}
+
+static ULong dbm_RepTo64( Int esize, ULong x )
+{
+   switch (esize) {
+      case 64:
+         return x;
+      case 32:
+         x &= 0xFFFFFFFF; x |= (x << 32);
+         return x;
+      case 16:
+         x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
+         return x;
+      case 8:
+         x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
+         return x;
+      case 4:
+         x &= 0xF; x |= (x << 4); x |= (x << 8);
+         x |= (x << 16); x |= (x << 32);
+         return x;
+      case 2:
+         x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
+         x |= (x << 16); x |= (x << 32);
+         return x;
+      default:
+         break;
+   }
+   vpanic("dbm_RepTo64");
+   /*NOTREACHED*/
+   return 0;
+}
+
+static Int dbm_highestSetBit ( ULong x )
+{
+   Int i;
+   for (i = 63; i >= 0; i--) {
+      if (x & (1ULL << i))
+         return i;
+   }
+   vassert(x == 0);
+   return -1;
+}
+
+static
+Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask, 
+                          ULong immN, ULong imms, ULong immr, Bool immediate,
+                          UInt M /*32 or 64*/)
+{
+   vassert(immN < (1ULL << 1));
+   vassert(imms < (1ULL << 6));
+   vassert(immr < (1ULL << 6));
+   vassert(immediate == False || immediate == True);
+   vassert(M == 32 || M == 64);
+
+   Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
+   if (len < 1) { /* printf("fail1\n"); */ return False; }
+   vassert(len <= 6);
+   vassert(M >= (1 << len));
+
+   vassert(len >= 1 && len <= 6);
+   ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
+                  (1 << len) - 1;
+   vassert(levels >= 1 && levels <= 63);
+
+   if (immediate && ((imms & levels) == levels)) { 
+      /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
+      return False;
+   }
+
+   ULong S = imms & levels;
+   ULong R = immr & levels;
+   Int   diff = S - R;
+   diff &= 63;
+   Int esize = 1 << len;
+   vassert(2 <= esize && esize <= 64);
+
+   /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
+      same below with d.  S can be 63 in which case we have an out of
+      range and hence undefined shift. */
+   vassert(S >= 0 && S <= 63);
+   vassert(esize >= (S+1));
+   ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
+                  //(1ULL << (S+1)) - 1;
+                  ((1ULL << S) - 1) + (1ULL << S);
+
+   Int d = // diff<len-1:0>
+           diff & ((1 << len)-1);
+   vassert(esize >= (d+1));
+   vassert(d >= 0 && d <= 63);
+
+   ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
+                  //(1ULL << (d+1)) - 1;
+                  ((1ULL << d) - 1) + (1ULL << d);
+
+   if (esize != 64) vassert(elem_s < (1ULL << esize));
+   if (esize != 64) vassert(elem_d < (1ULL << esize));
+
+   if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
+   if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
+
+   return True;
+}
+
+
+static
+Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
+                                         UInt insn)
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+   /* insn[28:23]
+      10000x PC-rel addressing
+      10001x Add/subtract (immediate)
+      100100 Logical (immediate)
+      100101 Move Wide (immediate)
+      100110 Bitfield
+      100111 Extract
+   */
+
+   /* ------------------ ADD/SUB{,S} imm12 ------------------ */
+   if (INSN(28,24) == BITS5(1,0,0,0,1)) {
+      Bool is64   = INSN(31,31) == 1;
+      Bool isSub  = INSN(30,30) == 1;
+      Bool setCC  = INSN(29,29) == 1;
+      UInt sh     = INSN(23,22);
+      UInt uimm12 = INSN(21,10);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      const HChar* nm = isSub ? "sub" : "add";
+      if (sh >= 2) {
+         /* Invalid; fall through */
+      } else {
+         vassert(sh <= 1);
+         uimm12 <<= (12 * sh);
+         if (is64) {
+            IRTemp argL  = newTemp(Ity_I64);
+            IRTemp argR  = newTemp(Ity_I64);
+            IRTemp res   = newTemp(Ity_I64);
+            assign(argL, getIReg64orSP(nn));
+            assign(argR, mkU64(uimm12));
+            assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
+                               mkexpr(argL), mkexpr(argR)));
+            if (setCC) {
+               putIReg64orZR(dd, mkexpr(res));
+               setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
+               DIP("%ss %s, %s, 0x%x\n",
+                   nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
+            } else {
+               putIReg64orSP(dd, mkexpr(res));
+               DIP("%s %s, %s, 0x%x\n",
+                   nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
+            }
+         } else {
+            IRTemp argL  = newTemp(Ity_I32);
+            IRTemp argR  = newTemp(Ity_I32);
+            IRTemp res   = newTemp(Ity_I32);
+            assign(argL, getIReg32orSP(nn));
+            assign(argR, mkU32(uimm12));
+            assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
+                               mkexpr(argL), mkexpr(argR)));
+            if (setCC) {
+               putIReg32orZR(dd, mkexpr(res));
+               setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
+               DIP("%ss %s, %s, 0x%x\n",
+                   nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
+            } else {
+               putIReg32orSP(dd, mkexpr(res));
+               DIP("%s %s, %s, 0x%x\n",
+                   nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
+            }
+         }
+         return True;
+      }
+   }
+
+   /* -------------------- ADR/ADRP -------------------- */
+   if (INSN(28,24) == BITS5(1,0,0,0,0)) {
+      UInt  bP    = INSN(31,31);
+      UInt  immLo = INSN(30,29);
+      UInt  immHi = INSN(23,5);
+      UInt  rD    = INSN(4,0);
+      ULong uimm  = (immHi << 2) | immLo;
+      ULong simm  = sx_to_64(uimm, 21);
+      ULong val;
+      if (bP) {
+         val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
+      } else {
+         val = guest_PC_curr_instr + simm;
+      }
+      putIReg64orZR(rD, mkU64(val));
+      DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
+      return True;
+   }
+
+   /* -------------------- LOGIC(imm) -------------------- */
+   if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
+      /* 31 30 28     22 21   15   9  4
+         sf op 100100 N  immr imms Rn Rd
+           op=00: AND  Rd|SP, Rn, #imm
+           op=01: ORR  Rd|SP, Rn, #imm
+           op=10: EOR  Rd|SP, Rn, #imm
+           op=11: ANDS Rd|ZR, Rn, #imm
+      */
+      Bool  is64 = INSN(31,31) == 1;
+      UInt  op   = INSN(30,29);
+      UInt  N    = INSN(22,22);
+      UInt  immR = INSN(21,16);
+      UInt  immS = INSN(15,10);
+      UInt  nn   = INSN(9,5);
+      UInt  dd   = INSN(4,0);
+      ULong imm  = 0;
+      Bool  ok;
+      if (N == 1 && !is64) 
+         goto after_logic_imm; /* not allowed; fall through */
+      ok = dbm_DecodeBitMasks(&imm, NULL,
+                              N, immS, immR, True, is64 ? 64 : 32);
+      if (!ok)
+         goto after_logic_imm;
+
+      const HChar* names[4] = { "and", "orr", "eor", "ands" };
+      const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
+      const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
+
+      vassert(op < 4);
+      if (is64) {
+         IRExpr* argL = getIReg64orZR(nn);
+         IRExpr* argR = mkU64(imm);
+         IRTemp  res  = newTemp(Ity_I64);
+         assign(res, binop(ops64[op], argL, argR));
+         if (op < 3) {
+            putIReg64orSP(dd, mkexpr(res));
+            DIP("%s %s, %s, 0x%llx\n", names[op],
+                nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
+         } else {
+            putIReg64orZR(dd, mkexpr(res));
+            setFlags_LOGIC(True/*is64*/, res);
+            DIP("%s %s, %s, 0x%llx\n", names[op],
+                nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
+         }
+      } else {
+         IRExpr* argL = getIReg32orZR(nn);
+         IRExpr* argR = mkU32((UInt)imm);
+         IRTemp  res  = newTemp(Ity_I32);
+         assign(res, binop(ops32[op], argL, argR));
+         if (op < 3) {
+            putIReg32orSP(dd, mkexpr(res));
+            DIP("%s %s, %s, 0x%x\n", names[op],
+                nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
+         } else {
+            putIReg32orZR(dd, mkexpr(res));
+            setFlags_LOGIC(False/*!is64*/, res);
+            DIP("%s %s, %s, 0x%x\n", names[op],
+                nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
+         }
+      }
+      return True;
+   }
+   after_logic_imm:
+
+   /* -------------------- MOV{Z,N,K} -------------------- */
+   if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
+      /* 31 30 28      22 20    4
+         |  |  |       |  |     |
+         sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
+         sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
+         sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
+      */
+      Bool is64   = INSN(31,31) == 1;
+      UInt subopc = INSN(30,29);
+      UInt hw     = INSN(22,21);
+      UInt imm16  = INSN(20,5);
+      UInt dd     = INSN(4,0);
+      if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
+         /* invalid; fall through */
+      } else {
+         ULong imm64 = ((ULong)imm16) << (16 * hw);
+         if (!is64)
+            vassert(imm64 < 0x100000000ULL);
+         switch (subopc) {
+            case BITS2(1,0): // MOVZ
+               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
+               DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
+               break;
+            case BITS2(0,0): // MOVN
+               imm64 = ~imm64;
+               if (!is64)
+                  imm64 &= 0xFFFFFFFFULL;
+               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
+               DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
+               break;
+            case BITS2(1,1): // MOVK
+               /* This is more complex.  We are inserting a slice into
+                  the destination register, so we need to have the old
+                  value of it. */
+               if (is64) {
+                  IRTemp old = newTemp(Ity_I64);
+                  assign(old, getIReg64orZR(dd));
+                  ULong mask = 0xFFFFULL << (16 * hw);
+                  IRExpr* res
+                     = binop(Iop_Or64, 
+                             binop(Iop_And64, mkexpr(old), mkU64(~mask)),
+                             mkU64(imm64));
+                  putIReg64orZR(dd, res);
+                  DIP("movk %s, 0x%x, lsl %u\n",
+                      nameIReg64orZR(dd), imm16, 16*hw);
+               } else {
+                  IRTemp old = newTemp(Ity_I32);
+                  assign(old, getIReg32orZR(dd));
+                  vassert(hw <= 1);
+                  UInt mask = 0xFFFF << (16 * hw);
+                  IRExpr* res
+                     = binop(Iop_Or32, 
+                             binop(Iop_And32, mkexpr(old), mkU32(~mask)),
+                             mkU32((UInt)imm64));
+                  putIReg32orZR(dd, res);
+                  DIP("movk %s, 0x%x, lsl %u\n",
+                      nameIReg32orZR(dd), imm16, 16*hw);
+               }
+               break;
+            default:
+               vassert(0);
+         }
+         return True;
+      }
+   }
+
+   /* -------------------- {U,S,}BFM -------------------- */
+   /*    30 28     22 21   15   9  4
+
+      sf 10 100110 N  immr imms nn dd
+         UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
+         UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
+
+      sf 00 100110 N  immr imms nn dd
+         SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
+         SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
+
+      sf 01 100110 N  immr imms nn dd
+         BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
+         BFM Xd, Xn, #immr, #imms   when sf=1, N=1
+   */
+   if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
+      UInt sf     = INSN(31,31);
+      UInt opc    = INSN(30,29);
+      UInt N      = INSN(22,22);
+      UInt immR   = INSN(21,16);
+      UInt immS   = INSN(15,10);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool inZero = False;
+      Bool extend = False;
+      const HChar* nm = "???";
+      /* skip invalid combinations */
+      switch (opc) {
+         case BITS2(0,0):
+            inZero = True; extend = True; nm = "sbfm"; break;
+         case BITS2(0,1):
+            inZero = False; extend = False; nm = "bfm"; break;
+         case BITS2(1,0):
+            inZero = True; extend = False; nm = "ubfm"; break;
+         case BITS2(1,1):
+            goto after_bfm; /* invalid */
+         default:
+            vassert(0);
+      }
+      if (sf == 1 && N != 1) goto after_bfm;
+      if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
+                             || ((immS >> 5) & 1) != 0)) goto after_bfm;
+      ULong wmask = 0, tmask = 0;
+      Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
+                                   N, immS, immR, False, sf == 1 ? 64 : 32);
+      if (!ok) goto after_bfm; /* hmmm */
+
+      Bool   is64 = sf == 1;
+      IRType ty   = is64 ? Ity_I64 : Ity_I32;
+
+      IRTemp dst = newTemp(ty);
+      IRTemp src = newTemp(ty);
+      IRTemp bot = newTemp(ty);
+      IRTemp top = newTemp(ty);
+      IRTemp res = newTemp(ty);
+      assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
+      assign(src, getIRegOrZR(is64, nn));
+      /* perform bitfield move on low bits */
+      assign(bot, binop(mkOR(ty),
+                        binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
+                        binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
+                                         mkU(ty, wmask))));
+      /* determine extension bits (sign, zero or dest register) */
+      assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
+      /* combine extension bits and result bits */
+      assign(res, binop(mkOR(ty),
+                        binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
+                        binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
+      putIRegOrZR(is64, dd, mkexpr(res));
+      DIP("%s %s, %s, immR=%u, immS=%u\n",
+          nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
+      return True;
+   }
+   after_bfm:
+
+   /* ---------------------- EXTR ---------------------- */
+   /*   30 28     22 20 15   9 4
+      1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
+      0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
+   */
+   if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
+      Bool is64  = INSN(31,31) == 1;
+      UInt mm    = INSN(20,16);
+      UInt imm6  = INSN(15,10);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      Bool valid = True;
+      if (INSN(31,31) != INSN(22,22))
+        valid = False;
+      if (!is64 && imm6 >= 32)
+        valid = False;
+      if (!valid) goto after_extr;
+      IRType ty    = is64 ? Ity_I64 : Ity_I32;
+      IRTemp srcHi = newTemp(ty);
+      IRTemp srcLo = newTemp(ty);
+      IRTemp res   = newTemp(ty);
+      assign(srcHi, getIRegOrZR(is64, nn));
+      assign(srcLo, getIRegOrZR(is64, mm));
+      if (imm6 == 0) {
+        assign(res, mkexpr(srcLo));
+      } else {
+        UInt szBits = 8 * sizeofIRType(ty);
+        vassert(imm6 > 0 && imm6 < szBits);
+        assign(res, binop(mkOR(ty),
+                          binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
+                          binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
+      }
+      putIRegOrZR(is64, dd, mkexpr(res));
+      DIP("extr %s, %s, %s, #%u\n",
+          nameIRegOrZR(is64,dd),
+          nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
+      return True;
+   }
+  after_extr:
+
+   vex_printf("ARM64 front end: data_processing_immediate\n");
+   return False;
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Data processing (register) instructions              ---*/
+/*------------------------------------------------------------*/
+
+static const HChar* nameSH ( UInt sh ) {
+   switch (sh) {
+      case 0: return "lsl";
+      case 1: return "lsr";
+      case 2: return "asr";
+      case 3: return "ror";
+      default: vassert(0);
+   }
+}
+
+/* Generate IR to get a register value, possibly shifted by an
+   immediate.  Returns either a 32- or 64-bit temporary holding the
+   result.  After the shift, the value can optionally be NOT-ed 
+   too.
+
+   sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
+   in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
+   isn't allowed, but it's the job of the caller to check that.
+*/
+static IRTemp getShiftedIRegOrZR ( Bool is64,
+                                   UInt sh_how, UInt sh_amt, UInt regNo,
+                                   Bool invert )
+{
+   vassert(sh_how < 4);
+   vassert(sh_amt < (is64 ? 64 : 32));
+   IRType ty = is64 ? Ity_I64 : Ity_I32;
+   IRTemp t0 = newTemp(ty);
+   assign(t0, getIRegOrZR(is64, regNo));
+   IRTemp t1 = newTemp(ty);
+   switch (sh_how) {
+      case BITS2(0,0):
+         assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
+         break;
+      case BITS2(0,1):
+         assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
+         break;
+      case BITS2(1,0):
+         assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
+         break;
+      case BITS2(1,1):
+         assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
+         break;
+      default:
+         vassert(0);
+   }
+   if (invert) {
+      IRTemp t2 = newTemp(ty);
+      assign(t2, unop(mkNOT(ty), mkexpr(t1)));
+      return t2;
+   } else {
+      return t1;
+   }
+}
+
+
+static
+Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
+                                        UInt insn)
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+   /* ------------------- ADD/SUB(reg) ------------------- */
+   /* x==0 => 32 bit op      x==1 => 64 bit op
+      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
+
+      31 30 29 28    23 21 20 15   9  4
+      |  |  |  |     |  |  |  |    |  |
+      x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
+      x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
+      x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
+      x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
+   */
+   if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
+      UInt   bX    = INSN(31,31);
+      UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
+      UInt   bS    = INSN(29, 29); /* set flags? */
+      UInt   sh    = INSN(23,22);
+      UInt   rM    = INSN(20,16);
+      UInt   imm6  = INSN(15,10);
+      UInt   rN    = INSN(9,5);
+      UInt   rD    = INSN(4,0);
+      Bool   isSUB = bOP == 1;
+      Bool   is64  = bX == 1;
+      IRType ty    = is64 ? Ity_I64 : Ity_I32;
+      if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
+         /* invalid; fall through */
+      } else {
+         IRTemp argL = newTemp(ty);
+         assign(argL, getIRegOrZR(is64, rN));
+         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
+         IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
+         IRTemp res  = newTemp(ty);
+         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
+         if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
+         if (bS) {
+            setFlags_ADD_SUB(is64, isSUB, argL, argR);
+         }
+         DIP("%s%s %s, %s, %s, %s #%u\n",
+             bOP ? "sub" : "add", bS ? "s" : "",
+             nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
+             nameIRegOrZR(is64, rM), nameSH(sh), imm6);
+         return True;
+      }
+   }
+
+   /* -------------------- LOGIC(reg) -------------------- */   
+   /* x==0 => 32 bit op      x==1 => 64 bit op
+      N==0 => inv? is no-op (no inversion)
+      N==1 => inv? is NOT
+      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
+
+      31 30 28    23 21 20 15   9  4
+      |  |  |     |  |  |  |    |  |
+      x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
+      x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
+      x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
+      x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
+      With N=1, the names are: BIC ORN EON BICS
+   */
+   if (INSN(28,24) == BITS5(0,1,0,1,0)) {
+      UInt   bX   = INSN(31,31);
+      UInt   sh   = INSN(23,22);
+      UInt   bN   = INSN(21,21);
+      UInt   rM   = INSN(20,16);
+      UInt   imm6 = INSN(15,10);
+      UInt   rN   = INSN(9,5);
+      UInt   rD   = INSN(4,0);
+      Bool   is64 = bX == 1;
+      IRType ty   = is64 ? Ity_I64 : Ity_I32;
+      if (!is64 && imm6 > 31) {
+         /* invalid; fall though */
+      } else {
+         IRTemp argL = newTemp(ty);
+         assign(argL, getIRegOrZR(is64, rN));
+         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
+         IROp   op   = Iop_INVALID;
+         switch (INSN(30,29)) {
+            case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
+            case BITS2(0,1):                  op = mkOR(ty);  break;
+            case BITS2(1,0):                  op = mkXOR(ty); break;
+            default: vassert(0);
+         }
+         IRTemp res = newTemp(ty);
+         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
+         if (INSN(30,29) == BITS2(1,1)) {
+            setFlags_LOGIC(is64, res);
+         }
+         putIRegOrZR(is64, rD, mkexpr(res));
+
+         static const HChar* names_op[8]
+            = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
+         vassert(((bN << 2) | INSN(30,29)) < 8);
+         const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
+         /* Special-case the printing of "MOV" */
+         if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
+            DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
+                                nameIRegOrZR(is64, rM));
+         } else {
+            DIP("%s %s, %s, %s, %s #%u\n", nm_op,
+                nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
+                nameIRegOrZR(is64, rM), nameSH(sh), imm6);
+         }
+         return True;
+      }
+   }
+
+   /* -------------------- {U,S}MULH -------------------- */   
+   /* 31       23 22 20 15     9   4
+      10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
+      10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
+   */
+   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
+       && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
+      Bool isU = INSN(23,23) == 1;
+      UInt mm  = INSN(20,16);
+      UInt nn  = INSN(9,5);
+      UInt dd  = INSN(4,0);
+      putIReg64orZR(dd, unop(Iop_128HIto64,
+                             binop(isU ? Iop_MullU64 : Iop_MullS64,
+                                   getIReg64orZR(nn), getIReg64orZR(mm))));
+      DIP("%cmulh %s, %s, %s\n", 
+          isU ? 'u' : 's',
+          nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
+      return True;
+   }
+
+   /* -------------------- M{ADD,SUB} -------------------- */   
+   /* 31 30           20 15 14 9 4
+      sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
+      sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
+   */
+   if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
+      Bool is64  = INSN(31,31) == 1;
+      UInt mm    = INSN(20,16);
+      Bool isAdd = INSN(15,15) == 0;
+      UInt aa    = INSN(14,10);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      if (is64) {
+         putIReg64orZR(
+            dd,
+            binop(isAdd ? Iop_Add64 : Iop_Sub64,
+                  getIReg64orZR(aa),
+                  binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
+      } else {
+         putIReg32orZR(
+            dd,
+            binop(isAdd ? Iop_Add32 : Iop_Sub32,
+                  getIReg32orZR(aa),
+                  binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
+      }
+      DIP("%s %s, %s, %s, %s\n",
+          isAdd ? "madd" : "msub",
+          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
+          nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
+      return True;
+   }
+
+   /* ---------------- CS{EL,INC,INV,NEG} ---------------- */   
+   /* 31 30 28        20 15   11 9  4
+      sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
+      sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
+      sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
+      sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
+      In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
+   */
+   if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
+      Bool    is64 = INSN(31,31) == 1;
+      UInt    b30  = INSN(30,30);
+      UInt    mm   = INSN(20,16);
+      UInt    cond = INSN(15,12);
+      UInt    b10  = INSN(10,10);
+      UInt    nn   = INSN(9,5);
+      UInt    dd   = INSN(4,0);
+      UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
+      IRType  ty   = is64 ? Ity_I64 : Ity_I32;
+      IRExpr* argL = getIRegOrZR(is64, nn);
+      IRExpr* argR = getIRegOrZR(is64, mm);
+      switch (op) {
+         case BITS2(0,0):
+            break;
+         case BITS2(0,1):
+            argR = binop(mkADD(ty), argR, mkU(ty,1));
+            break;
+         case BITS2(1,0):
+            argR = unop(mkNOT(ty), argR);
+            break;
+         case BITS2(1,1):
+            argR = binop(mkSUB(ty), mkU(ty,0), argR);
+            break;
+         default:
+            vassert(0);
+      }
+      putIRegOrZR(
+         is64, dd,
+         IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
+                    argL, argR)
+      );
+      const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
+      DIP("%s %s, %s, %s, %s\n", op_nm[op],
+          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
+          nameIRegOrZR(is64, mm), nameCC(cond));
+      return True;
+   }
+
+   /* -------------- ADD/SUB(extended reg) -------------- */   
+   /*     28         20 15  12   9 4
+      000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
+      100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
+
+      001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
+      101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
+
+      010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
+      110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
+
+      011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
+      111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
+
+      The 'm' operand is extended per opt, thusly:
+
+        000   Xm & 0xFF           UXTB
+        001   Xm & 0xFFFF         UXTH
+        010   Xm & (2^32)-1       UXTW
+        011   Xm                  UXTX
+
+        100   Xm sx from bit 7    SXTB
+        101   Xm sx from bit 15   SXTH
+        110   Xm sx from bit 31   SXTW
+        111   Xm                  SXTX
+
+      In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
+      operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
+      are the identity operation on Wm.
+
+      After extension, the value is shifted left by imm3 bits, which
+      may only be in the range 0 .. 4 inclusive.
+   */
+   if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
+      Bool is64  = INSN(31,31) == 1;
+      Bool isSub = INSN(30,30) == 1;
+      Bool setCC = INSN(29,29) == 1;
+      UInt mm    = INSN(20,16);
+      UInt opt   = INSN(15,13);
+      UInt imm3  = INSN(12,10);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
+                                  "sxtb", "sxth", "sxtw", "sxtx" };
+      /* Do almost the same thing in the 32- and 64-bit cases. */
+      IRTemp xN = newTemp(Ity_I64);
+      IRTemp xM = newTemp(Ity_I64);
+      assign(xN, getIReg64orSP(nn));
+      assign(xM, getIReg64orZR(mm));
+      IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
+      Int     shSX = 0;
+      /* widen Xm .. */
+      switch (opt) {
+         case BITS3(0,0,0): // UXTB
+            xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
+         case BITS3(0,0,1): // UXTH
+            xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
+         case BITS3(0,1,0): // UXTW -- noop for the 32bit case
+            if (is64) {
+               xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
+            }
+            break;
+         case BITS3(0,1,1): // UXTX -- always a noop
+            break;
+         case BITS3(1,0,0): // SXTB
+            shSX = 56; goto sxTo64;
+         case BITS3(1,0,1): // SXTH
+            shSX = 48; goto sxTo64;
+         case BITS3(1,1,0): // SXTW -- noop for the 32bit case
+            if (is64) {
+               shSX = 32; goto sxTo64;
+            }
+            break;
+         case BITS3(1,1,1): // SXTX -- always a noop
+            break;
+         sxTo64:
+            vassert(shSX >= 32);
+            xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
+                        mkU8(shSX));
+            break;
+         default:
+            vassert(0);
+      }
+      /* and now shift */
+      IRTemp argL = xN;
+      IRTemp argR = newTemp(Ity_I64);
+      assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
+      IRTemp res = newTemp(Ity_I64);
+      assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
+                        mkexpr(argL), mkexpr(argR)));
+      if (is64) {
+         if (setCC) {
+            putIReg64orZR(dd, mkexpr(res));
+            setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
+         } else {
+            putIReg64orSP(dd, mkexpr(res));
+         }
+      } else {
+         if (setCC) {
+            IRTemp argL32 = newTemp(Ity_I32);
+            IRTemp argR32 = newTemp(Ity_I32);
+            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
+            assign(argL32, unop(Iop_64to32, mkexpr(argL)));
+            assign(argR32, unop(Iop_64to32, mkexpr(argR)));
+            setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
+         } else {
+            putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
+         }
+      }
+      DIP("%s%s %s, %s, %s %s lsl %u\n",
+          isSub ? "sub" : "add", setCC ? "s" : "",
+          setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
+          nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
+          nameExt[opt], imm3);
+      return True;
+   }
+
+   /* ---------------- CCMP/CCMN(imm) ---------------- */
+   /* Bizarrely, these appear in the "data processing register"
+      category, even though they are operations against an
+      immediate. */
+   /* 31   29        20   15   11 9    3
+      sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
+      sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
+
+      Operation is:
+         (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
+         (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
+   */
+   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
+       && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
+      Bool is64  = INSN(31,31) == 1;
+      Bool isSUB = INSN(30,30) == 1;
+      UInt imm5  = INSN(20,16);
+      UInt cond  = INSN(15,12);
+      UInt nn    = INSN(9,5);
+      UInt nzcv  = INSN(3,0);
+
+      IRTemp condT = newTemp(Ity_I1);
+      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
+
+      IRType ty   = is64 ? Ity_I64 : Ity_I32;
+      IRTemp argL = newTemp(ty);
+      IRTemp argR = newTemp(ty);
+
+      if (is64) {
+         assign(argL, getIReg64orZR(nn));
+         assign(argR, mkU64(imm5));
+      } else {
+         assign(argL, getIReg32orZR(nn));
+         assign(argR, mkU32(imm5));
+      }
+      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
+
+      DIP("ccm%c %s, #%u, #%u, %s\n",
+          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
+          imm5, nzcv, nameCC(cond));
+      return True;
+   }
+
+   /* ---------------- CCMP/CCMN(reg) ---------------- */
+   /* 31   29        20 15   11 9    3
+      sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
+      sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
+      Operation is:
+         (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
+         (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
+   */
+   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
+       && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
+      Bool is64  = INSN(31,31) == 1;
+      Bool isSUB = INSN(30,30) == 1;
+      UInt mm    = INSN(20,16);
+      UInt cond  = INSN(15,12);
+      UInt nn    = INSN(9,5);
+      UInt nzcv  = INSN(3,0);
+
+      IRTemp condT = newTemp(Ity_I1);
+      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
+
+      IRType ty   = is64 ? Ity_I64 : Ity_I32;
+      IRTemp argL = newTemp(ty);
+      IRTemp argR = newTemp(ty);
+
+      if (is64) {
+         assign(argL, getIReg64orZR(nn));
+         assign(argR, getIReg64orZR(mm));
+      } else {
+         assign(argL, getIReg32orZR(nn));
+         assign(argR, getIReg32orZR(mm));
+      }
+      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
+
+      DIP("ccm%c %s, %s, #%u, %s\n",
+          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
+          nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
+      return True;
+   }
+
+
+   /* -------------- REV/REV16/REV32/RBIT -------------- */   
+   /* 31 30 28       20    15   11 9 4
+
+      1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
+      0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
+
+      1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
+      0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
+
+      1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
+      0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
+
+      1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
+   */
+   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
+       && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
+      UInt b31 = INSN(31,31);
+      UInt opc = INSN(11,10);
+
+      UInt ix = 0;
+      /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1; 
+      else if (b31 == 0 && opc == BITS2(1,0)) ix = 2; 
+      else if (b31 == 1 && opc == BITS2(0,0)) ix = 3; 
+      else if (b31 == 0 && opc == BITS2(0,0)) ix = 4; 
+      else if (b31 == 1 && opc == BITS2(0,1)) ix = 5; 
+      else if (b31 == 0 && opc == BITS2(0,1)) ix = 6; 
+      else if (b31 == 1 && opc == BITS2(1,0)) ix = 7; 
+      if (ix >= 1 && ix <= 7) {
+         Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
+         UInt   nn    = INSN(9,5);
+         UInt   dd    = INSN(4,0);
+         IRTemp src   = newTemp(Ity_I64);
+         IRTemp dst   = IRTemp_INVALID;
+         IRTemp (*math)(IRTemp) = NULL;
+         switch (ix) {
+            case 1: case 2: math = math_BYTESWAP64;   break;
+            case 3: case 4: math = math_BITSWAP64;    break;
+            case 5: case 6: math = math_USHORTSWAP64; break;
+            case 7:         math = math_UINTSWAP64;   break;
+            default: vassert(0);
+         }
+         const HChar* names[7]
+           = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
+         const HChar* nm = names[ix-1];
+         vassert(math);
+         if (ix == 6) {
+            /* This has to be special cased, since the logic below doesn't
+               handle it correctly. */
+            assign(src, getIReg64orZR(nn));
+            dst = math(src);
+            putIReg64orZR(dd,
+                          unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
+         } else if (is64) {
+            assign(src, getIReg64orZR(nn));
+            dst = math(src);
+            putIReg64orZR(dd, mkexpr(dst));
+         } else {
+            assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
+            dst = math(src);
+            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
+         }
+         DIP("%s %s, %s\n", nm,
+             nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- CLZ/CLS -------------------- */   
+   /*    30 28   24   20    15      9 4
+      sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
+      sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
+   */
+   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
+       && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
+      Bool   is64  = INSN(31,31) == 1;
+      Bool   isCLS = INSN(10,10) == 1;
+      UInt   nn    = INSN(9,5);
+      UInt   dd    = INSN(4,0);
+      IRTemp src   = newTemp(Ity_I64);
+      IRTemp dst   = newTemp(Ity_I64);
+      if (!isCLS) { // CLS not yet supported
+         if (is64) {
+            assign(src, getIReg64orZR(nn));
+            assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(src), mkU64(0)),
+                                   mkU64(64),
+                                   unop(Iop_Clz64, mkexpr(src))));
+            putIReg64orZR(dd, mkexpr(dst));
+         } else {
+            assign(src, binop(Iop_Shl64,
+                              unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
+            assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(src), mkU64(0)),
+                                   mkU64(32),
+                                   unop(Iop_Clz64, mkexpr(src))));
+            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
+         }
+         DIP("cl%c %s, %s\n",
+             isCLS ? 's' : 'z', nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
+         return True;
+      }
+   }
+
+   /* -------------------- LSLV/LSRV/ASRV -------------------- */   
+   /*    30 28        20 15   11 9 4
+      sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
+      sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
+      sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
+   */
+   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
+       && INSN(15,12) == BITS4(0,0,1,0) && INSN(11,10) < BITS2(1,1)) {
+      Bool   is64 = INSN(31,31) == 1;
+      UInt   mm   = INSN(20,16);
+      UInt   op   = INSN(11,10);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRType ty   = is64 ? Ity_I64 : Ity_I32;
+      IRTemp srcL = newTemp(ty);
+      IRTemp srcR = newTemp(Ity_I8);
+      IRTemp res  = newTemp(ty);
+      IROp   iop  = Iop_INVALID;
+      assign(srcL, getIRegOrZR(is64, nn));
+      assign(srcR,
+             unop(Iop_64to8,
+                  binop(Iop_And64,
+                        getIReg64orZR(mm), mkU64(is64 ? 63 : 31))));
+      switch (op) {
+         case BITS2(0,0): iop = mkSHL(ty); break;
+         case BITS2(0,1): iop = mkSHR(ty); break;
+         case BITS2(1,0): iop = mkSAR(ty); break;
+         default: vassert(0);
+      }
+      assign(res, binop(iop, mkexpr(srcL), mkexpr(srcR)));
+      putIRegOrZR(is64, dd, mkexpr(res));
+      vassert(op < 3);
+      const HChar* names[3] = { "lslv", "lsrv", "asrv" };
+      DIP("%s %s, %s, %s\n",
+          names[op], nameIRegOrZR(is64,dd),
+                     nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
+      return True;
+   }
+
+   /* -------------------- SDIV/UDIV -------------------- */   
+   /*    30 28        20 15    10 9 4
+      sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
+      sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
+   */
+   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
+       && INSN(15,11) == BITS5(0,0,0,0,1)) {
+      Bool is64 = INSN(31,31) == 1;
+      UInt mm   = INSN(20,16);
+      Bool isS  = INSN(10,10) == 1;
+      UInt nn   = INSN(9,5);
+      UInt dd   = INSN(4,0);
+      if (isS) {
+         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
+                                     getIRegOrZR(is64, nn),
+                                     getIRegOrZR(is64, mm)));
+      } else {
+         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
+                                     getIRegOrZR(is64, nn),
+                                     getIRegOrZR(is64, mm)));
+      }
+      DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
+          nameIRegOrZR(is64, dd),
+          nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
+      return True;
+   }
+
+   /* ------------------ {S,U}M{ADD,SUB}L ------------------ */   
+   /* 31        23  20 15 14 9 4
+      1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
+      1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
+      1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
+      1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
+      with operation
+         Xd = Xa +/- (Wn *u/s Wm)
+   */
+   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
+      Bool   isU   = INSN(23,23) == 1;
+      UInt   mm    = INSN(20,16);
+      Bool   isAdd = INSN(15,15) == 0;
+      UInt   aa    = INSN(14,10);
+      UInt   nn    = INSN(9,5);
+      UInt   dd    = INSN(4,0);
+      IRTemp wN    = newTemp(Ity_I32);
+      IRTemp wM    = newTemp(Ity_I32);
+      IRTemp xA    = newTemp(Ity_I64);
+      IRTemp muld  = newTemp(Ity_I64);
+      IRTemp res   = newTemp(Ity_I64);
+      assign(wN, getIReg32orZR(nn));
+      assign(wM, getIReg32orZR(mm));
+      assign(xA, getIReg64orZR(aa));
+      assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
+                         mkexpr(wN), mkexpr(wM)));
+      assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
+                        mkexpr(xA), mkexpr(muld)));
+      putIReg64orZR(dd, mkexpr(res));
+      DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
+          nameIReg64orZR(dd), nameIReg32orZR(nn),
+          nameIReg32orZR(mm), nameIReg64orZR(aa));
+      return True;
+   }
+   vex_printf("ARM64 front end: data_processing_register\n");
+   return False;
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Load and Store instructions                          ---*/
+/*------------------------------------------------------------*/
+
+/* Generate the EA for a "reg + reg" style amode.  This is done from
+   parts of the insn, but for sanity checking sake it takes the whole
+   insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
+   and S=insn[12]:
+
+   The possible forms, along with their opt:S values, are:
+      011:0   Xn|SP + Xm
+      111:0   Xn|SP + Xm
+      011:1   Xn|SP + Xm * transfer_szB
+      111:1   Xn|SP + Xm * transfer_szB
+      010:0   Xn|SP + 32Uto64(Wm)
+      010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
+      110:0   Xn|SP + 32Sto64(Wm)
+      110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
+
+   Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
+   the transfer size is insn[23,31,30].  For integer loads/stores,
+   insn[23] is zero, hence szLg2 can be at most 3 in such cases.
+
+   If the decoding fails, it returns IRTemp_INVALID.
+
+   isInt is True iff this is decoding is for transfers to/from integer
+   registers.  If False it is for transfers to/from vector registers.
+*/
+static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
+{
+   UInt    optS  = SLICE_UInt(insn, 15, 12);
+   UInt    mm    = SLICE_UInt(insn, 20, 16);
+   UInt    nn    = SLICE_UInt(insn, 9, 5);
+   UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
+                   | SLICE_UInt(insn, 31, 30); // Log2 of the size
+
+   buf[0] = 0;
+
+   /* Sanity checks, that this really is a load/store insn. */
+   if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
+      goto fail;
+
+   if (isInt
+       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
+       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
+       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
+       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
+      goto fail;
+
+   if (!isInt
+       && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
+      goto fail;
+
+   /* Throw out non-verified but possibly valid cases. */
+   switch (szLg2) {
+      case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
+      case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
+      case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
+      case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
+      case BITS3(1,0,0): // can only ever be valid for the vector case
+                         if (isInt) goto fail; else goto fail;
+      case BITS3(1,0,1): // these sizes are never valid
+      case BITS3(1,1,0):
+      case BITS3(1,1,1): goto fail;
+
+      default: vassert(0);
+   }
+
+   IRExpr* rhs  = NULL;
+   switch (optS) {
+      case BITS4(1,1,1,0): goto fail; //ATC
+      case BITS4(0,1,1,0):
+         rhs = getIReg64orZR(mm);
+         vex_sprintf(buf, "[%s, %s]",
+                     nameIReg64orZR(nn), nameIReg64orZR(mm));
+         break;
+      case BITS4(1,1,1,1): goto fail; //ATC
+      case BITS4(0,1,1,1):
+         rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
+         vex_sprintf(buf, "[%s, %s lsl %u]",
+                     nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
+         break;
+      case BITS4(0,1,0,0):
+         rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
+         vex_sprintf(buf, "[%s, %s uxtx]",
+                     nameIReg64orZR(nn), nameIReg32orZR(mm));
+         break;
+      case BITS4(0,1,0,1):
+         rhs = binop(Iop_Shl64,
+                     unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
+         vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
+                     nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
+         break;
+      case BITS4(1,1,0,0):
+         rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
+         vex_sprintf(buf, "[%s, %s sxtx]",
+                     nameIReg64orZR(nn), nameIReg32orZR(mm));
+         break;
+      case BITS4(1,1,0,1):
+         rhs = binop(Iop_Shl64,
+                     unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
+         vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
+                     nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
+         break;
+      default:
+         /* The rest appear to be genuinely invalid */
+         goto fail;
+   }
+
+   vassert(rhs);
+   IRTemp res = newTemp(Ity_I64);
+   assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
+   return res;
+
+  fail:
+   vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
+   return IRTemp_INVALID;
+}
+
+
+/* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
+   bits of DATAE :: Ity_I64. */
+static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
+{
+   IRExpr* addrE = mkexpr(addr);
+   switch (szB) {
+      case 8:
+         storeLE(addrE, dataE);
+         break;
+      case 4:
+         storeLE(addrE, unop(Iop_64to32, dataE));
+         break;
+      case 2:
+         storeLE(addrE, unop(Iop_64to16, dataE));
+         break;
+      case 1:
+         storeLE(addrE, unop(Iop_64to8, dataE));
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+
+/* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
+   placing the result in an Ity_I64 temporary. */
+static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
+{
+   IRTemp  res   = newTemp(Ity_I64);
+   IRExpr* addrE = mkexpr(addr);
+   switch (szB) {
+      case 8:
+         assign(res, loadLE(Ity_I64,addrE));
+         break;
+      case 4:
+         assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
+         break;
+      case 2:
+         assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
+         break;
+      case 1:
+         assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
+         break;
+      default:
+         vassert(0);
+   }
+   return res;
+}
+
+
+static
+Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+   /* ------------ LDR,STR (immediate, uimm12) ----------- */
+   /* uimm12 is scaled by the transfer size
+
+      31 29  26    21    9  4
+      |  |   |     |     |  |
+      11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
+      11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
+
+      10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
+      10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
+
+      01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
+      01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
+
+      00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
+      00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
+      UInt   szLg2 = INSN(31,30);
+      UInt   szB   = 1 << szLg2;
+      Bool   isLD  = INSN(22,22) == 1;
+      UInt   offs  = INSN(21,10) * szB;
+      UInt   nn    = INSN(9,5);
+      UInt   tt    = INSN(4,0);
+      IRTemp ta    = newTemp(Ity_I64);
+      assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
+      if (nn == 31) { /* FIXME generate stack alignment check */ }
+      vassert(szLg2 < 4);
+      if (isLD) {
+         putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
+      } else {
+         gen_narrowing_store(szB, ta, getIReg64orZR(tt));
+      }
+      const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
+      const HChar* st_name[4] = { "strb", "strh", "str", "str" };
+      DIP("%s %s, [%s, #%u]\n", 
+          (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
+          nameIReg64orSP(nn), offs);
+      return True;
+   }
+
+   /* ------------ LDUR,STUR (immediate, simm9) ----------- */
+   /*
+      31 29  26      20   11 9  4
+      |  |   |       |    |  |  |
+      (at-Rn-then-Rn=EA)  |  |  |
+      sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
+      sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
+
+      (at-EA-then-Rn=EA)
+      sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
+      sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
+
+      (at-EA)
+      sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
+      sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
+
+      simm9 is unscaled.
+
+      The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
+      load case this is because would create two competing values for
+      Rt.  In the store case the reason is unclear, but the spec
+      disallows it anyway.
+
+      Stores are narrowing, loads are unsigned widening.  sz encodes
+      the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
+   */
+   if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
+       == BITS9(1,1,1, 0,0,0,0,0, 0)) {
+      UInt szLg2  = INSN(31,30);
+      UInt szB    = 1 << szLg2;
+      Bool isLoad = INSN(22,22) == 1;
+      UInt imm9   = INSN(20,12);
+      UInt nn     = INSN(9,5);
+      UInt tt     = INSN(4,0);
+      Bool wBack  = INSN(10,10) == 1;
+      UInt how    = INSN(11,10);
+      if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
+         /* undecodable; fall through */
+      } else {
+         if (nn == 31) { /* FIXME generate stack alignment check */ }
+
+         // Compute the transfer address TA and the writeback address WA.
+         IRTemp tRN = newTemp(Ity_I64);
+         assign(tRN, getIReg64orSP(nn));
+         IRTemp tEA = newTemp(Ity_I64);
+         Long simm9 = (Long)sx_to_64(imm9, 9);
+         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
+
+         IRTemp tTA = newTemp(Ity_I64);
+         IRTemp tWA = newTemp(Ity_I64);
+         switch (how) {
+            case BITS2(0,1):
+               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(1,1):
+               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(0,0):
+               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
+            default:
+               vassert(0); /* NOTREACHED */
+         }
+
+         /* Normally rN would be updated after the transfer.  However, in
+            the special case typifed by
+               str x30, [sp,#-16]!
+            it is necessary to update SP before the transfer, (1)
+            because Memcheck will otherwise complain about a write
+            below the stack pointer, and (2) because the segfault
+            stack extension mechanism will otherwise extend the stack
+            only down to SP before the instruction, which might not be
+            far enough, if the -16 bit takes the actual access
+            address to the next page.
+         */
+         Bool earlyWBack
+           = wBack && simm9 < 0 && szB == 8
+             && how == BITS2(1,1) && nn == 31 && !isLoad && tt != nn;
+
+         if (wBack && earlyWBack)
+            putIReg64orSP(nn, mkexpr(tEA));
+
+         if (isLoad) {
+            putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
+         } else {
+            gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
+         }
+
+         if (wBack && !earlyWBack)
+            putIReg64orSP(nn, mkexpr(tEA));
+
+         const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
+         const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
+         const HChar* fmt_str = NULL;
+         switch (how) {
+            case BITS2(0,1):
+               fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
+               break;
+            case BITS2(1,1):
+               fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
+               break;
+            case BITS2(0,0):
+               fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
+               break;
+            default:
+               vassert(0);
+         }
+         DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
+                      nameIRegOrZR(szB == 8, tt),
+                      nameIReg64orSP(nn), simm9);
+         return True;
+      }
+   }
+
+   /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
+   /* L==1 => mm==LD
+      L==0 => mm==ST
+      x==0 => 32 bit transfers, and zero extended loads
+      x==1 => 64 bit transfers
+      simm7 is scaled by the (single-register) transfer size
+
+      (at-Rn-then-Rn=EA)
+      x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
+   
+      (at-EA-then-Rn=EA)
+      x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
+
+      (at-EA)
+      x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
+   */
+
+   UInt insn_30_23 = INSN(30,23);
+   if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1) 
+       || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
+       || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
+      UInt bL     = INSN(22,22);
+      UInt bX     = INSN(31,31);
+      UInt bWBack = INSN(23,23);
+      UInt rT1    = INSN(4,0);
+      UInt rN     = INSN(9,5);
+      UInt rT2    = INSN(14,10);
+      Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
+      if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
+          || (bL && rT1 == rT2)) {
+         /* undecodable; fall through */
+      } else {
+         if (rN == 31) { /* FIXME generate stack alignment check */ }
+
+         // Compute the transfer address TA and the writeback address WA.
+         IRTemp tRN = newTemp(Ity_I64);
+         assign(tRN, getIReg64orSP(rN));
+         IRTemp tEA = newTemp(Ity_I64);
+         simm7 = (bX ? 8 : 4) * simm7;
+         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
+
+         IRTemp tTA = newTemp(Ity_I64);
+         IRTemp tWA = newTemp(Ity_I64);
+         switch (INSN(24,23)) {
+            case BITS2(0,1):
+               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(1,1):
+               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(1,0):
+               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
+            default:
+               vassert(0); /* NOTREACHED */
+         }
+
+         /* Normally rN would be updated after the transfer.  However, in
+            the special case typifed by
+               stp x29, x30, [sp,#-112]!
+            it is necessary to update SP before the transfer, (1)
+            because Memcheck will otherwise complain about a write
+            below the stack pointer, and (2) because the segfault
+            stack extension mechanism will otherwise extend the stack
+            only down to SP before the instruction, which might not be
+            far enough, if the -112 bit takes the actual access
+            address to the next page.
+         */
+         Bool earlyWBack
+           = bWBack && simm7 < 0
+             && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
+
+         if (bWBack && earlyWBack)
+            putIReg64orSP(rN, mkexpr(tEA));
+
+         /**/ if (bL == 1 && bX == 1) {
+            // 64 bit load
+            putIReg64orZR(rT1, loadLE(Ity_I64,
+                                      binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
+            putIReg64orZR(rT2, loadLE(Ity_I64, 
+                                      binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
+         } else if (bL == 1 && bX == 0) {
+            // 32 bit load
+            putIReg32orZR(rT1, loadLE(Ity_I32,
+                                      binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
+            putIReg32orZR(rT2, loadLE(Ity_I32, 
+                                      binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
+         } else if (bL == 0 && bX == 1) {
+            // 64 bit store
+            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
+                    getIReg64orZR(rT1));
+            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
+                    getIReg64orZR(rT2));
+         } else {
+            vassert(bL == 0 && bX == 0);
+            // 32 bit store
+            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
+                    getIReg32orZR(rT1));
+            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
+                    getIReg32orZR(rT2));
+         }
+
+         if (bWBack && !earlyWBack)
+            putIReg64orSP(rN, mkexpr(tEA));
+
+         const HChar* fmt_str = NULL;
+         switch (INSN(24,23)) {
+            case BITS2(0,1):
+               fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
+               break;
+            case BITS2(1,1):
+               fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
+               break;
+            case BITS2(1,0):
+               fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
+               break;
+            default:
+               vassert(0);
+         }
+         DIP(fmt_str, bL == 0 ? "st" : "ld",
+                      nameIRegOrZR(bX == 1, rT1),
+                      nameIRegOrZR(bX == 1, rT2),
+                      nameIReg64orSP(rN), simm7);
+         return True;
+      }
+   }
+
+   /* ---------------- LDR (literal, int reg) ---------------- */
+   /* 31 29      23    4
+      00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
+      01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
+      10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
+      11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
+      Just handles the first two cases for now.
+   */
+   if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
+      UInt  imm19 = INSN(23,5);
+      UInt  rT    = INSN(4,0);
+      UInt  bX    = INSN(30,30);
+      ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
+      if (bX) {
+         putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
+      } else {
+         putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
+      }
+      DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
+      return True;
+   }
+
+   /* -------------- {LD,ST}R (integer register) --------------- */
+   /* 31 29        20 15     12 11 9  4
+      |  |         |  |      |  |  |  |
+      11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
+      10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
+      01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
+      00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
+
+      11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
+      10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
+      01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
+      00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
+       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
+      HChar  dis_buf[64];
+      UInt   szLg2 = INSN(31,30);
+      Bool   isLD  = INSN(22,22) == 1;
+      UInt   tt    = INSN(4,0);
+      IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
+      if (ea != IRTemp_INVALID) {
+         switch (szLg2) {
+            case 3: /* 64 bit */
+               if (isLD) {
+                  putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
+                  DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
+               } else {
+                  storeLE(mkexpr(ea), getIReg64orZR(tt));
+                  DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
+               }
+               break;
+            case 2: /* 32 bit */
+               if (isLD) {
+                  putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
+                  DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               } else {
+                  storeLE(mkexpr(ea), getIReg32orZR(tt));
+                  DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               }
+               break;
+            case 1: /* 16 bit */
+               if (isLD) {
+                  putIReg64orZR(tt, unop(Iop_16Uto64,
+                                         loadLE(Ity_I16, mkexpr(ea))));
+                  DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               } else {
+                  storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
+                  DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               }
+               break;
+            case 0: /* 8 bit */
+               if (isLD) {
+                  putIReg64orZR(tt, unop(Iop_8Uto64,
+                                         loadLE(Ity_I8, mkexpr(ea))));
+                  DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               } else {
+                  storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
+                  DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
+               }
+               break;
+            default:
+               vassert(0);
+         }
+         return True;
+      }
+   }
+
+   /* -------------- LDRS{B,H,W} (uimm12) -------------- */
+   /* 31 29  26  23 21    9 4
+      10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
+      01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
+      00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
+      where
+         Rt is Wt when x==1, Xt when x==0
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
+      /* Further checks on bits 31:30 and 22 */
+      Bool valid = False;
+      switch ((INSN(31,30) << 1) | INSN(22,22)) {
+         case BITS3(1,0,0):
+         case BITS3(0,1,0): case BITS3(0,1,1):
+         case BITS3(0,0,0): case BITS3(0,0,1):
+            valid = True;
+            break;
+      }
+      if (valid) {
+         UInt    szLg2 = INSN(31,30);
+         UInt    bitX  = INSN(22,22);
+         UInt    imm12 = INSN(21,10);
+         UInt    nn    = INSN(9,5);
+         UInt    tt    = INSN(4,0);
+         UInt    szB   = 1 << szLg2;
+         IRExpr* ea    = binop(Iop_Add64,
+                               getIReg64orSP(nn), mkU64(imm12 * szB));
+         switch (szB) {
+            case 4:
+               vassert(bitX == 0);
+               putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
+               DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
+                   nameIReg64orSP(nn), imm12 * szB);
+               break;
+            case 2:
+               if (bitX == 1) {
+                  putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
+               } else {
+                  putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
+               }
+               DIP("ldrsh %s, [%s, #%u]\n",
+                   nameIRegOrZR(bitX == 0, tt),
+                   nameIReg64orSP(nn), imm12 * szB);
+               break;
+            case 1:
+               if (bitX == 1) {
+                  putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
+               } else {
+                  putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
+               }
+               DIP("ldrsb %s, [%s, #%u]\n",
+                   nameIRegOrZR(bitX == 0, tt),
+                   nameIReg64orSP(nn), imm12 * szB);
+               break;
+            default:
+               vassert(0);
+         }
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
+   /* (at-Rn-then-Rn=EA)
+      31 29      23 21 20   11 9 4
+      00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
+      01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
+      10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
+
+      (at-EA-then-Rn=EA)
+      00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
+      01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
+      10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!      
+      where
+         Rt is Wt when x==1, Xt when x==0
+         transfer-at-Rn when [11]==0, at EA when [11]==1
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
+       && INSN(21,21) == 0 && INSN(10,10) == 1) {
+      /* Further checks on bits 31:30 and 22 */
+      Bool valid = False;
+      switch ((INSN(31,30) << 1) | INSN(22,22)) {
+         case BITS3(1,0,0):                    // LDRSW Xt
+         case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
+         case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
+            valid = True;
+            break;
+      }
+      if (valid) {
+         UInt   szLg2 = INSN(31,30);
+         UInt   imm9  = INSN(20,12);
+         Bool   atRN  = INSN(11,11) == 0;
+         UInt   nn    = INSN(9,5);
+         UInt   tt    = INSN(4,0);
+         IRTemp tRN   = newTemp(Ity_I64);
+         IRTemp tEA   = newTemp(Ity_I64);
+         IRTemp tTA   = IRTemp_INVALID;
+         ULong  simm9 = sx_to_64(imm9, 9);
+         Bool   is64  = INSN(22,22) == 0;
+         assign(tRN, getIReg64orSP(nn));
+         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
+         tTA = atRN ? tRN : tEA;
+         HChar ch = '?';
+         /* There are 5 cases: 
+               byte     load,           SX to 64
+               byte     load, SX to 32, ZX to 64
+               halfword load,           SX to 64
+               halfword load, SX to 32, ZX to 64
+               word     load,           SX to 64
+            The ifs below handle them in the listed order.
+         */
+         if (szLg2 == 0) {
+            ch = 'b';
+            if (is64) {
+               putIReg64orZR(tt, unop(Iop_8Sto64,
+                                      loadLE(Ity_I8, mkexpr(tTA))));
+            } else {
+               putIReg32orZR(tt, unop(Iop_8Sto32,
+                                      loadLE(Ity_I8, mkexpr(tTA))));
+            }
+         }
+         else if (szLg2 == 1) {
+            ch = 'h';
+            if (is64) {
+               putIReg64orZR(tt, unop(Iop_16Sto64,
+                                      loadLE(Ity_I16, mkexpr(tTA))));
+            } else {
+               putIReg32orZR(tt, unop(Iop_16Sto32,
+                                      loadLE(Ity_I16, mkexpr(tTA))));
+            }
+         }
+         else if (szLg2 == 2 && is64) {
+            ch = 'w';
+            putIReg64orZR(tt, unop(Iop_32Sto64,
+                                   loadLE(Ity_I32, mkexpr(tTA))));
+         }
+         else {
+            vassert(0);
+         }
+         putIReg64orSP(nn, mkexpr(tEA));
+         DIP(atRN ? "ldrs%c %s, [%s], #%lld\n" : "ldrs%c %s, [%s, #%lld]!",
+             ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
+   /* 31 29      23 21 20   11 9 4
+      00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
+      01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
+      10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
+      where
+         Rt is Wt when x==1, Xt when x==0
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
+       && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
+      /* Further checks on bits 31:30 and 22 */
+      Bool valid = False;
+      switch ((INSN(31,30) << 1) | INSN(22,22)) {
+         case BITS3(1,0,0):                    // LDURSW Xt
+         case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
+         case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
+            valid = True;
+            break;
+      }
+      if (valid) {
+         UInt   szLg2 = INSN(31,30);
+         UInt   imm9  = INSN(20,12);
+         UInt   nn    = INSN(9,5);
+         UInt   tt    = INSN(4,0);
+         IRTemp tRN   = newTemp(Ity_I64);
+         IRTemp tEA   = newTemp(Ity_I64);
+         ULong  simm9 = sx_to_64(imm9, 9);
+         Bool   is64  = INSN(22,22) == 0;
+         assign(tRN, getIReg64orSP(nn));
+         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
+         HChar ch = '?';
+         /* There are 5 cases: 
+               byte     load,           SX to 64
+               byte     load, SX to 32, ZX to 64
+               halfword load,           SX to 64
+               halfword load, SX to 32, ZX to 64
+               word     load,           SX to 64
+            The ifs below handle them in the listed order.
+         */
+         if (szLg2 == 0) {
+            ch = 'b';
+            if (is64) {
+               putIReg64orZR(tt, unop(Iop_8Sto64,
+                                      loadLE(Ity_I8, mkexpr(tEA))));
+            } else {
+               putIReg32orZR(tt, unop(Iop_8Sto32,
+                                      loadLE(Ity_I8, mkexpr(tEA))));
+            }
+         }
+         else if (szLg2 == 1) {
+            ch = 'h';
+            if (is64) {
+               putIReg64orZR(tt, unop(Iop_16Sto64,
+                                      loadLE(Ity_I16, mkexpr(tEA))));
+            } else {
+               putIReg32orZR(tt, unop(Iop_16Sto32,
+                                      loadLE(Ity_I16, mkexpr(tEA))));
+            }
+         }
+         else if (szLg2 == 2 && is64) {
+            ch = 'w';
+            putIReg64orZR(tt, unop(Iop_32Sto64,
+                                   loadLE(Ity_I32, mkexpr(tEA))));
+         }
+         else {
+            vassert(0);
+         }
+         DIP("ldurs%c %s, [%s, #%lld]",
+             ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
+   /* L==1    => mm==LD
+      L==0    => mm==ST
+      sz==00  => 32 bit (S) transfers
+      sz==01  => 64 bit (D) transfers
+      sz==10  => 128 bit (Q) transfers
+      sz==11  isn't allowed
+      simm7 is scaled by the (single-register) transfer size
+
+      31 29       22 21   14 9 4
+      sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
+      (at-Rn-then-Rn=EA)
+
+      sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
+      (at-EA-then-Rn=EA)
+
+      sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
+      (at-EA)
+   */
+
+   UInt insn_29_23 = INSN(29,23);
+   if (insn_29_23 == BITS7(1,0,1,1,0,0,1) 
+       || insn_29_23 == BITS7(1,0,1,1,0,1,1)
+       || insn_29_23 == BITS7(1,0,1,1,0,1,0)) {
+      UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
+      Bool isLD   = INSN(22,22) == 1;
+      Bool wBack  = INSN(23,23) == 1;
+      Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
+      UInt tt2    = INSN(14,10);
+      UInt nn     = INSN(9,5);
+      UInt tt1    = INSN(4,0);
+      if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
+         /* undecodable; fall through */
+      } else {
+         if (nn == 31) { /* FIXME generate stack alignment check */ }
+
+         // Compute the transfer address TA and the writeback address WA.
+         UInt   szB = 4 << szSlg2; /* szB is the per-register size */
+         IRTemp tRN = newTemp(Ity_I64);
+         assign(tRN, getIReg64orSP(nn));
+         IRTemp tEA = newTemp(Ity_I64);
+         simm7 = szB * simm7;
+         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
+
+         IRTemp tTA = newTemp(Ity_I64);
+         IRTemp tWA = newTemp(Ity_I64);
+         switch (INSN(24,23)) {
+            case BITS2(0,1):
+               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(1,1):
+               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
+            case BITS2(1,0):
+               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
+            default:
+               vassert(0); /* NOTREACHED */
+         }
+
+         IRType ty = Ity_INVALID;
+         switch (szB) {
+            case 4:  ty = Ity_F32;  break;
+            case 8:  ty = Ity_F64;  break;
+            case 16: ty = Ity_V128; break;
+            default: vassert(0);
+         }
+
+         /* Normally rN would be updated after the transfer.  However, in
+            the special cases typifed by
+               stp q0, q1, [sp,#-512]!
+               stp d0, d1, [sp,#-512]!
+               stp s0, s1, [sp,#-512]!
+            it is necessary to update SP before the transfer, (1)
+            because Memcheck will otherwise complain about a write
+            below the stack pointer, and (2) because the segfault
+            stack extension mechanism will otherwise extend the stack
+            only down to SP before the instruction, which might not be
+            far enough, if the -512 bit takes the actual access
+            address to the next page.
+         */
+         Bool earlyWBack
+           = wBack && simm7 < 0
+             && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
+
+         if (wBack && earlyWBack)
+            putIReg64orSP(nn, mkexpr(tEA));
+
+         if (isLD) {
+            if (szB < 16) {
+               putQReg128(tt1, mkV128(0x0000));
+            }
+            putQRegLO(tt1,
+                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
+            if (szB < 16) {
+               putQReg128(tt2, mkV128(0x0000));
+            }
+            putQRegLO(tt2,
+                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
+         } else {
+            storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
+                    getQRegLO(tt1, ty));
+            storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
+                    getQRegLO(tt2, ty));
+         }
+
+         if (wBack && !earlyWBack)
+            putIReg64orSP(nn, mkexpr(tEA));
+
+         const HChar* fmt_str = NULL;
+         switch (INSN(24,23)) {
+            case BITS2(0,1):
+               fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
+               break;
+            case BITS2(1,1):
+               fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
+               break;
+            case BITS2(1,0):
+               fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
+               break;
+            default:
+               vassert(0);
+         }
+         DIP(fmt_str, isLD ? "ld" : "st",
+                      nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
+                      nameIReg64orSP(nn), simm7);
+         return True;
+      }
+   }
+
+   /* -------------- {LD,ST}R (vector register) --------------- */
+   /* 31 29     23  20 15     12 11 9  4
+      |  |      |   |  |      |  |  |  |
+      00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
+      01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
+      10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
+      11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
+      00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
+
+      00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
+      01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
+      10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
+      11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
+      00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
+   */
+   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
+       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
+      HChar  dis_buf[64];
+      UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
+      Bool   isLD  = INSN(22,22) == 1;
+      UInt   tt    = INSN(4,0);
+      if (szLg2 >= 4) goto after_LDR_STR_vector_register;
+      IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
+      if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
+      switch (szLg2) {
+         case 0: /* 8 bit */
+            if (isLD) {
+               putQReg128(tt, mkV128(0x0000));
+               putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
+            } else {
+               vassert(0); //ATC
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
+            }
+            break;
+         case 1:
+            if (isLD) {
+               putQReg128(tt, mkV128(0x0000));
+               putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
+            } else {
+               vassert(0); //ATC
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
+            }
+            break;
+         case 2: /* 32 bit */
+            if (isLD) {
+               putQReg128(tt, mkV128(0x0000));
+               putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
+            } else {
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
+            }
+            break;
+         case 3: /* 64 bit */
+            if (isLD) {
+               putQReg128(tt, mkV128(0x0000));
+               putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
+            } else {
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
+            }
+            break;
+         case 4:  return False; //ATC
+         default: vassert(0);
+      }
+      return True;
+   }
+  after_LDR_STR_vector_register:
+
+   /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
+   /* 31 29      22 20 15  12 11 9  4
+      |  |       |  |  |   |  |  |  |
+      10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
+
+      01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
+      01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
+
+      00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
+      00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
+   */
+   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
+       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
+      HChar  dis_buf[64];
+      UInt   szLg2  = INSN(31,30);
+      Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
+      UInt   tt     = INSN(4,0);
+      if (szLg2 == 3) goto after_LDRS_integer_register;
+      IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
+      if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
+      /* Enumerate the 5 variants explicitly. */
+      if (szLg2 == 2/*32 bit*/ && sxTo64) {
+         putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
+         DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
+         return True;
+      }
+      else
+      if (szLg2 == 1/*16 bit*/) {
+         if (sxTo64) {
+            putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
+            DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
+         } else {
+            putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
+            DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
+         }
+         return True;
+      }
+      else
+      if (szLg2 == 0/*8 bit*/) {
+         if (sxTo64) {
+            putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
+            DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
+         } else {
+            putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
+            DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
+         }
+         return True;
+      }
+      /* else it's an invalid combination */
+   }
+  after_LDRS_integer_register:
+
+   /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
+   /* This is the Unsigned offset variant only.  The Post-Index and
+      Pre-Index variants are below.
+
+      31 29      23 21    9 4
+      00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
+      01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
+      10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
+      11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
+      00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
+
+      00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
+      01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
+      10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
+      11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
+      00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
+   */
+   if (INSN(29,24) == BITS6(1,1,1,1,0,1)
+       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
+      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
+      Bool   isLD   = INSN(22,22) == 1;
+      UInt   pimm12 = INSN(21,10) << szLg2;
+      UInt   nn     = INSN(9,5);
+      UInt   tt     = INSN(4,0);
+      IRTemp tEA    = newTemp(Ity_I64);
+      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
+      assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
+      if (isLD) {
+         if (szLg2 < 4) {
+            putQReg128(tt, mkV128(0x0000));
+         }
+         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
+      } else {
+         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
+      }
+      DIP("%s %s, [%s, #%u]\n",
+          isLD ? "ldr" : "str",
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
+      return True;
+   }
+
+   /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
+   /* These are the Post-Index and Pre-Index variants.
+
+      31 29      23   20   11 9 4
+      (at-Rn-then-Rn=EA)
+      00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
+      01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
+      10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
+      11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
+      00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
+
+      (at-EA-then-Rn=EA)
+      00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
+      01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
+      10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
+      11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
+      00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
+
+      Stores are the same except with bit 22 set to 0.
+   */
+   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
+       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
+       && INSN(21,21) == 0 && INSN(10,10) == 1) {
+      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
+      Bool   isLD   = INSN(22,22) == 1;
+      UInt   imm9   = INSN(20,12);
+      Bool   atRN   = INSN(11,11) == 0;
+      UInt   nn     = INSN(9,5);
+      UInt   tt     = INSN(4,0);
+      IRTemp tRN    = newTemp(Ity_I64);
+      IRTemp tEA    = newTemp(Ity_I64);
+      IRTemp tTA    = IRTemp_INVALID;
+      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
+      ULong  simm9  = sx_to_64(imm9, 9);
+      assign(tRN, getIReg64orSP(nn));
+      assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
+      tTA = atRN ? tRN : tEA;
+      if (isLD) {
+         if (szLg2 < 4) {
+            putQReg128(tt, mkV128(0x0000));
+         }
+         putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
+      } else {
+         storeLE(mkexpr(tTA), getQRegLO(tt, ty));
+      }
+      putIReg64orSP(nn, mkexpr(tEA));
+      DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
+          isLD ? "ldr" : "str",
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), simm9);
+      return True;
+   }
+
+   /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
+   /* 31 29      23   20   11 9 4
+      00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
+      01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
+      10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
+      11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
+      00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
+
+      00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
+      01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
+      10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
+      11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
+      00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
+   */
+   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
+       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
+       && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
+      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
+      Bool   isLD   = INSN(22,22) == 1;
+      UInt   imm9   = INSN(20,12);
+      UInt   nn     = INSN(9,5);
+      UInt   tt     = INSN(4,0);
+      ULong  simm9  = sx_to_64(imm9, 9);
+      IRTemp tEA    = newTemp(Ity_I64);
+      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
+      assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
+      if (isLD) {
+         if (szLg2 < 4) {
+            putQReg128(tt, mkV128(0x0000));
+         }
+         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
+      } else {
+         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
+      }
+      DIP("%s %s, [%s, #%lld]\n",
+          isLD ? "ldur" : "stur",
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
+      return True;
+   }
+
+   /* ---------------- LDR (literal, SIMD&FP) ---------------- */
+   /* 31 29      23    4
+      00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
+      01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
+      10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
+   */
+   if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
+      UInt   szB   = 4 << INSN(31,30);
+      UInt   imm19 = INSN(23,5);
+      UInt   tt    = INSN(4,0);
+      ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
+      IRType ty    = preferredVectorSubTypeFromSize(szB);
+      putQReg128(tt, mkV128(0x0000));
+      putQRegLO(tt, loadLE(ty, mkU64(ea)));
+      DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
+      return True;
+   }
+
+   /* ---------- LD1/ST1 (single structure, no offset) ---------- */
+   /* 31        23
+      0100 1100 0100 0000 0111 11 N T   LD1 {vT.2d},  [Xn|SP]
+      0100 1100 0000 0000 0111 11 N T   ST1 {vT.2d},  [Xn|SP]
+      0100 1100 0100 0000 0111 10 N T   LD1 {vT.4s},  [Xn|SP]
+      0100 1100 0000 0000 0111 10 N T   ST1 {vT.4s},  [Xn|SP]
+      0100 1100 0100 0000 0111 01 N T   LD1 {vT.8h},  [Xn|SP]
+      0100 1100 0000 0000 0111 01 N T   ST1 {vT.8h},  [Xn|SP]
+      0100 1100 0100 0000 0111 00 N T   LD1 {vT.16b}, [Xn|SP]
+      0100 1100 0000 0000 0111 00 N T   ST1 {vT.16b}, [Xn|SP]
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "2d", "4s", "8h", "16b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      if (isLD) {
+         putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
+      } else {
+         storeLE(mkexpr(tEA), getQReg128(vT));
+      }
+      DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* 31        23
+      0000 1100 0100 0000 0111 11 N T   LD1 {vT.1d}, [Xn|SP]
+      0000 1100 0000 0000 0111 11 N T   ST1 {vT.1d}, [Xn|SP]
+      0000 1100 0100 0000 0111 10 N T   LD1 {vT.2s}, [Xn|SP]
+      0000 1100 0000 0000 0111 10 N T   ST1 {vT.2s}, [Xn|SP]
+      0000 1100 0100 0000 0111 01 N T   LD1 {vT.4h}, [Xn|SP]
+      0000 1100 0000 0000 0111 01 N T   ST1 {vT.4h}, [Xn|SP]
+      0000 1100 0100 0000 0111 00 N T   LD1 {vT.8b}, [Xn|SP]
+      0000 1100 0000 0000 0111 00 N T   ST1 {vT.8b}, [Xn|SP]
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "1d", "2s", "4h", "8b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      if (isLD) {
+         putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
+         putQRegLane(vT, 1, mkU64(0));
+      } else {
+         storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+      }
+      DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* ---------- LD1/ST1 (single structure, post index) ---------- */
+   /* 31        23
+      0100 1100 1001 1111 0111 11 N T  ST1 {vT.2d},  [xN|SP], #16
+      0100 1100 1101 1111 0111 11 N T  LD1 {vT.2d},  [xN|SP], #16
+      0100 1100 1001 1111 0111 10 N T  ST1 {vT.4s},  [xN|SP], #16
+      0100 1100 1101 1111 0111 10 N T  LD1 {vT.4s},  [xN|SP], #16
+      0100 1100 1001 1111 0111 01 N T  ST1 {vT.8h},  [xN|SP], #16
+      0100 1100 1101 1111 0111 01 N T  LD1 {vT.8h},  [xN|SP], #16
+      0100 1100 1001 1111 0111 00 N T  ST1 {vT.16b}, [xN|SP], #16
+      0100 1100 1101 1111 0111 00 N T  LD1 {vT.16b}, [xN|SP], #16
+      Note that #16 is implied and cannot be any other value.
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFF000) == 0x4CDF7000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x4C9F7000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "2d", "4s", "8h", "16b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      if (isLD) {
+         putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
+      } else {
+         storeLE(mkexpr(tEA), getQReg128(vT));
+      }
+      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16)));
+      DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* 31        23
+      0000 1100 1001 1111 0111 11 N T  ST1 {vT.1d}, [xN|SP], #8
+      0000 1100 1101 1111 0111 11 N T  LD1 {vT.1d}, [xN|SP], #8
+      0000 1100 1001 1111 0111 10 N T  ST1 {vT.2s}, [xN|SP], #8
+      0000 1100 1101 1111 0111 10 N T  LD1 {vT.2s}, [xN|SP], #8
+      0000 1100 1001 1111 0111 01 N T  ST1 {vT.4h}, [xN|SP], #8
+      0000 1100 1101 1111 0111 01 N T  LD1 {vT.4h}, [xN|SP], #8
+      0000 1100 1001 1111 0111 00 N T  ST1 {vT.8b}, [xN|SP], #8
+      0000 1100 1101 1111 0111 00 N T  LD1 {vT.8b}, [xN|SP], #8
+      Note that #8 is implied and cannot be any other value.
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFF000) == 0x0CDF7000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x0C9F7000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN  = INSN(9,5);
+      UInt   vT  = INSN(4,0);
+      IRTemp tEA = newTemp(Ity_I64);
+      const HChar* names[4] = { "1d", "2s", "4h", "8b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      if (isLD) {
+         putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
+         putQRegLane(vT, 1, mkU64(0));
+      } else {
+         storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+      }
+      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8)));
+      DIP("%s {v%u.%s}, [%s], #8\n",  isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* ---------- LD2/ST2 (multiple structures, post index) ---------- */
+   /* Only a very few cases. */
+   /* 31        23             11 9 4
+      0100 1100 1101 1111 1000 11 n t  LD2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
+      0100 1100 1001 1111 1000 11 n t  ST2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
+      0100 1100 1101 1111 1000 10 n t  LD2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
+      0100 1100 1001 1111 1000 10 n t  ST2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
+   */
+   if (   (insn & 0xFFFFFC00) == 0x4CDF8C00 // LD2 .2d
+       || (insn & 0xFFFFFC00) == 0x4C9F8C00 // ST2 .2d
+       || (insn & 0xFFFFFC00) == 0x4CDF8800 // LD2 .4s
+       || (insn & 0xFFFFFC00) == 0x4C9F8800 // ST2 .4s
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      UInt   sz   = INSN(11,10);
+      const HChar* name = "??";
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      IRExpr* tEA_0  = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
+      IRExpr* tEA_8  = binop(Iop_Add64, mkexpr(tEA), mkU64(8));
+      IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
+      IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24));
+      if (sz == BITS2(1,1)) {
+         name = "2d";
+         if (isLD) {
+            putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0));
+            putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_16));
+            putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8));
+            putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_24));
+         } else {
+            storeLE(tEA_0,  getQRegLane((vT+0) % 32, 0, Ity_I64));
+            storeLE(tEA_16, getQRegLane((vT+0) % 32, 1, Ity_I64));
+            storeLE(tEA_8,  getQRegLane((vT+1) % 32, 0, Ity_I64));
+            storeLE(tEA_24, getQRegLane((vT+1) % 32, 1, Ity_I64));
+         }
+      }
+      else if (sz == BITS2(1,0)) {
+         /* Uh, this is ugly.  TODO: better. */
+         name = "4s";
+         IRExpr* tEA_4  = binop(Iop_Add64, mkexpr(tEA), mkU64(4));
+         IRExpr* tEA_12 = binop(Iop_Add64, mkexpr(tEA), mkU64(12));
+         IRExpr* tEA_20 = binop(Iop_Add64, mkexpr(tEA), mkU64(20));
+         IRExpr* tEA_28 = binop(Iop_Add64, mkexpr(tEA), mkU64(28));
+         if (isLD) {
+            putQRegLane((vT+0) % 32, 0, loadLE(Ity_I32, tEA_0));
+            putQRegLane((vT+0) % 32, 1, loadLE(Ity_I32, tEA_8));
+            putQRegLane((vT+0) % 32, 2, loadLE(Ity_I32, tEA_16));
+            putQRegLane((vT+0) % 32, 3, loadLE(Ity_I32, tEA_24));
+            putQRegLane((vT+1) % 32, 0, loadLE(Ity_I32, tEA_4));
+            putQRegLane((vT+1) % 32, 1, loadLE(Ity_I32, tEA_12));
+            putQRegLane((vT+1) % 32, 2, loadLE(Ity_I32, tEA_20));
+            putQRegLane((vT+1) % 32, 3, loadLE(Ity_I32, tEA_28));
+         } else {
+            storeLE(tEA_0,  getQRegLane((vT+0) % 32, 0, Ity_I32));
+            storeLE(tEA_8,  getQRegLane((vT+0) % 32, 1, Ity_I32));
+            storeLE(tEA_16, getQRegLane((vT+0) % 32, 2, Ity_I32));
+            storeLE(tEA_24, getQRegLane((vT+0) % 32, 3, Ity_I32));
+            storeLE(tEA_4,  getQRegLane((vT+1) % 32, 0, Ity_I32));
+            storeLE(tEA_12, getQRegLane((vT+1) % 32, 1, Ity_I32));
+            storeLE(tEA_20, getQRegLane((vT+1) % 32, 2, Ity_I32));
+            storeLE(tEA_28, getQRegLane((vT+1) % 32, 3, Ity_I32));
+         }
+      }
+      else {
+         vassert(0); // Can't happen.
+      }
+      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(32)));
+      DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld2" : "st2",
+          (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* ---------- LD1/ST1 (multiple structures, no offset) ---------- */
+   /* Only a very few cases. */
+   /* 31        23
+      0100 1100 0100 0000 1010 00 n t  LD1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP]
+      0100 1100 0000 0000 1010 00 n t  ST1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP]
+   */
+   if (   (insn & 0xFFFFFC00) == 0x4C40A000 // LD1
+       || (insn & 0xFFFFFC00) == 0x4C00A000 // ST1
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* name = "16b";
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      IRExpr* tEA_0  = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
+      IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
+      if (isLD) {
+         putQReg128((vT+0) % 32, loadLE(Ity_V128, tEA_0));
+         putQReg128((vT+1) % 32, loadLE(Ity_V128, tEA_16));
+      } else {
+         storeLE(tEA_0,  getQReg128((vT+0) % 32));
+         storeLE(tEA_16, getQReg128((vT+1) % 32));
+      }
+      DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld1" : "st1",
+          (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
+   /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
+   /* 31 29     23  20      14    9 4
+      sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
+      sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
+      sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
+      sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
+   */
+   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
+       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
+       && INSN(14,10) == BITS5(1,1,1,1,1)) {
+      UInt szBlg2     = INSN(31,30);
+      Bool isLD       = INSN(22,22) == 1;
+      Bool isAcqOrRel = INSN(15,15) == 1;
+      UInt ss         = INSN(20,16);
+      UInt nn         = INSN(9,5);
+      UInt tt         = INSN(4,0);
+
+      vassert(szBlg2 < 4);
+      UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
+      IRType ty  = integerIRTypeOfSize(szB);
+      const HChar* suffix[4] = { "rb", "rh", "r", "r" };
+
+      IRTemp ea = newTemp(Ity_I64);
+      assign(ea, getIReg64orSP(nn));
+      /* FIXME generate check that ea is szB-aligned */
+
+      if (isLD && ss == BITS5(1,1,1,1,1)) {
+         IRTemp res = newTemp(ty);
+         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
+         putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
+         if (isAcqOrRel) {
+            stmt(IRStmt_MBE(Imbe_Fence));
+         }
+         DIP("ld%sx%s %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+         return True;
+      }
+      if (!isLD) {
+         if (isAcqOrRel) {
+            stmt(IRStmt_MBE(Imbe_Fence));
+         }
+         IRTemp  res  = newTemp(Ity_I1);
+         IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
+         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
+         /* IR semantics: res is 1 if store succeeds, 0 if it fails.
+            Need to set rS to 1 on failure, 0 on success. */
+         putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
+                                            mkU64(1)));
+         DIP("st%sx%s %s, %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
+             nameIRegOrZR(False, ss),
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ------------------ LDA{R,RH,RB} ------------------ */
+   /* ------------------ STL{R,RH,RB} ------------------ */
+   /* 31 29     23  20      14    9 4
+      sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
+      sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
+   */
+   if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
+       && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
+      UInt szBlg2 = INSN(31,30);
+      Bool isLD   = INSN(22,22) == 1;
+      UInt nn     = INSN(9,5);
+      UInt tt     = INSN(4,0);
+
+      vassert(szBlg2 < 4);
+      UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
+      IRType ty  = integerIRTypeOfSize(szB);
+      const HChar* suffix[4] = { "rb", "rh", "r", "r" };
+
+      IRTemp ea = newTemp(Ity_I64);
+      assign(ea, getIReg64orSP(nn));
+      /* FIXME generate check that ea is szB-aligned */
+
+      if (isLD) {
+         IRTemp res = newTemp(ty);
+         assign(res, loadLE(ty, mkexpr(ea)));
+         putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
+         stmt(IRStmt_MBE(Imbe_Fence));
+         DIP("lda%s %s, [%s]\n", suffix[szBlg2],
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+      } else {
+         stmt(IRStmt_MBE(Imbe_Fence));
+         IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
+         storeLE(mkexpr(ea), data);
+         DIP("stl%s %s, [%s]\n", suffix[szBlg2],
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+      }
+      return True;
+   }
+
+   vex_printf("ARM64 front end: load_store\n");
+   return False;
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Control flow and misc instructions                   ---*/
+/*------------------------------------------------------------*/
+
+static
+Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
+                          VexArchInfo* archinfo)
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+   /* ---------------------- B cond ----------------------- */
+   /* 31        24    4 3
+      0101010 0 imm19 0 cond */
+   if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
+      UInt  cond   = INSN(3,0);
+      ULong uimm64 = INSN(23,5) << 2;
+      Long  simm64 = (Long)sx_to_64(uimm64, 21);
+      vassert(dres->whatNext    == Dis_Continue);
+      vassert(dres->len         == 4);
+      vassert(dres->continueAt  == 0);
+      vassert(dres->jk_StopHere == Ijk_INVALID);
+      stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
+                        Ijk_Boring,
+                        IRConst_U64(guest_PC_curr_instr + simm64),
+                        OFFB_PC) );
+      putPC(mkU64(guest_PC_curr_instr + 4));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_Boring;
+      DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
+      return True;
+   }
+
+   /* -------------------- B{L} uncond -------------------- */
+   if (INSN(30,26) == BITS5(0,0,1,0,1)) {
+      /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
+         100101 imm26  B  (PC + sxTo64(imm26 << 2))
+      */
+      UInt  bLink  = INSN(31,31);
+      ULong uimm64 = INSN(25,0) << 2;
+      Long  simm64 = (Long)sx_to_64(uimm64, 28);
+      if (bLink) {
+         putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
+      }
+      putPC(mkU64(guest_PC_curr_instr + simm64));
+      dres->whatNext = Dis_StopHere;
+      dres->jk_StopHere = Ijk_Call;
+      DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
+                          guest_PC_curr_instr + simm64);
+      return True;
+   }
+
+   /* --------------------- B{L} reg --------------------- */
+   /* 31      24 22 20    15     9  4
+      1101011 00 10 11111 000000 nn 00000  RET  Rn
+      1101011 00 01 11111 000000 nn 00000  CALL Rn
+      1101011 00 00 11111 000000 nn 00000  JMP  Rn
+   */
+   if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
+       && INSN(20,16) == BITS5(1,1,1,1,1)
+       && INSN(15,10) == BITS6(0,0,0,0,0,0)
+       && INSN(4,0) == BITS5(0,0,0,0,0)) {
+      UInt branch_type = INSN(22,21);
+      UInt nn          = INSN(9,5);
+      if (branch_type == BITS2(1,0) /* RET */) {
+         putPC(getIReg64orZR(nn));
+         dres->whatNext = Dis_StopHere;
+         dres->jk_StopHere = Ijk_Ret;
+         DIP("ret %s\n", nameIReg64orZR(nn));
+         return True;
+      }
+      if (branch_type == BITS2(0,1) /* CALL */) {
+         IRTemp dst = newTemp(Ity_I64);
+         assign(dst, getIReg64orZR(nn));
+         putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
+         putPC(mkexpr(dst));
+         dres->whatNext = Dis_StopHere;
+         dres->jk_StopHere = Ijk_Call;
+         DIP("blr %s\n", nameIReg64orZR(nn));
+         return True;
+      }
+      if (branch_type == BITS2(0,0) /* JMP */) {
+         putPC(getIReg64orZR(nn));
+         dres->whatNext = Dis_StopHere;
+         dres->jk_StopHere = Ijk_Boring;
+         DIP("jmp %s\n", nameIReg64orZR(nn));
+         return True;
+      }
+   }
+
+   /* -------------------- CB{N}Z -------------------- */
+   /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
+      sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
+   */
+   if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
+      Bool    is64   = INSN(31,31) == 1;
+      Bool    bIfZ   = INSN(24,24) == 0;
+      ULong   uimm64 = INSN(23,5) << 2;
+      UInt    rT     = INSN(4,0);
+      Long    simm64 = (Long)sx_to_64(uimm64, 21);
+      IRExpr* cond   = NULL;
+      if (is64) {
+         cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
+                      getIReg64orZR(rT), mkU64(0));
+      } else {
+         cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
+                      getIReg32orZR(rT), mkU32(0));
+      }
+      stmt( IRStmt_Exit(cond,
+                        Ijk_Boring,
+                        IRConst_U64(guest_PC_curr_instr + simm64),
+                        OFFB_PC) );
+      putPC(mkU64(guest_PC_curr_instr + 4));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_Boring;
+      DIP("cb%sz %s, 0x%llx\n",
+          bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
+          guest_PC_curr_instr + simm64);
+      return True;
+   }
+
+   /* -------------------- TB{N}Z -------------------- */
+   /* 31 30      24 23  18  5 4
+      b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
+      b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
+   */
+   if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
+      UInt    b5     = INSN(31,31);
+      Bool    bIfZ   = INSN(24,24) == 0;
+      UInt    b40    = INSN(23,19);
+      UInt    imm14  = INSN(18,5);
+      UInt    tt     = INSN(4,0);
+      UInt    bitNo  = (b5 << 5) | b40;
+      ULong   uimm64 = imm14 << 2;
+      Long    simm64 = sx_to_64(uimm64, 16);
+      IRExpr* cond 
+         = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
+                 binop(Iop_And64,
+                       binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
+                       mkU64(1)),
+                 mkU64(0));
+      stmt( IRStmt_Exit(cond,
+                        Ijk_Boring,
+                        IRConst_U64(guest_PC_curr_instr + simm64),
+                        OFFB_PC) );
+      putPC(mkU64(guest_PC_curr_instr + 4));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_Boring;
+      DIP("tb%sz %s, #%u, 0x%llx\n",
+          bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
+          guest_PC_curr_instr + simm64);
+      return True;
+   }
+
+   /* -------------------- SVC -------------------- */
+   /* 11010100 000 imm16 000 01
+      Don't bother with anything except the imm16==0 case.
+   */
+   if (INSN(31,0) == 0xD4000001) {
+      putPC(mkU64(guest_PC_curr_instr + 4));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_Sys_syscall;
+      DIP("svc #0\n");
+      return True;
+   }
+
+   /* ------------------ M{SR,RS} ------------------ */
+   /* Only handles the case where the system register is TPIDR_EL0.
+      0xD51BD0 010 Rt   MSR tpidr_el0, rT
+      0xD53BD0 010 Rt   MRS rT, tpidr_el0
+   */
+   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
+       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
+      Bool toSys = INSN(21,21) == 0;
+      UInt tt    = INSN(4,0);
+      if (toSys) {
+         stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
+         DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
+      } else {
+         putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
+         DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
+      }
+      return True;
+   }
+   /* Cases for FPCR 
+      0xD51B44 000 Rt  MSR fpcr, rT
+      0xD53B44 000 Rt  MSR rT, fpcr
+   */
+   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
+       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
+      Bool toSys = INSN(21,21) == 0;
+      UInt tt    = INSN(4,0);
+      if (toSys) {
+         stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
+         DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
+      } else {
+         putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
+         DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
+      }
+      return True;
+   }
+   /* Cases for FPSR 
+      0xD51B44 001 Rt  MSR fpsr, rT
+      0xD53B44 001 Rt  MSR rT, fpsr
+   */
+   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
+       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
+      Bool toSys = INSN(21,21) == 0;
+      UInt tt    = INSN(4,0);
+      if (toSys) {
+         stmt( IRStmt_Put( OFFB_FPSR, getIReg32orZR(tt)) );
+         DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
+      } else {
+         putIReg32orZR(tt, IRExpr_Get(OFFB_FPSR, Ity_I32));
+         DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
+      }
+      return True;
+   }
+   /* Cases for NZCV
+      D51B42 000 Rt  MSR nzcv, rT
+      D53B42 000 Rt  MRS rT, nzcv
+   */
+   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
+       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
+      Bool  toSys = INSN(21,21) == 0;
+      UInt  tt    = INSN(4,0);
+      if (toSys) {
+         IRTemp t = newTemp(Ity_I64);
+         assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
+         setFlags_COPY(t);
+         DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
+      } else {
+         IRTemp res = newTemp(Ity_I64);
+         assign(res, mk_arm64g_calculate_flags_nzcv());
+         putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
+         DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
+      }
+      return True;
+   }
+   /* Cases for DCZID_EL0
+      Don't support arbitrary reads and writes to this register.  Just
+      return the value 16, which indicates that the DC ZVA instruction
+      is not permitted, so we don't have to emulate it.
+      D5 3B 00 111 Rt  MRS rT, dczid_el0
+   */
+   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
+      UInt tt = INSN(4,0);
+      putIReg64orZR(tt, mkU64(1<<4));
+      DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
+      return True;
+   }
+   /* Cases for CTR_EL0
+      We just handle reads, and make up a value from the D and I line
+      sizes in the VexArchInfo we are given, and patch in the following
+      fields that the Foundation model gives ("natively"):
+      CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
+      D5 3B 00 001 Rt  MRS rT, dczid_el0
+   */
+   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
+      UInt tt = INSN(4,0);
+      /* Need to generate a value from dMinLine_lg2_szB and
+         dMinLine_lg2_szB.  The value in the register is in 32-bit
+         units, so need to subtract 2 from the values in the
+         VexArchInfo.  We can assume that the values here are valid --
+         disInstr_ARM64 checks them -- so there's no need to deal with
+         out-of-range cases. */
+      vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
+              && archinfo->arm64_dMinLine_lg2_szB <= 17
+              && archinfo->arm64_iMinLine_lg2_szB >= 2
+              && archinfo->arm64_iMinLine_lg2_szB <= 17);
+      UInt val
+         = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
+                      | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
+      putIReg64orZR(tt, mkU64(val));
+      DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
+      return True;
+   }
+
+   /* ------------------ IC_IVAU ------------------ */
+   /* D5 0B 75 001 Rt  ic ivau, rT
+   */
+   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
+      /* We will always be provided with a valid iMinLine value. */
+      vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
+              && archinfo->arm64_iMinLine_lg2_szB <= 17);
+      /* Round the requested address, in rT, down to the start of the
+         containing block. */
+      UInt   tt      = INSN(4,0);
+      ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
+      IRTemp addr    = newTemp(Ity_I64);
+      assign( addr, binop( Iop_And64,
+                           getIReg64orZR(tt),
+                           mkU64(~(lineszB - 1))) );
+      /* Set the invalidation range, request exit-and-invalidate, with
+         continuation at the next instruction. */
+      stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
+      stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
+      /* be paranoid ... */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      putPC(mkU64( guest_PC_curr_instr + 4 ));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_InvalICache;
+      DIP("ic ivau, %s\n", nameIReg64orZR(tt));
+      return True;
+   }
+
+   /* ------------------ DC_CVAU ------------------ */
+   /* D5 0B 7B 001 Rt  dc cvau, rT
+   */
+   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20) {
+      /* Exactly the same scheme as for IC IVAU, except we observe the
+         dMinLine size, and request an Ijk_FlushDCache instead of
+         Ijk_InvalICache. */
+      /* We will always be provided with a valid dMinLine value. */
+      vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
+              && archinfo->arm64_dMinLine_lg2_szB <= 17);
+      /* Round the requested address, in rT, down to the start of the
+         containing block. */
+      UInt   tt      = INSN(4,0);
+      ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
+      IRTemp addr    = newTemp(Ity_I64);
+      assign( addr, binop( Iop_And64,
+                           getIReg64orZR(tt),
+                           mkU64(~(lineszB - 1))) );
+      /* Set the flush range, request exit-and-flush, with
+         continuation at the next instruction. */
+      stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
+      stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
+      /* be paranoid ... */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      putPC(mkU64( guest_PC_curr_instr + 4 ));
+      dres->whatNext    = Dis_StopHere;
+      dres->jk_StopHere = Ijk_FlushDCache;
+      DIP("dc cvau, %s\n", nameIReg64orZR(tt));
+      return True;
+   }
+
+   /* ------------------ ISB, DMB, DSB ------------------ */
+   if (INSN(31,0) == 0xD5033FDF) {
+      stmt(IRStmt_MBE(Imbe_Fence));
+      DIP("isb\n");
+      return True;
+   }
+   if (INSN(31,0) == 0xD5033BBF) {
+      stmt(IRStmt_MBE(Imbe_Fence));
+      DIP("dmb ish\n");
+      return True;
+   }
+   if (INSN(31,0) == 0xD5033B9F) {
+      stmt(IRStmt_MBE(Imbe_Fence));
+      DIP("dsb ish\n");
+      return True;
+   }
+
+   /* -------------------- NOP -------------------- */
+   if (INSN(31,0) == 0xD503201F) {
+      DIP("nop\n");
+      return True;
+   }
+
+  //fail:
+   vex_printf("ARM64 front end: branch_etc\n");
+   return False;
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- SIMD and FP instructions                             ---*/
+/*------------------------------------------------------------*/
+
+/* begin FIXME -- rm temp scaffolding */
+static IRExpr* mk_CatEvenLanes64x2 ( IRTemp, IRTemp );
+static IRExpr* mk_CatOddLanes64x2  ( IRTemp, IRTemp );
+
+static IRExpr* mk_CatEvenLanes32x4 ( IRTemp, IRTemp );
+static IRExpr* mk_CatOddLanes32x4  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO32x4 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI32x4 ( IRTemp, IRTemp );
+
+static IRExpr* mk_CatEvenLanes16x8 ( IRTemp, IRTemp );
+static IRExpr* mk_CatOddLanes16x8  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO16x8 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI16x8 ( IRTemp, IRTemp );
+
+static IRExpr* mk_CatEvenLanes8x16 ( IRTemp, IRTemp );
+static IRExpr* mk_CatOddLanes8x16  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO8x16 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI8x16 ( IRTemp, IRTemp );
+/* end FIXME -- rm temp scaffolding */
+
+/* Generate N copies of |bit| in the bottom of a ULong. */
+static ULong Replicate ( ULong bit, Int N )
+{
+   vassert(bit <= 1 && N >= 1 && N < 64);
+   if (bit == 0) {
+      return 0;
+    } else {
+      /* Careful.  This won't work for N == 64. */
+      return (1ULL << N) - 1;
+   }
+}
+
+static ULong Replicate32x2 ( ULong bits32 )
+{
+   vassert(0 == (bits32 & ~0xFFFFFFFFULL));
+   return (bits32 << 32) | bits32;
+}
+
+static ULong Replicate16x4 ( ULong bits16 )
+{
+   vassert(0 == (bits16 & ~0xFFFFULL));
+   return Replicate32x2((bits16 << 16) | bits16);
+}
+
+static ULong Replicate8x8 ( ULong bits8 )
+{
+   vassert(0 == (bits8 & ~0xFFULL));
+   return Replicate16x4((bits8 << 8) | bits8);
+}
+
+/* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
+   |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
+   is 64.  In the former case, the upper 32 bits of the returned value
+   are guaranteed to be zero. */
+static ULong VFPExpandImm ( ULong imm8, Int N )
+{
+   vassert(imm8 <= 0xFF);
+   vassert(N == 32 || N == 64);
+   Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
+   Int F = N - E - 1;
+   ULong imm8_6 = (imm8 >> 6) & 1;
+   /* sign: 1 bit */
+   /* exp:  E bits */
+   /* frac: F bits */
+   ULong sign = (imm8 >> 7) & 1;
+   ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
+   ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
+   vassert(sign < (1ULL << 1));
+   vassert(exp  < (1ULL << E));
+   vassert(frac < (1ULL << F));
+   vassert(1 + E + F == N);
+   ULong res = (sign << (E+F)) | (exp << F) | frac;
+   return res;
+}
+
+/* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
+   This might fail, as indicated by the returned Bool.  Page 2530 of
+   the manual. */
+static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
+                               UInt op, UInt cmode, UInt imm8 )
+{
+   vassert(op <= 1);
+   vassert(cmode <= 15);
+   vassert(imm8 <= 255);
+
+   *res = 0; /* will overwrite iff returning True */
+
+   ULong imm64    = 0;
+   Bool  testimm8 = False;
+
+   switch (cmode >> 1) {
+      case 0:
+         testimm8 = False; imm64 = Replicate32x2(imm8); break;
+      case 1:
+         testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
+      case 2:
+         testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
+      case 3:
+         testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
+      case 4:
+          testimm8 = False; imm64 = Replicate16x4(imm8); break;
+      case 5:
+          testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
+      case 6:
+          testimm8 = True;
+          if ((cmode & 1) == 0)
+              imm64 = Replicate32x2((imm8 << 8) | 0xFF);
+          else
+              imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
+          break;
+      case 7:
+         testimm8 = False;
+         if ((cmode & 1) == 0 && op == 0)
+             imm64 = Replicate8x8(imm8);
+         if ((cmode & 1) == 0 && op == 1) {
+             imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
+             imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
+         }
+         if ((cmode & 1) == 1 && op == 0) {
+            ULong imm8_7  = (imm8 >> 7) & 1;
+            ULong imm8_6  = (imm8 >> 6) & 1;
+            ULong imm8_50 = imm8 & 63;
+            ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
+                          | ((imm8_6 ^ 1)         << (5 + 6 + 19))
+                          | (Replicate(imm8_6, 5) << (6 + 19))
+                          | (imm8_50              << 19);
+            imm64 = Replicate32x2(imm32);
+         }
+         if ((cmode & 1) == 1 && op == 1) {
+            // imm64 = imm8<7>:NOT(imm8<6>)
+            //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
+            ULong imm8_7  = (imm8 >> 7) & 1;
+            ULong imm8_6  = (imm8 >> 6) & 1;
+            ULong imm8_50 = imm8 & 63;
+            imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
+                    | (Replicate(imm8_6, 8) << 54)
+                    | (imm8_50 << 48);
+         }
+         break;
+      default:
+        vassert(0);
+   }
+
+   if (testimm8 && imm8 == 0)
+      return False;
+
+   *res = imm64;
+   return True;
+}
+
+
+/* Help a bit for decoding laneage for vector operations that can be
+   of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
+   and SZ bits, typically for vector floating point. */
+static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
+                               /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
+                               /*OUT*/const HChar** arrSpec,
+                               Bool bitQ, Bool bitSZ )
+{
+   vassert(bitQ == True || bitQ == False);
+   vassert(bitSZ == True || bitSZ == False);
+   if (bitQ && bitSZ) { // 2x64
+      if (tyI)       *tyI       = Ity_I64;
+      if (tyF)       *tyF       = Ity_F64;
+      if (nLanes)    *nLanes    = 2;
+      if (zeroUpper) *zeroUpper = False;
+      if (arrSpec)   *arrSpec   = "2d";
+      return True;
+   }
+   if (bitQ && !bitSZ) { // 4x32
+      if (tyI)       *tyI       = Ity_I32;
+      if (tyF)       *tyF       = Ity_F32;
+      if (nLanes)    *nLanes    = 4;
+      if (zeroUpper) *zeroUpper = False;
+      if (arrSpec)   *arrSpec   = "4s";
+      return True;
+   }
+   if (!bitQ && !bitSZ) { // 2x32
+      if (tyI)       *tyI       = Ity_I32;
+      if (tyF)       *tyF       = Ity_F32;
+      if (nLanes)    *nLanes    = 2;
+      if (zeroUpper) *zeroUpper = True;
+      if (arrSpec)   *arrSpec   = "2s";
+      return True;
+   }
+   // Else impliedly 1x64, which isn't allowed.
+   return False;
+}
+
+/* Helper for decoding laneage for simple vector operations,
+   eg integer add. */
+static Bool getLaneInfo_SIMPLE ( /*OUT*/Bool* zeroUpper,
+                                 /*OUT*/const HChar** arrSpec,
+                                 Bool bitQ, UInt szBlg2 )
+{
+   vassert(bitQ == True || bitQ == False);
+   vassert(szBlg2 < 4);
+   Bool zu = False;
+   const HChar* as = NULL;
+   switch ((szBlg2 << 1) | (bitQ ? 1 : 0)) {
+      case 0: zu = True;  as = "8b";  break;
+      case 1: zu = False; as = "16b"; break;
+      case 2: zu = True;  as = "4h";  break;
+      case 3: zu = False; as = "8h";  break;
+      case 4: zu = True;  as = "2s";  break;
+      case 5: zu = False; as = "4s";  break;
+      case 6: return False; // impliedly 1x64
+      case 7: zu = False; as = "2d";  break;
+      default: vassert(0);
+   }
+   vassert(as);
+   if (arrSpec)   *arrSpec = as;
+   if (zeroUpper) *zeroUpper = zu;
+   return True;
+}
+
+
+/* Helper for decoding laneage for shift-style vector operations 
+   that involve an immediate shift amount. */
+static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
+                                    UInt immh, UInt immb )
+{
+   vassert(immh < (1<<4));
+   vassert(immb < (1<<3));
+   UInt immhb = (immh << 3) | immb;
+   if (immh & 8) {
+      if (shift)  *shift  = 128 - immhb;
+      if (szBlg2) *szBlg2 = 3;
+      return True;
+   }
+   if (immh & 4) {
+      if (shift)  *shift  = 64 - immhb;
+      if (szBlg2) *szBlg2 = 2;
+      return True;
+   }
+   if (immh & 2) {
+      if (shift)  *shift  = 32 - immhb;
+      if (szBlg2) *szBlg2 = 1;
+      return True;
+   }
+   if (immh & 1) {
+      if (shift)  *shift  = 16 - immhb;
+      if (szBlg2) *szBlg2 = 0;
+      return True;
+   }
+   return False;
+}
+
+
+/* Generate IR to fold all lanes of the V128 value in 'src' as
+   characterised by the operator 'op', and return the result in the
+   bottom bits of a V128, with all other bits set to zero. */
+static IRTemp math_MINMAXV ( IRTemp src, IROp op )
+{
+   /* The basic idea is to use repeated applications of Iop_CatEven*
+      and Iop_CatOdd* operators to 'src' so as to clone each lane into
+      a complete vector.  Then fold all those vectors with 'op' and
+      zero out all but the least significant lane. */
+   switch (op) {
+      case Iop_Min8Sx16: case Iop_Min8Ux16:
+      case Iop_Max8Sx16: case Iop_Max8Ux16: {
+         /* NB: temp naming here is misleading -- the naming is for 8
+            lanes of 16 bit, whereas what is being operated on is 16
+            lanes of 8 bits. */
+         IRTemp x76543210 = src;
+         IRTemp x76547654 = newTemp(Ity_V128);
+         IRTemp x32103210 = newTemp(Ity_V128);
+         assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
+         assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
+         IRTemp x76767676 = newTemp(Ity_V128);
+         IRTemp x54545454 = newTemp(Ity_V128);
+         IRTemp x32323232 = newTemp(Ity_V128);
+         IRTemp x10101010 = newTemp(Ity_V128);
+         assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
+         assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
+         assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
+         assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
+         IRTemp x77777777 = newTemp(Ity_V128);
+         IRTemp x66666666 = newTemp(Ity_V128);
+         IRTemp x55555555 = newTemp(Ity_V128);
+         IRTemp x44444444 = newTemp(Ity_V128);
+         IRTemp x33333333 = newTemp(Ity_V128);
+         IRTemp x22222222 = newTemp(Ity_V128);
+         IRTemp x11111111 = newTemp(Ity_V128);
+         IRTemp x00000000 = newTemp(Ity_V128);
+         assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
+         assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
+         assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
+         assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
+         assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
+         assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
+         assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
+         assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
+         /* Naming not misleading after here. */
+         IRTemp xAllF = newTemp(Ity_V128);
+         IRTemp xAllE = newTemp(Ity_V128);
+         IRTemp xAllD = newTemp(Ity_V128);
+         IRTemp xAllC = newTemp(Ity_V128);
+         IRTemp xAllB = newTemp(Ity_V128);
+         IRTemp xAllA = newTemp(Ity_V128);
+         IRTemp xAll9 = newTemp(Ity_V128);
+         IRTemp xAll8 = newTemp(Ity_V128);
+         IRTemp xAll7 = newTemp(Ity_V128);
+         IRTemp xAll6 = newTemp(Ity_V128);
+         IRTemp xAll5 = newTemp(Ity_V128);
+         IRTemp xAll4 = newTemp(Ity_V128);
+         IRTemp xAll3 = newTemp(Ity_V128);
+         IRTemp xAll2 = newTemp(Ity_V128);
+         IRTemp xAll1 = newTemp(Ity_V128);
+         IRTemp xAll0 = newTemp(Ity_V128);
+         assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
+         assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
+         assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
+         assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
+         assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
+         assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
+         assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
+         assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
+         assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
+         assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
+         assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
+         assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
+         assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
+         assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
+         assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
+         assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
+         IRTemp maxFE = newTemp(Ity_V128);
+         IRTemp maxDC = newTemp(Ity_V128);
+         IRTemp maxBA = newTemp(Ity_V128);
+         IRTemp max98 = newTemp(Ity_V128);
+         IRTemp max76 = newTemp(Ity_V128);
+         IRTemp max54 = newTemp(Ity_V128);
+         IRTemp max32 = newTemp(Ity_V128);
+         IRTemp max10 = newTemp(Ity_V128);
+         assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
+         assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
+         assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
+         assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
+         assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
+         assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
+         assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
+         assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
+         IRTemp maxFEDC = newTemp(Ity_V128);
+         IRTemp maxBA98 = newTemp(Ity_V128);
+         IRTemp max7654 = newTemp(Ity_V128);
+         IRTemp max3210 = newTemp(Ity_V128);
+         assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
+         assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
+         assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
+         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
+         IRTemp maxFEDCBA98 = newTemp(Ity_V128);
+         IRTemp max76543210 = newTemp(Ity_V128);
+         assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
+         assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
+         IRTemp maxAllLanes = newTemp(Ity_V128);
+         assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
+                                       mkexpr(max76543210)));
+         IRTemp res = newTemp(Ity_V128);
+         assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
+         return res;
+      }
+      case Iop_Min16Sx8: case Iop_Min16Ux8:
+      case Iop_Max16Sx8: case Iop_Max16Ux8: {
+         IRTemp x76543210 = src;
+         IRTemp x76547654 = newTemp(Ity_V128);
+         IRTemp x32103210 = newTemp(Ity_V128);
+         assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
+         assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
+         IRTemp x76767676 = newTemp(Ity_V128);
+         IRTemp x54545454 = newTemp(Ity_V128);
+         IRTemp x32323232 = newTemp(Ity_V128);
+         IRTemp x10101010 = newTemp(Ity_V128);
+         assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
+         assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
+         assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
+         assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
+         IRTemp x77777777 = newTemp(Ity_V128);
+         IRTemp x66666666 = newTemp(Ity_V128);
+         IRTemp x55555555 = newTemp(Ity_V128);
+         IRTemp x44444444 = newTemp(Ity_V128);
+         IRTemp x33333333 = newTemp(Ity_V128);
+         IRTemp x22222222 = newTemp(Ity_V128);
+         IRTemp x11111111 = newTemp(Ity_V128);
+         IRTemp x00000000 = newTemp(Ity_V128);
+         assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
+         assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
+         assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
+         assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
+         assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
+         assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
+         assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
+         assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
+         IRTemp max76 = newTemp(Ity_V128);
+         IRTemp max54 = newTemp(Ity_V128);
+         IRTemp max32 = newTemp(Ity_V128);
+         IRTemp max10 = newTemp(Ity_V128);
+         assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
+         assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
+         assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
+         assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
+         IRTemp max7654 = newTemp(Ity_V128);
+         IRTemp max3210 = newTemp(Ity_V128);
+         assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
+         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
+         IRTemp max76543210 = newTemp(Ity_V128);
+         assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
+         IRTemp res = newTemp(Ity_V128);
+         assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
+         return res;
+      }
+      case Iop_Min32Sx4: case Iop_Min32Ux4:
+      case Iop_Max32Sx4: case Iop_Max32Ux4: {
+         IRTemp x3210 = src;
+         IRTemp x3232 = newTemp(Ity_V128);
+         IRTemp x1010 = newTemp(Ity_V128);
+         assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
+         assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
+         IRTemp x3333 = newTemp(Ity_V128);
+         IRTemp x2222 = newTemp(Ity_V128);
+         IRTemp x1111 = newTemp(Ity_V128);
+         IRTemp x0000 = newTemp(Ity_V128);
+         assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
+         assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
+         assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
+         assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
+         IRTemp max32 = newTemp(Ity_V128);
+         IRTemp max10 = newTemp(Ity_V128);
+         assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
+         assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
+         IRTemp max3210 = newTemp(Ity_V128);
+         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
+         IRTemp res = newTemp(Ity_V128);
+         assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
+         return res;
+      }
+      default:
+         vassert(0);
+   }
+}
+
+
+/* Generate IR for TBL and TBX.  This deals with the 128 bit case
+   only. */
+static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
+                             IRTemp oor_values )
+{
+   vassert(len >= 0 && len <= 3);
+
+   /* Generate some useful constants as concisely as possible. */
+   IRTemp half15 = newTemp(Ity_I64);
+   assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
+   IRTemp half16 = newTemp(Ity_I64);
+   assign(half16, mkU64(0x1010101010101010ULL));
+
+   /* A zero vector */
+   IRTemp allZero = newTemp(Ity_V128);
+   assign(allZero, mkV128(0x0000));
+   /* A vector containing 15 in each 8-bit lane */
+   IRTemp all15 = newTemp(Ity_V128);
+   assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
+   /* A vector containing 16 in each 8-bit lane */
+   IRTemp all16 = newTemp(Ity_V128);
+   assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
+   /* A vector containing 32 in each 8-bit lane */
+   IRTemp all32 = newTemp(Ity_V128);
+   assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
+   /* A vector containing 48 in each 8-bit lane */
+   IRTemp all48 = newTemp(Ity_V128);
+   assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
+   /* A vector containing 64 in each 8-bit lane */
+   IRTemp all64 = newTemp(Ity_V128);
+   assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
+
+   /* Group the 16/32/48/64 vectors so as to be indexable. */
+   IRTemp allXX[4] = { all16, all32, all48, all64 };
+
+   /* Compute the result for each table vector, with zeroes in places
+      where the index values are out of range, and OR them into the
+      running vector. */
+   IRTemp running_result = newTemp(Ity_V128);
+   assign(running_result, mkV128(0));
+
+   UInt tabent;
+   for (tabent = 0; tabent <= len; tabent++) {
+      vassert(tabent >= 0 && tabent < 4);
+      IRTemp bias = newTemp(Ity_V128);
+      assign(bias,
+             mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
+      IRTemp biased_indices = newTemp(Ity_V128);
+      assign(biased_indices,
+             binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
+      IRTemp valid_mask = newTemp(Ity_V128);
+      assign(valid_mask,
+             binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
+      IRTemp safe_biased_indices = newTemp(Ity_V128);
+      assign(safe_biased_indices,
+             binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
+      IRTemp results_or_junk = newTemp(Ity_V128);
+      assign(results_or_junk,
+             binop(Iop_Perm8x16, mkexpr(tab[tabent]),
+                                 mkexpr(safe_biased_indices)));
+      IRTemp results_or_zero = newTemp(Ity_V128);
+      assign(results_or_zero,
+             binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
+      /* And OR that into the running result. */
+      IRTemp tmp = newTemp(Ity_V128);
+      assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
+                        mkexpr(running_result)));
+      running_result = tmp;
+   }
+
+   /* So now running_result holds the overall result where the indices
+      are in range, and zero in out-of-range lanes.  Now we need to
+      compute an overall validity mask and use this to copy in the
+      lanes in the oor_values for out of range indices.  This is
+      unnecessary for TBL but will get folded out by iropt, so we lean
+      on that and generate the same code for TBL and TBX here. */
+   IRTemp overall_valid_mask = newTemp(Ity_V128);
+   assign(overall_valid_mask,
+          binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
+   IRTemp result = newTemp(Ity_V128);
+   assign(result,
+          binop(Iop_OrV128,
+                mkexpr(running_result),
+                binop(Iop_AndV128,
+                      mkexpr(oor_values),
+                      unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
+   return result;      
+}
+
+
+static
+Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+   /* ---------------- FMOV (general) ---------------- */
+   /* case   30       23   20 18  15     9 4
+       (1) 0 00 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
+       (2) 1 00 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
+       (3) 1 00 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
+
+       (4) 0 00 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
+       (5) 1 00 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
+       (6) 1 00 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
+   */
+   if (INSN(30,24) == BITS7(0,0,1,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(0,0,0,0,0,0)) {
+      UInt sf = INSN(31,31);
+      UInt ty = INSN(23,22); // type
+      UInt rm = INSN(20,19); // rmode
+      UInt op = INSN(18,16); // opcode
+      UInt nn = INSN(9,5);
+      UInt dd = INSN(4,0);
+      UInt ix = 0; // case
+      if (sf == 0) {
+         if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
+            ix = 1;
+         else
+         if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
+            ix = 4;
+      } else {
+         vassert(sf == 1);
+         if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
+            ix = 2;
+         else
+         if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
+            ix = 5;
+         else
+         if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
+            ix = 3;
+         else
+         if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
+            ix = 6;
+      }
+      if (ix > 0) {
+         switch (ix) {
+            case 1:
+               putQReg128(dd, mkV128(0));
+               putQRegLO(dd, getIReg32orZR(nn));
+               DIP("fmov s%u, w%u\n", dd, nn);
+               break;
+            case 2:
+               putQReg128(dd, mkV128(0));
+               putQRegLO(dd, getIReg64orZR(nn));
+               DIP("fmov d%u, x%u\n", dd, nn);
+               break;
+            case 3:
+               putQRegHI64(dd, getIReg64orZR(nn));
+               DIP("fmov v%u.d[1], x%u\n", dd, nn);
+               break;
+            case 4:
+               putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
+               DIP("fmov w%u, s%u\n", dd, nn);
+               break;
+            case 5:
+               putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
+               DIP("fmov x%u, d%u\n", dd, nn);
+               break;
+            case 6:
+               putIReg64orZR(dd, getQRegHI64(nn));
+               DIP("fmov x%u, v%u.d[1]\n", dd, nn);
+               break;
+            default:
+               vassert(0);
+         }
+         return True;
+      }
+      /* undecodable; fall through */
+   }
+
+   /* -------------- FMOV (scalar, immediate) -------------- */
+   /* 31  28    23   20   12  9     4
+      000 11110 00 1 imm8 100 00000 d  FMOV Sd, #imm
+      000 11110 01 1 imm8 100 00000 d  FMOV Dd, #imm
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,0,0)
+       && INSN(21,21) == 1 && INSN(12,5) == BITS8(1,0,0,0,0,0,0,0)) {
+      Bool  isD  = INSN(22,22) == 1;
+      UInt  imm8 = INSN(20,13);
+      UInt  dd   = INSN(4,0);
+      ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
+      if (!isD) {
+         vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
+      }
+      putQReg128(dd, mkV128(0));
+      putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
+      DIP("fmov %s, #0x%llx\n",
+          nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
+      return True;
+   }
+
+   /* -------------- {FMOV,MOVI} (vector, immediate) -------------- */
+   /* 31    28          18  15    11 9     4
+      0q op 01111 00000 abc cmode 01 defgh d  MOV Dd,   #imm (q=0)
+                                              MOV Vd.2d #imm (q=1)
+      Allowable op:cmode
+         FMOV = 1:1111
+         MOVI = 0:xx00, 1:0x00, 1:10x0, 1:110x, 11110
+   */
+   if (INSN(31,31) == 0
+       && INSN(28,19) == BITS10(0,1,1,1,1,0,0,0,0,0)
+       && INSN(11,10) == BITS2(0,1)) {
+      UInt  bitQ     = INSN(30,30);
+      UInt  bitOP    = INSN(29,29);
+      UInt  cmode    = INSN(15,12);
+      UInt  imm8     = (INSN(18,16) << 5) | INSN(9,5);
+      UInt  dd       = INSN(4,0);
+      ULong imm64lo  = 0;
+      UInt  op_cmode = (bitOP << 4) | cmode;
+      Bool  ok       = False;
+      switch (op_cmode) {
+         case BITS5(1,1,1,1,1): // 1:1111
+         case BITS5(0,0,0,0,0): case BITS5(0,0,1,0,0):
+         case BITS5(0,1,0,0,0): case BITS5(0,1,1,0,0): // 0:xx00
+         case BITS5(1,0,0,0,0): case BITS5(1,0,1,0,0): // 1:0x00
+         case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
+         case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
+         case BITS5(1,1,1,1,0): // 1:1110
+            ok = True; break;
+         default:
+           break;
+      }
+      if (ok) {
+         ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, imm8);
+      }
+      if (ok) {
+         ULong imm64hi = (bitQ == 0 && bitOP == 0)  ? 0  : imm64lo;
+         putQReg128(dd, binop(Iop_64HLtoV128, mkU64(imm64hi), mkU64(imm64lo)));
+         DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------- {S,U}CVTF (scalar, integer) -------------- */
+   /* 31  28    23 21 20 18  15     9 4                  ix
+      000 11110 00 1  00 010 000000 n d  SCVTF Sd, Wn    0
+      000 11110 01 1  00 010 000000 n d  SCVTF Dd, Wn    1
+      100 11110 00 1  00 010 000000 n d  SCVTF Sd, Xn    2
+      100 11110 01 1  00 010 000000 n d  SCVTF Dd, Xn    3
+
+      000 11110 00 1  00 011 000000 n d  UCVTF Sd, Wn    4
+      000 11110 01 1  00 011 000000 n d  UCVTF Dd, Wn    5
+      100 11110 00 1  00 011 000000 n d  UCVTF Sd, Xn    6
+      100 11110 01 1  00 011 000000 n d  UCVTF Dd, Xn    7
+
+      These are signed/unsigned conversion from integer registers to
+      FP registers, all 4 32/64-bit combinations, rounded per FPCR.
+   */
+   if (INSN(30,23) == BITS8(0,0,1,1,1,1,0,0) && INSN(21,17) == BITS5(1,0,0,0,1)
+       && INSN(15,10) == BITS6(0,0,0,0,0,0)) {
+      Bool isI64 = INSN(31,31) == 1;
+      Bool isF64 = INSN(22,22) == 1;
+      Bool isU   = INSN(16,16) == 1;
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
+      const IROp ops[8]
+        = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
+            Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
+      IRExpr* src = getIRegOrZR(isI64, nn);
+      IRExpr* res = (isF64 && !isI64) 
+                       ? unop(ops[ix], src)
+                       : binop(ops[ix], mkexpr(mk_get_IR_rounding_mode()), src);
+      putQReg128(dd, mkV128(0));
+      putQRegLO(dd, res);
+      DIP("%ccvtf %s, %s\n",
+          isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32), 
+          nameIRegOrZR(isI64, nn));
+      return True;
+   }
+
+   /* ------------ F{ADD,SUB,MUL,DIV,NMUL} (scalar) ------------ */
+   /* 31        23  20 15   11 9 4
+      ---------------- 0000 ------   FMUL  --------
+      000 11110 001 m  0001 10 n d   FDIV  Sd,Sn,Sm
+      000 11110 011 m  0001 10 n d   FDIV  Dd,Dn,Dm
+      ---------------- 0010 ------   FADD  --------
+      ---------------- 0011 ------   FSUB  --------
+      ---------------- 1000 ------   FNMUL --------
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,0,0)
+       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
+      Bool   isD = INSN(22,22) == 1;
+      UInt   mm  = INSN(20,16);
+      UInt   op  = INSN(15,12);
+      UInt   nn  = INSN(9,5);
+      UInt   dd  = INSN(4,0);
+      IROp   iop = Iop_INVALID;
+      IRType ty  = isD ? Ity_F64 : Ity_F32;
+      Bool   neg = False;
+      const HChar* nm = "???";
+      switch (op) {
+         case BITS4(0,0,0,0): nm = "fmul";  iop = mkMULF(ty); break;
+         case BITS4(0,0,0,1): nm = "fdiv";  iop = mkDIVF(ty); break;
+         case BITS4(0,0,1,0): nm = "fadd";  iop = mkADDF(ty); break;
+         case BITS4(0,0,1,1): nm = "fsub";  iop = mkSUBF(ty); break;
+         case BITS4(1,0,0,0): nm = "fnmul"; iop = mkMULF(ty);
+                              neg = True; break;
+         default:             return False;
+      }
+      vassert(iop != Iop_INVALID);
+      IRExpr* resE = triop(iop, mkexpr(mk_get_IR_rounding_mode()),
+                           getQRegLO(nn, ty), getQRegLO(mm, ty));
+      IRTemp res = newTemp(ty);
+      assign(res, neg ? unop(mkNEGF(ty),resE) : resE);
+      putQReg128(dd, mkV128(0));
+      putQRegLO(dd, mkexpr(res));
+      DIP("%s %s, %s, %s\n",
+          nm, nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty));
+      return True;
+   }
+
+   /* ------------ F{MOV,ABS,NEG,SQRT} D/D or S/S ------------ */
+   /* 31        23 21    16 14    9 4
+      000 11110 00 10000 00 10000 n d  FMOV Sd, Sn
+      000 11110 01 10000 00 10000 n d  FMOV Dd, Dn
+      ------------------ 01 ---------  FABS ------
+      ------------------ 10 ---------  FNEG ------
+      ------------------ 11 ---------  FSQRT -----
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,0,0)
+       && INSN(21,17) == BITS5(1,0,0,0,0)
+       && INSN(14,10) == BITS5(1,0,0,0,0)) {
+      Bool   isD = INSN(22,22) == 1;
+      UInt   opc = INSN(16,15);
+      UInt   nn  = INSN(9,5);
+      UInt   dd  = INSN(4,0);
+      IRType ty  = isD ? Ity_F64 : Ity_F32;
+      IRTemp res = newTemp(ty);
+      if (opc == BITS2(0,0)) {
+         assign(res, getQRegLO(nn, ty));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fmov %s, %s\n",
+             nameQRegLO(dd, ty), nameQRegLO(nn, ty));
+         return True;
+      }
+      if (opc == BITS2(1,0) || opc == BITS2(0,1)) {
+         Bool isAbs = opc == BITS2(0,1);
+         IROp op    = isAbs ? mkABSF(ty) : mkNEGF(ty);
+         assign(res, unop(op, getQRegLO(nn, ty)));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(res));
+         DIP("%s %s, %s\n", isAbs ? "fabs" : "fneg",
+             nameQRegLO(dd, ty), nameQRegLO(nn, ty));
+         return True;
+      }
+      if (opc == BITS2(1,1)) {
+         assign(res,
+                binop(mkSQRTF(ty),
+                      mkexpr(mk_get_IR_rounding_mode()), getQRegLO(nn, ty)));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fsqrt %s, %s\n", nameQRegLO(dd, ty), nameQRegLO(nn, ty));
+         return True;
+      }
+      /* else fall through; other cases are ATC */
+   }
+
+   /* ---------------- F{ABS,NEG} (vector) ---------------- */
+   /* 31  28      22 21    16       9 4
+      0q0 01110 1 sz 10000 01111 10 n d  FABS Vd.T, Vn.T
+      0q1 01110 1 sz 10000 01111 10 n d  FNEG Vd.T, Vn.T
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,0,1)
+       && INSN(21,17) == BITS5(1,0,0,0,0)
+       && INSN(16,10) == BITS7(0,1,1,1,1,1,0)) {
+      UInt bitQ   = INSN(30,30);
+      UInt bitSZ  = INSN(22,22);
+      Bool isFNEG = INSN(29,29) == 1;
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      const HChar* ar = "??";
+      IRType tyF    = Ity_INVALID;
+      Bool   zeroHI = False;
+      Bool   ok     = getLaneInfo_Q_SZ(NULL, &tyF, NULL, &zeroHI, &ar,
+                                       (Bool)bitQ, (Bool)bitSZ);
+      if (ok) {
+         vassert(tyF == Ity_F64 || tyF == Ity_F32);
+         IROp op = (tyF == Ity_F64) ? (isFNEG ? Iop_Neg64Fx2 : Iop_Abs64Fx2)
+                                    : (isFNEG ? Iop_Neg32Fx4 : Iop_Abs32Fx4);
+         IRTemp res = newTemp(Ity_V128);
+         assign(res, unop(op, getQReg128(nn)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(res))
+                               : mkexpr(res));
+         DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
+             nameQReg128(dd), ar, nameQReg128(nn), ar);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- FCMP,FCMPE -------------------- */
+   /* 31        23   20    15      9 4
+      000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
+      000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
+      000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
+      000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
+
+      000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
+      000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
+      000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
+      000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
+
+      FCMPE generates Invalid Operation exn if either arg is any kind
+      of NaN.  FCMP generates Invalid Operation exn if either arg is a
+      signalling NaN.  We ignore this detail here and produce the same
+      IR for both.
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,0,0) && INSN(21,21) == 1 
+       && INSN(15,10) == BITS6(0,0,1,0,0,0) && INSN(2,0) == BITS3(0,0,0)) {
+      Bool   isD     = INSN(22,22) == 1;
+      UInt   mm      = INSN(20,16);
+      UInt   nn      = INSN(9,5);
+      Bool   isCMPE  = INSN(4,4) == 1;
+      Bool   cmpZero = INSN(3,3) == 1;
+      IRType ty      = isD ? Ity_F64 : Ity_F32;
+      Bool   valid   = True;
+      if (cmpZero && mm != 0) valid = False;
+      if (valid) {
+         IRTemp argL  = newTemp(ty);
+         IRTemp argR  = newTemp(ty);
+         IRTemp irRes = newTemp(Ity_I32);
+         assign(argL, getQRegLO(nn, ty));
+         assign(argR,
+                cmpZero 
+                   ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
+                   : getQRegLO(mm, ty));
+         assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
+                             mkexpr(argL), mkexpr(argR)));
+         IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
+         IRTemp nzcv_28x0 = newTemp(Ity_I64);
+         assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
+         setFlags_COPY(nzcv_28x0);
+         DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ty),
+             cmpZero ? "#0.0" : nameQRegLO(mm, ty));
+         return True;
+      }
+   }
+
+   /* -------------------- F{N}M{ADD,SUB} -------------------- */
+   /* 31          22   20 15 14 9 4   ix
+      000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
+      000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
+      000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
+      000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
+      where Fx=Dx when sz=1, Fx=Sx when sz=0
+
+               -----SPEC------    ----IMPL----
+      fmadd       a +    n * m    a + n * m
+      fmsub       a + (-n) * m    a - n * m
+      fnmadd   (-a) + (-n) * m    -(a + n * m)
+      fnmsub   (-a) +    n * m    -(a - n * m)
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,1,0)) {
+      Bool    isD   = INSN(22,22) == 1;
+      UInt    mm    = INSN(20,16);
+      UInt    aa    = INSN(14,10);
+      UInt    nn    = INSN(9,5);
+      UInt    dd    = INSN(4,0);
+      UInt    ix    = (INSN(21,21) << 1) | INSN(15,15);
+      IRType  ty    = isD ? Ity_F64 : Ity_F32;
+      IROp    opADD = mkADDF(ty);
+      IROp    opSUB = mkSUBF(ty);
+      IROp    opMUL = mkMULF(ty);
+      IROp    opNEG = mkNEGF(ty);
+      IRTemp  res   = newTemp(ty);
+      IRExpr* eA    = getQRegLO(aa, ty);
+      IRExpr* eN    = getQRegLO(nn, ty);
+      IRExpr* eM    = getQRegLO(mm, ty);
+      IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
+      IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
+      switch (ix) {
+         case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
+         case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
+         case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
+         case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
+         default: vassert(0);
+      }
+      putQReg128(dd, mkV128(0x0000));
+      putQRegLO(dd, mkexpr(res));
+      const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
+      DIP("%s %s, %s, %s, %s\n",
+          names[ix], nameQRegLO(dd, ty), nameQRegLO(nn, ty),
+                     nameQRegLO(mm, ty), nameQRegLO(aa, ty));
+      return True;
+   }
+
+   /* -------- FCVT{N,P,M,Z}{S,U} (scalar, integer) -------- */
+   /*    30       23   20 18  15     9 4
+      sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
+      sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
+      ---------------- 01 --------------  FCVTP-------- (round to +inf)
+      ---------------- 10 --------------  FCVTM-------- (round to -inf)
+      ---------------- 11 --------------  FCVTZ-------- (round to zero)
+
+      Rd is Xd when sf==1, Wd when sf==0
+      Fn is Dn when x==1, Sn when x==0
+      20:19 carry the rounding mode, using the same encoding as FPCR
+   */
+   if (INSN(30,23) == BITS8(0,0,1,1,1,1,0,0) && INSN(21,21) == 1
+       && INSN(18,17) == BITS2(0,0) && INSN(15,10) == BITS6(0,0,0,0,0,0)) {
+      Bool isI64 = INSN(31,31) == 1;
+      Bool isF64 = INSN(22,22) == 1;
+      UInt rm    = INSN(20,19);
+      Bool isU   = INSN(16,16) == 1;
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      /* Decide on the IR rounding mode to use. */
+      IRRoundingMode irrm = 8; /*impossible*/
+      HChar ch = '?';
+      switch (rm) {
+         case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
+         case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
+         case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
+         case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
+         default: vassert(0);
+      }
+      vassert(irrm != 8);
+      /* Decide on the conversion primop, based on the source size,
+         dest size and signedness (8 possibilities).  Case coding:
+            F32 ->s I32   0
+            F32 ->u I32   1
+            F32 ->s I64   2
+            F32 ->u I64   3
+            F64 ->s I32   4
+            F64 ->u I32   5
+            F64 ->s I64   6
+            F64 ->u I64   7
+      */
+      UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
+      vassert(ix < 8);
+      const IROp ops[8] 
+         = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
+             Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
+      IROp op = ops[ix];
+      // A bit of ATCery: bounce all cases we haven't seen an example of.
+      if (/* F32toI32S */
+             (op == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
+          || (op == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
+          || (op == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
+          /* F32toI32U */
+          || (op == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
+          || (op == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
+          /* F32toI64S */
+          || (op == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
+          /* F32toI64U */
+          || (op == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
+          /* F64toI32S */
+          || (op == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
+          || (op == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
+          || (op == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
+          /* F64toI32U */
+          || (op == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
+          || (op == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
+          || (op == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
+          /* F64toI64S */
+          || (op == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
+          || (op == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
+          || (op == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
+          /* F64toI64U */
+          || (op == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
+          || (op == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
+         ) {
+        /* validated */
+      } else {
+        return False;
+      }
+      IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
+      IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
+      IRTemp src    = newTemp(srcTy);
+      IRTemp dst    = newTemp(dstTy);
+      assign(src, getQRegLO(nn, srcTy));
+      assign(dst, binop(op, mkU32(irrm), mkexpr(src)));
+      putIRegOrZR(isI64, dd, mkexpr(dst));
+      DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
+          nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
+      return True;
+   }
+
+   /* -------- FCVTAS (KLUDGED) (scalar, integer) -------- */
+   /*   30       23   20 18  15     9 4
+      1 00 11110 0x 1 00 100 000000 n d  FCVTAS Xd, Fn
+      0 00 11110 0x 1 00 100 000000 n d  FCVTAS Wd, Fn
+      Fn is Dn when x==1, Sn when x==0
+   */
+   if (INSN(30,23) == BITS8(0,0,1,1,1,1,0,0)
+       && INSN(21,16) == BITS6(1,0,0,1,0,0)
+       && INSN(15,10) == BITS6(0,0,0,0,0,0)) {
+      Bool isI64 = INSN(31,31) == 1;
+      Bool isF64 = INSN(22,22) == 1;
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      /* Decide on the IR rounding mode to use. */
+      /* KLUDGE: should be Irrm_NEAREST_TIE_AWAY_0 */
+      IRRoundingMode irrm = Irrm_NEAREST;
+      /* Decide on the conversion primop. */
+      IROp   op    = isI64 ? (isF64 ? Iop_F64toI64S :  Iop_F32toI64S)
+                           : (isF64 ? Iop_F64toI32S :  Iop_F32toI32S);
+      IRType srcTy = isF64 ? Ity_F64 : Ity_F32;
+      IRType dstTy = isI64 ? Ity_I64 : Ity_I32;
+      IRTemp src   = newTemp(srcTy);
+      IRTemp dst   = newTemp(dstTy);
+      assign(src, getQRegLO(nn, srcTy));
+      assign(dst, binop(op, mkU32(irrm), mkexpr(src)));
+      putIRegOrZR(isI64, dd, mkexpr(dst));
+      DIP("fcvtas %s, %s (KLUDGED)\n",
+          nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
+      return True;
+   }
+
+   /* ---------------- FRINT{I,M,P,Z} (scalar) ---------------- */
+   /* 31        23 21   17  14    9 4
+      000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
+                        rm
+      x==0 => S-registers, x==1 => D-registers
+      rm (17:15) encodings:
+         111 per FPCR  (FRINTI)
+         001 +inf      (FRINTP)
+         010 -inf      (FRINTM)
+         011 zero      (FRINTZ)
+         000 tieeven
+         100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
+         110 per FPCR + "exact = TRUE"
+         101 unallocated
+   */
+   if (INSN(31,23) == BITS9(0,0,0,1,1,1,1,0,0)
+       && INSN(21,18) == BITS4(1,0,0,1) && INSN(14,10) == BITS5(1,0,0,0,0)) {
+      Bool    isD   = INSN(22,22) == 1;
+      UInt    rm    = INSN(17,15);
+      UInt    nn    = INSN(9,5);
+      UInt    dd    = INSN(4,0);
+      IRType  ty    = isD ? Ity_F64 : Ity_F32;
+      IRExpr* irrmE = NULL;
+      UChar   ch    = '?';
+      switch (rm) {
+         case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
+         case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
+         case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
+         // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
+         case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
+         default: break;
+      }
+      if (irrmE) {
+         IRTemp src = newTemp(ty);
+         IRTemp dst = newTemp(ty);
+         assign(src, getQRegLO(nn, ty));
+         assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
+                           irrmE, mkexpr(src)));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(dst));
+         DIP("frint%c %s, %s\n",
+             ch, nameQRegLO(dd, ty), nameQRegLO(nn, ty));
+         return True;
+      }
+      /* else unhandled rounding mode case -- fall through */
+   }
+
+   /* ------------------ FCVT (scalar) ------------------ */
+   /* 31        23 21    16 14    9 4
+      000 11110 11 10001 00 10000 n d   FCVT Sd, Hn (unimp)
+      --------- 11 ----- 01 ---------   FCVT Dd, Hn (unimp)
+      --------- 00 ----- 11 ---------   FCVT Hd, Sn (unimp)
+      --------- 00 ----- 01 ---------   FCVT Dd, Sn
+      --------- 01 ----- 11 ---------   FCVT Hd, Dn (unimp)
+      --------- 01 ----- 00 ---------   FCVT Sd, Dn
+      Rounding, when dst is smaller than src, is per the FPCR.
+   */
+   if (INSN(31,24) == BITS8(0,0,0,1,1,1,1,0)
+       && INSN(21,17) == BITS5(1,0,0,0,1) 
+       && INSN(14,10) == BITS5(1,0,0,0,0)) {
+      UInt b2322 = INSN(23,22);
+      UInt b1615 = INSN(16,15);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      if (b2322 == BITS2(0,0) && b1615 == BITS2(0,1)) {
+         /* Convert S to D */
+         IRTemp res = newTemp(Ity_F64);
+         assign(res, unop(Iop_F32toF64, getQRegLO(nn, Ity_F32)));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fcvt %s, %s\n",
+             nameQRegLO(dd, Ity_F64), nameQRegLO(nn, Ity_F32));
+         return True;
+      }
+      if (b2322 == BITS2(0,1) && b1615 == BITS2(0,0)) {
+         /* Convert D to S */
+         IRTemp res = newTemp(Ity_F32);
+         assign(res, binop(Iop_F64toF32, mkexpr(mk_get_IR_rounding_mode()),
+                                         getQRegLO(nn, Ity_F64)));
+         putQReg128(dd, mkV128(0x0000));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fcvt %s, %s\n",
+             nameQRegLO(dd, Ity_F32), nameQRegLO(nn, Ity_F64));
+         return True;
+      }
+      /* else unhandled */
+   }
+
+   /* ------------------ FABD (scalar) ------------------ */
+   /* 31        23  20 15     9 4
+      011 11110 111 m  110101 n d  FABD  Dd, Dn, Dm
+      011 11110 101 m  110101 n d  FABD  Sd, Sn, Sm
+   */
+   if (INSN(31,23) == BITS9(0,1,1,1,1,1,1,0,1) && INSN(21,21) == 1
+       && INSN(15,10) == BITS6(1,1,0,1,0,1)) {
+      Bool   isD = INSN(22,22) == 1;
+      UInt   mm  = INSN(20,16);
+      UInt   nn  = INSN(9,5);
+      UInt   dd  = INSN(4,0);
+      IRType ty  = isD ? Ity_F64 : Ity_F32;
+      IRTemp res = newTemp(ty);
+      assign(res, unop(mkABSF(ty),
+                       triop(mkSUBF(ty),
+                             mkexpr(mk_get_IR_rounding_mode()),
+                             getQRegLO(nn,ty), getQRegLO(mm,ty))));
+      putQReg128(dd, mkV128(0x0000));
+      putQRegLO(dd, mkexpr(res));
+      DIP("fabd %s, %s, %s\n",
+          nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty));
+      return True;
+   }
+
+   /* -------------- {S,U}CVTF (vector, integer) -------------- */
+   /* 31  28      22 21       15     9 4
+      0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
+      0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
+      with laneage:
+      case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,0,0)
+       && INSN(21,16) == BITS6(1,0,0,0,0,1)
+       && INSN(15,10) == BITS6(1,1,0,1,1,0)) {
+      Bool isQ   = INSN(30,30) == 1;
+      Bool isU   = INSN(29,29) == 1;
+      Bool isF64 = INSN(22,22) == 1;
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      if (isQ || !isF64) {
+         IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
+         UInt   nLanes = 0;
+         Bool   zeroHI = False;
+         const HChar* arrSpec = NULL;
+         Bool   ok = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
+                                      isQ, isF64 );
+         IROp   op = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
+                         : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
+         IRTemp rm = mk_get_IR_rounding_mode();
+         UInt   i;
+         vassert(ok); /* the 'if' above should ensure this */
+         for (i = 0; i < nLanes; i++) {
+            putQRegLane(dd, i,
+                        binop(op, mkexpr(rm), getQRegLane(nn, i, tyI)));
+         }
+         if (zeroHI) {
+            putQRegLane(dd, 1, mkU64(0));
+         }
+         DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
+             nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------- F{ADD,SUB,MUL,DIV,MLA,MLS} (vector) ---------- */
+   /* 31  28      22 21 20 15     9 4                  case
+      0q0 01110 0 sz 1  m  110101 n d  FADD Vd,Vn,Vm   1
+      0q0 01110 1 sz 1  m  110101 n d  FSUB Vd,Vn,Vm   2
+      0q1 01110 0 sz 1  m  110111 n d  FMUL Vd,Vn,Vm   3
+      0q1 01110 0 sz 1  m  111111 n d  FDIV Vd,Vn,Vm   4
+      0q0 01110 0 sz 1  m  110011 n d  FMLA Vd,Vn,Vm   5
+      0q0 01110 1 sz 1  m  110011 n d  FMLS Vd,Vn,Vm   6
+      0q1 01110 1 sz 1  m  110101 n d  FABD Vd,Vn,Vm   7
+   */
+   if (INSN(31,31) == 0
+       && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) {
+      Bool isQ   = INSN(30,30) == 1;
+      UInt b29   = INSN(29,29);
+      UInt b23   = INSN(23,23);
+      Bool isF64 = INSN(22,22) == 1;
+      UInt mm    = INSN(20,16);
+      UInt b1510 = INSN(15,10);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      UInt ix    = 0;
+      /**/ if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,1,0,1)) ix = 1;
+      else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,1,0,1)) ix = 2;
+      else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,0,1,1,1)) ix = 3;
+      else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,1,1,1,1)) ix = 4;
+      else if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,0,1,1)) ix = 5;
+      else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,0,1,1)) ix = 6;
+      else if (b29 == 1 && b23 == 1 && b1510 == BITS6(1,1,0,1,0,1)) ix = 7;
+      IRType laneTy = Ity_INVALID;
+      Bool   zeroHI = False;
+      const HChar* arr = "??";
+      Bool ok
+         = getLaneInfo_Q_SZ(NULL, &laneTy, NULL, &zeroHI, &arr, isQ, isF64);
+      /* Skip MLA/MLS for the time being */
+      if (ok && ix >= 1 && ix <= 4) {
+         const IROp ops64[4]
+            = { Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2 };
+         const IROp ops32[4]
+            = { Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4 };
+         const HChar* names[4]
+            = { "fadd", "fsub", "fmul", "fdiv" };
+         IROp   op = laneTy==Ity_F64 ? ops64[ix-1] : ops32[ix-1];
+         IRTemp rm = mk_get_IR_rounding_mode();
+         IRTemp t1 = newTemp(Ity_V128);
+         IRTemp t2 = newTemp(Ity_V128);
+         assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
+         assign(t2, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t1))
+                           : mkexpr(t1));
+         putQReg128(dd, mkexpr(t2));
+         DIP("%s %s.%s, %s.%s, %s.%s\n", names[ix-1],
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
+      if (ok && ix >= 5 && ix <= 6) {
+         IROp opADD = laneTy==Ity_F64 ? Iop_Add64Fx2 : Iop_Add32Fx4;
+         IROp opSUB = laneTy==Ity_F64 ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
+         IROp opMUL = laneTy==Ity_F64 ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
+         IRTemp rm = mk_get_IR_rounding_mode();
+         IRTemp t1 = newTemp(Ity_V128);
+         IRTemp t2 = newTemp(Ity_V128);
+         // FIXME: double rounding; use FMA primops instead
+         assign(t1, triop(opMUL,
+                          mkexpr(rm), getQReg128(nn), getQReg128(mm)));
+         assign(t2, triop(ix == 5 ? opADD : opSUB,
+                          mkexpr(rm), getQReg128(dd), mkexpr(t1)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t2))
+                               : mkexpr(t2));
+         DIP("%s %s.%s, %s.%s, %s.%s\n", ix == 5 ? "fmla" : "fmls",
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
+      if (ok && ix == 7) {
+         IROp opSUB = laneTy==Ity_F64 ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
+         IROp opABS = laneTy==Ity_F64 ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
+         IRTemp rm = mk_get_IR_rounding_mode();
+         IRTemp t1 = newTemp(Ity_V128);
+         IRTemp t2 = newTemp(Ity_V128);
+         // FIXME: use Abd primop instead?
+         assign(t1, triop(opSUB,
+                          mkexpr(rm), getQReg128(nn), getQReg128(mm)));
+         assign(t2, unop(opABS, mkexpr(t1)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t2))
+                               : mkexpr(t2));
+         DIP("fabd %s.%s, %s.%s, %s.%s\n",
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
+   }
+
+   /* ------------ FCM{EQ,GE,GT}, FAC{GE,GT} (vector) ------------ */
+   /* 31  28      22   20 15     9 4                  case
+      0q1 01110 0 sz 1 m  111011 n d  FACGE Vd, Vn, Vm
+      0q1 01110 1 sz 1 m  111011 n d  FACGT Vd, Vn, Vm
+      0q0 01110 0 sz 1 m  111001 n d  FCMEQ Vd, Vn, Vm
+      0q1 01110 0 sz 1 m  111001 n d  FCMGE Vd, Vn, Vm
+      0q1 01110 1 sz 1 m  111001 n d  FCMGT Vd, Vn, Vm
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1
+       && INSN(15,12) == BITS4(1,1,1,0) && INSN(10,10) == 1) {
+      Bool isQ   = INSN(30,30) == 1;
+      UInt U     = INSN(29,29);
+      UInt E     = INSN(23,23);
+      Bool isF64 = INSN(22,22) == 1;
+      UInt ac    = INSN(11,11);
+      UInt mm    = INSN(20,16);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      /* */
+      UInt   EUac   = (E << 2) | (U << 1) | ac;
+      IROp   opABS  = Iop_INVALID;
+      IROp   opCMP  = Iop_INVALID;
+      IRType laneTy = Ity_INVALID;
+      Bool   zeroHI = False;
+      Bool   swap   = True;
+      const HChar* arr = "??";
+      const HChar* nm  = "??";
+      Bool ok
+         = getLaneInfo_Q_SZ(NULL, &laneTy, NULL, &zeroHI, &arr, isQ, isF64);
+      if (ok) {
+         vassert((isF64 && laneTy == Ity_F64) || (!isF64 && laneTy == Ity_F32));
+         switch (EUac) {
+            case BITS3(0,0,0):
+               nm    = "fcmeq";
+               opCMP = isF64 ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
+               swap  = False;
+               break;
+            case BITS3(0,1,0):
+               nm    = "fcmge";
+               opCMP = isF64 ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
+               break;
+            case BITS3(0,1,1):
+               nm    = "facge";
+               opCMP = isF64 ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
+               opABS = isF64 ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
+               break;
+            case BITS3(1,1,0):
+               nm    = "fcmgt";
+               opCMP = isF64 ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
+               break;
+            case BITS3(1,1,1):
+               nm    = "fcagt";
+               opCMP = isF64 ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
+               opABS = isF64 ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
+               break;
+            default:
+               break;
+         }
+      }
+      if (opCMP != Iop_INVALID) {
+         IRExpr* argN = getQReg128(nn);
+         IRExpr* argM = getQReg128(mm);
+         if (opABS != Iop_INVALID) {
+            argN = unop(opABS, argN);
+            argM = unop(opABS, argM);
+         }
+         IRExpr* res = swap ? binop(opCMP, argM, argN)
+                            : binop(opCMP, argN, argM);
+         if (zeroHI) {
+            res = unop(Iop_ZeroHI64ofV128, res);
+         }
+         putQReg128(dd, res);
+         DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- FCVTN -------------------- */
+   /* 31  28    23  20    15     9 4
+      0q0 01110 0s1 00001 011010 n d  FCVTN Vd, Vn
+      where case q:s of 00: 16Fx4(lo) <- 32Fx4
+                        01: 32Fx2(lo) <- 64Fx2
+                        10: 16Fx4(hi) <- 32Fx4
+                        11: 32Fx2(hi) <- 64Fx2
+      Only deals with the 32Fx2 <- 64Fx2 version (s==1)
+   */
+   if (INSN(31,31) == 0 && INSN(29,23) == BITS7(0,0,1,1,1,0,0)
+       && INSN(21,10) == BITS12(1,0,0,0,0,1,0,1,1,0,1,0)) {
+      UInt bQ = INSN(30,30);
+      UInt bS = INSN(22,22);
+      UInt nn = INSN(9,5);
+      UInt dd = INSN(4,0);
+      if (bS == 1) {
+         IRTemp  rm    = mk_get_IR_rounding_mode();
+         IRExpr* srcLo = getQRegLane(nn, 0, Ity_F64);
+         IRExpr* srcHi = getQRegLane(nn, 1, Ity_F64);
+         putQRegLane(dd, 2 * bQ + 0, binop(Iop_F64toF32, mkexpr(rm), srcLo));
+         putQRegLane(dd, 2 * bQ + 1, binop(Iop_F64toF32, mkexpr(rm), srcHi));
+         if (bQ == 0) {
+            putQRegLane(dd, 1, mkU64(0));
+         }
+         DIP("fcvtn%s %s.%s, %s.2d\n", bQ ? "2" : "",
+             nameQReg128(dd), bQ ? "4s" : "2s", nameQReg128(nn));
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- ADD/SUB (vector) ---------------- */
+   /* 31  28    23   21 20 15     9 4
+      0q0 01110 size 1  m  100001 n d  ADD Vd.T, Vn.T, Vm.T
+      0q1 01110 size 1  m  100001 n d  SUB Vd.T, Vn.T, Vm.T
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(1,0,0,0,0,1)) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt szBlg2 = INSN(23,22);
+      Bool isSUB  = INSN(29,29) == 1;
+      UInt mm     = INSN(20,16);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 );
+      if (ok) {
+         const IROp opsADD[4]
+            = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+         const IROp opsSUB[4]
+            = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+         vassert(szBlg2 < 4);
+         IROp   op = isSUB ? opsSUB[szBlg2] : opsADD[szBlg2];
+         IRTemp t  = newTemp(Ity_V128);
+         assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t))
+                               : mkexpr(t));
+         const HChar* nm = isSUB ? "sub" : "add";
+         DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+             nameQReg128(dd), arrSpec, 
+             nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- ADD/SUB (scalar) ---------------- */
+   /* 31  28    23 21 20 15     9 4
+      010 11110 11 1  m  100001 n d  ADD Dd, Dn, Dm
+      011 11110 11 1  m  100001 n d  SUB Dd, Dn, Dm
+   */
+   if (INSN(31,30) == BITS2(0,1) && INSN(28,21) == BITS8(1,1,1,1,0,1,1,1)
+       && INSN(15,10) == BITS6(1,0,0,0,0,1)) {
+      Bool isSUB = INSN(29,29) == 1;
+      UInt mm    = INSN(20,16);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      IRTemp res = newTemp(Ity_I64);
+      assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
+                        getQRegLane(nn, 0, Ity_I64),
+                        getQRegLane(mm, 0, Ity_I64)));
+      putQRegLane(dd, 0, mkexpr(res));
+      putQRegLane(dd, 1, mkU64(0));
+      DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
+          nameQRegLO(dd, Ity_I64),
+          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
+      return True;
+   }
+
+   /* ------------ MUL/PMUL/MLA/MLS (vector) ------------ */
+   /* 31  28    23   21 20 15     9 4
+      0q0 01110 size 1  m  100111 n d  MUL  Vd.T, Vn.T, Vm.T  B/H/S only
+      0q1 01110 size 1  m  100111 n d  PMUL Vd.T, Vn.T, Vm.T  B only
+      0q0 01110 size 1  m  100101 n d  MLA  Vd.T, Vn.T, Vm.T  B/H/S only
+      0q1 01110 size 1  m  100101 n d  MLS  Vd.T, Vn.T, Vm.T  B/H/S only
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0)
+       && INSN(21,21) == 1 
+       && (INSN(15,10) & BITS6(1,1,1,1,0,1)) == BITS6(1,0,0,1,0,1)) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt szBlg2 = INSN(23,22);
+      UInt bit29  = INSN(29,29);
+      UInt mm     = INSN(20,16);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool isMLAS = INSN(11,11) == 0;
+      const IROp opsADD[4]
+         = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_INVALID };
+      const IROp opsSUB[4]
+         = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_INVALID };
+      const IROp opsMUL[4]
+         = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
+      const IROp opsPMUL[4]
+         = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
+      /* Set opMUL and, if necessary, opACC.  A result value of
+         Iop_INVALID for opMUL indicates that the instruction is
+         invalid. */
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 );
+      vassert(szBlg2 < 4);
+      IROp opACC = Iop_INVALID;
+      IROp opMUL = Iop_INVALID;
+      if (ok) {
+         opMUL = (bit29 == 1 && !isMLAS) ? opsPMUL[szBlg2]
+                                         : opsMUL[szBlg2];
+         opACC = isMLAS ? (bit29 == 1 ? opsSUB[szBlg2] : opsADD[szBlg2])
+                        : Iop_INVALID;
+      }
+      if (ok && opMUL != Iop_INVALID) {
+         IRTemp t1 = newTemp(Ity_V128);
+         assign(t1, binop(opMUL, getQReg128(nn), getQReg128(mm)));
+         IRTemp t2 = newTemp(Ity_V128);
+         assign(t2, opACC == Iop_INVALID
+                       ? mkexpr(t1)
+                       : binop(opACC, getQReg128(dd), mkexpr(t1)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t2))
+                               : mkexpr(t2));
+         const HChar* nm = isMLAS ? (bit29 == 1 ? "mls" : "mla")
+                                  : (bit29 == 1 ? "pmul" : "mul");
+         DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+             nameQReg128(dd), arrSpec, 
+             nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- {S,U}{MIN,MAX} (vector) ---------------- */
+   /* 31  28    23   21 20 15     9 4
+      0q0 01110 size 1  m  011011 n d  SMIN Vd.T, Vn.T, Vm.T
+      0q1 01110 size 1  m  011011 n d  UMIN Vd.T, Vn.T, Vm.T
+      0q0 01110 size 1  m  011001 n d  SMAX Vd.T, Vn.T, Vm.T
+      0q1 01110 size 1  m  011001 n d  UMAX Vd.T, Vn.T, Vm.T
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0)
+       && INSN(21,21) == 1
+       && ((INSN(15,10) & BITS6(1,1,1,1,0,1)) == BITS6(0,1,1,0,0,1))) {
+      Bool isQ    = INSN(30,30) == 1;
+      Bool isU    = INSN(29,29) == 1;
+      UInt szBlg2 = INSN(23,22);
+      Bool isMAX  = INSN(11,11) == 0;
+      UInt mm     = INSN(20,16);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 );
+      if (ok) {
+         const IROp opMINS[4]
+            = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
+         const IROp opMINU[4]
+            = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
+         const IROp opMAXS[4]
+            = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
+         const IROp opMAXU[4]
+            = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
+         vassert(szBlg2 < 4);
+         IROp op = isMAX ? (isU ? opMAXU[szBlg2] : opMAXS[szBlg2])
+                         : (isU ? opMINU[szBlg2] : opMINS[szBlg2]);
+         IRTemp t = newTemp(Ity_V128);
+         assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t))
+                               : mkexpr(t));
+         const HChar* nm = isMAX ? (isU ? "umax" : "smax")
+                                 : (isU ? "umin" : "smin");
+         DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+             nameQReg128(dd), arrSpec, 
+             nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- {S,U}{MIN,MAX}V -------------------- */
+   /* 31  28    23   21    16 15     9 4
+      0q0 01110 size 11000 1  101010 n d  SMINV Vd, Vn.T
+      0q1 01110 size 11000 1  101010 n d  UMINV Vd, Vn.T
+      0q0 01110 size 11000 0  101010 n d  SMAXV Vd, Vn.T
+      0q1 01110 size 11000 0  101010 n d  UMAXV Vd, Vn.T
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0)
+       && INSN(21,17) == BITS5(1,1,0,0,0)
+       && INSN(15,10) == BITS6(1,0,1,0,1,0)) {
+      Bool isQ    = INSN(30,30) == 1;
+      Bool isU    = INSN(29,29) == 1;
+      UInt szBlg2 = INSN(23,22);
+      Bool isMAX  = INSN(16,16) == 0;
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2);
+      if (ok) {
+         if (szBlg2 == 3)         ok = False;
+         if (szBlg2 == 2 && !isQ) ok = False;
+      }
+      if (ok) {
+         const IROp opMINS[3]
+            = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
+         const IROp opMINU[3]
+            = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
+         const IROp opMAXS[3]
+            = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
+         const IROp opMAXU[3]
+            = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
+         vassert(szBlg2 < 3);
+         IROp op = isMAX ? (isU ? opMAXU[szBlg2] : opMAXS[szBlg2])
+                         : (isU ? opMINU[szBlg2] : opMINS[szBlg2]);
+         IRTemp tN1 = newTemp(Ity_V128);
+         assign(tN1, getQReg128(nn));
+         /* If Q == 0, we're just folding lanes in the lower half of
+            the value.  In which case, copy the lower half of the
+            source into the upper half, so we can then treat it the
+            same as the full width case. */
+         IRTemp tN2 = newTemp(Ity_V128);
+         assign(tN2, zeroHI ? mk_CatEvenLanes64x2(tN1,tN1) : mkexpr(tN1));
+         IRTemp res = math_MINMAXV(tN2, op);
+         if (res == IRTemp_INVALID)
+            return False; /* means math_MINMAXV
+                             doesn't handle this case yet */
+         putQReg128(dd, mkexpr(res));
+         const HChar* nm = isMAX ? (isU ? "umaxv" : "smaxv")
+                                 : (isU ? "uminv" : "sminv");
+         const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
+         IRType laneTy = tys[szBlg2];
+         DIP("%s %s, %s.%s\n", nm,
+             nameQRegLO(dd, laneTy), nameQReg128(nn), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ------------ {AND,BIC,ORR,ORN} (vector) ------------ */
+   /* 31  28    23  20 15     9 4
+      0q0 01110 001 m  000111 n d  AND Vd.T, Vn.T, Vm.T
+      0q0 01110 011 m  000111 n d  BIC Vd.T, Vn.T, Vm.T
+      0q0 01110 101 m  000111 n d  ORR Vd.T, Vn.T, Vm.T
+      0q0 01110 111 m  000111 n d  ORN Vd.T, Vn.T, Vm.T
+      T is 16b when q==1, 8b when q==0
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      Bool   isQ    = INSN(30,30) == 1;
+      Bool   isORR  = INSN(23,23) == 1;
+      Bool   invert = INSN(22,22) == 1;
+      UInt   mm     = INSN(20,16);
+      UInt   nn     = INSN(9,5);
+      UInt   dd     = INSN(4,0);
+      IRTemp res    = newTemp(Ity_V128);
+      assign(res, binop(isORR ? Iop_OrV128 : Iop_AndV128,
+                        getQReg128(nn),
+                        invert ? unop(Iop_NotV128, getQReg128(mm))
+                               : getQReg128(mm)));
+      putQReg128(dd, isQ ? mkexpr(res)
+                         : unop(Iop_ZeroHI64ofV128, mkexpr(res)));
+      const HChar* names[4] = { "and", "bic", "orr", "orn" };
+      const HChar* ar = isQ ? "16b" : "8b";
+      DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
+          nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
+      return True;
+   }
+
+   /* ---------- CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector) ---------- */
+   /* 31  28    23   21     15     9 4                          ix 
+      0q1 01110 size 1  m   100011 n d  CMEQ  Vd.T, Vn.T, Vm.T  (1) ==
+      0q0 01110 size 1  m   100011 n d  CMTST Vd.T, Vn.T, Vm.T  (2) &, != 0
+
+      0q1 01110 size 1  m   001101 n d  CMHI Vd.T, Vn.T, Vm.T   (3) >u
+      0q0 01110 size 1  m   001101 n d  CMGT Vd.T, Vn.T, Vm.T   (4) >s
+
+      0q1 01110 size 1  m   001111 n d  CMHS Vd.T, Vn.T, Vm.T   (5) >=u
+      0q0 01110 size 1  m   001111 n d  CMGE Vd.T, Vn.T, Vm.T   (6) >=s
+
+      0q1 01110 size 100000 100010 n d  CMGE Vd.T, Vn.T, #0     (7) >=s 0
+      0q0 01110 size 100000 100010 n d  CMGT Vd.T, Vn.T, #0     (8) >s 0
+
+      0q1 01110 size 100000 100110 n d  CMLE Vd.T, Vn.T, #0     (9) <=s 0
+      0q0 01110 size 100000 100110 n d  CMEQ Vd.T, Vn.T, #0     (10) == 0
+
+      0q0 01110 size 100000 101010 n d  CMLT Vd.T, Vn.T, #0     (11) <s 0
+   */
+   if (INSN(31,31) == 0
+       && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt bit29  = INSN(29,29);
+      UInt szBlg2 = INSN(23,22);
+      UInt mm     = INSN(20,16);
+      UInt b1510  = INSN(15,10);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      const IROp opsEQ[4]
+         = { Iop_CmpEQ8x16,  Iop_CmpEQ16x8,  Iop_CmpEQ32x4,  Iop_CmpEQ64x2 };
+      const IROp opsGTS[4]
+         = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
+      const IROp opsGTU[4]
+         = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
+      Bool zeroHI = False;
+      const HChar* arrSpec = "??";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2);
+      UInt ix = 0;
+      if (ok) {
+         switch (b1510) {
+            case BITS6(1,0,0,0,1,1): ix = bit29 ? 1 : 2; break;
+            case BITS6(0,0,1,1,0,1): ix = bit29 ? 3 : 4; break;
+            case BITS6(0,0,1,1,1,1): ix = bit29 ? 5 : 6; break;
+            case BITS6(1,0,0,0,1,0):
+               if (mm == 0) { ix = bit29 ? 7 : 8; }; break;
+            case BITS6(1,0,0,1,1,0):
+               if (mm == 0) { ix = bit29 ? 9 : 10; }; break;
+            case BITS6(1,0,1,0,1,0):
+               if (mm == 0 && bit29 == 0) { ix = 11; }; break;
+            default: break;
+         }
+      }
+      if (ix != 0) {
+         vassert(ok && szBlg2 < 4);
+         IRExpr* argL = getQReg128(nn);
+         IRExpr* argR = (ix <= 6) ? getQReg128(mm) : mkV128(0x0000);
+         IRExpr* res  = NULL;
+         /* Some useful identities:
+               x >  y   can be expressed directly
+               x <  y   ==   y > x
+               x <= y   ==   not (x > y)
+               x >= y   ==   not (y > x)
+         */
+         switch (ix) {
+            case 1: res = binop(opsEQ[szBlg2], argL, argR); break;
+            case 2: res = unop(Iop_NotV128, binop(opsEQ[szBlg2],
+                                            binop(Iop_AndV128, argL, argR), 
+                                                  mkV128(0x0000)));
+                    break;
+            case 3: res = binop(opsGTU[szBlg2], argL, argR); break;
+            case 4: res = binop(opsGTS[szBlg2], argL, argR); break;
+            case 5: res = unop(Iop_NotV128, binop(opsGTU[szBlg2], argR, argL));
+                    break;
+            case 6: res = unop(Iop_NotV128, binop(opsGTS[szBlg2], argR, argL));
+                    break;
+            case 7: res = unop(Iop_NotV128, binop(opsGTS[szBlg2], argR, argL));
+                    break;
+            case 8: res = binop(opsGTS[szBlg2], argL, argR); break;
+            case 9: res = unop(Iop_NotV128,
+                               binop(opsGTS[szBlg2], argL, argR));
+                    break;
+            case 10: res = binop(opsEQ[szBlg2],  argL, argR); break;
+            case 11: res = binop(opsGTS[szBlg2], argR, argL); break;
+            default: vassert(0);
+         }
+         vassert(res);
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, res) : res);
+         const HChar* nms[11] = { "eq", "tst", "hi", "gt", "hs", "ge",
+                                  "ge", "gt", "le", "eq", "lt" };
+         if (ix <= 6) {
+            DIP("cm%s %s.%s, %s.%s, %s.%s\n", nms[ix-1],
+                nameQReg128(dd), arrSpec,
+                nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         } else {
+            DIP("cm%s %s.%s, %s.%s, #0\n", nms[ix-1],
+                nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
+         }
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------- {EOR,BSL,BIT,BIF} (vector) -------------- */
+   /* 31  28    23   20 15     9 4
+      0q1 01110 00 1 m  000111 n d  EOR Vd.T, Vm.T, Vn.T
+      0q1 01110 01 1 m  000111 n d  BSL Vd.T, Vm.T, Vn.T
+      0q1 01110 10 1 m  000111 n d  BIT Vd.T, Vm.T, Vn.T
+      0q1 01110 11 1 m  000111 n d  BIF Vd.T, Vm.T, Vn.T
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(1,0,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      Bool   isQ  = INSN(30,30) == 1;
+      UInt   op   = INSN(23,22);
+      UInt   mm   = INSN(20,16);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRTemp argD = newTemp(Ity_V128);
+      IRTemp argN = newTemp(Ity_V128);
+      IRTemp argM = newTemp(Ity_V128);
+      assign(argD, getQReg128(dd));
+      assign(argN, getQReg128(nn));
+      assign(argM, getQReg128(mm));
+      const IROp opXOR = Iop_XorV128;
+      const IROp opAND = Iop_AndV128;
+      const IROp opNOT = Iop_NotV128;
+      IRExpr* res = NULL;
+      switch (op) {
+         case BITS2(0,0): /* EOR */
+            res = binop(opXOR, mkexpr(argM), mkexpr(argN));
+            break;
+         case BITS2(0,1): /* BSL */
+            res = binop(opXOR, mkexpr(argM),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
+                                     mkexpr(argD)));
+            break;
+         case BITS2(1,0): /* BIT */
+            res = binop(opXOR, mkexpr(argD),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
+                                     mkexpr(argM)));
+            break;
+         case BITS2(1,1): /* BIF */
+            res = binop(opXOR, mkexpr(argD),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
+                                     unop(opNOT, mkexpr(argM))));
+            break;
+         default:
+            vassert(0);
+      }
+      vassert(res);
+      putQReg128(dd, isQ ? res : unop(Iop_ZeroHI64ofV128, res));
+      const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
+      const HChar* arr = isQ ? "16b" : "8b";
+      vassert(op < 4);
+      DIP("%s %s.%s, %s.%s, %s.%s\n", nms[op],
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
+
+   /* ------------ {USHR,SSHR,SHL} (vector, immediate) ------------ */
+   /* 31  28     22   18   15     9 4
+      0q1 011110 immh immb 000001 n d  USHR Vd.T, Vn.T, #shift (1)
+      0q0 011110 immh immb 000001 n d  SSHR Vd.T, Vn.T, #shift (2)
+      0q0 011110 immh immb 010101 n d  SHL  Vd.T, Vn.T, #shift (3)
+      laneTy, shift = case immh:immb of
+                         0001:xxx -> B, SHR:8-xxx,    SHL:xxx
+                         001x:xxx -> H, SHR:16-xxxx   SHL:xxxx
+                         01xx:xxx -> S, SHR:32-xxxxx  SHL:xxxxx
+                         1xxx:xxx -> D, SHR:64-xxxxxx SHL:xxxxxx
+                         other    -> invalid
+      As usual the case laneTy==D && q==0 is not allowed.
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,1,0)
+       && INSN(10,10) == 1) {
+      UInt ix = 0;
+      /**/ if (INSN(29,29) == 1 && INSN(15,11) == BITS5(0,0,0,0,0)) ix = 1;
+      else if (INSN(29,29) == 0 && INSN(15,11) == BITS5(0,0,0,0,0)) ix = 2;
+      else if (INSN(29,29) == 0 && INSN(15,11) == BITS5(0,1,0,1,0)) ix = 3;
+      if (ix > 0) {
+         Bool isQ  = INSN(30,30) == 1;
+         UInt immh = INSN(22,19);
+         UInt immb = INSN(18,16);
+         UInt nn   = INSN(9,5);
+         UInt dd   = INSN(4,0);
+         const IROp opsSHRN[4]
+            = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+         const IROp opsSARN[4]
+            = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
+         const IROp opsSHLN[4]
+            = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
+         UInt szBlg2 = 0;
+         UInt shift  = 0;
+         Bool ok     = getLaneInfo_IMMH_IMMB(&shift, &szBlg2, immh, immb);
+         if (ix == 3) {
+            /* The shift encoding has opposite sign for the leftwards
+               case.  Adjust shift to compensate. */
+            shift = (8 << szBlg2) - shift;
+         }
+         if (ok && szBlg2 < 4 && shift > 0 && shift < (8 << szBlg2)
+             && !(szBlg2 == 3/*64bit*/ && !isQ)) {
+            IROp op = Iop_INVALID;
+            const HChar* nm = NULL;
+            switch (ix) {
+               case 1: op = opsSHRN[szBlg2]; nm = "ushr"; break;
+               case 2: op = opsSARN[szBlg2]; nm = "sshr"; break;
+               case 3: op = opsSHLN[szBlg2]; nm = "shl";  break;
+               default: vassert(0);
+            }
+            IRExpr* src = getQReg128(nn);
+            IRExpr* res = binop(op, src, mkU8(shift));
+            putQReg128(dd, isQ ? res : unop(Iop_ZeroHI64ofV128, res));
+            HChar laneCh = "bhsd"[szBlg2];
+            UInt  nLanes = (isQ ? 128 : 64) / (8 << szBlg2);
+            DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
+                nameQReg128(dd), nLanes, laneCh,
+                nameQReg128(nn), nLanes, laneCh, shift);
+            return True;
+         }
+         /* else fall through */
+      }
+   }
+
+   /* -------------------- {U,S}SHLL{,2} -------------------- */
+   /* 31  28     22   18   15     9 4
+      0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
+      0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
+      where Ta,Tb,sh
+        = case immh of 1xxx -> invalid
+                       01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
+                       001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
+                       0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
+                       0000 -> AdvSIMD modified immediate (???)
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,1,0)
+       && INSN(15,10) == BITS6(1,0,1,0,0,1)) {
+      Bool isQ   = INSN(30,30) == 1;
+      Bool isU   = INSN(29,29) == 1;
+      UInt immh  = INSN(22,19);
+      UInt immb  = INSN(18,16);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      UInt immhb = (immh << 3) | immb;
+      IRTemp  src  = newTemp(Ity_V128);
+      IRTemp  zero = newTemp(Ity_V128);
+      IRExpr* res  = NULL;
+      UInt    sh   = 0;
+      const HChar* ta = "??";
+      const HChar* tb = "??";
+      assign(src, getQReg128(nn));
+      assign(zero, mkV128(0x0000));
+      if (immh & 8) {
+         /* invalid; don't assign to res */
+      }
+      else if (immh & 4) {
+         sh = immhb - 32;
+         vassert(sh < 32); /* so 32-sh is 1..32 */
+         ta = "2d";
+         tb = isQ ? "4s" : "2s";
+         IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero) 
+                           : mk_InterleaveLO32x4(src, zero);
+         res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
+      }
+      else if (immh & 2) {
+         sh = immhb - 16;
+         vassert(sh < 16); /* so 16-sh is 1..16 */
+         ta = "4s";
+         tb = isQ ? "8h" : "4h";
+         IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero) 
+                           : mk_InterleaveLO16x8(src, zero);
+         res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
+      }
+      else if (immh & 1) {
+         sh = immhb - 8;
+         vassert(sh < 8); /* so 8-sh is 1..8 */
+         ta = "8h";
+         tb = isQ ? "16b" : "8b";
+         IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero) 
+                           : mk_InterleaveLO8x16(src, zero);
+         res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
+      } else {
+         vassert(immh == 0);
+         /* invalid; don't assign to res */
+      }
+      /* */
+      if (res) {
+         putQReg128(dd, res);
+         DIP("%cshll%s %s.%s, %s.%s, #%d\n",
+             isU ? 'u' : 's', isQ ? "2" : "",
+             nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- XTN{,2} -------------------- */
+   /* 31  28    23   21     15     9 4  XTN{,2} Vd.Tb, Vn.Ta
+      0q0 01110 size 100001 001010 n d
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,1,0)
+       && INSN(21,16) == BITS6(1,0,0,0,0,1)
+       && INSN(15,10) == BITS6(0,0,1,0,1,0)) {
+      Bool isQ  = INSN(30,30) == 1;
+      UInt size = INSN(23,22);
+      UInt nn   = INSN(9,5);
+      UInt dd   = INSN(4,0);
+      IROp op   = Iop_INVALID;
+      const HChar* tb = NULL;
+      const HChar* ta = NULL;
+      switch ((size << 1) | (isQ ? 1 : 0)) {
+         case 0: tb = "8b";  ta = "8h"; op = Iop_NarrowUn16to8x8;  break;
+         case 1: tb = "16b"; ta = "8h"; op = Iop_NarrowUn16to8x8;  break;
+         case 2: tb = "4h";  ta = "4s"; op = Iop_NarrowUn32to16x4; break;
+         case 3: tb = "8h";  ta = "4s"; op = Iop_NarrowUn32to16x4; break;
+         case 4: tb = "2s";  ta = "2d"; op = Iop_NarrowUn64to32x2; break;
+         case 5: tb = "4s";  ta = "2d"; op = Iop_NarrowUn64to32x2; break;
+         case 6: break;
+         case 7: break;
+         default: vassert(0);
+      }
+      if (op != Iop_INVALID) {
+         if (!isQ) {
+            putQRegLane(dd, 1, mkU64(0));
+         }
+         putQRegLane(dd, isQ ? 1 : 0, unop(op, getQReg128(nn)));
+         DIP("xtn%s %s.%s, %s.%s\n", isQ ? "2" : "",
+             nameQReg128(dd), tb, nameQReg128(nn), ta);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- DUP (element, vector) ---------------- */
+   /* 31  28       20   15     9 4
+      0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
+   */
+   if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0)
+       && INSN(15,10) == BITS6(0,0,0,0,0,1)) {
+      Bool   isQ  = INSN(30,30) == 1;
+      UInt   imm5 = INSN(20,16);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRTemp w0   = newTemp(Ity_I64);
+      const HChar* arT  = "??";
+      const HChar* arTs = "??";
+      IRType laneTy = Ity_INVALID;
+      UInt   laneNo = 16; /* invalid */
+      if (imm5 & 1) {
+         arT    = isQ ? "16b" : "8b";
+         arTs   = "b";
+         laneNo = (imm5 >> 1) & 15;
+         laneTy = Ity_I8;
+         assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if (imm5 & 2) {
+         arT    = isQ ? "8h" : "4h";
+         arTs   = "h";
+         laneNo = (imm5 >> 2) & 7;
+         laneTy = Ity_I16;
+         assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if (imm5 & 4) {
+         arT    = isQ ? "4s" : "2s";
+         arTs   = "s";
+         laneNo = (imm5 >> 3) & 3;
+         laneTy = Ity_I32;
+         assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if ((imm5 & 8) && isQ) {
+         arT  = "2d";
+         arTs = "d";
+         laneNo = (imm5 >> 4) & 1;
+         laneTy = Ity_I64;
+         assign(w0, getQRegLane(nn, laneNo, laneTy));
+      }
+      else {
+         /* invalid; leave laneTy unchanged. */
+      }
+      /* */
+      if (laneTy != Ity_INVALID) {
+         vassert(laneNo < 16);
+         IRTemp w1 = math_DUP_TO_64(w0, laneTy);
+         putQReg128(dd, binop(Iop_64HLtoV128,
+                              isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
+         DIP("dup %s.%s, %s.%s[%u]\n",
+             nameQReg128(dd), arT, nameQReg128(nn), arTs, laneNo);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- DUP (general, vector) ---------------- */
+   /* 31  28    23  20   15     9 4
+      0q0 01110 000 imm5 000011 n d  DUP Vd.T, Rn
+      Q=0 writes 64, Q=1 writes 128
+      imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
+            xxx10  4H(q=0)      or 8H(q=1),      R=W
+            xx100  2S(q=0)      or 4S(q=1),      R=W
+            x1000  Invalid(q=0) or 2D(q=1),      R=X
+            x0000  Invalid(q=0) or Invalid(q=1)
+   */
+   if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0)
+       && INSN(15,10) == BITS6(0,0,0,0,1,1)) {
+      Bool   isQ  = INSN(30,30) == 1;
+      UInt   imm5 = INSN(20,16);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRTemp w0   = newTemp(Ity_I64);
+      const HChar* arT = "??";
+      IRType laneTy = Ity_INVALID;
+      if (imm5 & 1) {
+         arT    = isQ ? "16b" : "8b";
+         laneTy = Ity_I8;
+         assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
+      }
+      else if (imm5 & 2) {
+         arT    = isQ ? "8h" : "4h";
+         laneTy = Ity_I16;
+         assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
+      }
+      else if (imm5 & 4) {
+         arT    = isQ ? "4s" : "2s";
+         laneTy = Ity_I32;
+         assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
+      }
+      else if ((imm5 & 8) && isQ) {
+         arT    = "2d";
+         laneTy = Ity_I64;
+         assign(w0, getIReg64orZR(nn));
+      }
+      else {
+         /* invalid; leave laneTy unchanged. */
+      }
+      /* */
+      if (laneTy != Ity_INVALID) {
+         IRTemp w1 = math_DUP_TO_64(w0, laneTy);
+         putQReg128(dd, binop(Iop_64HLtoV128,
+                              isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
+         DIP("dup %s.%s, %s\n",
+             nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------------- {S,U}MOV ---------------------- */
+   /* 31  28        20   15     9 4
+      0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
+      0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
+      dest is Xd when q==1, Wd when q==0
+      UMOV:
+         Ts,index,ops = case q:imm5 of
+                          0:xxxx1 -> B, xxxx, 8Uto64
+                          1:xxxx1 -> invalid
+                          0:xxx10 -> H, xxx,  16Uto64
+                          1:xxx10 -> invalid
+                          0:xx100 -> S, xx,   32Uto64
+                          1:xx100 -> invalid
+                          1:x1000 -> D, x,    copy64
+                          other   -> invalid
+      SMOV:
+         Ts,index,ops = case q:imm5 of
+                          0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
+                          1:xxxx1 -> B, xxxx, 8Sto64
+                          0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
+                          1:xxx10 -> H, xxx,  16Sto64
+                          0:xx100 -> invalid
+                          1:xx100 -> S, xx,   32Sto64
+                          1:x1000 -> invalid
+                          other   -> invalid
+   */
+   if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0)
+       && (INSN(15,10) & BITS6(1,1,1,0,1,1)) == BITS6(0,0,1,0,1,1)) {
+      UInt bitQ = INSN(30,30) == 1;
+      UInt imm5 = INSN(20,16);
+      UInt nn   = INSN(9,5);
+      UInt dd   = INSN(4,0);
+      Bool isU  = INSN(12,12) == 1;
+      const HChar* arTs = "??";
+      UInt    laneNo = 16; /* invalid */
+      // Setting 'res' to non-NULL determines valid/invalid
+      IRExpr* res    = NULL;
+      if (!bitQ && (imm5 & 1)) { // 0:xxxx1
+         laneNo = (imm5 >> 1) & 15;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
+         res = isU ? unop(Iop_8Uto64, lane)
+                   : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
+         arTs = "b";
+      }
+      else if (bitQ && (imm5 & 1)) { // 1:xxxx1
+         laneNo = (imm5 >> 1) & 15;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
+         res = isU ? NULL
+                   : unop(Iop_8Sto64, lane);
+         arTs = "b";
+      }
+      else if (!bitQ && (imm5 & 2)) { // 0:xxx10
+         laneNo = (imm5 >> 2) & 7;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
+         res = isU ? unop(Iop_16Uto64, lane)
+                   : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
+         arTs = "h";
+      }
+      else if (bitQ && (imm5 & 2)) { // 1:xxx10
+         laneNo = (imm5 >> 2) & 7;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
+         res = isU ? NULL
+                   : unop(Iop_16Sto64, lane);
+         arTs = "h";
+      }
+      else if (!bitQ && (imm5 & 4)) { // 0:xx100
+         laneNo = (imm5 >> 3) & 3;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
+         res = isU ? unop(Iop_32Uto64, lane)
+                   : NULL;
+         arTs = "s";
+      }
+      else if (bitQ && (imm5 & 4)) { // 1:xxx10
+         laneNo = (imm5 >> 3) & 3;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
+         res = isU ? NULL
+                   : unop(Iop_32Sto64, lane);
+         arTs = "s";
+      }
+      else if (bitQ && (imm5 & 8)) { // 1:x1000
+         laneNo = (imm5 >> 4) & 1;
+         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
+         res = isU ? lane
+                   : NULL;
+         arTs = "d";
+      }
+      /* */
+      if (res) {
+         vassert(laneNo < 16);
+         putIReg64orZR(dd, res);
+         DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
+             nameIRegOrZR(bitQ == 1, dd),
+             nameQReg128(nn), arTs, laneNo);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- INS (general) -------------------- */
+   /* 31  28       20   15     9 4
+      010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
+      where Ts,ix = case imm5 of xxxx1 -> B, xxxx
+                                 xxx10 -> H, xxx
+                                 xx100 -> S, xx
+                                 x1000 -> D, x
+   */
+   if (INSN(31,21) == BITS11(0,1,0,0,1,1,1,0,0,0,0)
+       && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      UInt    imm5   = INSN(20,16);
+      UInt    nn     = INSN(9,5);
+      UInt    dd     = INSN(4,0);
+      HChar   ts     = '?';
+      UInt    laneNo = 16;
+      IRExpr* src    = NULL;
+      if (imm5 & 1) {
+         src    = unop(Iop_64to8, getIReg64orZR(nn));
+         laneNo = (imm5 >> 1) & 15;
+         ts     = 'b';
+      }
+      else if (imm5 & 2) {
+         src    = unop(Iop_64to16, getIReg64orZR(nn));
+         laneNo = (imm5 >> 2) & 7;
+         ts     = 'h';
+      }
+      else if (imm5 & 4) {
+         src    = unop(Iop_64to32, getIReg64orZR(nn));
+         laneNo = (imm5 >> 3) & 3;
+         ts     = 's';
+      }
+      else if (imm5 & 8) {
+         src    = getIReg64orZR(nn);
+         laneNo = (imm5 >> 4) & 1;
+         ts     = 'd';
+      }
+      /* */
+      if (src) {
+         vassert(laneNo < 16);
+         putQRegLane(dd, laneNo, src);
+         DIP("ins %s.%c[%u], %s\n",
+             nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
+         return True;
+      }
+      /* else invalid; fall through */
+   }
+
+   /* -------------------- NEG (vector) -------------------- */
+   /* 31  28    23 21    16      9 4
+      0q1 01110 sz 10000 0101110 n d  NEG Vd, Vn
+      sz is laneSz, q:sz == 011 is disallowed, as usual
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(1,0,1,1,1,0)
+       && INSN(21,10) == BITS12(1,0,0,0,0,0,1,0,1,1,1,0)) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt szBlg2 = INSN(23,22);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 );
+      if (ok) {
+         const IROp opSUB[4]
+            = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+         IRTemp res = newTemp(Ity_V128);
+         vassert(szBlg2 < 4);
+         assign(res, binop(opSUB[szBlg2], mkV128(0x0000), getQReg128(nn)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(res))
+                               : mkexpr(res));
+         DIP("neg %s.%s, %s.%s\n",
+             nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- TBL, TBX -------------------- */
+   /* 31  28        20 15 14  12  9 4
+      0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
+      0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
+      where Ta = 16b(q=1) or 8b(q=0)
+   */
+   if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0)
+       && INSN(15,15) == 0 && INSN(11,10) == BITS2(0,0)) {
+      Bool isQ   = INSN(30,30) == 1;
+      Bool isTBX = INSN(12,12) == 1;
+      UInt mm    = INSN(20,16);
+      UInt len   = INSN(14,13);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      /* The out-of-range values to use. */
+      IRTemp oor_values = newTemp(Ity_V128);
+      assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
+      /* src value */
+      IRTemp src = newTemp(Ity_V128);
+      assign(src, getQReg128(mm));
+      /* The table values */
+      IRTemp tab[4];
+      UInt   i;
+      for (i = 0; i <= len; i++) {
+         vassert(i < 4);
+         tab[i] = newTemp(Ity_V128);
+         assign(tab[i], getQReg128((nn + i) % 32));
+      }
+      IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
+      putQReg128(dd, isQ ? mkexpr(res)
+                         : unop(Iop_ZeroHI64ofV128, mkexpr(res)) );
+      const HChar* Ta = isQ ? "16b" : "8b";
+      const HChar* nm = isTBX ? "tbx" : "tbl";
+      DIP("%s %s.%s, {v%d.16b .. v%d.16b}, %s.%s\n",
+          nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
+      return True;
+   }
+   /* FIXME Temporary hacks to get through ld.so FIXME */
+
+   /* ------------------ movi vD.4s, #0x0 ------------------ */
+   /* 0x4F 0x00 0x04 000 vD */
+   if ((insn & 0xFFFFFFE0) == 0x4F000400) {
+      UInt vD = INSN(4,0);
+      putQReg128(vD, mkV128(0x0000));
+      DIP("movi v%u.4s, #0x0\n", vD);
+      return True;
+   }
+
+   /* ---------------- MOV vD.16b, vN.16b ---------------- */
+   /* 31        23  20 15     9 4
+      010 01110 101 m  000111 n d   ORR vD.16b, vN.16b, vM.16b
+      This only handles the N == M case.
+   */
+   if (INSN(31,24) == BITS8(0,1,0,0,1,1,1,0)
+       && INSN(23,21) == BITS3(1,0,1) && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      UInt mm = INSN(20,16);
+      UInt nn = INSN(9,5);
+      UInt dd = INSN(4,0);
+      if (mm == nn) {
+         putQReg128(dd, getQReg128(nn));
+         DIP("mov v%u.16b, v%u.16b\n", dd, nn);
+         return True;
+      }
+      /* else it's really an ORR; fall through. */
+   }
+
+   /* ---------------- CMEQ_d_d_#0 ---------------- */
+   /* 
+      010 11110 11 10000 0100 110 n d   CMEQ Dd, Dn, #0
+   */
+   if ((INSN(31,0) & 0xFFFFFC00) == 0x5EE09800) {
+      UInt nn = INSN(9,5);
+      UInt dd = INSN(4,0);
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+                          binop(Iop_CmpEQ64x2, getQReg128(nn),
+                                mkV128(0x0000))));
+      DIP("cmeq d%u, d%u, #0\n", dd, nn);
+      return True;
+   }
+
+   /* ---------------- SHL_d_d_#imm ---------------- */
+   /* 31         22 21  18 15     9 4
+      010 111110 1  ih3 ib 010101 n d  SHL Dd, Dn, #(ih3:ib)
+   */
+   if (INSN(31,22) == BITS10(0,1,0,1,1,1,1,1,0,1)
+       && INSN(15,10) == BITS6(0,1,0,1,0,1)) {
+      UInt nn = INSN(9,5);
+      UInt dd = INSN(4,0);
+      UInt sh = INSN(21,16);
+      vassert(sh < 64);
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+                          binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
+      DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
+      return True;
+   }
+
+   vex_printf("ARM64 front end: simd_and_fp\n");
+   return False;
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single ARM64 instruction               ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single ARM64 instruction into IR.  The instruction
+   has is located at |guest_instr| and has guest IP of
+   |guest_PC_curr_instr|, which will have been set before the call
+   here.  Returns True iff the instruction was decoded, in which case
+   *dres will be set accordingly, or False, in which case *dres should
+   be ignored by the caller. */
+
+static
+Bool disInstr_ARM64_WRK (
+        /*MB_OUT*/DisResult* dres,
+        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+        Bool         resteerCisOk,
+        void*        callback_opaque,
+        UChar*       guest_instr,
+        VexArchInfo* archinfo,
+        VexAbiInfo*  abiinfo
+     )
+{
+   // A macro to fish bits out of 'insn'.
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+
+//ZZ    DisResult dres;
+//ZZ    UInt      insn;
+//ZZ    //Bool      allow_VFP = False;
+//ZZ    //UInt      hwcaps = archinfo->hwcaps;
+//ZZ    IRTemp    condT; /* :: Ity_I32 */
+//ZZ    UInt      summary;
+//ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
+//ZZ 
+//ZZ    /* What insn variants are we supporting today? */
+//ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
+//ZZ    // etc etc
+
+   /* Set result defaults. */
+   dres->whatNext    = Dis_Continue;
+   dres->len         = 4;
+   dres->continueAt  = 0;
+   dres->jk_StopHere = Ijk_INVALID;
+
+   /* At least this is simple on ARM64: insns are all 4 bytes long, and
+      4-aligned.  So just fish the whole thing out of memory right now
+      and have done. */
+   UInt insn = getUIntLittleEndianly( guest_instr );
+
+   if (0) vex_printf("insn: 0x%x\n", insn);
+
+   DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
+
+   vassert(0 == (guest_PC_curr_instr & 3ULL));
+
+   /* ----------------------------------------------------------- */
+
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)guest_instr;
+      /* Spot the 16-byte preamble: 
+            93CC0D8C   ror x12, x12, #3
+            93CC358C   ror x12, x12, #13
+            93CCCD8C   ror x12, x12, #51
+            93CCF58C   ror x12, x12, #61
+      */
+      UInt word1 = 0x93CC0D8C;
+      UInt word2 = 0x93CC358C;
+      UInt word3 = 0x93CCCD8C;
+      UInt word4 = 0x93CCF58C;
+      if (getUIntLittleEndianly(code+ 0) == word1 &&
+          getUIntLittleEndianly(code+ 4) == word2 &&
+          getUIntLittleEndianly(code+ 8) == word3 &&
+          getUIntLittleEndianly(code+12) == word4) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         if (getUIntLittleEndianly(code+16) == 0xAA0A014A
+                                               /* orr x10,x10,x10 */) {
+            /* X3 = client_request ( X4 ) */
+            DIP("x3 = client_request ( x4 )\n");
+            putPC(mkU64( guest_PC_curr_instr + 20 ));
+            dres->jk_StopHere = Ijk_ClientReq;
+            dres->whatNext    = Dis_StopHere;
+            return True;
+         }
+         else
+         if (getUIntLittleEndianly(code+16) == 0xAA0B016B
+                                               /* orr x11,x11,x11 */) {
+            /* X3 = guest_NRADDR */
+            DIP("x3 = guest_NRADDR\n");
+            dres->len = 20;
+            putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
+            return True;
+         }
+         else
+         if (getUIntLittleEndianly(code+16) == 0xAA0C018C
+                                               /* orr x12,x12,x12 */) {
+            /*  branch-and-link-to-noredir X8 */
+            DIP("branch-and-link-to-noredir x8\n");
+            putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
+            putPC(getIReg64orZR(8));
+            dres->jk_StopHere = Ijk_NoRedir;
+            dres->whatNext    = Dis_StopHere;
+            return True;
+         }
+         else
+         if (getUIntLittleEndianly(code+16) == 0xAA090129
+                                               /* orr x9,x9,x9 */) {
+            /* IR injection */
+            DIP("IR injection\n");
+            vex_inject_ir(irsb, Iend_LE);
+            // Invalidate the current insn. The reason is that the IRop we're
+            // injecting here can change. In which case the translation has to
+            // be redone. For ease of handling, we simply invalidate all the
+            // time.
+            stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
+            putPC(mkU64( guest_PC_curr_instr + 20 ));
+            dres->whatNext    = Dis_StopHere;
+            dres->jk_StopHere = Ijk_InvalICache;
+            return True;
+         }
+         /* We don't know what it is. */
+         return False;
+         /*NOTREACHED*/
+      }
+   }
+
+   /* ----------------------------------------------------------- */
+
+   /* Main ARM64 instruction decoder starts here. */
+
+   Bool ok = False;
+
+   /* insn[28:25] determines the top-level grouping, so let's start
+      off with that.
+
+      For all of these dis_ARM64_ functions, we pass *dres with the
+      normal default results "insn OK, 4 bytes long, keep decoding" so
+      they don't need to change it.  However, decodes of control-flow
+      insns may cause *dres to change.
+   */
+   switch (INSN(28,25)) {
+      case BITS4(1,0,0,0): case BITS4(1,0,0,1):
+         // Data processing - immediate
+         ok = dis_ARM64_data_processing_immediate(dres, insn);
+         break;
+      case BITS4(1,0,1,0): case BITS4(1,0,1,1):
+         // Branch, exception generation and system instructions
+         ok = dis_ARM64_branch_etc(dres, insn, archinfo);
+         break;
+      case BITS4(0,1,0,0): case BITS4(0,1,1,0):
+      case BITS4(1,1,0,0): case BITS4(1,1,1,0):
+         // Loads and stores
+         ok = dis_ARM64_load_store(dres, insn);
+         break;
+      case BITS4(0,1,0,1): case BITS4(1,1,0,1):
+         // Data processing - register
+         ok = dis_ARM64_data_processing_register(dres, insn);
+         break;
+      case BITS4(0,1,1,1): case BITS4(1,1,1,1): 
+         // Data processing - SIMD and floating point
+         ok = dis_ARM64_simd_and_fp(dres, insn);
+         break;
+      case BITS4(0,0,0,0): case BITS4(0,0,0,1):
+      case BITS4(0,0,1,0): case BITS4(0,0,1,1):
+         // UNALLOCATED
+         break;
+      default:
+         vassert(0); /* Can't happen */
+   }
+
+   /* If the next-level down decoders failed, make sure |dres| didn't
+      get changed. */
+   if (!ok) {
+      vassert(dres->whatNext    == Dis_Continue);
+      vassert(dres->len         == 4);
+      vassert(dres->continueAt  == 0);
+      vassert(dres->jk_StopHere == Ijk_INVALID);
+   }
+
+   return ok;
+
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Top-level fn                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
+                           Bool         (*resteerOkFn) ( void*, Addr64 ),
+                           Bool         resteerCisOk,
+                           void*        callback_opaque,
+                           UChar*       guest_code_IN,
+                           Long         delta_IN,
+                           Addr64       guest_IP,
+                           VexArch      guest_arch,
+                           VexArchInfo* archinfo,
+                           VexAbiInfo*  abiinfo,
+                           Bool         host_bigendian_IN,
+                           Bool         sigill_diag_IN )
+{
+   DisResult dres;
+   vex_bzero(&dres, sizeof(dres));
+
+   /* Set globals (see top of this file) */
+   vassert(guest_arch == VexArchARM64);
+
+   irsb                = irsb_IN;
+   host_is_bigendian   = host_bigendian_IN;
+   guest_PC_curr_instr = (Addr64)guest_IP;
+
+   /* Sanity checks */
+   /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
+   vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
+   vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
+
+   /* Try to decode */
+   Bool ok = disInstr_ARM64_WRK( &dres,
+                                 resteerOkFn, resteerCisOk, callback_opaque,
+                                 (UChar*)&guest_code_IN[delta_IN],
+                                 archinfo, abiinfo );
+   if (ok) {
+      /* All decode successes end up here. */
+      vassert(dres.len == 4 || dres.len == 20);
+      switch (dres.whatNext) {
+         case Dis_Continue:
+            putPC( mkU64(dres.len + guest_PC_curr_instr) );
+            break;
+         case Dis_ResteerU:
+         case Dis_ResteerC:
+            putPC(mkU64(dres.continueAt));
+            break;
+         case Dis_StopHere:
+            break;
+         default:
+            vassert(0);
+      }
+      DIP("\n");
+   } else {
+      /* All decode failures end up here. */
+      if (sigill_diag_IN) {
+         Int   i, j;
+         UChar buf[64];
+         UInt  insn
+                  = getUIntLittleEndianly( (UChar*)&guest_code_IN[delta_IN] );
+         vex_bzero(buf, sizeof(buf));
+         for (i = j = 0; i < 32; i++) {
+            if (i > 0) {
+              if ((i & 7) == 0) buf[j++] = ' ';
+              else if ((i & 3) == 0) buf[j++] = '\'';
+            }
+            buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
+         }
+         vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
+         vex_printf("disInstr(arm64): %s\n", buf);
+      }
+
+      /* Tell the dispatcher that this insn cannot be decoded, and so
+         has not been executed, and (is currently) the next to be
+         executed.  PC should be up-to-date since it is made so at the
+         start of each insn, but nevertheless be paranoid and update
+         it again right now. */
+      putPC( mkU64(guest_PC_curr_instr) );
+      dres.whatNext    = Dis_StopHere;
+      dres.len         = 0;
+      dres.continueAt  = 0;
+      dres.jk_StopHere = Ijk_NoDecode;
+   }
+   return dres;
+}
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+/* Spare code for doing reference implementations of various 128-bit
+   SIMD interleaves/deinterleaves/concatenation ops.  For 64-bit
+   equivalents see the end of guest_arm_toIR.c. */
+
+////////////////////////////////////////////////////////////////
+// 64x2 operations
+//
+static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 )
+{
+  // returns a0 b0
+  return binop(Iop_64HLtoV128, unop(Iop_V128to64, mkexpr(a10)),
+                               unop(Iop_V128to64, mkexpr(b10)));
+}
+
+static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 )
+{
+  // returns a1 b1
+  return binop(Iop_64HLtoV128, unop(Iop_V128HIto64, mkexpr(a10)),
+                               unop(Iop_V128HIto64, mkexpr(b10)));
+}
+
+
+////////////////////////////////////////////////////////////////
+// 32x4 operations
+//
+
+// Split a 128 bit value into 4 32 bit ones, in 64-bit IRTemps with
+// the top halves guaranteed to be zero.
+static void breakV128to32s ( IRTemp* out3, IRTemp* out2, IRTemp* out1,
+                             IRTemp* out0, IRTemp v128 )
+{
+  if (out3) *out3 = newTemp(Ity_I64);
+  if (out2) *out2 = newTemp(Ity_I64);
+  if (out1) *out1 = newTemp(Ity_I64);
+  if (out0) *out0 = newTemp(Ity_I64);
+  IRTemp hi64 = newTemp(Ity_I64);
+  IRTemp lo64 = newTemp(Ity_I64);
+  assign(hi64, unop(Iop_V128HIto64, mkexpr(v128)) );
+  assign(lo64, unop(Iop_V128to64,   mkexpr(v128)) );
+  if (out3) assign(*out3, binop(Iop_Shr64, mkexpr(hi64), mkU8(32)));
+  if (out2) assign(*out2, binop(Iop_And64, mkexpr(hi64), mkU64(0xFFFFFFFF)));
+  if (out1) assign(*out1, binop(Iop_Shr64, mkexpr(lo64), mkU8(32)));
+  if (out0) assign(*out0, binop(Iop_And64, mkexpr(lo64), mkU64(0xFFFFFFFF)));
+}
+
+// Make a V128 bit value from 4 32 bit ones, each of which is in a 64 bit
+// IRTemp.
+static IRTemp mkV128from32s ( IRTemp in3, IRTemp in2, IRTemp in1, IRTemp in0 )
+{
+  IRTemp hi64 = newTemp(Ity_I64);
+  IRTemp lo64 = newTemp(Ity_I64);
+  assign(hi64,
+         binop(Iop_Or64,
+               binop(Iop_Shl64, mkexpr(in3), mkU8(32)),
+               binop(Iop_And64, mkexpr(in2), mkU64(0xFFFFFFFF))));
+  assign(lo64,
+         binop(Iop_Or64,
+               binop(Iop_Shl64, mkexpr(in1), mkU8(32)),
+               binop(Iop_And64, mkexpr(in0), mkU64(0xFFFFFFFF))));
+  IRTemp res = newTemp(Ity_V128);
+  assign(res, binop(Iop_64HLtoV128, mkexpr(hi64), mkexpr(lo64)));
+  return res;
+}
+
+static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a2 a0 b2 b0
+  IRTemp a2, a0, b2, b0;
+  breakV128to32s(NULL, &a2, NULL, &a0, a3210);
+  breakV128to32s(NULL, &b2, NULL, &b0, b3210);
+  return mkexpr(mkV128from32s(a2, a0, b2, b0));
+}
+
+static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 a1 b3 b1
+  IRTemp a3, a1, b3, b1;
+  breakV128to32s(&a3, NULL, &a1, NULL, a3210);
+  breakV128to32s(&b3, NULL, &b1, NULL, b3210);
+  return mkexpr(mkV128from32s(a3, a1, b3, b1));
+}
+
+static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a1 b1 a0 b0
+  IRTemp a1, a0, b1, b0;
+  breakV128to32s(NULL, NULL, &a1, &a0, a3210);
+  breakV128to32s(NULL, NULL, &b1, &b0, b3210);
+  return mkexpr(mkV128from32s(a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 b3 a2 b2
+  IRTemp a3, a2, b3, b2;
+  breakV128to32s(&a3, &a2, NULL, NULL, a3210);
+  breakV128to32s(&b3, &b2, NULL, NULL, b3210);
+  return mkexpr(mkV128from32s(a3, b3, a2, b2));
+}
+
+////////////////////////////////////////////////////////////////
+// 16x8 operations
+//
+
+static void breakV128to16s ( IRTemp* out7, IRTemp* out6, IRTemp* out5,
+                             IRTemp* out4, IRTemp* out3, IRTemp* out2,
+                             IRTemp* out1,IRTemp* out0, IRTemp v128 )
+{
+  if (out7) *out7 = newTemp(Ity_I64);
+  if (out6) *out6 = newTemp(Ity_I64);
+  if (out5) *out5 = newTemp(Ity_I64);
+  if (out4) *out4 = newTemp(Ity_I64);
+  if (out3) *out3 = newTemp(Ity_I64);
+  if (out2) *out2 = newTemp(Ity_I64);
+  if (out1) *out1 = newTemp(Ity_I64);
+  if (out0) *out0 = newTemp(Ity_I64);
+  IRTemp hi64 = newTemp(Ity_I64);
+  IRTemp lo64 = newTemp(Ity_I64);
+  assign(hi64, unop(Iop_V128HIto64, mkexpr(v128)) );
+  assign(lo64, unop(Iop_V128to64,   mkexpr(v128)) );
+  if (out7)
+    assign(*out7, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(48)),
+                        mkU64(0xFFFF)));
+  if (out6)
+    assign(*out6, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(32)),
+                        mkU64(0xFFFF)));
+  if (out5)
+    assign(*out5, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(16)),
+                        mkU64(0xFFFF)));
+  if (out4)
+    assign(*out4, binop(Iop_And64, mkexpr(hi64), mkU64(0xFFFF)));
+  if (out3)
+    assign(*out3, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(48)),
+                        mkU64(0xFFFF)));
+  if (out2)
+    assign(*out2, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(32)),
+                        mkU64(0xFFFF)));
+  if (out1)
+    assign(*out1, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(16)),
+                        mkU64(0xFFFF)));
+  if (out0)
+    assign(*out0, binop(Iop_And64, mkexpr(lo64), mkU64(0xFFFF)));
+}
+
+static IRTemp mkV128from16s ( IRTemp in7, IRTemp in6, IRTemp in5, IRTemp in4,
+                              IRTemp in3, IRTemp in2, IRTemp in1, IRTemp in0 )
+{
+  IRTemp hi64 = newTemp(Ity_I64);
+  IRTemp lo64 = newTemp(Ity_I64);
+  assign(hi64,
+         binop(Iop_Or64,
+               binop(Iop_Or64,
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in7), mkU64(0xFFFF)),
+                           mkU8(48)),
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in6), mkU64(0xFFFF)),
+                           mkU8(32))),
+               binop(Iop_Or64,
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in5), mkU64(0xFFFF)),
+                           mkU8(16)),
+                     binop(Iop_And64,
+                           mkexpr(in4), mkU64(0xFFFF)))));
+  assign(lo64,
+         binop(Iop_Or64,
+               binop(Iop_Or64,
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in3), mkU64(0xFFFF)),
+                           mkU8(48)),
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in2), mkU64(0xFFFF)),
+                           mkU8(32))),
+               binop(Iop_Or64,
+                     binop(Iop_Shl64,
+                           binop(Iop_And64, mkexpr(in1), mkU64(0xFFFF)),
+                           mkU8(16)),
+                     binop(Iop_And64,
+                           mkexpr(in0), mkU64(0xFFFF)))));
+  IRTemp res = newTemp(Ity_V128);
+  assign(res, binop(Iop_64HLtoV128, mkexpr(hi64), mkexpr(lo64)));
+  return res;
+}
+
+static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a6 a4 a2 a0 b6 b4 b2 b0
+  IRTemp a6, a4, a2, a0, b6, b4, b2, b0;
+  breakV128to16s(NULL, &a6, NULL, &a4, NULL, &a2, NULL, &a0, a76543210);
+  breakV128to16s(NULL, &b6, NULL, &b4, NULL, &b2, NULL, &b0, b76543210);
+  return mkexpr(mkV128from16s(a6, a4, a2, a0, b6, b4, b2, b0));
+}
+
+static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 a5 a3 a1 b7 b5 b3 b1
+  IRTemp a7, a5, a3, a1, b7, b5, b3, b1;
+  breakV128to16s(&a7, NULL, &a5, NULL, &a3, NULL, &a1, NULL, a76543210);
+  breakV128to16s(&b7, NULL, &b5, NULL, &b3, NULL, &b1, NULL, b76543210);
+  return mkexpr(mkV128from16s(a7, a5, a3, a1, b7, b5, b3, b1));
+}
+
+static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a3 b3 a2 b2 a1 b1 a0 b0
+  IRTemp a3, b3, a2, b2, a1, a0, b1, b0;
+  breakV128to16s(NULL, NULL, NULL, NULL, &a3, &a2, &a1, &a0, a76543210);
+  breakV128to16s(NULL, NULL, NULL, NULL, &b3, &b2, &b1, &b0, b76543210);
+  return mkexpr(mkV128from16s(a3, b3, a2, b2, a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 b7 a6 b6 a5 b5 a4 b4
+  IRTemp a7, b7, a6, b6, a5, b5, a4, b4;
+  breakV128to16s(&a7, &a6, &a5, &a4, NULL, NULL, NULL, NULL, a76543210);
+  breakV128to16s(&b7, &b6, &b5, &b4, NULL, NULL, NULL, NULL, b76543210);
+  return mkexpr(mkV128from16s(a7, b7, a6, b6, a5, b5, a4, b4));
+}
+
+////////////////////////////////////////////////////////////////
+// 8x16 operations
+//
+
+static void breakV128to8s ( IRTemp* outF, IRTemp* outE, IRTemp* outD, 
+                            IRTemp* outC, IRTemp* outB, IRTemp* outA, 
+                            IRTemp* out9, IRTemp* out8, 
+                            IRTemp* out7, IRTemp* out6, IRTemp* out5,
+                            IRTemp* out4, IRTemp* out3, IRTemp* out2,
+                            IRTemp* out1,IRTemp* out0, IRTemp v128 )
+{
+  if (outF) *outF = newTemp(Ity_I64);
+  if (outE) *outE = newTemp(Ity_I64);
+  if (outD) *outD = newTemp(Ity_I64);
+  if (outC) *outC = newTemp(Ity_I64);
+  if (outB) *outB = newTemp(Ity_I64);
+  if (outA) *outA = newTemp(Ity_I64);
+  if (out9) *out9 = newTemp(Ity_I64);
+  if (out8) *out8 = newTemp(Ity_I64);
+  if (out7) *out7 = newTemp(Ity_I64);
+  if (out6) *out6 = newTemp(Ity_I64);
+  if (out5) *out5 = newTemp(Ity_I64);
+  if (out4) *out4 = newTemp(Ity_I64);
+  if (out3) *out3 = newTemp(Ity_I64);
+  if (out2) *out2 = newTemp(Ity_I64);
+  if (out1) *out1 = newTemp(Ity_I64);
+  if (out0) *out0 = newTemp(Ity_I64);
+  IRTemp hi64 = newTemp(Ity_I64);
+  IRTemp lo64 = newTemp(Ity_I64);
+  assign(hi64, unop(Iop_V128HIto64, mkexpr(v128)) );
+  assign(lo64, unop(Iop_V128to64,   mkexpr(v128)) );
+  if (outF)
+    assign(*outF, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(56)),
+                        mkU64(0xFF)));
+  if (outE)
+    assign(*outE, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(48)),
+                        mkU64(0xFF)));
+  if (outD)
+    assign(*outD, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(40)),
+                        mkU64(0xFF)));
+  if (outC)
+    assign(*outC, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(32)),
+                        mkU64(0xFF)));
+  if (outB)
+    assign(*outB, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(24)),
+                        mkU64(0xFF)));
+  if (outA)
+    assign(*outA, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(16)),
+                        mkU64(0xFF)));
+  if (out9)
+    assign(*out9, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(8)),
+                        mkU64(0xFF)));
+  if (out8)
+    assign(*out8, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(hi64), mkU8(0)),
+                        mkU64(0xFF)));
+  if (out7)
+    assign(*out7, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(56)),
+                        mkU64(0xFF)));
+  if (out6)
+    assign(*out6, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(48)),
+                        mkU64(0xFF)));
+  if (out5)
+    assign(*out5, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(40)),
+                        mkU64(0xFF)));
+  if (out4)
+    assign(*out4, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(32)),
+                        mkU64(0xFF)));
+  if (out3)
+    assign(*out3, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(24)),
+                        mkU64(0xFF)));
+  if (out2)
+    assign(*out2, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(16)),
+                        mkU64(0xFF)));
+  if (out1)
+    assign(*out1, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(8)),
+                        mkU64(0xFF)));
+  if (out0)
+    assign(*out0, binop(Iop_And64,
+                        binop(Iop_Shr64, mkexpr(lo64), mkU8(0)),
+                        mkU64(0xFF)));
+}
+
+static IRTemp mkV128from8s ( IRTemp inF, IRTemp inE, IRTemp inD, IRTemp inC,
+                             IRTemp inB, IRTemp inA, IRTemp in9, IRTemp in8,
+                             IRTemp in7, IRTemp in6, IRTemp in5, IRTemp in4,
+                             IRTemp in3, IRTemp in2, IRTemp in1, IRTemp in0 )
+{
+  IRTemp vFE = newTemp(Ity_I64);
+  IRTemp vDC = newTemp(Ity_I64);
+  IRTemp vBA = newTemp(Ity_I64);
+  IRTemp v98 = newTemp(Ity_I64);
+  IRTemp v76 = newTemp(Ity_I64);
+  IRTemp v54 = newTemp(Ity_I64);
+  IRTemp v32 = newTemp(Ity_I64);
+  IRTemp v10 = newTemp(Ity_I64);
+  assign(vFE, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(inF), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(inE), mkU64(0xFF))));
+  assign(vDC, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(inD), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(inC), mkU64(0xFF))));
+  assign(vBA, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(inB), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(inA), mkU64(0xFF))));
+  assign(v98, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(in9), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(in8), mkU64(0xFF))));
+  assign(v76, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(in7), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(in6), mkU64(0xFF))));
+  assign(v54, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(in5), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(in4), mkU64(0xFF))));
+  assign(v32, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(in3), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(in2), mkU64(0xFF))));
+  assign(v10, binop(Iop_Or64,
+                    binop(Iop_Shl64,
+                          binop(Iop_And64, mkexpr(in1), mkU64(0xFF)), mkU8(8)),
+                    binop(Iop_And64, mkexpr(in0), mkU64(0xFF))));
+  return mkV128from16s(vFE, vDC, vBA, v98, v76, v54, v32, v10);
+}
+
+static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
+                                     IRTemp bFEDCBA9876543210 )
+{
+  // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
+  IRTemp aE, aC, aA, a8, a6, a4, a2, a0, bE, bC, bA, b8, b6, b4, b2, b0;
+  breakV128to8s(NULL, &aE, NULL, &aC, NULL, &aA, NULL, &a8, 
+                NULL, &a6, NULL, &a4, NULL, &a2, NULL, &a0,
+                aFEDCBA9876543210);
+  breakV128to8s(NULL, &bE, NULL, &bC, NULL, &bA, NULL, &b8, 
+                NULL, &b6, NULL, &b4, NULL, &b2, NULL, &b0,
+                bFEDCBA9876543210);
+  return mkexpr(mkV128from8s(aE, aC, aA, a8, a6, a4, a2, a0,
+                             bE, bC, bA, b8, b6, b4, b2, b0));
+}
+
+static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
+                                    IRTemp bFEDCBA9876543210 )
+{
+  // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
+  IRTemp aF, aD, aB, a9, a7, a5, a3, a1, bF, bD, bB, b9, b7, b5, b3, b1;
+  breakV128to8s(&aF, NULL, &aD, NULL, &aB, NULL, &a9, NULL,
+                &a7, NULL, &a5, NULL, &a3, NULL, &a1, NULL,
+                aFEDCBA9876543210);
+
+  breakV128to8s(&bF, NULL, &bD, NULL, &bB, NULL, &b9, NULL,
+                &b7, NULL, &b5, NULL, &b3, NULL, &b1, NULL,
+                aFEDCBA9876543210);
+
+  return mkexpr(mkV128from8s(aF, aD, aB, a9, a7, a5, a3, a1,
+                             bF, bD, bB, b9, b7, b5, b3, b1));
+}
+
+static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
+                                     IRTemp bFEDCBA9876543210 )
+{
+  // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+  IRTemp a7, b7, a6, b6, a5, b5, a4, b4, a3, b3, a2, b2, a1, b1, a0, b0;
+  breakV128to8s(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                &a7,  &a6,  &a5,  &a4,  &a3,  &a2,  &a1,  &a0,
+                aFEDCBA9876543210);
+  breakV128to8s(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                &b7,  &b6,  &b5,  &b4,  &b3,  &b2,  &b1,  &b0,
+                bFEDCBA9876543210);
+  return mkexpr(mkV128from8s(a7, b7, a6, b6, a5, b5, a4, b4,
+                             a3, b3, a2, b2, a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
+                                     IRTemp bFEDCBA9876543210 )
+{
+  // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
+  IRTemp aF, bF, aE, bE, aD, bD, aC, bC, aB, bB, aA, bA, a9, b9, a8, b8;
+  breakV128to8s(&aF,  &aE,  &aD,  &aC,  &aB,  &aA,  &a9,  &a8,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aFEDCBA9876543210);
+  breakV128to8s(&bF,  &bE,  &bD,  &bC,  &bB,  &bA,  &b9,  &b8,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                bFEDCBA9876543210);
+  return mkexpr(mkV128from8s(aF, bF, aE, bE, aD, bD, aC, bC,
+                             aB, bB, aA, bA, a9, b9, a8, b8));
+}
+
+/*--------------------------------------------------------------------*/
+/*--- end                                       guest_arm64_toIR.c ---*/
+/*--------------------------------------------------------------------*/
Index: priv/guest_arm_helpers.c
===================================================================
--- priv/guest_arm_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_arm_helpers.c	(.../trunk)	(revision 2863)
@@ -981,8 +981,8 @@
    vex_state->guest_GEFLAG3 = 0;
 
    vex_state->guest_EMNOTE  = EmNote_NONE;
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN   = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN   = 0;
    vex_state->guest_NRADDR  = 0;
    vex_state->guest_IP_AT_SYSCALL = 0;
 
@@ -1030,10 +1030,6 @@
    vex_state->guest_ITSTATE = 0;
 
    vex_state->padding1 = 0;
-   vex_state->padding2 = 0;
-   vex_state->padding3 = 0;
-   vex_state->padding4 = 0;
-   vex_state->padding5 = 0;
 }
 
 
@@ -1130,8 +1126,8 @@
                  /* 1 */ ALWAYSDEFD(guest_CC_OP),
                  /* 2 */ ALWAYSDEFD(guest_CC_NDEP),
                  /* 3 */ ALWAYSDEFD(guest_EMNOTE),
-                 /* 4 */ ALWAYSDEFD(guest_TISTART),
-                 /* 5 */ ALWAYSDEFD(guest_TILEN),
+                 /* 4 */ ALWAYSDEFD(guest_CMSTART),
+                 /* 5 */ ALWAYSDEFD(guest_CMLEN),
                  /* 6 */ ALWAYSDEFD(guest_NRADDR),
                  /* 7 */ ALWAYSDEFD(guest_IP_AT_SYSCALL),
                  /* 8 */ ALWAYSDEFD(guest_TPIDRURO),
Index: priv/guest_arm_toIR.c
===================================================================
--- priv/guest_arm_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_arm_toIR.c	(.../trunk)	(revision 2863)
@@ -485,8 +485,8 @@
 #define OFFB_GEFLAG2  offsetof(VexGuestARMState,guest_GEFLAG2)
 #define OFFB_GEFLAG3  offsetof(VexGuestARMState,guest_GEFLAG3)
 
-#define OFFB_TISTART  offsetof(VexGuestARMState,guest_TISTART)
-#define OFFB_TILEN    offsetof(VexGuestARMState,guest_TILEN)
+#define OFFB_CMSTART  offsetof(VexGuestARMState,guest_CMSTART)
+#define OFFB_CMLEN    offsetof(VexGuestARMState,guest_CMLEN)
 
 
 /* ---------------- Integer registers ---------------- */
@@ -2875,6 +2875,31 @@
    return True;
 }
 
+/* Generate specific vector FP binary ops, possibly with a fake
+   rounding mode as required by the primop. */
+static
+IRExpr* binop_w_fake_RM ( IROp op, IRExpr* argL, IRExpr* argR )
+{
+   switch (op) {
+      case Iop_Add32Fx4:
+      case Iop_Sub32Fx4:
+      case Iop_Mul32Fx4:
+         return triop(op, get_FAKE_roundingmode(), argL, argR );
+      case Iop_Add32x4: case Iop_Add16x8:
+      case Iop_Sub32x4: case Iop_Sub16x8:
+      case Iop_Mul32x4: case Iop_Mul16x8:
+      case Iop_Mul32x2: case Iop_Mul16x4:
+      case Iop_Add32Fx2:
+      case Iop_Sub32Fx2:
+      case Iop_Mul32Fx2:
+      case Iop_PwAdd32Fx2:
+         return binop(op, argL, argR);
+      default:
+        ppIROp(op);
+        vassert(0);
+   }
+}
+
 /* VTBL, VTBX */
 static
 Bool dis_neon_vtb ( UInt theInstr, IRTemp condT )
@@ -4601,7 +4626,8 @@
                   /* VABD  */
                   if (Q) {
                      assign(res, unop(Iop_Abs32Fx4,
-                                      binop(Iop_Sub32Fx4,
+                                      triop(Iop_Sub32Fx4,
+                                            get_FAKE_roundingmode(),
                                             mkexpr(arg_n),
                                             mkexpr(arg_m))));
                   } else {
@@ -4616,7 +4642,7 @@
                   break;
                }
             }
-            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            assign(res, binop_w_fake_RM(op, mkexpr(arg_n), mkexpr(arg_m)));
          } else {
             if (U == 0) {
                /* VMLA, VMLS  */
@@ -4641,9 +4667,11 @@
                      default: vassert(0);
                   }
                }
-               assign(res, binop(op2,
-                                 Q ? getQReg(dreg) : getDRegI64(dreg),
-                                 binop(op, mkexpr(arg_n), mkexpr(arg_m))));
+               assign(res, binop_w_fake_RM(
+                              op2,
+                              Q ? getQReg(dreg) : getDRegI64(dreg),
+                              binop_w_fake_RM(op, mkexpr(arg_n),
+                                                  mkexpr(arg_m))));
 
                DIP("vml%c.f32 %c%u, %c%u, %c%u\n",
                    P ? 's' : 'a', Q ? 'q' : 'd',
@@ -4654,7 +4682,7 @@
                if ((C >> 1) != 0)
                   return False;
                op = Q ? Iop_Mul32Fx4 : Iop_Mul32Fx2 ;
-               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+               assign(res, binop_w_fake_RM(op, mkexpr(arg_n), mkexpr(arg_m)));
                DIP("vmul.f32 %c%u, %c%u, %c%u\n",
                    Q ? 'q' : 'd', dreg,
                    Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
@@ -5318,10 +5346,10 @@
          }
       }
       op2 = INSN(10,10) ? sub : add;
-      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      assign(res, binop_w_fake_RM(op, mkexpr(arg_n), mkexpr(arg_m)));
       if (Q)
-         putQReg(dreg, binop(op2, getQReg(dreg), mkexpr(res)),
-               condT);
+         putQReg(dreg, binop_w_fake_RM(op2, getQReg(dreg), mkexpr(res)),
+                 condT);
       else
          putDRegI64(dreg, binop(op2, getDRegI64(dreg), mkexpr(res)),
                     condT);
@@ -5548,7 +5576,7 @@
                vassert(0);
          }
       }
-      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      assign(res, binop_w_fake_RM(op, mkexpr(arg_n), mkexpr(arg_m)));
       if (Q)
          putQReg(dreg, mkexpr(res), condT);
       else
@@ -13499,6 +13527,27 @@
                         condT);
             DIP("fdivd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
             goto decode_success_vfp;
+         case BITS4(1,1,0,0): /* VFMA: d + n * m (fused) */
+            /* XXXROUNDINGFIXME look up ARM reference for fused
+               multiply-add rounding */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              getDReg(dD),
+                              triop(Iop_MulF64, rm, getDReg(dN),
+                                                    getDReg(dM))),
+                        condT);
+            DIP("vfmad%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(1,1,0,1): /* VFMS: d + (-n * m) (fused) */
+            /* XXXROUNDINGFIXME look up ARM reference for fused
+               multiply-add rounding */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              getDReg(dD),
+                              triop(Iop_MulF64, rm,
+                                    unop(Iop_NegF64, getDReg(dN)),
+                                    getDReg(dM))),
+                        condT);
+            DIP("vfmsd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
          default:
             break;
       }
@@ -13963,6 +14012,27 @@
                         condT);
             DIP("fdivs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
             goto decode_success_vfp;
+         case BITS4(1,1,0,0): /* VFMA: d + n * m (fused) */
+            /* XXXROUNDINGFIXME look up ARM reference for fused
+               multiply-add rounding */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              getFReg(fD),
+                              triop(Iop_MulF32, rm, getFReg(fN),
+                                                    getFReg(fM))),
+                        condT);
+            DIP("vfmas%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(1,1,0,1): /* VFMS: d + (-n * m) (fused) */
+            /* XXXROUNDINGFIXME look up ARM reference for fused
+               multiply-add rounding */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              getFReg(fD),
+                              triop(Iop_MulF32, rm,
+                                    unop(Iop_NegF32, getFReg(fN)),
+                                    getFReg(fM))),
+                        condT);
+            DIP("vfmss%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
          default:
             break;
       }
@@ -14577,11 +14647,11 @@
             // injecting here can change. In which case the translation has to
             // be redone. For ease of handling, we simply invalidate all the
             // time.
-            stmt(IRStmt_Put(OFFB_TISTART, mkU32(guest_R15_curr_instr_notENC)));
-            stmt(IRStmt_Put(OFFB_TILEN,   mkU32(20)));
+            stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_R15_curr_instr_notENC)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(20)));
             llPutIReg(15, mkU32( guest_R15_curr_instr_notENC + 20 ));
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             goto decode_success;
          }
          /* We don't know what it is.  Set opc1/opc2 so decode_failure
@@ -17422,11 +17492,11 @@
             // injecting here can change. In which case the translation has to
             // be redone. For ease of handling, we simply invalidate all the
             // time.
-            stmt(IRStmt_Put(OFFB_TISTART, mkU32(guest_R15_curr_instr_notENC)));
-            stmt(IRStmt_Put(OFFB_TILEN,   mkU32(20)));
+            stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_R15_curr_instr_notENC)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(20)));
             llPutIReg(15, mkU32( (guest_R15_curr_instr_notENC + 20) | 1 ));
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             goto decode_success;
          }
          /* We don't know what it is.  Set insn0 so decode_failure
@@ -18203,9 +18273,9 @@
             condT = IRTemp_INVALID;
             // now uncond
             /* non-interworking branch */
-            irsb->next = binop(Iop_Or32, mkexpr(res), mkU32(1));
-            irsb->jumpkind = Ijk_Boring;
-            dres.whatNext = Dis_StopHere;
+            llPutIReg(15, binop(Iop_Or32, mkexpr(res), mkU32(1)));
+            dres.jk_StopHere = Ijk_Boring;
+            dres.whatNext    = Dis_StopHere;
          }
          DIP("add(hi) r%u, r%u\n", rD, rM);
          goto decode_success;
@@ -20250,7 +20320,7 @@
 
    /* --------------- LD/ST reg+imm12 --------------- */
    /* Loads and stores of the form:
-         op  Rt, [Rn, +#imm12]
+         op  Rt, [Rn, #+-imm12]
       where op is one of
          ldrb ldrh ldr  ldrsb ldrsh
          strb strh str
@@ -20257,27 +20327,25 @@
    */
    if (INSN0(15,9) == BITS7(1,1,1,1,1,0,0)) {
       Bool   valid  = True;
-      Bool   syned  = False;
+      Bool   syned  = INSN0(8,8) == 1;
       Bool   isST   = False;
       IRType ty     = Ity_I8;
+      UInt   bU     = INSN0(7,7); // 1: +imm   0: -imm
+                                  // -imm is only supported by literal versions
       const HChar* nm = "???";
 
-      switch (INSN0(8,4)) {
-         case BITS5(0,1,0,0,0):   // strb
+      switch (INSN0(6,4)) {
+         case BITS3(0,0,0):   // strb
             nm = "strb"; isST = True; break;
-         case BITS5(0,1,0,0,1):   // ldrb
-            nm = "ldrb"; break;
-         case BITS5(1,1,0,0,1):   // ldrsb
-            nm = "ldrsb"; syned = True; break;
-         case BITS5(0,1,0,1,0):   // strh
+         case BITS3(0,0,1):   // ldrb
+            nm = syned ? "ldrsb" : "ldrb"; break;
+         case BITS3(0,1,0):   // strh
             nm = "strh"; ty = Ity_I16; isST = True; break;
-         case BITS5(0,1,0,1,1):   // ldrh
-            nm = "ldrh"; ty = Ity_I16; break;
-         case BITS5(1,1,0,1,1):   // ldrsh
-            nm = "ldrsh"; ty = Ity_I16; syned = True; break;
-         case BITS5(0,1,1,0,0):   // str
+         case BITS3(0,1,1):   // ldrh
+            nm = syned ? "ldrsh" : "ldrh"; ty = Ity_I16; break;
+         case BITS3(1,0,0):   // str
             nm = "str"; ty = Ity_I32; isST = True; break;
-         case BITS5(0,1,1,0,1):
+         case BITS3(1,0,1):
             nm = "ldr"; ty = Ity_I32; break;  // ldr
          default:
             valid = False; break;
@@ -20288,25 +20356,27 @@
       UInt imm12   = INSN1(11,0);
       Bool loadsPC = False;
 
-      if (ty == Ity_I8 || ty == Ity_I16) {
-         /* all 8- and 16-bit load and store cases have the
-            same exclusion set. */
-         if (rN == 15 || isBadRegT(rT))
+      if (rN != 15 && bU == 0) {
+         // only pc supports #-imm12
+         valid = False;
+      }
+
+      if (isST) {
+         if (syned) valid = False;
+         if (rN == 15 || rT == 15)
             valid = False;
       } else {
-         vassert(ty == Ity_I32);
-         if (isST) {
-            if (rN == 15 || rT == 15)
+         /* For a 32-bit load, rT == 15 is only allowable if we are not
+            in an IT block, or are the last in it.  Need to insert
+            a dynamic check for that.  Also, in this particular
+            case, rN == 15 is allowable.  In this case however, the
+            value obtained for rN is (apparently)
+            "word-align(address of current insn + 4)". */
+         if (rT == 15) {
+            if (ty == Ity_I32)
+               loadsPC = True;
+            else // Can't do it for B/H loads
                valid = False;
-         } else {
-            /* For a 32-bit load, rT == 15 is only allowable if we not
-               in an IT block, or are the last in it.  Need to insert
-               a dynamic check for that.  Also, in this particular
-               case, rN == 15 is allowable.  In this case however, the
-               value obtained for rN is (apparently)
-               "word-align(address of current insn + 4)". */
-            if (rT == 15)
-               loadsPC = True;
          }
       }
 
@@ -20324,8 +20394,8 @@
 
          IRTemp rNt = newTemp(Ity_I32);
          if (rN == 15) {
-            vassert(ty == Ity_I32 && !isST);
-            assign(rNt, binop(Iop_And32, getIRegT(rN), mkU32(~3)));
+            vassert(!isST);
+            assign(rNt, binop(Iop_And32, getIRegT(15), mkU32(~3)));
          } else {
             assign(rNt, getIRegT(rN));
          }
@@ -20332,7 +20402,8 @@
 
          IRTemp transAddr = newTemp(Ity_I32);
          assign(transAddr,
-                binop( Iop_Add32, mkexpr(rNt), mkU32(imm12) ));
+                binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
+                      mkexpr(rNt), mkU32(imm12)));
 
          IRTemp oldRt = newTemp(Ity_I32);
          assign(oldRt, getIRegT(rT));
@@ -20387,9 +20458,8 @@
                vassert(rT == 15);
                vassert(condT == IRTemp_INVALID); /* due to check above */
                llPutIReg(15, mkexpr(newRt));
-               irsb->next = mkexpr(newRt);
-               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
-               dres.whatNext  = Dis_StopHere;
+               dres.jk_StopHere = Ijk_Boring;
+               dres.whatNext    = Dis_StopHere;
             }
          }
 
@@ -20401,7 +20471,7 @@
 
    /* -------------- LDRD/STRD reg+/-#imm8 -------------- */
    /* Doubleword loads and stores of the form:
-         ldrd/strd  Rt, Rt2, [Rn, #-imm8]      or
+         ldrd/strd  Rt, Rt2, [Rn, #+/-imm8]    or
          ldrd/strd  Rt, Rt2, [Rn], #+/-imm8    or
          ldrd/strd  Rt, Rt2, [Rn, #+/-imm8]!  
    */
@@ -20419,12 +20489,17 @@
       if (bP == 0 && bW == 0)                 valid = False;
       if (bW == 1 && (rN == rT || rN == rT2)) valid = False;
       if (isBadRegT(rT) || isBadRegT(rT2))    valid = False;
-      if (rN == 15)                           valid = False;
       if (bL == 1 && rT == rT2)               valid = False;
+      /* It's OK to use PC as the base register only in the
+         following case: ldrd Rt, Rt2, [PC, #+/-imm8] */
+      if (rN == 15 && (bL == 0/*store*/
+                       || bW == 1/*wb*/))     valid = False;
 
       if (valid) {
          IRTemp preAddr = newTemp(Ity_I32);
-         assign(preAddr, getIRegT(rN));
+         assign(preAddr, 15 == rN
+                           ? binop(Iop_And32, getIRegT(15), mkU32(~3U))
+                           : getIRegT(rN));
 
          IRTemp postAddr = newTemp(Ity_I32);
          assign(postAddr, binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
Index: priv/guest_generic_bb_to_IR.c
===================================================================
--- priv/guest_generic_bb_to_IR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_generic_bb_to_IR.c	(.../trunk)	(revision 2863)
@@ -131,8 +131,8 @@
    not to disassemble any instructions into it; this is indicated
    by the callback returning True.
 
-   offB_TIADDR and offB_TILEN are the offsets of guest_TIADDR and
-   guest_TILEN.  Since this routine has to work for any guest state,
+   offB_CMADDR and offB_CMLEN are the offsets of guest_CMADDR and
+   guest_CMLEN.  Since this routine has to work for any guest state,
    without knowing what it is, those offsets have to passed in.
 
    callback_opaque is a caller-supplied pointer to data which the
@@ -194,8 +194,8 @@
          /*IN*/ IRType           guest_word_type,
          /*IN*/ UInt             (*needs_self_check)(void*,VexGuestExtents*),
          /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
-         /*IN*/ Int              offB_GUEST_TISTART,
-         /*IN*/ Int              offB_GUEST_TILEN,
+         /*IN*/ Int              offB_GUEST_CMSTART,
+         /*IN*/ Int              offB_GUEST_CMLEN,
          /*IN*/ Int              offB_GUEST_IP,
          /*IN*/ Int              szB_GUEST_IP
       )
@@ -663,7 +663,7 @@
             vassert(!nm_spec);
          }
 
-         /* Set TISTART and TILEN.  These will describe to the despatcher
+         /* Set CMSTART and CMLEN.  These will describe to the despatcher
             the area of guest code to invalidate should we exit with a
             self-check failure. */
 
@@ -684,10 +684,10 @@
             = IRStmt_WrTmp(tilen_tmp, IRExpr_Const(len2check_IRConst) );
 
          irsb->stmts[selfcheck_idx + i * 5 + 2]
-            = IRStmt_Put( offB_GUEST_TISTART, IRExpr_RdTmp(tistart_tmp) );
+            = IRStmt_Put( offB_GUEST_CMSTART, IRExpr_RdTmp(tistart_tmp) );
 
          irsb->stmts[selfcheck_idx + i * 5 + 3]
-            = IRStmt_Put( offB_GUEST_TILEN, IRExpr_RdTmp(tilen_tmp) );
+            = IRStmt_Put( offB_GUEST_CMLEN, IRExpr_RdTmp(tilen_tmp) );
 
          /* Generate the entry point descriptors */
          if (abiinfo_both->host_ppc_calls_use_fndescrs) {
@@ -737,7 +737,7 @@
                           ? IRExpr_Const(IRConst_U64(expectedhW))
                           : IRExpr_Const(IRConst_U32(expectedhW))
                  ),
-                 Ijk_TInval,
+                 Ijk_InvalICache,
                  /* Where we must restart if there's a failure: at the
                     first extent, regardless of which extent the
                     failure actually happened in. */
Index: priv/guest_generic_bb_to_IR.h
===================================================================
--- priv/guest_generic_bb_to_IR.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_generic_bb_to_IR.h	(.../trunk)	(revision 2863)
@@ -184,8 +184,8 @@
          /*IN*/ IRType           guest_word_type,
          /*IN*/ UInt             (*needs_self_check)(void*,VexGuestExtents*),
          /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
-         /*IN*/ Int              offB_GUEST_TISTART,
-         /*IN*/ Int              offB_GUEST_TILEN,
+         /*IN*/ Int              offB_GUEST_CMSTART,
+         /*IN*/ Int              offB_GUEST_CMLEN,
          /*IN*/ Int              offB_GUEST_IP,
          /*IN*/ Int              szB_GUEST_IP
       );
Index: priv/guest_mips_defs.h
===================================================================
--- priv/guest_mips_defs.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_mips_defs.h	(.../trunk)	(revision 2863)
@@ -85,7 +85,8 @@
    TRUNCWS,  TRUNCWD, TRUNCLS, TRUNCLD,
    CVTDS,    CVTDW,   CVTSD,   CVTSW,
    CVTWS,    CVTWD,   CVTDL,   CVTLS,
-   CVTLD,    CVTSL
+   CVTLD,    CVTSL,   ADDS,    ADDD,
+   SUBS,     SUBD,    DIVS
 } flt_op;
 
 extern UInt mips32_dirtyhelper_mfc0 ( UInt rd, UInt sel );
@@ -98,8 +99,12 @@
 extern ULong mips64_dirtyhelper_rdhwr ( ULong rt, ULong rd );
 #endif
 
-extern UInt mips_dirtyhelper_calculate_FCSR ( void* guest_state, UInt fs,
-                                              flt_op op );
+/* Calculate FCSR in fp32 mode. */
+extern UInt mips_dirtyhelper_calculate_FCSR_fp32 ( void* guest_state, UInt fs,
+                                                   UInt ft, flt_op op );
+/* Calculate FCSR in fp64 mode. */
+extern UInt mips_dirtyhelper_calculate_FCSR_fp64 ( void* guest_state, UInt fs,
+                                                   UInt ft, flt_op op );
 
 /*---------------------------------------------------------*/
 /*---               Condition code stuff                ---*/
Index: priv/guest_mips_helpers.c
===================================================================
--- priv/guest_mips_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_mips_helpers.c	(.../trunk)	(revision 2863)
@@ -44,7 +44,7 @@
    these functions are generated by the back end.
 */
 
-#define ALWAYSDEFD32(field)                           \
+#define ALWAYSDEFD32(field)                            \
     { offsetof(VexGuestMIPS32State, field),            \
       (sizeof ((VexGuestMIPS32State*)0)->field) }
 
@@ -105,38 +105,38 @@
    vex_state->guest_LO = 0;   /* Multiply and divide register lower result */
 
    /* FPU Registers */
-   vex_state->guest_f0 = 0x7ff80000;   /* Floting point general purpose registers */
-   vex_state->guest_f1 = 0x7ff80000;
-   vex_state->guest_f2 = 0x7ff80000;
-   vex_state->guest_f3 = 0x7ff80000;
-   vex_state->guest_f4 = 0x7ff80000;
-   vex_state->guest_f5 = 0x7ff80000;
-   vex_state->guest_f6 = 0x7ff80000;
-   vex_state->guest_f7 = 0x7ff80000;
-   vex_state->guest_f8 = 0x7ff80000;
-   vex_state->guest_f9 = 0x7ff80000;
-   vex_state->guest_f10 = 0x7ff80000;
-   vex_state->guest_f11 = 0x7ff80000;
-   vex_state->guest_f12 = 0x7ff80000;
-   vex_state->guest_f13 = 0x7ff80000;
-   vex_state->guest_f14 = 0x7ff80000;
-   vex_state->guest_f15 = 0x7ff80000;
-   vex_state->guest_f16 = 0x7ff80000;
-   vex_state->guest_f17 = 0x7ff80000;
-   vex_state->guest_f18 = 0x7ff80000;
-   vex_state->guest_f19 = 0x7ff80000;
-   vex_state->guest_f20 = 0x7ff80000;
-   vex_state->guest_f21 = 0x7ff80000;
-   vex_state->guest_f22 = 0x7ff80000;
-   vex_state->guest_f23 = 0x7ff80000;
-   vex_state->guest_f24 = 0x7ff80000;
-   vex_state->guest_f25 = 0x7ff80000;
-   vex_state->guest_f26 = 0x7ff80000;
-   vex_state->guest_f27 = 0x7ff80000;
-   vex_state->guest_f28 = 0x7ff80000;
-   vex_state->guest_f29 = 0x7ff80000;
-   vex_state->guest_f30 = 0x7ff80000;
-   vex_state->guest_f31 = 0x7ff80000;
+   vex_state->guest_f0 = 0x7ff800007ff80000ULL; /* Floting point GP registers */
+   vex_state->guest_f1 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f2 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f3 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f4 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f5 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f6 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f7 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f8 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f9 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f10 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f11 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f12 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f13 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f14 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f15 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f16 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f17 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f18 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f19 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f20 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f21 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f22 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f23 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f24 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f25 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f26 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f27 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f28 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f29 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f30 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f31 = 0x7ff800007ff80000ULL;
 
    vex_state->guest_FIR = 0;  /* FP implementation and revision register */
    vex_state->guest_FCCR = 0; /* FP condition codes register */
@@ -150,8 +150,8 @@
    vex_state->guest_EMNOTE = 0;
 
    /* For clflush: record start and length of area to invalidate */
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN = 0;
    vex_state->host_EvC_COUNTER = 0;
    vex_state->host_EvC_FAILADDR = 0;
 
@@ -212,38 +212,38 @@
    vex_state->guest_LO = 0;   /* Multiply and divide register lower result */
 
    /* FPU Registers */
-   vex_state->guest_f0 = 0xffffffffffffffffULL;  /* Floting point registers */
-   vex_state->guest_f1 = 0xffffffffffffffffULL;
-   vex_state->guest_f2 = 0xffffffffffffffffULL;
-   vex_state->guest_f3 = 0xffffffffffffffffULL;
-   vex_state->guest_f4 = 0xffffffffffffffffULL;
-   vex_state->guest_f5 = 0xffffffffffffffffULL;
-   vex_state->guest_f6 = 0xffffffffffffffffULL;
-   vex_state->guest_f7 = 0xffffffffffffffffULL;
-   vex_state->guest_f8 = 0xffffffffffffffffULL;
-   vex_state->guest_f9 = 0xffffffffffffffffULL;
-   vex_state->guest_f10 = 0xffffffffffffffffULL;
-   vex_state->guest_f11 = 0xffffffffffffffffULL;
-   vex_state->guest_f12 = 0xffffffffffffffffULL;
-   vex_state->guest_f13 = 0xffffffffffffffffULL;
-   vex_state->guest_f14 = 0xffffffffffffffffULL;
-   vex_state->guest_f15 = 0xffffffffffffffffULL;
-   vex_state->guest_f16 = 0xffffffffffffffffULL;
-   vex_state->guest_f17 = 0xffffffffffffffffULL;
-   vex_state->guest_f18 = 0xffffffffffffffffULL;
-   vex_state->guest_f19 = 0xffffffffffffffffULL;
-   vex_state->guest_f20 = 0xffffffffffffffffULL;
-   vex_state->guest_f21 = 0xffffffffffffffffULL;
-   vex_state->guest_f22 = 0xffffffffffffffffULL;
-   vex_state->guest_f23 = 0xffffffffffffffffULL;
-   vex_state->guest_f24 = 0xffffffffffffffffULL;
-   vex_state->guest_f25 = 0xffffffffffffffffULL;
-   vex_state->guest_f26 = 0xffffffffffffffffULL;
-   vex_state->guest_f27 = 0xffffffffffffffffULL;
-   vex_state->guest_f28 = 0xffffffffffffffffULL;
-   vex_state->guest_f29 = 0xffffffffffffffffULL;
-   vex_state->guest_f30 = 0xffffffffffffffffULL;
-   vex_state->guest_f31 = 0xffffffffffffffffULL;
+   vex_state->guest_f0 =  0x7ff800007ff80000ULL;  /* Floting point registers */
+   vex_state->guest_f1 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f2 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f3 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f4 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f5 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f6 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f7 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f8 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f9 =  0x7ff800007ff80000ULL;
+   vex_state->guest_f10 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f11 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f12 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f13 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f14 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f15 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f16 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f17 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f18 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f19 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f20 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f21 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f22 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f23 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f24 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f25 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f26 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f27 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f28 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f29 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f30 = 0x7ff800007ff80000ULL;
+   vex_state->guest_f31 = 0x7ff800007ff80000ULL;
 
    vex_state->guest_FIR = 0;   /* FP implementation and revision register */
    vex_state->guest_FCCR = 0;  /* FP condition codes register */
@@ -258,8 +258,8 @@
    vex_state->guest_EMNOTE = 0;
 
    /* For clflush: record start and length of area to invalidate */
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN = 0;
    vex_state->host_EvC_COUNTER = 0;
    vex_state->host_EvC_FAILADDR = 0;
 
@@ -375,8 +375,8 @@
              /* 0 */ ALWAYSDEFD32(guest_r0),
              /* 1 */ ALWAYSDEFD32(guest_r1),
              /* 2 */ ALWAYSDEFD32(guest_EMNOTE),
-             /* 3 */ ALWAYSDEFD32(guest_TISTART),
-             /* 4 */ ALWAYSDEFD32(guest_TILEN),
+             /* 3 */ ALWAYSDEFD32(guest_CMSTART),
+             /* 4 */ ALWAYSDEFD32(guest_CMLEN),
              /* 5 */ ALWAYSDEFD32(guest_r29),
              /* 6 */ ALWAYSDEFD32(guest_r31),
              /* 7 */ ALWAYSDEFD32(guest_ULR)
@@ -402,8 +402,8 @@
    .alwaysDefd = {
                   /* 0 */ ALWAYSDEFD64 (guest_r0),
                   /* 1 */ ALWAYSDEFD64 (guest_EMNOTE),
-                  /* 2 */ ALWAYSDEFD64 (guest_TISTART),
-                  /* 3 */ ALWAYSDEFD64 (guest_TILEN),
+                  /* 2 */ ALWAYSDEFD64 (guest_CMSTART),
+                  /* 3 */ ALWAYSDEFD64 (guest_CMLEN),
                   /* 4 */ ALWAYSDEFD64 (guest_r29),
                   /* 5 */ ALWAYSDEFD64 (guest_r31),
                   /* 6 */ ALWAYSDEFD64 (guest_ULR)
@@ -1107,145 +1107,324 @@
 }
 #endif
 
-#define ASM_VOLATILE_ROUND32(fs, inst)                              \
-   __asm__ volatile("cfc1    $t0, $31"  "\n\t"                      \
-                    "ctc1    %2,  $31"  "\n\t"                      \
-                    "mtc1    %1,  $f0"  "\n\t"                      \
-                    ""#inst" $f0, $f0"  "\n\t"                      \
-                    "cfc1    %0,  $31"  "\n\t"                      \
-                    "ctc1    $t0, $31"  "\n\t"                      \
+#define ASM_VOLATILE_UNARY32(inst)                                  \
+   __asm__ volatile("cfc1  $t0,  $31"   "\n\t"                      \
+                    "ctc1  %2,   $31"   "\n\t"                      \
+                    "mtc1  %1,   $f20"  "\n\t"                      \
+                    #inst" $f20, $f20"  "\n\t"                      \
+                    "cfc1  %0,   $31"   "\n\t"                      \
+                    "ctc1  $t0,  $31"   "\n\t"                      \
                     : "=r" (ret)                                    \
-                    : "r" (loVal), "r" (fcsr)                       \
-                    : "t0", "$f0", "$f1"                            \
+                    : "r" (loFsVal), "r" (fcsr)                     \
+                    : "t0", "$f20"                                  \
                    );
 
-#define ASM_VOLATILE_ROUND32_DOUBLE(fs, inst)                       \
-   __asm__ volatile("cfc1    $t0, $31"  "\n\t"                      \
-                    "ctc1    %3,  $31"  "\n\t"                      \
-                    "mtc1    %1,  $f0"  "\n\t"                      \
-                    "mtc1    %2,  $f1"  "\n\t"                      \
-                    ""#inst" $f0, $f0"  "\n\t"                      \
-                    "cfc1    %0,  $31"  "\n\t"                      \
-                    "ctc1    $t0, $31"  "\n\t"                      \
+#define ASM_VOLATILE_UNARY32_DOUBLE(inst)                           \
+   __asm__ volatile("cfc1  $t0,  $31"   "\n\t"                      \
+                    "ctc1  %3,   $31"   "\n\t"                      \
+                    "mtc1  %1,   $f20"  "\n\t"                      \
+                    "mtc1  %2,   $f21"  "\n\t"                      \
+                    #inst" $f20, $f20"  "\n\t"                      \
+                    "cfc1  %0,   $31"   "\n\t"                      \
+                    "ctc1  $t0,  $31"   "\n\t"                      \
                     : "=r" (ret)                                    \
-                    : "r" (loVal), "r" (hiVal), "r" (fcsr)          \
-                    : "t0", "$f0", "$f1"                            \
+                    : "r" (loFsVal), "r" (hiFsVal), "r" (fcsr)      \
+                    : "t0", "$f20", "$f21"                          \
                    );
 
-#define ASM_VOLATILE_ROUND64(fs, inst)                              \
-   __asm__ volatile("cfc1     $t0, $31"  "\n\t"                     \
-                    "ctc1     %2,  $31"  "\n\t"                     \
-                    "dmtc1    %1,  $f0"  "\n\t"                     \
-                    ""#inst"  $f0, $f0"  "\n\t"                     \
-                    "cfc1     %0,  $31"  "\n\t"                     \
-                    "ctc1     $t0, $31"  "\n\t"                     \
+#define ASM_VOLATILE_UNARY64(inst)                                  \
+   __asm__ volatile("cfc1  $t0,  $31"    "\n\t"                     \
+                    "ctc1  %2,   $31"    "\n\t"                     \
+                    "ldc1  $f24, 0(%1)"  "\n\t"                     \
+                    #inst" $f24, $f24"   "\n\t"                     \
+                    "cfc1  %0,   $31"    "\n\t"                     \
+                    "ctc1  $t0,  $31"    "\n\t"                     \
                     : "=r" (ret)                                    \
-                    : "r" (addr[fs]), "r" (fcsr)                    \
-                    : "t0", "$f0"                                   \
+                    : "r" (&(addr[fs])), "r" (fcsr)                 \
+                    : "t0", "$f24"                                  \
                    );
 
+#define ASM_VOLATILE_BINARY32(inst)                                 \
+   __asm__ volatile("cfc1  $t0,  $31"         "\n\t"                \
+                    "ctc1  %3,   $31"         "\n\t"                \
+                    "mtc1  %1,   $f20"        "\n\t"                \
+                    "mtc1  %2,   $f22"        "\n\t"                \
+                    #inst" $f20, $f20, $f22"  "\n\t"                \
+                    "cfc1  %0,   $31"         "\n\t"                \
+                    "ctc1  $t0,  $31"         "\n\t"                \
+                    : "=r" (ret)                                    \
+                    : "r" (loFsVal), "r" (loFtVal), "r" (fcsr)      \
+                    : "t0", "$f20", "$f22"                          \
+                   );
+
+#define ASM_VOLATILE_BINARY32_DOUBLE(inst)                          \
+   __asm__ volatile("cfc1  $t0,  $31"         "\n\t"                \
+                    "ctc1  %5,   $31"         "\n\t"                \
+                    "mtc1  %1,   $f20"        "\n\t"                \
+                    "mtc1  %2,   $f21"        "\n\t"                \
+                    "mtc1  %3,   $f22"        "\n\t"                \
+                    "mtc1  %4,   $f23"        "\n\t"                \
+                    #inst" $f20, $f20, $f22"  "\n\t"                \
+                    "cfc1  %0,   $31"         "\n\t"                \
+                    "ctc1  $t0,  $31"         "\n\t"                \
+                    : "=r" (ret)                                    \
+                    : "r" (loFsVal), "r" (hiFsVal), "r" (loFtVal),  \
+                      "r" (hiFtVal), "r" (fcsr)                     \
+                    : "t0", "$f20", "$f21", "$f22", "$f23"          \
+                   );
+
+#define ASM_VOLATILE_BINARY64(inst)                                     \
+   __asm__ volatile("cfc1  $t0,  $31"         "\n\t"                    \
+                    "ctc1  %3,   $31"         "\n\t"                    \
+                    "ldc1  $f24, 0(%1)"       "\n\t"                    \
+                    "ldc1  $f26, 0(%2)"       "\n\t"                    \
+                    #inst" $f24, $f24, $f26"  "\n\t"                    \
+                    "cfc1  %0,   $31"         "\n\t"                    \
+                    "ctc1  $t0,  $31"         "\n\t"                    \
+                    : "=r" (ret)                                        \
+                    : "r" (&(addr[fs])), "r" (&(addr[ft])), "r" (fcsr)  \
+                    : "t0", "$f24", "$f26"                              \
+                   );
+
 /* TODO: Add cases for all fpu instructions because all fpu instructions are
          change the value of FCSR register. */
-extern UInt mips_dirtyhelper_calculate_FCSR ( void* gs, UInt fs, flt_op inst )
+extern UInt mips_dirtyhelper_calculate_FCSR_fp32 ( void* gs, UInt fs, UInt ft,
+                                                   flt_op inst )
 {
    UInt ret = 0;
 #if defined(__mips__)
+   VexGuestMIPS32State* guest_state = (VexGuestMIPS32State*)gs;
+   UInt loFsVal, hiFsVal, loFtVal, hiFtVal;
+#if defined (_MIPSEL)
+   ULong *addr = (ULong *)&guest_state->guest_f0;
+   loFsVal     = (UInt)addr[fs];
+   hiFsVal     = (UInt)addr[fs+1];
+   loFtVal     = (UInt)addr[ft];
+   hiFtVal     = (UInt)addr[ft+1];
+#elif defined (_MIPSEB)
+   UInt *addr = (UInt *)&guest_state->guest_f0;
+   loFsVal    = (UInt)addr[fs*2];
+   hiFsVal    = (UInt)addr[fs*2+2];
+   loFtVal    = (UInt)addr[ft*2];
+   hiFtVal    = (UInt)addr[ft*2+2];
+#endif
+   UInt fcsr     = guest_state->guest_FCSR;
+   switch (inst) {
+      case ROUNDWD:
+         ASM_VOLATILE_UNARY32_DOUBLE(round.w.d)
+         break;
+      case FLOORWS:
+         ASM_VOLATILE_UNARY32(floor.w.s)
+         break;
+      case FLOORWD:
+         ASM_VOLATILE_UNARY32_DOUBLE(floor.w.d)
+         break;
+      case TRUNCWS:
+         ASM_VOLATILE_UNARY32(trunc.w.s)
+         break;
+      case TRUNCWD:
+         ASM_VOLATILE_UNARY32_DOUBLE(trunc.w.d)
+         break;
+      case CEILWS:
+         ASM_VOLATILE_UNARY32(ceil.w.s)
+         break;
+      case CEILWD:
+         ASM_VOLATILE_UNARY32_DOUBLE(ceil.w.d)
+         break;
+      case CVTDS:
+         ASM_VOLATILE_UNARY32(cvt.d.s)
+         break;
+      case CVTDW:
+         ASM_VOLATILE_UNARY32(cvt.d.w)
+         break;
+      case CVTSW:
+         ASM_VOLATILE_UNARY32(cvt.s.w)
+         break;
+      case CVTSD:
+         ASM_VOLATILE_UNARY32_DOUBLE(cvt.s.d)
+         break;
+      case CVTWS:
+         ASM_VOLATILE_UNARY32(cvt.w.s)
+         break;
+      case CVTWD:
+         ASM_VOLATILE_UNARY32_DOUBLE(cvt.w.d)
+         break;
+      case ROUNDWS:
+         ASM_VOLATILE_UNARY32(round.w.s)
+         break;
+#if ((__mips == 32) && defined(__mips_isa_rev) && (__mips_isa_rev >= 2)) \
+    || (__mips == 64)
+      case CEILLS:
+         ASM_VOLATILE_UNARY32(ceil.l.s)
+         break;
+      case CEILLD:
+         ASM_VOLATILE_UNARY32_DOUBLE(ceil.l.d)
+         break;
+      case CVTDL:
+         ASM_VOLATILE_UNARY32_DOUBLE(cvt.d.l)
+         break;
+      case CVTLS:
+         ASM_VOLATILE_UNARY32(cvt.l.s)
+         break;
+      case CVTLD:
+         ASM_VOLATILE_UNARY32_DOUBLE(cvt.l.d)
+         break;
+      case CVTSL:
+         ASM_VOLATILE_UNARY32_DOUBLE(cvt.s.l)
+         break;
+      case FLOORLS:
+         ASM_VOLATILE_UNARY32(floor.l.s)
+         break;
+      case FLOORLD:
+         ASM_VOLATILE_UNARY32_DOUBLE(floor.l.d)
+         break;
+      case ROUNDLS:
+         ASM_VOLATILE_UNARY32(round.l.s)
+         break;
+      case ROUNDLD:
+         ASM_VOLATILE_UNARY32_DOUBLE(round.l.d)
+         break;
+      case TRUNCLS:
+         ASM_VOLATILE_UNARY32(trunc.l.s)
+         break;
+      case TRUNCLD:
+         ASM_VOLATILE_UNARY32_DOUBLE(trunc.l.d)
+         break;
+#endif
+      case ADDS:
+          ASM_VOLATILE_BINARY32(add.s)
+          break;
+      case ADDD:
+          ASM_VOLATILE_BINARY32_DOUBLE(add.d)
+          break;
+      case SUBS:
+          ASM_VOLATILE_BINARY32(sub.s)
+          break;
+      case SUBD:
+          ASM_VOLATILE_BINARY32_DOUBLE(sub.d)
+          break;
+      case DIVS:
+          ASM_VOLATILE_BINARY32(div.s)
+          break;
+      default:
+         vassert(0);
+         break;
+   }
+#endif
+   return ret;
+}
+
+/* TODO: Add cases for all fpu instructions because all fpu instructions are
+         change the value of FCSR register. */
+extern UInt mips_dirtyhelper_calculate_FCSR_fp64 ( void* gs, UInt fs, UInt ft,
+                                                   flt_op inst )
+{
+   UInt ret = 0;
+#if defined(__mips__)
 #if defined(VGA_mips32)
    VexGuestMIPS32State* guest_state = (VexGuestMIPS32State*)gs;
-   UInt *addr = (UInt *)&guest_state->guest_f0;
-   UInt loVal = addr[fs];
-   UInt hiVal = addr[fs+1];
-#define ASM_VOLATILE_ROUND(fs, inst)        ASM_VOLATILE_ROUND32(fs, inst)
-#define ASM_VOLATILE_ROUND_DOUBLE(fs, inst) ASM_VOLATILE_ROUND32_DOUBLE(fs, inst)
 #else
    VexGuestMIPS64State* guest_state = (VexGuestMIPS64State*)gs;
+#endif
    ULong *addr = (ULong *)&guest_state->guest_f0;
-#define ASM_VOLATILE_ROUND(fs, inst)        ASM_VOLATILE_ROUND64(fs, inst)
-#define ASM_VOLATILE_ROUND_DOUBLE(fs, inst) ASM_VOLATILE_ROUND64(fs, inst)
-#endif
-   UInt fcsr = guest_state->guest_FCSR;
+   UInt fcsr   = guest_state->guest_FCSR;
    switch (inst) {
       case ROUNDWD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, round.w.d)
+         ASM_VOLATILE_UNARY64(round.w.d)
          break;
       case FLOORWS:
-         ASM_VOLATILE_ROUND(fs, floor.w.s)
+         ASM_VOLATILE_UNARY64(floor.w.s)
          break;
       case FLOORWD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, floor.w.d)
+         ASM_VOLATILE_UNARY64(floor.w.d)
          break;
       case TRUNCWS:
-         ASM_VOLATILE_ROUND(fs, trunc.w.s)
+         ASM_VOLATILE_UNARY64(trunc.w.s)
          break;
       case TRUNCWD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, trunc.w.d)
+         ASM_VOLATILE_UNARY64(trunc.w.d)
          break;
       case CEILWS:
-         ASM_VOLATILE_ROUND(fs, ceil.w.s)
+         ASM_VOLATILE_UNARY64(ceil.w.s)
          break;
       case CEILWD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, ceil.w.d)
+         ASM_VOLATILE_UNARY64(ceil.w.d)
          break;
       case CVTDS:
-         ASM_VOLATILE_ROUND(fs, cvt.d.s)
+         ASM_VOLATILE_UNARY64(cvt.d.s)
          break;
       case CVTDW:
-         ASM_VOLATILE_ROUND(fs, cvt.d.w)
+         ASM_VOLATILE_UNARY64(cvt.d.w)
          break;
       case CVTSW:
-         ASM_VOLATILE_ROUND(fs, cvt.s.w)
+         ASM_VOLATILE_UNARY64(cvt.s.w)
          break;
       case CVTSD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, cvt.s.d)
+         ASM_VOLATILE_UNARY64(cvt.s.d)
          break;
       case CVTWS:
-         ASM_VOLATILE_ROUND(fs, cvt.w.s)
+         ASM_VOLATILE_UNARY64(cvt.w.s)
          break;
       case CVTWD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, cvt.w.d)
+         ASM_VOLATILE_UNARY64(cvt.w.d)
          break;
       case ROUNDWS:
-         ASM_VOLATILE_ROUND(fs, round.w.s)
+         ASM_VOLATILE_UNARY64(round.w.s)
          break;
 #if ((__mips == 32) && defined(__mips_isa_rev) && (__mips_isa_rev >= 2)) \
     || (__mips == 64)
       case CEILLS:
-         ASM_VOLATILE_ROUND(fs, ceil.l.s)
+         ASM_VOLATILE_UNARY64(ceil.l.s)
          break;
       case CEILLD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, ceil.l.d)
+         ASM_VOLATILE_UNARY64(ceil.l.d)
          break;
       case CVTDL:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, cvt.d.l)
+         ASM_VOLATILE_UNARY64(cvt.d.l)
          break;
       case CVTLS:
-         ASM_VOLATILE_ROUND(fs, cvt.l.s)
+         ASM_VOLATILE_UNARY64(cvt.l.s)
          break;
       case CVTLD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, cvt.l.d)
+         ASM_VOLATILE_UNARY64(cvt.l.d)
          break;
       case CVTSL:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, cvt.s.l)
+         ASM_VOLATILE_UNARY64(cvt.s.l)
          break;
       case FLOORLS:
-         ASM_VOLATILE_ROUND(fs, floor.l.s)
+         ASM_VOLATILE_UNARY64(floor.l.s)
          break;
       case FLOORLD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, floor.l.d)
+         ASM_VOLATILE_UNARY64(floor.l.d)
          break;
       case ROUNDLS:
-         ASM_VOLATILE_ROUND(fs, round.l.s)
+         ASM_VOLATILE_UNARY64(round.l.s)
          break;
       case ROUNDLD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, round.l.d)
+         ASM_VOLATILE_UNARY64(round.l.d)
          break;
       case TRUNCLS:
-         ASM_VOLATILE_ROUND(fs, trunc.l.s)
+         ASM_VOLATILE_UNARY64(trunc.l.s)
          break;
       case TRUNCLD:
-         ASM_VOLATILE_ROUND_DOUBLE(fs, trunc.l.d)
+         ASM_VOLATILE_UNARY64(trunc.l.d)
          break;
 #endif
+      case ADDS:
+          ASM_VOLATILE_BINARY64(add.s)
+          break;
+      case ADDD:
+          ASM_VOLATILE_BINARY64(add.d)
+          break;
+      case SUBS:
+          ASM_VOLATILE_BINARY64(sub.s)
+          break;
+      case SUBD:
+          ASM_VOLATILE_BINARY64(sub.d)
+          break;
+      case DIVS:
+          ASM_VOLATILE_BINARY64(div.s)
+          break;
       default:
          vassert(0);
          break;
Index: priv/guest_mips_toIR.c
===================================================================
--- priv/guest_mips_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_mips_toIR.c	(.../trunk)	(revision 2863)
@@ -72,6 +72,9 @@
    disInstr_MIPS below. */
 static Bool mode64 = False;
 
+/* CPU has FPU and 32 dbl. prec. FP registers. */
+static Bool fp_mode64 = False;
+
 /* Define 1.0 in single and double precision. */
 #define ONE_SINGLE 0x3F800000
 #define ONE_DOUBLE 0x3FF0000000000000ULL
@@ -466,6 +469,13 @@
          assign(t1, binop(Iop_Add64, getIReg(rs), \
                                      mkU64(extend_s_16to64(imm)))); \
 
+#define LOADX_STORE_PATTERN \
+   t1 = newTemp(mode64 ? Ity_I64 : Ity_I32); \
+      if(!mode64) \
+         assign(t1, binop(Iop_Add32, getIReg(regRs), getIReg(regRt))); \
+      else \
+         assign(t1, binop(Iop_Add64, getIReg(regRs), getIReg(regRt)));
+
 #define LWX_SWX_PATTERN64 \
    t2 = newTemp(Ity_I64); \
    assign(t2, binop(Iop_And64, mkexpr(t1), mkU64(0xFFFFFFFFFFFFFFFCULL))); \
@@ -534,6 +544,11 @@
                              binop(Iop_Shr32, getFCSR(), mkU8(24+cc))), \
                  mkU32(0x1)));
 
+#define ILLEGAL_INSTRUCTON \
+   putPC(mkU32(guest_PC_curr_instr + 4)); \
+   dres.jk_StopHere = Ijk_SigILL; \
+   dres.whatNext    = Dis_StopHere;
+
 /*------------------------------------------------------------*/
 /*---                  Field helpers                       ---*/
 /*------------------------------------------------------------*/
@@ -1090,38 +1105,84 @@
 
 /* fs   - fpu source register number.
    inst - fpu instruction that needs to be executed.
-   sz32 - size of source register. */
-static void calculateFCSR(UInt fs, UInt inst, Bool sz32)
+   sz32 - size of source register.
+   opN  - number of operads:
+          1 - unary operation.
+          2 - binary operation. */
+static void calculateFCSR(UInt fs, UInt ft, UInt inst, Bool sz32, UInt opN)
 {
    IRDirty *d;
    IRTemp fcsr = newTemp(Ity_I32);
-   /* IRExpr_BBPTR() => Need to pass pointer to guest
-      state to helper. */
-   d = unsafeIRDirty_1_N(fcsr, 0,
-                         "mips_dirtyhelper_calculate_FCSR",
-                         &mips_dirtyhelper_calculate_FCSR,
-                         mkIRExprVec_3(IRExpr_BBPTR(),
-                                       mkU32(fs),
-                                       mkU32(inst)));
-
-   /* Declare we're reading guest state. */
-   if (!mode64 && !sz32)
-      d->nFxState = 2;
+   /* IRExpr_BBPTR() => Need to pass pointer to guest state to helper. */
+   if (fp_mode64)
+      d = unsafeIRDirty_1_N(fcsr, 0,
+                            "mips_dirtyhelper_calculate_FCSR_fp64",
+                            &mips_dirtyhelper_calculate_FCSR_fp64,
+                            mkIRExprVec_4(IRExpr_BBPTR(),
+                                          mkU32(fs),
+                                          mkU32(ft),
+                                          mkU32(inst)));
    else
-      d->nFxState = 1;
-   vex_bzero(&d->fxState, sizeof(d->fxState));
+      d = unsafeIRDirty_1_N(fcsr, 0,
+                            "mips_dirtyhelper_calculate_FCSR_fp32",
+                            &mips_dirtyhelper_calculate_FCSR_fp32,
+                            mkIRExprVec_4(IRExpr_BBPTR(),
+                                          mkU32(fs),
+                                          mkU32(ft),
+                                          mkU32(inst)));
 
-   d->fxState[0].fx     = Ifx_Read;  /* read */
-   d->fxState[0].offset = floatGuestRegOffset(fs);
-   if (mode64)
-      d->fxState[0].size   = sizeof(ULong);
-   else
+   if (opN == 1) {  /* Unary operation. */
+      /* Declare we're reading guest state. */
+      if (sz32 || fp_mode64)
+         d->nFxState = 2;
+      else
+         d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+
+      d->fxState[0].fx     = Ifx_Read;  /* read */
+      if (mode64)
+         d->fxState[0].offset = offsetof(VexGuestMIPS64State, guest_FCSR);
+      else
+         d->fxState[0].offset = offsetof(VexGuestMIPS32State, guest_FCSR);
       d->fxState[0].size   = sizeof(UInt);
+      d->fxState[1].fx     = Ifx_Read;  /* read */
+      d->fxState[1].offset = floatGuestRegOffset(fs);
+      d->fxState[1].size   = sizeof(ULong);
 
-   if (!mode64 && !sz32) {
+      if (!(sz32 || fp_mode64)) {
+         d->fxState[2].fx     = Ifx_Read;  /* read */
+         d->fxState[2].offset = floatGuestRegOffset(fs+1);
+         d->fxState[2].size   = sizeof(ULong);
+      }
+   } else if (opN == 2) {  /* Binary operation. */
+      /* Declare we're reading guest state. */
+      if (sz32 || fp_mode64)
+         d->nFxState = 3;
+      else
+         d->nFxState = 5;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+
+      d->fxState[0].fx     = Ifx_Read;  /* read */
+      if (mode64)
+         d->fxState[0].offset = offsetof(VexGuestMIPS64State, guest_FCSR);
+      else
+         d->fxState[0].offset = offsetof(VexGuestMIPS32State, guest_FCSR);
+      d->fxState[0].size   = sizeof(UInt);
       d->fxState[1].fx     = Ifx_Read;  /* read */
-      d->fxState[1].offset = floatGuestRegOffset(fs+1);
-      d->fxState[1].size   = sizeof(UInt);
+      d->fxState[1].offset = floatGuestRegOffset(fs);
+      d->fxState[1].size   = sizeof(ULong);
+      d->fxState[2].fx     = Ifx_Read;  /* read */
+      d->fxState[2].offset = floatGuestRegOffset(ft);
+      d->fxState[2].size   = sizeof(ULong);
+
+      if (!(sz32 || fp_mode64)) {
+         d->fxState[3].fx     = Ifx_Read;  /* read */
+         d->fxState[3].offset = floatGuestRegOffset(fs+1);
+         d->fxState[3].size   = sizeof(ULong);
+         d->fxState[4].fx     = Ifx_Read;  /* read */
+         d->fxState[4].offset = floatGuestRegOffset(ft+1);
+         d->fxState[4].size   = sizeof(ULong);
+      }
    }
 
    stmt(IRStmt_Dirty(d));
@@ -1146,6 +1207,12 @@
       stmt(IRStmt_Put(integerGuestRegOffset(archreg), e));
 }
 
+static IRExpr *mkNarrowTo32(IRType ty, IRExpr * src)
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ty == Ity_I64 ? unop(Iop_64to32, src) : src;
+}
+
 static void putLO(IRExpr * e)
 {
    if (mode64) {
@@ -1239,12 +1306,6 @@
    return 0;
 }
 
-static IRExpr *mkNarrowTo32(IRType ty, IRExpr * src)
-{
-   vassert(ty == Ity_I32 || ty == Ity_I64);
-   return ty == Ity_I64 ? unop(Iop_64to32, src) : src;
-}
-
 static IRExpr *getLoFromF64(IRType ty, IRExpr * src)
 {
    vassert(ty == Ity_F32 || ty == Ity_F64);
@@ -1340,21 +1401,21 @@
                                      (UInt) branch_offset), OFFB_PC);
 }
 
-static IRExpr *getFReg(UInt dregNo)
+static IRExpr *getFReg(UInt fregNo)
 {
-   vassert(dregNo < 32);
-   IRType ty = mode64 ? Ity_F64 : Ity_F32;
-   return IRExpr_Get(floatGuestRegOffset(dregNo), ty);
+   vassert(fregNo < 32);
+   IRType ty = fp_mode64 ? Ity_F64 : Ity_F32;
+   return IRExpr_Get(floatGuestRegOffset(fregNo), ty);
 }
 
 static IRExpr *getDReg(UInt dregNo)
 {
-   if (mode64) {
-      vassert(dregNo < 32);
-      IRType ty = Ity_F64;
-      return IRExpr_Get(floatGuestRegOffset(dregNo), ty);
+   vassert(dregNo < 32);
+   if (fp_mode64) {
+      return IRExpr_Get(floatGuestRegOffset(dregNo), Ity_F64);
    } else {
-      vassert(dregNo < 32);
+      /* Read a floating point register pair and combine their contents into a
+         64-bit value */
       IRTemp t0 = newTemp(Ity_F32);
       IRTemp t1 = newTemp(Ity_F32);
       IRTemp t2 = newTemp(Ity_F64);
@@ -1377,7 +1438,7 @@
 static void putFReg(UInt dregNo, IRExpr * e)
 {
    vassert(dregNo < 32);
-   IRType ty = mode64 ? Ity_F64 : Ity_F32;
+   IRType ty = fp_mode64 ? Ity_F64 : Ity_F32;
    vassert(typeOfIRExpr(irsb->tyenv, e) == ty);
    stmt(IRStmt_Put(floatGuestRegOffset(dregNo), e));
 }
@@ -1384,7 +1445,7 @@
 
 static void putDReg(UInt dregNo, IRExpr * e)
 {
-   if (mode64) {
+   if (fp_mode64) {
       vassert(dregNo < 32);
       IRType ty = Ity_F64;
       vassert(typeOfIRExpr(irsb->tyenv, e) == ty);
@@ -1642,22 +1703,22 @@
 static const HChar* showCondCode(UInt code) {
    const HChar* ret;
    switch (code) {
-      case 0: ret = "F"; break;
-      case 1: ret = "UN"; break;
-      case 2: ret = "EQ"; break;
-      case 3: ret = "UEQ"; break;
-      case 4: ret = "OLT"; break;
-      case 5: ret = "ULT"; break;
-      case 6: ret = "OLE"; break;
-      case 7: ret = "ULE"; break;
-      case 8: ret = "SF"; break;
-      case 9: ret = "NGLE"; break;
-      case 10: ret = "SEQ"; break;
-      case 11: ret = "NGL"; break;
-      case 12: ret = "LT"; break;
-      case 13: ret = "NGE"; break;
-      case 14: ret = "LE"; break;
-      case 15: ret = "NGT"; break;
+      case 0: ret = "f"; break;
+      case 1: ret = "un"; break;
+      case 2: ret = "eq"; break;
+      case 3: ret = "ueq"; break;
+      case 4: ret = "olt"; break;
+      case 5: ret = "ult"; break;
+      case 6: ret = "ole"; break;
+      case 7: ret = "ule"; break;
+      case 8: ret = "sf"; break;
+      case 9: ret = "ngle"; break;
+      case 10: ret = "seq"; break;
+      case 11: ret = "ngl"; break;
+      case 12: ret = "lt"; break;
+      case 13: ret = "nge"; break;
+      case 14: ret = "le"; break;
+      case 15: ret = "ngt"; break;
       default: vpanic("showCondCode"); break;
    }
    return ret;
@@ -1678,8 +1739,8 @@
       UInt fpc_cc = get_fpc_cc(cins);
       switch (fmt) {
          case 0x10: {  /* C.cond.S */
-            DIP("C.%s.S %d, f%d, f%d", showCondCode(cond), fpc_cc, fs, ft);
-            if (mode64) {
+            DIP("c.%s.s %d, f%d, f%d", showCondCode(cond), fpc_cc, fs, ft);
+            if (fp_mode64) {
                t0 = newTemp(Ity_I32);
                t1 = newTemp(Ity_I32);
                t2 = newTemp(Ity_I32);
@@ -1694,7 +1755,8 @@
                                  getFReg(ft))));
 
                assign(ccIR, binop(Iop_CmpF64, mkexpr(tmp5), mkexpr(tmp6)));
-               putHI(mkWidenFrom32(Ity_I64, mkexpr(ccIR), True));
+               putHI(mkWidenFrom32(mode64 ? Ity_I64: Ity_I32,
+                                   mkexpr(ccIR), True));
                /* Map compare result from IR to MIPS
                   FP cmp result | MIPS | IR
                   --------------------------
@@ -1711,7 +1773,8 @@
                               binop(Iop_And32, binop(Iop_Xor32, mkexpr(ccIR),
                               binop(Iop_Shr32, mkexpr(ccIR), mkU8(6))),
                               mkU32(1))))));
-               putLO(mkWidenFrom32(Ity_I64, mkexpr(ccMIPS), True));
+               putLO(mkWidenFrom32(mode64 ? Ity_I64: Ity_I32,
+                                   mkexpr(ccMIPS), True));
 
                /* UN */
                assign(t0, binop(Iop_And32, mkexpr(ccMIPS), mkU32(0x1)));
@@ -1885,7 +1948,7 @@
             break;
 
          case 0x11: {  /* C.cond.D */
-            DIP("C.%s.D %d, f%d, f%d", showCondCode(cond), fpc_cc, fs, ft);
+            DIP("c.%s.d %d, f%d, f%d", showCondCode(cond), fpc_cc, fs, ft);
             t0 = newTemp(Ity_I32);
             t1 = newTemp(Ity_I32);
             t2 = newTemp(Ity_I32);
@@ -2110,6 +2173,7 @@
 static Bool dis_instr_CVM ( UInt theInstr )
 {
    UChar  opc2     = get_function(theInstr);
+   UChar  opc1     = get_opcode(theInstr);
    UChar  regRs    = get_rs(theInstr);
    UChar  regRt    = get_rt(theInstr);
    UChar  regRd    = get_rd(theInstr);
@@ -2120,99 +2184,184 @@
    IRTemp tmp      = newTemp(ty);
    IRTemp tmpRs    = newTemp(ty);
    IRTemp tmpRt    = newTemp(ty);
+   IRTemp t1       = newTemp(ty);
    UInt size;
    assign(tmpRs, getIReg(regRs));
 
-   switch(opc2) { 
-      case 0x03: {  /* DMUL rd, rs, rt */
-         DIP("dmul r%d, r%d, r%d", regRd, regRs, regRt);
-         IRType t0 = newTemp(Ity_I128);
-         assign(t0, binop(Iop_MullU64, getIReg(regRs), getIReg(regRt)));
-         putIReg(regRd, unop(Iop_128to64, mkexpr(t0)));
-         break;
-      }
-      case 0x32:  /* 5. CINS rd, rs, p, lenm1 */
-         DIP("cins r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
-         assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs), mkU8(64-( lenM1+1 ))));
-         assign ( tmpRt, binop(Iop_Shr64, mkexpr( tmp ), mkU8(64-(p+lenM1+1))));
-         putIReg( regRt, mkexpr(tmpRt));
-         break;
+   switch(opc1){
+      case 0x1C:  {
+         switch(opc2) { 
+            case 0x03: {  /* DMUL rd, rs, rt */
+               DIP("dmul r%d, r%d, r%d", regRd, regRs, regRt);
+               IRType t0 = newTemp(Ity_I128);
+               assign(t0, binop(Iop_MullU64, getIReg(regRs), getIReg(regRt)));
+               putIReg(regRd, unop(Iop_128to64, mkexpr(t0)));
+               break;
+            }
 
-      case 0x33:  /* 6. CINS32 rd, rs, p+32, lenm1 */
-         DIP("cins32 r%u, r%u, %d, %d\n", regRt, regRs, p+32, lenM1);
-         assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs), mkU8(64-( lenM1+1 ))));
-         assign ( tmpRt, binop(Iop_Shr64, mkexpr( tmp ), mkU8(32-(p+lenM1+1))));
-         putIReg( regRt, mkexpr(tmpRt));
-         break;
+            case 0x32:  /* 5. CINS rd, rs, p, lenm1 */
+               DIP("cins r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
+               assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs),
+                                     mkU8(64-( lenM1+1 ))));
+               assign ( tmpRt, binop(Iop_Shr64, mkexpr( tmp ),
+                                     mkU8(64-(p+lenM1+1))));
+               putIReg( regRt, mkexpr(tmpRt));
+               break;
 
-      case 0x3A:  /* 3. EXTS rt, rs, p len */
-         DIP("exts r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
-         size = lenM1 + 1;  /* lenm1+1 */
-         UChar lsAmt = 64 - (p + size);  /* p+lenm1+1 */
-         UChar rsAmt = 64 - size;  /* lenm1+1 */
-         tmp = newTemp(Ity_I64);
-         assign(tmp, binop(Iop_Shl64, mkexpr(tmpRs), mkU8(lsAmt)));
-         putIReg(regRt, binop(Iop_Sar64, mkexpr(tmp), mkU8(rsAmt)));
-         break;
+            case 0x33:  /* 6. CINS32 rd, rs, p+32, lenm1 */
+               DIP("cins32 r%u, r%u, %d, %d\n", regRt, regRs, p+32, lenM1);
+               assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs),
+                                     mkU8(64-( lenM1+1 ))));
+               assign ( tmpRt, binop(Iop_Shr64, mkexpr( tmp ),
+                                     mkU8(32-(p+lenM1+1))));
+               putIReg( regRt, mkexpr(tmpRt));
+               break;
 
-      case 0x3B:  /* 4. EXTS32 rt, rs, p len */
-         DIP("exts32 r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
-         assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs), mkU8(32-(p+lenM1+1))));
-         assign ( tmpRt, binop(Iop_Sar64, mkexpr(tmp)  , mkU8(64-(lenM1+1))) );
-         putIReg( regRt, mkexpr(tmpRt));
-         break;
+            case 0x3A:  /* 3. EXTS rt, rs, p len */
+               DIP("exts r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
+               size = lenM1 + 1;  /* lenm1+1 */
+               UChar lsAmt = 64 - (p + size);  /* p+lenm1+1 */
+               UChar rsAmt = 64 - size;  /* lenm1+1 */
+               tmp = newTemp(Ity_I64);
+               assign(tmp, binop(Iop_Shl64, mkexpr(tmpRs), mkU8(lsAmt)));
+               putIReg(regRt, binop(Iop_Sar64, mkexpr(tmp), mkU8(rsAmt)));
+               break;
 
-      case 0x2B:  /* 20. SNE rd, rs, rt */
-         DIP("sne r%d, r%d, r%d", regRd,regRs, regRt);
-         if (mode64)
-            putIReg(regRd, unop(Iop_1Uto64, binop(Iop_CmpNE64, getIReg(regRs),
-                                                  getIReg(regRt))));
-         else
-            putIReg(regRd,unop(Iop_1Uto32, binop(Iop_CmpNE32, getIReg(regRs),
-                                                 getIReg(regRt))));
-         break;
+            case 0x3B:  /* 4. EXTS32 rt, rs, p len */
+               DIP("exts32 r%u, r%u, %d, %d\n", regRt, regRs, p, lenM1); 
+               assign ( tmp  , binop(Iop_Shl64, mkexpr(tmpRs),
+                                     mkU8(32-(p+lenM1+1))));
+               assign ( tmpRt, binop(Iop_Sar64, mkexpr(tmp),
+                                     mkU8(64-(lenM1+1))) );
+               putIReg( regRt, mkexpr(tmpRt));
+               break;
 
-      case 0x2A:  /* Set Equals - SEQ; Cavium OCTEON */
-         DIP("seq r%d, r%d, %d", regRd, regRs, regRt);
-         if (mode64)
-            putIReg(regRd, unop(Iop_1Uto64,
-                                binop(Iop_CmpEQ64, getIReg(regRs),
-                                      getIReg(regRt))));
-         else
-            putIReg(regRd, unop(Iop_1Uto32,
-                                binop(Iop_CmpEQ32, getIReg(regRs),
-                                      getIReg(regRt))));
-         break;
+            case 0x2B:  /* 20. SNE rd, rs, rt */
+               DIP("sne r%d, r%d, r%d", regRd,regRs, regRt);
+               if (mode64)
+                  putIReg(regRd, unop(Iop_1Uto64, binop(Iop_CmpNE64,
+                                                        getIReg(regRs),
+                                                        getIReg(regRt))));
+               else
+                  putIReg(regRd,unop(Iop_1Uto32, binop(Iop_CmpNE32,
+                                                       getIReg(regRs),
+                                                       getIReg(regRt))));
+               break;
 
-      case 0x2E:  /* Set Equals Immediate - SEQI; Cavium OCTEON */
-         DIP("seqi r%d, r%d, %d", regRt, regRs, imm);
-         if (mode64)
-            putIReg(regRt, unop(Iop_1Uto64,
-                                binop(Iop_CmpEQ64, getIReg(regRs),
-                                      mkU64(extend_s_10to64(imm)))));
-         else
-            putIReg(regRt, unop(Iop_1Uto32,
-                                binop(Iop_CmpEQ32, getIReg(regRs),
-                                      mkU32(extend_s_10to32(imm)))));
-         break;
+            case 0x2A:  /* Set Equals - SEQ; Cavium OCTEON */
+               DIP("seq r%d, r%d, %d", regRd, regRs, regRt);
+               if (mode64)
+                  putIReg(regRd, unop(Iop_1Uto64,
+                                      binop(Iop_CmpEQ64, getIReg(regRs),
+                                            getIReg(regRt))));
+               else
+                  putIReg(regRd, unop(Iop_1Uto32,
+                                      binop(Iop_CmpEQ32, getIReg(regRs),
+                                            getIReg(regRt))));
+               break;
 
-      case 0x2F:  /* Set Not Equals Immediate - SNEI; Cavium OCTEON */
-         DIP("snei r%d, r%d, %d", regRt, regRs, imm);
-         if (mode64)
-            putIReg(regRt, unop(Iop_1Uto64,
-                             binop(Iop_CmpNE64,
-                                   getIReg(regRs),
-                                   mkU64(extend_s_10to64(imm)))));
-         else
-            putIReg(regRt, unop(Iop_1Uto32,
-                             binop(Iop_CmpNE32,
-                                   getIReg(regRs),
-                                   mkU32(extend_s_10to32(imm)))));
+            case 0x2E:  /* Set Equals Immediate - SEQI; Cavium OCTEON */
+               DIP("seqi r%d, r%d, %d", regRt, regRs, imm);
+               if (mode64)
+                  putIReg(regRt, unop(Iop_1Uto64,
+                                      binop(Iop_CmpEQ64, getIReg(regRs),
+                                            mkU64(extend_s_10to64(imm)))));
+               else
+                  putIReg(regRt, unop(Iop_1Uto32,
+                                      binop(Iop_CmpEQ32, getIReg(regRs),
+                                            mkU32(extend_s_10to32(imm)))));
+               break;
+
+            case 0x2F:  /* Set Not Equals Immediate - SNEI; Cavium OCTEON */
+               DIP("snei r%d, r%d, %d", regRt, regRs, imm);
+               if (mode64)
+                  putIReg(regRt, unop(Iop_1Uto64,
+                                   binop(Iop_CmpNE64,
+                                         getIReg(regRs),
+                                         mkU64(extend_s_10to64(imm)))));
+               else
+                  putIReg(regRt, unop(Iop_1Uto32,
+                                   binop(Iop_CmpNE32,
+                                         getIReg(regRs),
+                                         mkU32(extend_s_10to32(imm)))));
+               break;
+
+            default:
+               return False;
+         }
          break;
-
+      } /* opc1 0x1C ends here*/
+      case 0x1F:{
+         switch(opc2) {
+            case 0x0A: {  // lx - Load indexed instructions
+               switch (get_sa(theInstr)) {
+                  case 0x00: {  // LWX rd, index(base)
+                     DIP("lwx r%d, r%d(r%d)", regRd, regRt, regRs);
+                     LOADX_STORE_PATTERN;
+                     putIReg(regRd, mkWidenFrom32(ty, load(Ity_I32, mkexpr(t1)),
+                                                  True));
+                     break;
+                  }
+                  case 0x08: {  // LDX rd, index(base)
+                     DIP("ldx r%d, r%d(r%d)", regRd, regRt, regRs);
+                     vassert(mode64); /* Currently Implemented only for n64 */
+                     LOADX_STORE_PATTERN;
+                     putIReg(regRd, load(Ity_I64, mkexpr(t1)));
+                     break;
+                  }
+                  case 0x06: {  // LBUX rd, index(base)
+                     DIP("lbux r%d, r%d(r%d)", regRd, regRt, regRs);
+                     LOADX_STORE_PATTERN;
+                     if (mode64)
+                        putIReg(regRd, unop(Iop_8Uto64, load(Ity_I8,
+                                                             mkexpr(t1))));
+                     else
+                        putIReg(regRd, unop(Iop_8Uto32, load(Ity_I8,
+                                                             mkexpr(t1))));
+                     break;
+                  }
+                  case 0x10: {  // LWUX rd, index(base) (Cavium OCTEON)
+                     DIP("lwux r%d, r%d(r%d)", regRd, regRt, regRs);
+                     LOADX_STORE_PATTERN; /* same for both 32 and 64 modes*/
+                     putIReg(regRd, mkWidenFrom32(ty, load(Ity_I32, mkexpr(t1)),
+                                                  False));
+                     break;
+                  }
+                  case 0x14: {  // LHUX rd, index(base) (Cavium OCTEON)
+                     DIP("lhux r%d, r%d(r%d)", regRd, regRt, regRs);
+                     LOADX_STORE_PATTERN;
+                     if (mode64)
+                        putIReg(regRd,
+                                unop(Iop_16Uto64, load(Ity_I16, mkexpr(t1))));
+                     else
+                        putIReg(regRd,
+                                unop(Iop_16Uto32, load(Ity_I16, mkexpr(t1))));
+                     break;
+                  }
+                  case 0x16: {  // LBX rd, index(base) (Cavium OCTEON)
+                     DIP("lbx r%d, r%d(r%d)", regRd, regRs, regRt);
+                     LOADX_STORE_PATTERN;
+                     if (mode64)
+                        putIReg(regRd,
+                                unop(Iop_8Sto64, load(Ity_I8, mkexpr(t1))));
+                     else
+                        putIReg(regRd,
+                                unop(Iop_8Sto32, load(Ity_I8, mkexpr(t1))));
+                     break;
+                  }
+                  default:
+                     vex_printf("\nUnhandled LX instruction opc3 = %x\n",
+                                get_sa(theInstr));
+                     return False;
+               }
+               break;
+            }
+         } /* opc1 = 0x1F & opc2 = 0xA (LX) ends here*/
+         break;
+      } /* opc1 = 0x1F ends here*/
       default:
-       return False;
-   }
+         return False; 
+   } /* main opc1 switch ends here */
    return True;
 }
 
@@ -2223,7 +2372,7 @@
 static UInt disDSPInstr_MIPS_WRK ( UInt cins )
 {
    IRTemp t0, t1 = 0, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
-          t15, t16, t17, t18;
+          t15, t16, t17;
    UInt opcode, rs, rt, rd, sa, function, ac, ac_mfhilo, rddsp_mask,
         wrdsp_mask, dsp_imm, shift;
 
@@ -2875,40 +3024,123 @@
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
                      t2 = newTemp(Ity_I32);
-                     t3 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I32);
+                     t3 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
+                     t7 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
 
                      assign(t0, getAcc(ac));
-                     assign(t1, binop(Iop_Sar64, mkexpr(t0), mkU8(rs)));
-                     putIReg(rt, unop(Iop_64to32, mkexpr(t1)));
+                     if (0 == rs) {
+                        assign(t1, mkexpr(t0));
+                     } else {
+                        assign(t1, binop(Iop_Sar64, mkexpr(t0), mkU8(rs)));
+                     }
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
 
-                     assign(t2, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
+                     /* If the last discarded bit is 1, there would be carry
+                        when rounding, otherwise there wouldn't. We use that
+                        fact and just add the value of the last discarded bit
+                        to the least sifgnificant bit of the shifted value
+                        from acc. */
+                     if (0 == rs) {
+                        assign(t8, mkU64(0x0ULL));
+                     } else {
+                        assign(t8, binop(Iop_And64,
+                                         binop(Iop_Shr64,
+                                               mkexpr(t0),
+                                               mkU8(rs-1)),
+                                         mkU64(0x1ULL)));
+                     }
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
 
-                     /* Check if signOut == signIn */
-                     assign(t3, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t0)),
-                                                       mkU32(0x80000000)),
-                                                 binop(Iop_And32,
-                                                       getIReg(rt),
-                                                       mkU32(0x80000000))),
-                                           getDSPControl(),
-                                           mkexpr(t2)));
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
 
-                     assign(t4, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                 unop(Iop_64HIto32,
-                                                      mkexpr(t1)),
-                                                 mkU32(0x0)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t1)),
-                                                            mkU32(0xffffffff)),
-                                                      mkexpr(t2),
-                                                      mkexpr(t3)),
-                                           mkexpr(t3)));
-                     putDSPControl(mkexpr(t4));
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+                     if (0 == rs) {
+                        putIReg(rt, unop(Iop_64to32, mkexpr(t0)));
+                     } else {
+                        putIReg(rt, unop(Iop_64to32, mkexpr(t1)));
+                     }
                      break;
                   }
                   case 0x1: {  /* EXTRV.W */
@@ -2917,43 +3149,133 @@
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
                      t2 = newTemp(Ity_I32);
-                     t3 = newTemp(Ity_I32);
+                     t3 = newTemp(Ity_I1);
                      t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
+                     t7 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
+                     t15 = newTemp(Ity_I8);
 
+                     assign(t15, unop(Iop_32to8,
+                                      binop(Iop_And32,
+                                            getIReg(rs),
+                                            mkU32(0x1f))));
                      assign(t0, getAcc(ac));
-                     assign(t1, binop(Iop_Sar64,
-                                      mkexpr(t0),
-                                      unop(Iop_32to8,
-                                           binop(Iop_And32,
-                                                 getIReg(rs),
-                                                 mkU32(0x1f)))));
-                     putIReg(rt, unop(Iop_64to32, mkexpr(t1)));
+                     assign(t1, binop(Iop_Sar64, mkexpr(t0), mkexpr(t15)));
+                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                        unop(Iop_8Uto32,
+                                                             mkexpr(t15)),
+                                                        mkU32(0)),
+                                                  unop(Iop_64to32, mkexpr(t0)),
+                                                  unop(Iop_64to32, mkexpr(t1))));
 
-                     assign(t2, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
-
-                     /* Check if signOut == signIn */
-                     assign(t3, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t0)),
-                                                       mkU32(0x80000000)),
-                                                 binop(Iop_And32,
-                                                       getIReg(rt),
-                                                       mkU32(0x80000000))),
-                                           getDSPControl(),
-                                           mkexpr(t2)));
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
                      assign(t4, binop(Iop_CmpNE32,
-                                      unop(Iop_64HIto32, mkexpr(t1)),
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
                                       mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t1)),
-                                                    mkU32(0x0)),
-                                              IRExpr_ITE(mkexpr(t4),
-                                                         mkexpr(t2),
-                                                         mkexpr(t3)),
-                                              mkexpr(t3)));
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+
+                     /* If the last discarded bit is 1, there would be carry
+                        when rounding, otherwise there wouldn't. We use that
+                        fact and just add the value of the last discarded bit
+                        to the least sifgnificant bit of the shifted value
+                        from acc. */
+                     assign(t8,
+                            IRExpr_ITE(binop(Iop_CmpEQ32,
+                                             unop(Iop_8Uto32,
+                                                  mkexpr(t15)),
+                                             mkU32(0)),
+                                       mkU64(0x0ULL),
+                                       binop(Iop_And64,
+                                             binop(Iop_Shr64,
+                                                   mkexpr(t0),
+                                                   unop(Iop_32to8,
+                                                        binop(Iop_Sub32,
+                                                              unop(Iop_8Uto32,
+                                                                   mkexpr(t15)),
+                                                                   mkU32(1)))),
+                                             mkU64(0x1ULL))));
+
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
+
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
                      break;
                   }
                   case 0x2: {  /* EXTP */
@@ -3140,61 +3462,131 @@
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
                      t2 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I64);
-                     t6 = newTemp(Ity_I64);
+                     t3 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
+                     t7 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
+                     t15 = newTemp(Ity_I64);
+                     t16 = newTemp(Ity_I1);
 
                      assign(t0, getAcc(ac));
-                     if (0 == rs) {
-                        putIReg(rt, unop(Iop_64to32, mkexpr(t0)));
-                     } else {
-                        assign(t1, binop(Iop_Sar64, mkexpr(t0), mkU8(rs)));
+                     assign(t16, binop(Iop_CmpEQ32,
+                                       mkU32(rs),
+                                       mkU32(0)));
+                     assign(t1, IRExpr_ITE(mkexpr(t16),
+                                           mkexpr(t0),
+                                           binop(Iop_Sar64,
+                                                 mkexpr(t0),
+                                                 mkU8(rs))));
+                     /* If the last discarded bit is 1, there would be carry
+                        when rounding, otherwise there wouldn't. We use that
+                        fact and just add the value of the last discarded bit
+                        to the least significant bit of the shifted value
+                        from acc. */
+                     assign(t15, binop(Iop_Shr64,
+                                       mkexpr(t0),
+                                       unop(Iop_32to8,
+                                            binop(Iop_Sub32,
+                                                  binop(Iop_And32,
+                                                        mkU32(rs),
+                                                        mkU32(0x1f)),
+                                                  mkU32(1)))));
 
-                        assign(t2, binop(Iop_Or32,
-                                         getDSPControl(), mkU32(0x800000)));
+                     assign(t8,
+                            IRExpr_ITE(mkexpr(t16),
+                                       mkU64(0x0ULL),
+                                       binop(Iop_And64,
+                                             mkexpr(t15),
+                                             mkU64(0x0000000000000001ULL))));
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
+                     putIReg(rt, unop(Iop_64to32, mkexpr(t9)));
 
-                        putDSPControl(IRExpr_ITE(
-                                      binop(Iop_CmpNE32,
-                                            unop(Iop_64HIto32, mkexpr(t1)),
-                                            mkU32(0x0)),
-                                      IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t1)),
-                                                       mkU32(0xffffffff)),
-                                                 mkexpr(t2),
-                                                 getDSPControl()),
-                                      getDSPControl()));
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
 
-                        assign(t4, binop(Iop_Or32,
-                                         getDSPControl(), mkU32(0x800000)));
-                        /* If the last discarded bit is 1, there would be carry
-                           when rounding, otherwise there wouldn't. We use that
-                           fact and just add the value of the last discarded bit
-                           to the least sifgnificant bit of the shifted value
-                           from acc. */
-                        assign(t5, binop(Iop_Shr64,
-                                         binop(Iop_And64,
-                                               mkexpr(t0),
-                                               binop(Iop_Shl64,
-                                                     mkU64(0x1ULL),
-                                                     mkU8(rs-1))),
-                                         mkU8(rs-1)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
 
-                        assign(t6, binop(Iop_Add64, mkexpr(t1), mkexpr(t5)));
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
 
-                        putDSPControl(IRExpr_ITE(
-                                      binop(Iop_CmpNE32,
-                                            unop(Iop_64HIto32, mkexpr(t6)),
-                                                 mkU32(0x0)),
-                                            IRExpr_ITE(binop(Iop_CmpNE32,
-                                                             unop(Iop_64HIto32,
-                                                                  mkexpr(t6)),
-                                                             mkU32(0xffffffff)),
-                                                       mkexpr(t4),
-                                                       getDSPControl()),
-                                            getDSPControl()));
-                        putIReg(rt, unop(Iop_64to32, mkexpr(t6)));
-                     }
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
                      break;
                   }
                   case 0x5: {  /* EXTRV_R.W */
@@ -3203,79 +3595,129 @@
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
                      t2 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I64);
-                     t6 = newTemp(Ity_I64);
+                     t3 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
+                     t7 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
+                     t15 = newTemp(Ity_I8);
 
+                     assign(t15, unop(Iop_32to8,
+                                      binop(Iop_And32,
+                                            getIReg(rs),
+                                            mkU32(0x1f))));
                      assign(t0, getAcc(ac));
+                     assign(t1, binop(Iop_Sar64, mkexpr(t0), mkexpr(t15)));
 
-                     assign(t1, binop(Iop_Sar64,
-                                      mkexpr(t0),
-                                      unop(Iop_32to8,
-                                           binop(Iop_And32,
-                                                 getIReg(rs),
-                                                 mkU32(0x1f)))));
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
 
-                     assign(t2, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
-
-                     putDSPControl(IRExpr_ITE(
-                                   binop(Iop_CmpNE32,
-                                         unop(Iop_64HIto32, mkexpr(t1)),
-                                         mkU32(0x0)),
-                                   IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t1)),
-                                                    mkU32(0xffffffff)),
-                                              mkexpr(t2),
-                                              getDSPControl()),
-                                   getDSPControl()));
-
-                     assign(t4, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
                      /* If the last discarded bit is 1, there would be carry
                         when rounding, otherwise there wouldn't. We use that
-                        fact and just add the value of the last discarded bit to
-                        the least sifgnificant bit of the shifted value from
-                        acc. */
-                     assign(t5, binop(Iop_Shr64,
-                                      binop(Iop_And64,
-                                            mkexpr(t0),
-                                            binop(Iop_Shl64,
-                                                  mkU64(0x1ULL),
-                                                  unop(Iop_32to8,
-                                                       binop(Iop_Sub32,
-                                                             binop(Iop_And32,
-                                                                   getIReg(rs),
-                                                                   mkU32(0x1f)),
-                                                             mkU32(0x1))))),
-                                      unop(Iop_32to8,
-                                           binop(Iop_Sub32,
-                                                 binop(Iop_And32,
-                                                       getIReg(rs),
-                                                       mkU32(0x1f)),
-                                                 mkU32(0x1)))));
+                        fact and just add the value of the last discarded bit
+                        to the least sifgnificant bit of the shifted value
+                        from acc. */
+                     assign(t8,
+                            IRExpr_ITE(binop(Iop_CmpEQ32,
+                                             unop(Iop_8Uto32,
+                                                  mkexpr(t15)),
+                                             mkU32(0)),
+                                       mkU64(0x0ULL),
+                                       binop(Iop_And64,
+                                             binop(Iop_Shr64,
+                                                   mkexpr(t0),
+                                                   unop(Iop_32to8,
+                                                        binop(Iop_Sub32,
+                                                              unop(Iop_8Uto32,
+                                                                   mkexpr(t15)),
+                                                                   mkU32(1)))),
+                                             mkU64(0x1ULL))));
 
-                     assign(t6, binop(Iop_Add64, mkexpr(t1), mkexpr(t5)));
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
+                     /* Put rounded value in destination register. */
+                     putIReg(rt, unop(Iop_64to32, mkexpr(t9)));
 
-                     putDSPControl(IRExpr_ITE(
-                                   binop(Iop_CmpNE32,
-                                         unop(Iop_64HIto32, mkexpr(t6)),
-                                         mkU32(0x0)),
-                                   IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t6)),
-                                                    mkU32(0xffffffff)),
-                                              mkexpr(t4),
-                                              getDSPControl()),
-                                   getDSPControl()));
-                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                  binop(Iop_And32,
-                                                        getIReg(rs),
-                                                        mkU32(0x1f)),
-                                                  mkU32(0x0)),
-                                            unop(Iop_64to32, mkexpr(t0)),
-                                            unop(Iop_64to32, mkexpr(t6))));
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
                      break;
                   }
                   case 0x6: {  /* EXTR_RS.W */
@@ -3283,81 +3725,136 @@
                      vassert(!mode64);
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
-                     t2 = newTemp(Ity_I64);
-                     t3 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I32);
-                     t6 = newTemp(Ity_I32);
+                     t2 = newTemp(Ity_I32);
+                     t3 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
+                     t7 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
+                     t16 = newTemp(Ity_I32);
 
-                     if (0 != rs) {
-                        assign(t0, getAcc(ac));
+                     assign(t0, getAcc(ac));
+                     if (0 == rs) {
+                        assign(t1, mkexpr(t0));
+                     } else {
                         assign(t1, binop(Iop_Sar64, mkexpr(t0), mkU8(rs)));
-                        putDSPControl(IRExpr_ITE(
-                                      binop(Iop_CmpNE32,
-                                            unop(Iop_64HIto32, mkexpr(t1)),
-                                            mkU32(0x0)),
-                                      IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t1)),
-                                                       mkU32(0xffffffff)),
-                                                 binop(Iop_Or32,
-                                                       getDSPControl(),
-                                                       mkU32(0x00800000)),
-                                                 getDSPControl()),
-                                      getDSPControl()));
-                        /* If the last discarded bit is 1, there would be carry
-                           when rounding, otherwise there wouldn't. We use that
-                           fact and just add the value of the last discarded bit
-                           to the least sifgnificant bit of the shifted value
-                           from acc. */
-                        assign(t2, binop(Iop_Add64,
-                                         mkexpr(t1),
-                                         binop(Iop_Shr64,
-                                               binop(Iop_And64,
-                                                     mkexpr(t0),
-                                                     binop(Iop_Shl64,
-                                                           mkU64(0x1ULL),
-                                                           unop(Iop_32to8,
-                                                                mkU32(rs-1)))),
-                                               unop(Iop_32to8, mkU32(rs-1)))));
-                        assign(t6, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t2)),
-                                                    mkU32(0xffffffff)),
+                     }
+
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
                                               binop(Iop_Or32,
                                                     getDSPControl(),
                                                     mkU32(0x00800000)),
                                               getDSPControl()));
-                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t2)),
-                                                       mkU32(0x0)),
-                                                 mkexpr(t6),
-                                                 getDSPControl()));
-                        assign(t3, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                    binop(Iop_And32,
-                                                          unop(Iop_64HIto32,
-                                                               mkexpr(t2)),
-                                                          mkU32(0x80000000)),
-                                                    mkU32(0x0)),
-                                              mkU32(0x7fffffff),
-                                              mkU32(0x80000000)));
-                        assign(t4, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t2)),
-                                                    mkU32(0xffffffff)),
-                                              mkexpr(t3),
-                                              unop(Iop_64to32, mkexpr(t2))));
-                        assign(t5, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t2)),
-                                                    mkU32(0x0)),
-                                              mkexpr(t4),
-                                              unop(Iop_64to32, mkexpr(t2))));
-                        putIReg(rt, mkexpr(t5));
+
+                     /* If the last discarded bit is 1, there would be carry
+                        when rounding, otherwise there wouldn't. We use that
+                        fact and just add the value of the last discarded bit
+                        to the least sifgnificant bit of the shifted value
+                        from acc. */
+                     if (0 == rs) {
+                        assign(t8, mkU64(0x0ULL));
                      } else {
-                        putIReg(rt, unop(Iop_64to32, getAcc(ac)));
+                        assign(t8, binop(Iop_And64,
+                                         binop(Iop_Shr64,
+                                               mkexpr(t0),
+                                               mkU8(rs-1)),
+                                         mkU64(0x1ULL)));
                      }
+
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
+
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+
+                     assign(t16, binop(Iop_And32,
+                                       unop(Iop_64HIto32,
+                                            mkexpr(t9)),
+                                       mkU32(0x80000000)));
+                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                  mkexpr(t14),
+                                                  mkU32(0)),
+                                            IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                             mkexpr(t16),
+                                                             mkU32(0)),
+                                                       mkU32(0x7fffffff),
+                                                       mkU32(0x80000000)),
+                                            unop(Iop_64to32, mkexpr(t9))));
                      break;
                   }
                   case 0x7: {  /* EXTRV_RS.W */
@@ -3366,104 +3863,146 @@
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I64);
                      t2 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I64);
-                     t6 = newTemp(Ity_I64);
+                     t3 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I1);
                      t7 = newTemp(Ity_I32);
-                     t8 = newTemp(Ity_I32);
-                     t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I32);
+                     t8 = newTemp(Ity_I64);
+                     t9 = newTemp(Ity_I64);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I1);
+                     t12 = newTemp(Ity_I1);
+                     t13 = newTemp(Ity_I1);
+                     t14 = newTemp(Ity_I32);
+                     t15 = newTemp(Ity_I32);
+                     t16 = newTemp(Ity_I32);
+                     t17 = newTemp(Ity_I1);
 
+                     assign(t15, binop(Iop_And32,
+                                       getIReg(rs),
+                                       mkU32(0x1f)));
+                     assign(t17, binop(Iop_CmpEQ32,
+                                       mkexpr(t15),
+                                       mkU32(0)));
                      assign(t0, getAcc(ac));
+                     assign(t1, IRExpr_ITE(mkexpr(t17),
+                                           mkexpr(t0),
+                                           binop(Iop_Sar64,
+                                                 mkexpr(t0),
+                                                 unop(Iop_32to8,
+                                                      mkexpr(t15)))));
 
-                     assign(t1, binop(Iop_Sar64,
-                                      mkexpr(t0),
-                                      unop(Iop_32to8, binop(Iop_And32,
-                                                            getIReg(rs),
-                                                            mkU32(0x1f)))));
+                     /* Check if bits 63..31 of the result in t1 aren't 0. */
+                     assign(t3, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
+                     /* Check if bits 63..31 of the result in t1 aren't
+                        0x1ffffffff. */
+                     assign(t5, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t1)),
+                                      mkU32(0xffffffff)));
+                     assign(t6, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t1)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
+                     /* If bits 63..31 aren't 0 nor 0x1ffffffff, set DSP
+                        control register. */
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t3)),
+                                            unop(Iop_1Sto32, mkexpr(t4))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t5)),
+                                            unop(Iop_1Sto32, mkexpr(t6)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t7),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
 
-                     assign(t2, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
+                     /* If the last discarded bit is 1, there would be carry
+                        when rounding, otherwise there wouldn't. We use that
+                        fact and just add the value of the last discarded bit
+                        to the least sifgnificant bit of the shifted value
+                        from acc. */
+                     assign(t8,
+                            IRExpr_ITE(mkexpr(t17),
+                                       mkU64(0x0ULL),
+                                       binop(Iop_And64,
+                                             binop(Iop_Shr64,
+                                                   mkexpr(t0),
+                                                   unop(Iop_32to8,
+                                                        binop(Iop_Sub32,
+                                                              mkexpr(t15),
+                                                              mkU32(1)))),
+                                             mkU64(0x1ULL))));
 
-                     assign(t10, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  unop(Iop_64HIto32,
-                                                       mkexpr(t1)),
-                                                  mkU32(0x0)),
-                                            IRExpr_ITE(binop(Iop_CmpNE32,
-                                                             unop(Iop_64HIto32,
-                                                                  mkexpr(t1)),
-                                                             mkU32(0xffffffff)),
-                                                       mkexpr(t2),
-                                                       getDSPControl()),
-                                            getDSPControl()));
+                     assign(t9, binop(Iop_Add64, mkexpr(t1), mkexpr(t8)));
 
-                     putDSPControl(mkexpr(t10));
+                     /* Repeat previous steps for the rounded value. */
+                     assign(t10, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0)));
+                     assign(t11, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0)));
 
-                     assign(t4, binop(Iop_Or32,
-                                      getDSPControl(), mkU32(0x00800000)));
-                     /* If the last discarded bit is 1, there would be carry
-                        when rounding, otherwise there wouldn't. We use that
-                        fact and just add the value of the last discarded bit to
-                        the least sifgnificant bit of the shifted value from
-                        acc. */
-                     assign(t5, binop(Iop_Shr64,
-                                      binop(Iop_And64,
-                                            mkexpr(t0),
-                                            binop(Iop_Shl64,
-                                                  mkU64(0x1ULL),
-                                                  unop(Iop_32to8,
-                                                       binop(Iop_Sub32,
-                                                             binop(Iop_And32,
-                                                                   getIReg(rs),
-                                                                   mkU32(0x1f)),
-                                                             mkU32(0x1))))),
-                                      unop(Iop_32to8,
-                                           binop(Iop_Sub32,
-                                                 binop(Iop_And32,
-                                                       getIReg(rs),
-                                                       mkU32(0x1f)),
-                                                 mkU32(0x1)))));
+                     assign(t12, binop(Iop_CmpNE32,
+                                      unop(Iop_64HIto32,
+                                           mkexpr(t9)),
+                                      mkU32(0xffffffff)));
+                     assign(t13, binop(Iop_CmpNE32,
+                                      binop(Iop_And32,
+                                            unop(Iop_64to32,
+                                                 mkexpr(t9)),
+                                            mkU32(0x80000000)),
+                                      mkU32(0x80000000)));
 
-                     assign(t6, binop(Iop_Add64, mkexpr(t1), mkexpr(t5)));
-
-                     assign(t8, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                 unop(Iop_64HIto32,
-                                                      mkexpr(t6)),
-                                                 mkU32(0xffffffff)),
-                                           mkexpr(t4),
-                                           getDSPControl()));
+                     assign(t14, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t10)),
+                                            unop(Iop_1Sto32, mkexpr(t11))),
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32, mkexpr(t12)),
+                                            unop(Iop_1Sto32, mkexpr(t13)))));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_64HIto32,
-                                                         mkexpr(t6)),
-                                                    mkU32(0x0)),
-                                              mkexpr(t8),
+                                                    mkexpr(t14),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
                                               getDSPControl()));
-                     assign(t9, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t6)),
+
+                     assign(t16, binop(Iop_And32,
+                                       unop(Iop_64HIto32,
+                                            mkexpr(t9)),
+                                       mkU32(0x80000000)));
+                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                  mkexpr(t14),
+                                                  mkU32(0)),
+                                            IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                             mkexpr(t16),
+                                                             mkU32(0)),
+                                                       mkU32(0x7fffffff),
                                                        mkU32(0x80000000)),
-                                                 mkU32(0x0)),
-                                           mkU32(0x7fffffff),
-                                           mkU32(0x80000000)));
-                     assign(t7, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                 unop(Iop_64HIto32, mkexpr(t6)),
-                                                 mkU32(0x0)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t6)),
-                                                            mkU32(0xffffffff)),
-                                                      mkexpr(t9),
-                                                      unop(Iop_64to32,
-                                                           mkexpr(t6))),
-                                           unop(Iop_64to32, mkexpr(t6))));
-                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                  binop(Iop_And32,
-                                                        getIReg(rs),
-                                                        mkU32(0x1f)),
-                                                  mkU32(0x0)),
-                                            unop(Iop_64to32, mkexpr(t0)),
-                                            mkexpr(t7)));
+                                            unop(Iop_64to32, mkexpr(t9))));
                      break;
                   }
                   case 0xA: {  /* EXTPDP */
@@ -3678,9 +4217,7 @@
                      t5 = newTemp(Ity_I32);
                      t6 = newTemp(Ity_I64);
                      t7 = newTemp(Ity_I32);
-                     t8 = newTemp(Ity_I32);
                      t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I1);
 
                      assign(t0, getAcc(ac));
 
@@ -3689,12 +4226,10 @@
                      assign(t2, binop(Iop_Or32,
                                       getDSPControl(), mkU32(0x00800000)));
 
-                     assign(t9, binop(Iop_Shl32,
-                                      binop(Iop_And32,
-                                            unop(Iop_64to32,
-                                                 mkexpr(t1)),
-                                            mkU32(0x00008000)),
-                                      mkU8(16)));
+                     assign(t9, binop(Iop_And32,
+                                      unop(Iop_64to32,
+                                           mkexpr(t1)),
+                                      mkU32(0x80000000)));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
                                                     mkexpr(t9),
                                                     binop(Iop_And32,
@@ -3711,120 +4246,79 @@
                      assign(t3, binop(Iop_Sub64,
                                       mkexpr(t1),
                                       mkU64(0x0000000000007fffULL)));
-                     assign(t4, binop(Iop_Or32,
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t3)),
-                                                       mkU32(0x7fffffff)))),
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
+                     assign(t4, binop(Iop_And32,
+                                       binop(Iop_Or32,
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
+                                                       binop(Iop_And32,
+                                                             unop(Iop_64HIto32,
+                                                                  mkexpr(t3)),
+                                                             mkU32(0x7fffffff)))),
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
                                                        unop(Iop_64to32,
-                                                            mkexpr(t3)),
-                                                       mkU32(0xffffffff))))));
-
-                     assign(t5, IRExpr_ITE(unop(Iop_32to1,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t3)),
-                                                            mkU32(0x80000000)),
-                                                      mkU8(31))),
-                                           unop(Iop_64to32, mkexpr(t1)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            mkexpr(t4),
-                                                            mkU32(0x0)),
-                                                      mkU32(0x7fff),
-                                                      unop(Iop_64to32,
-                                                           mkexpr(t1)))));
-
-                     assign(t10, unop(Iop_32to1,
-                                      binop(Iop_Shr32,
-                                            binop(Iop_And32,
-                                                  unop(Iop_64HIto32,
-                                                       mkexpr(t3)),
-                                                   mkU32(0x80000000)),
-                                            mkU8(31))));
-                     putDSPControl(IRExpr_ITE(mkexpr(t10),
-                                             getDSPControl(),
-                                             IRExpr_ITE(binop(Iop_CmpNE32,
-                                                              mkexpr(t4),
-                                                              mkU32(0x0)),
-                                                        binop(Iop_Or32,
-                                                             getDSPControl(),
-                                                             mkU32(0x00800000)),
-                                                        getDSPControl())));
-
+                                                            mkexpr(t3))))),
+                                       unop(Iop_1Sto32,
+                                            binop(Iop_CmpEQ32,
+                                                  binop(Iop_And32,
+                                                        unop(Iop_64HIto32,
+                                                                  mkexpr(t3)),
+                                                             mkU32(0x80000000)),
+                                                  mkU32(0)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t4)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
                      /* Check if t1<0xffffffffffff8000 (0xffffffffffff8000-t1)>0
-                        1. subtract t1 from 0x7fff
+                        1. subtract t1 from 0xffffffffffff8000
                         2. if the resulting number is positive (sign bit = 0)
                             and any of the other bits is 1, the value is > 0 */
                      assign(t6, binop(Iop_Sub64,
                                        mkU64(0xffffffffffff8000ULL),
                                        mkexpr(t1)));
-
-                     assign(t7, binop(Iop_Or32,
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t6)),
-                                                       mkU32(0x7fffffff)))),
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64to32,
-                                                            mkexpr(t6)),
-                                                       mkU32(0xffffffff))))));
-
-                     assign(t8, IRExpr_ITE(unop(Iop_32to1,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t6)),
-                                                            mkU32(0x80000000)),
-                                                      mkU8(31))),
-                                           unop(Iop_64to32, mkexpr(t1)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            mkexpr(t7),
-                                                            mkU32(0x0)),
-                                                      mkU32(0xffff8000),
-                                                      unop(Iop_64to32,
-                                                           mkexpr(t1)))));
-                     putDSPControl(IRExpr_ITE(unop(Iop_32to1,
-                                                   binop(Iop_Shr32,
-                                                         binop(Iop_And32,
-                                                             unop(Iop_64HIto32,
-                                                                   mkexpr(t6)),
-                                                             mkU32(0x80000000)),
-                                                         mkU8(31))),
-                                              getDSPControl(),
-                                              IRExpr_ITE(binop(Iop_CmpNE32,
-                                                               mkexpr(t7),
-                                                               mkU32(0x0)),
-                                                         binop(Iop_Or32,
-                                                             getDSPControl(),
-                                                             mkU32(0x00800000)),
-                                                         getDSPControl())));
-
-                     /* If the shifted value is positive, it can only be >0x7fff
-                        and the final result is the value stored in t5,
-                        otherwise, the final result is in t8. */
-                     putIReg(rt, IRExpr_ITE(unop(Iop_32to1,
-                                                 binop(Iop_Shr32,
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
                                                        binop(Iop_And32,
                                                              unop(Iop_64HIto32,
-                                                                  mkexpr(t1)),
+                                                                  mkexpr(t6)),
+                                                             mkU32(0x7fffffff)))),
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
+                                                       unop(Iop_64to32,
+                                                            mkexpr(t6))))),
+                                      unop(Iop_1Sto32,
+                                            binop(Iop_CmpEQ32,
+                                                  binop(Iop_And32,
+                                                        unop(Iop_64HIto32,
+                                                                  mkexpr(t6)),
                                                              mkU32(0x80000000)),
-                                                       mkU8(31))),
-                                            mkexpr(t8),
-                                            mkexpr(t5)));
+                                                  mkU32(0)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t7)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t4)),
+                                            mkU32(0x00007fff),
+                                            IRExpr_ITE(binop(Iop_CmpNE32,
+                                                             mkU32(0),
+                                                             mkexpr(t7)),
+                                                       mkU32(0xffff8000),
+                                                       unop(Iop_64to32,
+                                                            mkexpr(t1)))));
                      break;
                   }
                   case 0xF: {  /* EXTRV_S.H */
@@ -3838,10 +4332,7 @@
                      t5 = newTemp(Ity_I32);
                      t6 = newTemp(Ity_I64);
                      t7 = newTemp(Ity_I32);
-                     t8 = newTemp(Ity_I32);
                      t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I32);
-                     t11 = newTemp(Ity_I32);
 
                      assign(t0, getAcc(ac));
 
@@ -3855,12 +4346,10 @@
                      assign(t2, binop(Iop_Or32,
                                       getDSPControl(), mkU32(0x00800000)));
 
-                     assign(t9, binop(Iop_Shl32,
-                                      binop(Iop_And32,
-                                            unop(Iop_64to32,
-                                                 mkexpr(t1)),
-                                            mkU32(0x00008000)),
-                                      mkU8(16)));
+                     assign(t9, binop(Iop_And32,
+                                      unop(Iop_64to32,
+                                           mkexpr(t1)),
+                                      mkU32(0x80000000)));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
                                                     mkexpr(t9),
                                                     binop(Iop_And32,
@@ -3873,127 +4362,83 @@
                      /* Check if t1 > 0x7fff ((t1 - 0x7fff) > 0)
                         1. subtract 0x7fff from t1
                         2. if the resulting number is positive (sign bit = 0)
-                            and any of the other bits is 1, the value is > 0 */
+                           and any of the other bits is 1, the value is > 0. */
                      assign(t3, binop(Iop_Sub64,
                                       mkexpr(t1),
                                       mkU64(0x0000000000007fffULL)));
-                     assign(t4, binop(Iop_Or32,
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t3)),
-                                                       mkU32(0x7fffffff)))),
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
+                     assign(t4, binop(Iop_And32,
+                                       binop(Iop_Or32,
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
+                                                       binop(Iop_And32,
+                                                             unop(Iop_64HIto32,
+                                                                  mkexpr(t3)),
+                                                             mkU32(0x7fffffff)))),
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
                                                        unop(Iop_64to32,
-                                                            mkexpr(t3)),
-                                                       mkU32(0xffffffff))))));
-
-                     assign(t5, IRExpr_ITE(unop(Iop_32to1,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t3)),
-                                                            mkU32(0x80000000)),
-                                                      mkU8(31))),
-                                           unop(Iop_64to32, mkexpr(t1)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            mkexpr(t4),
-                                                            mkU32(0x0)),
-                                                      mkU32(0x7fff),
-                                                      unop(Iop_64to32,
-                                                           mkexpr(t1)))));
-
-                     assign(t10, binop(Iop_Shr32,
-                                       binop(Iop_And32,
-                                             unop(Iop_64HIto32,
-                                                  mkexpr(t3)),
-                                             mkU32(0x80000000)),
-                                       mkU8(31)));
-                     assign(t11, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  mkexpr(t4),
-                                                  mkU32(0x0)),
-                                            binop(Iop_Or32,
-                                                  getDSPControl(),
-                                                  mkU32(0x00800000)),
-                                            getDSPControl()));
-                     putDSPControl(IRExpr_ITE(unop(Iop_32to1,
-                                                   mkexpr(t10)),
-                                              getDSPControl(),
-                                              mkexpr(t11)));
-
-                     /* Check if t1<0xffffffffffff8000
-                        1. subtract t1 from 0x7fff
-                        2. if the resulting number is positive (sign bit == 0)
+                                                            mkexpr(t3))))),
+                                       unop(Iop_1Sto32,
+                                            binop(Iop_CmpEQ32,
+                                                  binop(Iop_And32,
+                                                        unop(Iop_64HIto32,
+                                                                  mkexpr(t3)),
+                                                             mkU32(0x80000000)),
+                                                  mkU32(0)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t4)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+                     /* Check if t1<0xffffffffffff8000 (0xffffffffffff8000-t1)>0
+                        1. subtract t1 from 0xffffffffffff8000
+                        2. if the resulting number is positive (sign bit = 0)
                             and any of the other bits is 1, the value is > 0 */
                      assign(t6, binop(Iop_Sub64,
-                                      mkU64(0xffffffffffff8000ULL),
-                                      mkexpr(t1)));
-
-                     assign(t7, binop(Iop_Or32,
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64HIto32,
-                                                            mkexpr(t6)),
-                                                       mkU32(0x7fffffff)))),
-                                      unop(Iop_1Uto32,
-                                           binop(Iop_CmpNE32,
-                                                 mkU32(0),
-                                                 binop(Iop_And32,
-                                                       unop(Iop_64to32,
-                                                            mkexpr(t6)),
-                                                       mkU32(0xffffffff))))));
-
-                     assign(t8, IRExpr_ITE(unop(Iop_32to1,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            unop(Iop_64HIto32,
-                                                                 mkexpr(t6)),
-                                                            mkU32(0x80000000)),
-                                                      mkU8(31))),
-                                           unop(Iop_64to32, mkexpr(t1)),
-                                           IRExpr_ITE(binop(Iop_CmpNE32,
-                                                            mkexpr(t7),
-                                                            mkU32(0x0)),
-                                                      mkU32(0xffff8000),
-                                                      unop(Iop_64to32,
-                                                           mkexpr(t1)))));
-                     putDSPControl(IRExpr_ITE(unop(Iop_32to1,
-                                                  binop(Iop_Shr32,
-                                                        binop(Iop_And32,
-                                                              unop(Iop_64HIto32,
-                                                                   mkexpr(t6)),
-                                                              mkU32(0x80000000)
-                                                             ),
-                                                        mkU8(31))),
-                                              getDSPControl(),
-                                              IRExpr_ITE(binop(Iop_CmpNE32,
-                                                               mkexpr(t7),
-                                                               mkU32(0x0)),
-                                                         binop(Iop_Or32,
-                                                               getDSPControl(),
-                                                               mkU32(0x00800000)
-                                                              ),
-                                                         getDSPControl())));
-
-                     /* If the shifted value is positive, it can only be >0x7fff
-                        and the final result is the value stored in t5,
-                        otherwise, the final result is in t8. */
-                     putIReg(rt, IRExpr_ITE(unop(Iop_32to1,
-                                                 binop(Iop_Shr32,
+                                       mkU64(0xffffffffffff8000ULL),
+                                       mkexpr(t1)));
+                     assign(t7, binop(Iop_And32,
+                                      binop(Iop_Or32,
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
                                                        binop(Iop_And32,
                                                              unop(Iop_64HIto32,
-                                                                  mkexpr(t1)),
+                                                                  mkexpr(t6)),
+                                                             mkU32(0x7fffffff)))),
+                                            unop(Iop_1Sto32,
+                                                 binop(Iop_CmpNE32,
+                                                       mkU32(0),
+                                                       unop(Iop_64to32,
+                                                            mkexpr(t6))))),
+                                      unop(Iop_1Sto32,
+                                            binop(Iop_CmpEQ32,
+                                                  binop(Iop_And32,
+                                                        unop(Iop_64HIto32,
+                                                                  mkexpr(t6)),
                                                              mkU32(0x80000000)),
-                                                       mkU8(31))),
-                                            mkexpr(t8),
-                                            mkexpr(t5)));
+                                                  mkU32(0)))));
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t7)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x00800000)),
+                                              getDSPControl()));
+                     putIReg(rt, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkU32(0),
+                                                    mkexpr(t4)),
+                                            mkU32(0x00007fff),
+                                            IRExpr_ITE(binop(Iop_CmpNE32,
+                                                             mkU32(0),
+                                                             mkexpr(t7)),
+                                                       mkU32(0xffff8000),
+                                                       unop(Iop_64to32,
+                                                            mkexpr(t1)))));
                      break;
                   }
                   case 0x12: {  /* RDDSP*/
@@ -4192,38 +4637,38 @@
                      DIP("shilov ac%d, r%d", ac, rs);
                      vassert(!mode64);
                      t0 = newTemp(Ity_I64);
-                     t1 = newTemp(Ity_I64);
-                     t2 = newTemp(Ity_I32);
-                     t3 = newTemp(Ity_I1);
+                     t1 = newTemp(Ity_I32);
+                     t2 = newTemp(Ity_I1);
+                     t3 = newTemp(Ity_I64);
                      t4 = newTemp(Ity_I64);
-                     t5 = newTemp(Ity_I64);
 
                      assign(t0, getAcc(ac));
-                     assign(t2, binop(Iop_And32, getIReg(rs), mkU32(0x3f)));
-                     assign(t3, binop(Iop_CmpEQ32, mkexpr(t2), mkU32(0x20)));
-
-                     assign(t4, binop(Iop_Shl64,
+                     assign(t1, binop(Iop_And32, getIReg(rs), mkU32(0x3f)));
+                     assign(t2, binop(Iop_CmpEQ32, mkexpr(t1), mkU32(0x20)));
+                     assign(t3, binop(Iop_Shl64,
                                       mkexpr(t0),
                                       unop(Iop_32to8,
                                            binop(Iop_Add32,
                                                  unop(Iop_Not32,
-                                                      mkexpr(t2)),
+                                                      mkexpr(t1)),
                                                  mkU32(0x1)))));
-                     assign(t5, binop(Iop_Shr64,
+                     assign(t4, binop(Iop_Shr64,
                                       mkexpr(t0),
                                       unop(Iop_32to8,
-                                           mkexpr(t2))));
-                     putAcc(ac, IRExpr_ITE(mkexpr(t3),
-                                           binop(Iop_32HLto64,
-                                                 unop(Iop_64to32, mkexpr(t0)),
-                                                 mkU32(0x0)),
-                                           IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                            binop(Iop_And32,
-                                                                  mkexpr(t2),
-                                                                  mkU32(0x20)),
-                                                            mkU32(0x20)),
-                                                      mkexpr(t4),
-                                                      mkexpr(t5))));
+                                           mkexpr(t1))));
+
+                     putAcc(ac,
+                            IRExpr_ITE(mkexpr(t2),
+                                       binop(Iop_32HLto64,
+                                             unop(Iop_64to32, mkexpr(t0)),
+                                             mkU32(0x0)),
+                                       IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                        binop(Iop_And32,
+                                                              mkexpr(t1),
+                                                              mkU32(0x20)),
+                                                        mkU32(0x20)),
+                                                  mkexpr(t3),
+                                                  mkexpr(t4))));
                      break;
                   }
                   case 0x1F: {  /* MTHLIP */
@@ -7201,160 +7646,113 @@
                      t0 = newTemp(Ity_I32);
                      t1 = newTemp(Ity_I1);
                      t2 = newTemp(Ity_I1);
-                     t3 = newTemp(Ity_I1);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I32);
-                     t6 = newTemp(Ity_I1);
+                     t3 = newTemp(Ity_I32);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I32);
                      t7 = newTemp(Ity_I1);
                      t8 = newTemp(Ity_I1);
-                     t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I32);
-                     t11 = newTemp(Ity_I1);
-                     t12 = newTemp(Ity_I1);
-                     t13 = newTemp(Ity_I1);
-                     t14 = newTemp(Ity_I32);
-                     t15 = newTemp(Ity_I32);
-                     t16 = newTemp(Ity_I1);
-                     t17 = newTemp(Ity_I1);
-                     t18 = newTemp(Ity_I32);
+                     t9 = newTemp(Ity_I1);
+                     t10 = newTemp(Ity_I1);
 
                      if (0 == rs) {
                         putIReg(rd, getIReg(rt));
                      } else {
-                        /* Shift bits 7..0. */
+                        /* Shift bits 7..0 and 23..16. */
                         assign(t0, binop(Iop_Shl32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_32to8, getIReg(rt))),
-                                         unop(Iop_32to8,
-                                              binop(Iop_And32,
-                                                    mkU32(rs),
-                                                    mkU32(0x7)))));
-                        /* Check if discard isn't 0x0 and 0xffffffff. */
+                                         binop(Iop_And32,
+                                               getIReg(rt),
+                                               mkU32(0x00ff00ff)),
+                                         mkU8(rs)));
                         assign(t1, binop(Iop_CmpNE32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_16HIto8,
-                                                   unop(Iop_32to16,
-                                                        mkexpr(t0)))),
-                                         mkU32(0x00000000)));
+                                        binop(Iop_And32,
+                                              mkexpr(t0),
+                                              mkU32(0xff000000)),
+                                        mkU32(0x00000000)));
                         assign(t2, binop(Iop_CmpNE32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_16HIto8,
-                                                   unop(Iop_32to16,
-                                                        mkexpr(t0)))),
-                                         mkU32(0x000000ff)));
-                        assign(t4, binop(Iop_Or32,
-                                         getDSPControl(), mkU32(0x400000)));
-                        putDSPControl(IRExpr_ITE(mkexpr(t1),
-                                                 IRExpr_ITE(mkexpr(t2),
-                                                            mkexpr(t4),
-                                                            getDSPControl()),
-                                                 getDSPControl()));
-
-                        /* Shift bits 15..8. */
-                        assign(t5, binop(Iop_Shl32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_16HIto8,
-                                                   unop(Iop_32to16,
-                                                        getIReg(rt)))),
-                                         unop(Iop_32to8,
-                                              binop(Iop_And32,
-                                                    mkU32(rs),
-                                                    mkU32(0x7)))));
-                        /* Check if discard isn't 0x0 and 0xffffffff. */
-                        assign(t6, binop(Iop_CmpNE32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_16HIto8,
-                                                   unop(Iop_32to16,
-                                                        mkexpr(t5)))),
-                                         mkU32(0x00000000)));
+                                        binop(Iop_And32,
+                                              mkexpr(t0),
+                                              mkU32(0xff000000)),
+                                        mkU32(0xff000000)));
                         assign(t7, binop(Iop_CmpNE32,
-                                         unop(Iop_8Uto32,
-                                              unop(Iop_16HIto8,
-                                                   unop(Iop_32to16,
-                                                        mkexpr(t5)))),
-                                         mkU32(0x000000ff)));
-                        assign(t9, binop(Iop_Or32,
-                                         getDSPControl(),
-                                         mkU32(0x400000)));
-                        putDSPControl(IRExpr_ITE(mkexpr(t6),
-                                                 IRExpr_ITE(mkexpr(t7),
-                                                            mkexpr(t9),
-                                                            getDSPControl()),
-                                                 getDSPControl()));
-
-                        /* Shift bits 23..16. */
-                        assign(t10, binop(Iop_Shl32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16to8,
-                                                    unop(Iop_32HIto16,
-                                                         getIReg(rt)))),
-                                          unop(Iop_32to8,
+                                        binop(Iop_And32,
+                                              mkexpr(t0),
+                                              mkU32(0x0000ff00)),
+                                        mkU32(0x00000000)));
+                        assign(t8, binop(Iop_CmpNE32,
+                                        binop(Iop_And32,
+                                              mkexpr(t0),
+                                              mkU32(0x0000ff00)),
+                                        mkU32(0x000ff00)));
+                        /* Shift bits 15..8 and 31..24. */
+                        assign(t3, binop(Iop_Shl32,
+                                         binop(Iop_Shr32,
                                                binop(Iop_And32,
-                                                     mkU32(rs),
-                                                     mkU32(0x7)))));
-                        /* Check if discard isn't 0x0 and 0xffffffff. */
-                        assign(t11, binop(Iop_CmpNE32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16HIto8,
-                                                    unop(Iop_32to16,
-                                                         mkexpr(t10)))),
-                                          mkU32(0x00000000)));
-                        assign(t12, binop(Iop_CmpNE32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16HIto8,
-                                                    unop(Iop_32to16,
-                                                         mkexpr(t10)))),
-                                          mkU32(0x000000ff)));
+                                                     getIReg(rt),
+                                                     mkU32(0xff00ff00)),
+                                               mkU8(8)),
+                                         mkU8(rs)));
+                        assign(t4, binop(Iop_CmpNE32,
+                                        binop(Iop_And32,
+                                              mkexpr(t3),
+                                              mkU32(0xff000000)),
+                                        mkU32(0x00000000)));
+                        assign(t5, binop(Iop_CmpNE32,
+                                        binop(Iop_And32,
+                                              mkexpr(t3),
+                                              mkU32(0xff000000)),
+                                        mkU32(0xff000000)));
+                        assign(t9, binop(Iop_CmpNE32,
+                                        binop(Iop_And32,
+                                              mkexpr(t3),
+                                              mkU32(0x0000ff00)),
+                                        mkU32(0x00000000)));
+                        assign(t10, binop(Iop_CmpNE32,
+                                        binop(Iop_And32,
+                                              mkexpr(t3),
+                                              mkU32(0x0000ff00)),
+                                        mkU32(0x0000ff00)));
 
-                        assign(t14, binop(Iop_Or32,
-                                          getDSPControl(),
-                                          mkU32(0x400000)));
-                        putDSPControl(IRExpr_ITE(mkexpr(t11),
-                                                 IRExpr_ITE(mkexpr(t12),
-                                                            mkexpr(t14),
-                                                            getDSPControl()),
-                                                 getDSPControl()));
-
-                        /* Shift bits 31..24. */
-                        assign(t15, binop(Iop_Shl32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16HIto8,
-                                                    unop(Iop_32HIto16,
-                                                         getIReg(rt)))),
-                                          unop(Iop_32to8,
+                        assign(t6, binop(Iop_Or32,
+                                         binop(Iop_Or32,
                                                binop(Iop_And32,
-                                                     mkU32(rs),
-                                                     mkU32(0x7)))));
-                        /* Check if discard isn't 0x0 and 0xffffffff. */
-                        assign(t16, binop(Iop_CmpNE32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16HIto8,
-                                                    unop(Iop_32to16,
-                                                         mkexpr(t15)))),
-                                          mkU32(0x00000000)));
-                        assign(t17, binop(Iop_CmpNE32,
-                                          unop(Iop_8Uto32,
-                                               unop(Iop_16HIto8,
-                                                    unop(Iop_32to16,
-                                                         mkexpr(t15)))),
-                                          mkU32(0x000000ff)));
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t1)),
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t2))),
+                                               binop(Iop_And32,
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t7)),
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t8)))),
+                                         binop(Iop_Or32,
+                                               binop(Iop_And32,
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t4)),
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t5))),
+                                               binop(Iop_And32,
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t9)),
+                                                     unop(Iop_1Uto32,
+                                                          mkexpr(t10))))));
 
-                        assign(t18, binop(Iop_Or32,
-                                          getDSPControl(),
-                                          mkU32(0x400000)));
-                        putDSPControl(IRExpr_ITE(mkexpr(t16),
-                                                 IRExpr_ITE(mkexpr(t17),
-                                                            mkexpr(t18),
-                                                            getDSPControl()),
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                       mkexpr(t6),
+                                                       mkU32(0x0)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
                                                  getDSPControl()));
-
-                        putIReg(rd, binop(Iop_16HLto32,
-                                          binop(Iop_8HLto16,
-                                                unop(Iop_32to8, mkexpr(t15)),
-                                                unop(Iop_32to8, mkexpr(t10))),
-                                          binop(Iop_8HLto16,
-                                                unop(Iop_32to8, mkexpr(t5)),
-                                                unop(Iop_32to8, mkexpr(t0)))));
+                        putIReg(rd, binop(Iop_Or32,
+                                          binop(Iop_Shl32,
+                                                binop(Iop_And32,
+                                                      mkexpr(t3),
+                                                      mkU32(0x00ff00ff)),
+                                                mkU8(8)),
+                                          binop(Iop_And32,
+                                                mkexpr(t0),
+                                                mkU32(0x00ff00ff))));
                      }
                      break;
                   }
@@ -7422,165 +7820,119 @@
                      t0 = newTemp(Ity_I32);
                      t1 = newTemp(Ity_I1);
                      t2 = newTemp(Ity_I1);
-                     t3 = newTemp(Ity_I1);
-                     t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I32);
-                     t6 = newTemp(Ity_I1);
+                     t3 = newTemp(Ity_I32);
+                     t4 = newTemp(Ity_I1);
+                     t5 = newTemp(Ity_I1);
+                     t6 = newTemp(Ity_I32);
                      t7 = newTemp(Ity_I1);
                      t8 = newTemp(Ity_I1);
-                     t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I32);
-                     t11 = newTemp(Ity_I1);
-                     t12 = newTemp(Ity_I1);
-                     t13 = newTemp(Ity_I1);
-                     t14 = newTemp(Ity_I32);
-                     t15 = newTemp(Ity_I32);
-                     t16 = newTemp(Ity_I1);
-                     t17 = newTemp(Ity_I1);
-                     t18 = newTemp(Ity_I32);
+                     t9 = newTemp(Ity_I1);
+                     t10 = newTemp(Ity_I1);
+                     t11 = newTemp(Ity_I8);
 
-                     /* Shift bits 7..0. */
+                     assign(t11, unop(Iop_32to8,
+                                      binop(Iop_And32,
+                                            getIReg(rs),
+                                            mkU32(0x7))));
+                     /* Shift bits 7..0 and 23..16. */
                      assign(t0, binop(Iop_Shl32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_32to8, getIReg(rt))),
-                                      unop(Iop_32to8,
-                                           binop(Iop_And32,
-                                                 getIReg(rs),
-                                                 mkU32(0x7)))));
-                     /* Check if discard isn't 0x0 and 0xffffffff. */
+                                      binop(Iop_And32,
+                                            getIReg(rt),
+                                            mkU32(0x00ff00ff)),
+                                      mkexpr(t11)));
                      assign(t1, binop(Iop_CmpNE32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_16HIto8,
-                                                unop(Iop_32to16, mkexpr(t0)))),
-                                      mkU32(0x00000000)));
+                                     binop(Iop_And32,
+                                           mkexpr(t0),
+                                           mkU32(0xff000000)),
+                                     mkU32(0x00000000)));
                      assign(t2, binop(Iop_CmpNE32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_16HIto8,
-                                                unop(Iop_32to16, mkexpr(t0)))),
-                                      mkU32(0x000000ff)));
-
-                     assign(t4, binop(Iop_Or32,
-                                      getDSPControl(),
-                                      mkU32(0x400000)));
-                     putDSPControl(IRExpr_ITE(mkexpr(t1),
-                                              IRExpr_ITE(mkexpr(t2),
-                                                         mkexpr(t4),
-                                                         getDSPControl()),
-                                              getDSPControl()));
-
-                     /* Shift bits 15..8. */
-                     assign(t5, binop(Iop_Shl32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_16HIto8,
-                                                unop(Iop_32to16, getIReg(rt)))),
-                                      unop(Iop_32to8,
-                                           binop(Iop_And32,
-                                                 getIReg(rs),
-                                                 mkU32(0x7)))));
-                     /* Check if discard isn't 0x0 and 0xffffffff. */
-                     assign(t6, binop(Iop_CmpNE32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_16HIto8,
-                                                unop(Iop_32to16, mkexpr(t5)))),
-                                       mkU32(0x00000000)));
+                                     binop(Iop_And32,
+                                           mkexpr(t0),
+                                           mkU32(0xff000000)),
+                                     mkU32(0xff000000)));
                      assign(t7, binop(Iop_CmpNE32,
-                                      unop(Iop_8Uto32,
-                                           unop(Iop_16HIto8,
-                                                unop(Iop_32to16, mkexpr(t5)))),
-                                      mkU32(0x000000ff)));
-
-                     assign(t9, binop(Iop_Or32,
-                                      getDSPControl(),
-                                      mkU32(0x400000)));
-                     putDSPControl(IRExpr_ITE(mkexpr(t6),
-                                              IRExpr_ITE(mkexpr(t7),
-                                                         mkexpr(t9),
-                                                         getDSPControl()),
-                                              getDSPControl()));
-
-                     /* Shift bits 23..16. */
-                     assign(t10, binop(Iop_Shl32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16to8,
-                                                 unop(Iop_32HIto16,
-                                                      getIReg(rt)))),
-                                       unop(Iop_32to8,
+                                     binop(Iop_And32,
+                                           mkexpr(t0),
+                                           mkU32(0x0000ff00)),
+                                     mkU32(0x00000000)));
+                     assign(t8, binop(Iop_CmpNE32,
+                                     binop(Iop_And32,
+                                           mkexpr(t0),
+                                           mkU32(0x0000ff00)),
+                                     mkU32(0x000ff00)));
+                     /* Shift bits 15..8 and 31..24. */
+                     assign(t3, binop(Iop_Shl32,
+                                      binop(Iop_Shr32,
                                             binop(Iop_And32,
-                                                  getIReg(rs),
-                                                  mkU32(0x7)))));
-                     /* Check if discard isn't 0x0 and 0xffffffff. */
-                     assign(t11, binop(Iop_CmpNE32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16HIto8,
-                                                 unop(Iop_32to16,
-                                                      mkexpr(t10)))),
-                                       mkU32(0x00000000)));
-                     assign(t12, binop(Iop_CmpNE32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16HIto8,
-                                                 unop(Iop_32to16,
-                                                      mkexpr(t10)))),
-                                       mkU32(0x000000ff)));
+                                                  getIReg(rt),
+                                                  mkU32(0xff00ff00)),
+                                            mkU8(8)),
+                                      mkexpr(t11)));
+                     assign(t4, binop(Iop_CmpNE32,
+                                     binop(Iop_And32,
+                                           mkexpr(t3),
+                                           mkU32(0xff000000)),
+                                     mkU32(0x00000000)));
+                     assign(t5, binop(Iop_CmpNE32,
+                                     binop(Iop_And32,
+                                           mkexpr(t3),
+                                           mkU32(0xff000000)),
+                                     mkU32(0xff000000)));
+                     assign(t9, binop(Iop_CmpNE32,
+                                     binop(Iop_And32,
+                                           mkexpr(t3),
+                                           mkU32(0x0000ff00)),
+                                     mkU32(0x00000000)));
+                     assign(t10, binop(Iop_CmpNE32,
+                                     binop(Iop_And32,
+                                           mkexpr(t3),
+                                           mkU32(0x0000ff00)),
+                                     mkU32(0x0000ff00)));
 
-                     assign(t14, binop(Iop_Or32,
-                                       getDSPControl(),
-                                       mkU32(0x400000)));
-                     putDSPControl(IRExpr_ITE(mkexpr(t11),
-                                              IRExpr_ITE(mkexpr(t12),
-                                                         mkexpr(t14),
-                                                         getDSPControl()),
-                                              getDSPControl()));
-
-                     /* Shift bits 31..24. */
-                     assign(t15, binop(Iop_Shl32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16HIto8,
-                                                 unop(Iop_32HIto16,
-                                                      getIReg(rt)))),
-                                       unop(Iop_32to8,
+                     assign(t6, binop(Iop_Or32,
+                                      binop(Iop_Or32,
                                             binop(Iop_And32,
-                                                  getIReg(rs),
-                                                  mkU32(0x7)))));
-                     /* Check if discard isn't 0x0 and 0xffffffff. */
-                     assign(t16, binop(Iop_CmpNE32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16HIto8,
-                                                 unop(Iop_32to16,
-                                                      mkexpr(t15)))),
-                                       mkU32(0x00000000)));
-                     assign(t17, binop(Iop_CmpNE32,
-                                       unop(Iop_8Uto32,
-                                            unop(Iop_16HIto8,
-                                                 unop(Iop_32to16,
-                                                      mkexpr(t15)))),
-                                       mkU32(0x000000ff)));
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t1)),
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t2))),
+                                            binop(Iop_And32,
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t7)),
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t8)))),
+                                      binop(Iop_Or32,
+                                            binop(Iop_And32,
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t4)),
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t5))),
+                                            binop(Iop_And32,
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t9)),
+                                                  unop(Iop_1Uto32,
+                                                       mkexpr(t10))))));
 
-                     assign(t18, binop(Iop_Or32,
-                                       getDSPControl(),
-                                       mkU32(0x400000)));
-                     putDSPControl(IRExpr_ITE(mkexpr(t16),
-                                              IRExpr_ITE(mkexpr(t17),
-                                                         mkexpr(t18),
-                                                         getDSPControl()),
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    mkexpr(t6),
+                                                    mkU32(0x0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    mkU32(0x400000)),
                                               getDSPControl()));
-
                      putIReg(rd, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                  unop(Iop_8Uto32, mkexpr(t11)),
+                                                  mkU32(0)),
+                                            getIReg(rt),
+                                            binop(Iop_Or32,
+                                                  binop(Iop_Shl32,
+                                                        binop(Iop_And32,
+                                                              mkexpr(t3),
+                                                              mkU32(0xff00ff)),
+                                                        mkU8(8)),
                                                   binop(Iop_And32,
-                                                        getIReg(rs),
-                                                        mkU32(0x7)),
-                                                  mkU32(0x0)),
-                                            getIReg(rt),
-                                            binop(Iop_16HLto32,
-                                                  binop(Iop_8HLto16,
-                                                        unop(Iop_32to8,
-                                                             mkexpr(t15)),
-                                                        unop(Iop_32to8,
-                                                             mkexpr(t10))),
-                                                  binop(Iop_8HLto16,
-                                                        unop(Iop_32to8,
-                                                             mkexpr(t5)),
-                                                        unop(Iop_32to8,
-                                                             mkexpr(t0))))));
+                                                        mkexpr(t0),
+                                                        mkU32(0x00ff00ff)))));
                      break;
                   }
                   case 0x1: {  /* SHRLV.QB */
@@ -8075,7 +8427,10 @@
                      t1 = newTemp(Ity_I32);
                      t2 = newTemp(Ity_I32);
                      t3 = newTemp(Ity_I32);
-                     t4 = newTemp(Ity_I1);
+                     t4 = newTemp(Ity_I32);
+                     t5 = newTemp(Ity_I32);
+                     t6 = newTemp(Ity_I32);
+                     t7 = newTemp(Ity_I32);
 
                      if (0 == rs) {
                         putIReg(rd, getIReg(rt));
@@ -8086,21 +8441,27 @@
                                               unop(Iop_32to16, getIReg(rt))),
                                          mkU8(rs)));
 
-                        assign(t2, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_16Sto32,
-                                                         unop(Iop_32HIto16,
-                                                              mkexpr(t0))),
-                                                    mkU32(0xffffffff)),
-                                              binop(Iop_Or32,
-                                                    getDSPControl(),
-                                                    mkU32(0x400000)),
-                                              getDSPControl()));
-                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_16Sto32,
-                                                            unop(Iop_32HIto16,
-                                                                 mkexpr(t0))),
-                                                       mkU32(0x00000000)),
-                                                 mkexpr(t2),
+                        assign(t1, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                               binop(Iop_Sar32,
+                                                     mkexpr(t0),
+                                                     mkU8(16)),
+                                               mkU32(0))));
+                        assign(t2, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                              binop(Iop_Sar32,
+                                                    mkexpr(t0),
+                                                    mkU8(16)),
+                                              mkU32(0xffffffff))));
+                        assign(t3, binop(Iop_And32,
+                                         mkexpr(t1),
+                                         mkexpr(t2)));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t3),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
                                                  getDSPControl()));
                         putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
                                                        binop(Iop_And32,
@@ -8115,46 +8476,56 @@
                                                        getDSPControl(),
                                                        mkU32(0x400000))));
                         /* Shift higher 16 bits. */
-                        assign(t1, binop(Iop_Shl32,
+                        assign(t4, binop(Iop_Shl32,
                                          unop(Iop_16Sto32,
                                               unop(Iop_32HIto16, getIReg(rt))),
                                          mkU8(rs)));
 
-                        assign(t3, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_16Sto32,
-                                                         unop(Iop_32HIto16,
-                                                              mkexpr(t1))),
-                                                    mkU32(0xffffffff)),
-                                              binop(Iop_Or32,
-                                                    getDSPControl(),
-                                                    mkU32(0x400000)),
-                                              getDSPControl()));
-                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                     unop(Iop_16Sto32,
-                                                          unop(Iop_32HIto16,
-                                                               mkexpr(t1))),
-                                                     mkU32(0x00000000)),
-                                                mkexpr(t3),
-                                                getDSPControl()));
-                        assign(t4, binop(Iop_CmpEQ32,
-                                         binop(Iop_Shr32,
-                                               binop(Iop_And32,
-                                                     getIReg(rt),
-                                                     mkU32(0x80000000)),
-                                               mkU8(31)),
-                                         binop(Iop_Shr32,
-                                               binop(Iop_And32,
-                                                     mkexpr(t1),
-                                                     mkU32(0x00008000)),
-                                               mkU8(15))));
-                        putDSPControl(IRExpr_ITE(mkexpr(t4),
+                        assign(t5, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                               binop(Iop_Sar32,
+                                                     mkexpr(t4),
+                                                     mkU8(16)),
+                                               mkU32(0))));
+                        assign(t6, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                              binop(Iop_Sar32,
+                                                    mkexpr(t4),
+                                                    mkU8(16)),
+                                              mkU32(0xffffffff))));
+                        assign(t7, binop(Iop_And32,
+                                         mkexpr(t5),
+                                         mkexpr(t6)));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t7),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
+                                                 getDSPControl()));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t7),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
+                                                 getDSPControl()));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       binop(Iop_And32,
+                                                             getIReg(rt),
+                                                             mkU32(0x80000000)),
+                                                       binop(Iop_Shl32,
+                                                             binop(Iop_And32,
+                                                                   mkexpr(t4),
+                                                                   mkU32(0x00008000)),
+                                                             mkU8(16))
+                                                      ),
                                                  getDSPControl(),
                                                  binop(Iop_Or32,
                                                        getDSPControl(),
                                                        mkU32(0x400000))));
-
                         putIReg(rd, binop(Iop_16HLto32,
-                                          unop(Iop_32to16, mkexpr(t1)),
+                                          unop(Iop_32to16, mkexpr(t4)),
                                           unop(Iop_32to16, mkexpr(t0))));
                      }
                      break;
@@ -8323,18 +8694,20 @@
                      DIP("shll_s.ph r%d, r%d, %d", rd, rt, rs);
                      vassert(!mode64);
                      t0 = newTemp(Ity_I32);
-                     t1 = newTemp(Ity_I16);
-                     t2 = newTemp(Ity_I16);
-                     t3 = newTemp(Ity_I16);
+                     t1 = newTemp(Ity_I32);
+                     t2 = newTemp(Ity_I32);
+                     t3 = newTemp(Ity_I32);
                      t4 = newTemp(Ity_I32);
-                     t5 = newTemp(Ity_I16);
-                     t6 = newTemp(Ity_I16);
-                     t7 = newTemp(Ity_I16);
+                     t5 = newTemp(Ity_I32);
+                     t6 = newTemp(Ity_I32);
+                     t7 = newTemp(Ity_I32);
                      t8 = newTemp(Ity_I32);
                      t9 = newTemp(Ity_I32);
-                     t10 = newTemp(Ity_I1);
-                     t11 = newTemp(Ity_I16);
-                     t12 = newTemp(Ity_I16);
+                     t10 = newTemp(Ity_I32);
+                     t11 = newTemp(Ity_I32);
+                     t12 = newTemp(Ity_I32);
+                     t13 = newTemp(Ity_I32);
+                     t14 = newTemp(Ity_I32);
 
                      if (0 == rs) {
                         putIReg(rd, getIReg(rt));
@@ -8345,69 +8718,70 @@
                                               unop(Iop_32to16, getIReg(rt))),
                                          mkU8(rs)));
 
-                        assign(t1, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                    binop(Iop_And32,
-                                                          getIReg(rt),
-                                                          mkU32(0x00008000)),
-                                                    mkU32(0x0)),
-                                              mkU16(0x7fff),
-                                              mkU16(0x8000)));
-                        assign(t2,
-                               IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            getIReg(rt),
-                                                            mkU32(0x00008000)),
-                                                      mkU8(15)),
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            mkexpr(t0),
-                                                            mkU32(0x00008000)),
-                                                      mkU8(15))),
-                                          unop(Iop_32to16, mkexpr(t0)),
-                                          mkexpr(t1)));
-                        assign(t11, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                     unop(Iop_16Sto32,
-                                                          unop(Iop_32HIto16,
-                                                               mkexpr(t0))),
-                                                     mkU32(0xffffffff)),
-                                               mkexpr(t1),
-                                               mkexpr(t2)));
-                        assign(t3,
-                               IRExpr_ITE(binop(Iop_CmpNE32,
-                                                unop(Iop_16Sto32,
-                                                     unop(Iop_32HIto16,
-                                                          mkexpr(t0))),
-                                                mkU32(0x00000000)),
-                                          mkexpr(t11),
-                                          mkexpr(t2)));
-                        assign(t8, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_16Sto32,
-                                                         unop(Iop_32HIto16,
-                                                              mkexpr(t0))),
-                                                    mkU32(0xffffffff)),
-                                              binop(Iop_Or32,
-                                                    getDSPControl(),
-                                                    mkU32(0x400000)),
-                                              getDSPControl()));
-                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_16Sto32,
-                                                            unop(Iop_32HIto16,
-                                                                 mkexpr(t0))),
-                                                       mkU32(0x00000000)),
-                                                mkexpr(t8),
-                                                getDSPControl()));
+                        assign(t1, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                               binop(Iop_Sar32,
+                                                     mkexpr(t0),
+                                                     mkU8(16)),
+                                               mkU32(0))));
+                        assign(t2, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                              binop(Iop_Sar32,
+                                                    mkexpr(t0),
+                                                    mkU8(16)),
+                                              mkU32(0xffffffff))));
+                        assign(t3, binop(Iop_And32,
+                                         mkexpr(t1),
+                                         mkexpr(t2)));
                         putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t3),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
+                                                 getDSPControl()));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
                                                        binop(Iop_And32,
                                                              getIReg(rt),
                                                              mkU32(0x00008000)),
                                                        binop(Iop_And32,
-                                                            mkexpr(t0),
-                                                            mkU32(0x00008000))),
+                                                             mkexpr(t0),
+                                                             mkU32(0x00008000))
+                                                      ),
                                                  getDSPControl(),
                                                  binop(Iop_Or32,
                                                        getDSPControl(),
                                                        mkU32(0x400000))));
+                        assign(t8,
+                               IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                mkexpr(t3),
+                                                mkU32(0x1)),
+                                          IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                           binop(Iop_And32,
+                                                                 getIReg(rt),
+                                                                 mkU32(0x8000)),
+                                                           mkU32(0)),
+                                                     mkU32(0x00007fff),
+                                                     mkU32(0x00008000)),
+                                          binop(Iop_And32,
+                                                mkexpr(t0),
+                                                mkU32(0x0000ffff))));
+                        assign(t10,
+                               IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                binop(Iop_And32,
+                                                      getIReg(rt),
+                                                      mkU32(0x00008000)),
+                                                binop(Iop_And32,
+                                                      mkexpr(t0),
+                                                      mkU32(0x00008000))),
+                                          mkexpr(t8),
+                                          IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                           binop(Iop_And32,
+                                                                 getIReg(rt),
+                                                                 mkU32(0x8000)),
+                                                           mkU32(0)),
+                                                     mkU32(0x00007fff),
+                                                     mkU32(0x00008000))));
                         /* Shift higher 16 bits. */
                         assign(t4, binop(Iop_Shl32,
                                          unop(Iop_16Sto32,
@@ -8414,77 +8788,88 @@
                                               unop(Iop_32HIto16, getIReg(rt))),
                                          mkU8(rs)));
 
-                        assign(t5, IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                    binop(Iop_And32,
-                                                          getIReg(rt),
-                                                          mkU32(0x80000000)),
-                                                    mkU32(0x0)),
-                                              mkU16(0x7fff),
-                                              mkU16(0x8000)));
-                        assign(t6,
-                               IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            getIReg(rt),
-                                                            mkU32(0x80000000)),
-                                                      mkU8(31)),
-                                                binop(Iop_Shr32,
-                                                      binop(Iop_And32,
-                                                            mkexpr(t4),
-                                                            mkU32(0x00008000)),
-                                                      mkU8(15))),
-                                          unop(Iop_32to16, mkexpr(t4)),
-                                          mkexpr(t5)));
-                        assign(t12, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                     unop(Iop_16Sto32,
-                                                          unop(Iop_32HIto16,
-                                                               mkexpr(t4))),
-                                                     mkU32(0xffffffff)),
-                                               mkexpr(t5),
-                                               mkexpr(t6)));
-                        assign(t7,
-                               IRExpr_ITE(binop(Iop_CmpNE32,
-                                                unop(Iop_16Sto32,
-                                                     unop(Iop_32HIto16,
-                                                          mkexpr(t4))),
-                                                mkU32(0x00000000)),
-                                          mkexpr(t12),
-                                          mkexpr(t6)));
-                        assign(t9, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                    unop(Iop_16Sto32,
-                                                      unop(Iop_32HIto16,
-                                                           mkexpr(t4))),
-                                                    mkU32(0xffffffff)),
-                                              binop(Iop_Or32,
-                                                    getDSPControl(),
-                                                    mkU32(0x400000)),
-                                              getDSPControl()));
-                        putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
-                                                       unop(Iop_16Sto32,
-                                                            unop(Iop_32HIto16,
-                                                                 mkexpr(t4))),
-                                                       mkU32(0x00000000)),
-                                                 mkexpr(t9),
+                        assign(t5, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                               binop(Iop_Sar32,
+                                                     mkexpr(t4),
+                                                     mkU8(16)),
+                                               mkU32(0))));
+                        assign(t6, unop(Iop_1Uto32,
+                                        binop(Iop_CmpNE32,
+                                              binop(Iop_Sar32,
+                                                    mkexpr(t4),
+                                                    mkU8(16)),
+                                              mkU32(0xffffffff))));
+                        assign(t7, binop(Iop_And32,
+                                         mkexpr(t5),
+                                         mkexpr(t6)));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t7),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
                                                  getDSPControl()));
-                        assign(t10, binop(Iop_CmpEQ32,
-                                          binop(Iop_Shr32,
-                                                binop(Iop_And32,
-                                                      getIReg(rt),
-                                                      mkU32(0x80000000)),
-                                                mkU8(31)),
-                                          binop(Iop_Shr32,
-                                                binop(Iop_And32,
-                                                      mkexpr(t4),
-                                                      mkU32(0x00008000)),
-                                                mkU8(15))));
-                        putDSPControl(IRExpr_ITE(mkexpr(t10),
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       mkexpr(t7),
+                                                       mkU32(0x1)),
+                                                 binop(Iop_Or32,
+                                                       getDSPControl(),
+                                                       mkU32(0x400000)),
+                                                 getDSPControl()));
+                        assign(t12, binop(Iop_Shl32,
+                                          binop(Iop_And32,
+                                                mkexpr(t4),
+                                                mkU32(0x8000)),
+                                          mkU8(16)));
+                        putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                       binop(Iop_And32,
+                                                             getIReg(rt),
+                                                             mkU32(0x80000000)),
+                                                       mkexpr(t12)),
                                                  getDSPControl(),
                                                  binop(Iop_Or32,
                                                        getDSPControl(),
                                                        mkU32(0x400000))));
-
-                        putIReg(rd, binop(Iop_16HLto32,
-                                          mkexpr(t7), mkexpr(t3)));
+                        assign(t13, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                     binop(Iop_And32,
+                                                           getIReg(rt),
+                                                           mkU32(0x80000000)),
+                                                     mkU32(0)),
+                                               mkU32(0x7fff0000),
+                                               mkU32(0x80000000)));
+                        assign(t9,
+                               IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                mkexpr(t7),
+                                                mkU32(0x1)),
+                                          mkexpr(t13),
+                                          binop(Iop_Shl32,
+                                                binop(Iop_And32,
+                                                      mkexpr(t4),
+                                                      mkU32(0x0000ffff)),
+                                                mkU8(16))));
+                        assign(t14, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                     binop(Iop_And32,
+                                                           getIReg(rt),
+                                                           mkU32(0x80000000)),
+                                                     mkU32(0)),
+                                               mkU32(0x7fff0000),
+                                               mkU32(0x80000000)));
+                        assign(t11,
+                               IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                binop(Iop_And32,
+                                                      getIReg(rt),
+                                                      mkU32(0x80000000)),
+                                                binop(Iop_Shl32,
+                                                      binop(Iop_And32,
+                                                            mkexpr(t4),
+                                                            mkU32(0x00008000)),
+                                                      mkU8(16))),
+                                          mkexpr(t9),
+                                          mkexpr(t14)));
+                        putIReg(rd, binop(Iop_Or32,
+                                          mkexpr(t10),
+                                          mkexpr(t11)));
                      }
                      break;
                   }
@@ -10831,10 +11216,9 @@
                      t8 = newTemp(Ity_I64);
                      t9 = newTemp(Ity_I64);
                      t10 = newTemp(Ity_I32);
-                     t11 = newTemp(Ity_I32);
 
                      assign(t0, getAcc(ac));
-                     /* Calculate first cross dot product and saturate if
+                     /* Calculate the first cross dot product and saturate if
                         needed. */
                      assign(t1, unop(Iop_32Sto64,
                                      binop(Iop_Shl32,
@@ -10859,23 +11243,28 @@
                                            unop(Iop_32to16, getIReg(rt))),
                                       mkU32(0x00008000)));
 
-                     assign(t4,
-                            IRExpr_ITE(mkexpr(t2),
-                                       IRExpr_ITE(mkexpr(t3),
-                                                  mkU64(0x000000007fffffffULL),
-                                                  mkexpr(t1)),
-                                       mkexpr(t1)));
+                     assign(t4, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                 binop(Iop_And32,
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t2)),
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t3))),
+                                                 mkU32(0)),
+                                           mkU64(0x000000007fffffffULL),
+                                           mkexpr(t1)));
 
-                     putDSPControl(IRExpr_ITE(mkexpr(t2),
-                                              IRExpr_ITE(mkexpr(t3),
-                                                         binop(Iop_Or32,
-                                                               getDSPControl(),
-                                                               binop(Iop_Shl32,
-                                                                     mkU32(0x1),
-                                                                     mkU8(ac+16)
-                                                                    )
-                                                              ),
-                                                         getDSPControl()),
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    binop(Iop_And32,
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t2)),
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t3))),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16))),
                                               getDSPControl()));
                      /* Calculate second cross dot product and saturate if
                         needed. */
@@ -10902,29 +11291,35 @@
                                            unop(Iop_32HIto16, getIReg(rt))),
                                       mkU32(0x00008000)));
 
-                     assign(t8,
-                            IRExpr_ITE(mkexpr(t6),
-                                       IRExpr_ITE(mkexpr(t7),
-                                                  mkU64(0x000000007fffffffULL),
-                                                  mkexpr(t5)),
-                                       mkexpr(t5)));
+                     assign(t8, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                 binop(Iop_And32,
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t6)),
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t7))),
+                                                 mkU32(0)),
+                                           mkU64(0x000000007fffffffULL),
+                                           mkexpr(t5)));
 
-                     putDSPControl(IRExpr_ITE(mkexpr(t6),
-                                              IRExpr_ITE(mkexpr(t7),
-                                                         binop(Iop_Or32,
-                                                               getDSPControl(),
-                                                               binop(Iop_Shl32,
-                                                                     mkU32(0x1),
-                                                                     mkU8(ac+16)
-                                                                    )
-                                                              ),
-                                                         getDSPControl()),
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    binop(Iop_And32,
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t6)),
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t7))),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16))),
                                               getDSPControl()));
-                     /* Add intermediate products with value in the
+                     /* Subtract intermediate products from value in the
                         accumulator. */
-                     assign(t9, binop(Iop_Add64,
-                                     mkexpr(t0),
-                                     binop(Iop_Add64, mkexpr(t8), mkexpr(t4))));
+                     assign(t9,
+                            binop(Iop_Add64,
+                                  mkexpr(t0),
+                                  binop(Iop_Add64, mkexpr(t8), mkexpr(t4))));
 
                      putAcc(ac,
                             IRExpr_ITE(binop(Iop_CmpEQ32,
@@ -10949,38 +11344,28 @@
                                                         mkU32(0xffffffff)),
                                                   mkU64(0xffffffff80000000ULL),
                                                   mkexpr(t9))));
-                     assign(t10, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  unop(Iop_64HIto32,
-                                                       binop(Iop_Shl64,
-                                                             mkexpr(t9),
-                                                             mkU8(1))),
-                                                  mkU32(0x0)),
-                                            binop(Iop_Or32,
-                                                  getDSPControl(),
-                                                  binop(Iop_Shl32,
-                                                        mkU32(0x1),
-                                                        mkU8(ac+16))),
-                                            getDSPControl()));
-                     assign(t11, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  unop(Iop_64HIto32,
-                                                       binop(Iop_Shl64,
-                                                             mkexpr(t9),
-                                                             mkU8(1))),
-                                                  mkU32(0xffffffff)),
-                                            binop(Iop_Or32,
-                                                  getDSPControl(),
-                                                  binop(Iop_Shl32,
-                                                        mkU32(0x1),
-                                                        mkU8(ac+16))),
-                                            getDSPControl()));
+                     assign(t10, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                  unop(Iop_64to32,
+                                                       mkexpr(t9)),
+                                                  unop(Iop_64to32,
+                                                       getAcc(ac))),
+                                           getDSPControl(),
+                                           binop(Iop_Or32,
+                                                 getDSPControl(),
+                                                 binop(Iop_Shl32,
+                                                       mkU32(0x1),
+                                                       mkU8(ac+16)))));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                    binop(Iop_And32,
-                                                          unop(Iop_64HIto32,
-                                                               mkexpr(t9)),
-                                                          mkU32(0x80000000)),
-                                                    mkU32(0x0)),
+                                                    unop(Iop_64HIto32,
+                                                         mkexpr(t9)),
+                                                    unop(Iop_64HIto32,
+                                                         getAcc(ac))),
                                               mkexpr(t10),
-                                              mkexpr(t11)));
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16)))));
                      break;
                   }
                   case 0x1B: {  /* DPSQX_SA.W.PH */
@@ -10997,10 +11382,9 @@
                      t8 = newTemp(Ity_I64);
                      t9 = newTemp(Ity_I64);
                      t10 = newTemp(Ity_I32);
-                     t11 = newTemp(Ity_I32);
 
                      assign(t0, getAcc(ac));
-                     /* Calculate first cross dot product and saturate if
+                     /* Calculate the first cross dot product and saturate if
                         needed. */
                      assign(t1, unop(Iop_32Sto64,
                                      binop(Iop_Shl32,
@@ -11025,23 +11409,28 @@
                                            unop(Iop_32to16, getIReg(rt))),
                                       mkU32(0x00008000)));
 
-                     assign(t4,
-                            IRExpr_ITE(mkexpr(t2),
-                                       IRExpr_ITE(mkexpr(t3),
-                                                  mkU64(0x000000007fffffffULL),
-                                                  mkexpr(t1)),
-                                       mkexpr(t1)));
+                     assign(t4, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                 binop(Iop_And32,
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t2)),
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t3))),
+                                                 mkU32(0)),
+                                           mkU64(0x000000007fffffffULL),
+                                           mkexpr(t1)));
 
-                     putDSPControl(IRExpr_ITE(mkexpr(t2),
-                                              IRExpr_ITE(mkexpr(t3),
-                                                         binop(Iop_Or32,
-                                                               getDSPControl(),
-                                                               binop(Iop_Shl32,
-                                                                     mkU32(0x1),
-                                                                     mkU8(ac+16)
-                                                                    )
-                                                              ),
-                                                         getDSPControl()),
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    binop(Iop_And32,
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t2)),
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t3))),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16))),
                                               getDSPControl()));
                      /* Calculate second cross dot product and saturate if
                         needed. */
@@ -11060,31 +11449,36 @@
                         intermediate product and write to DSPControl
                         register. */
                      assign(t6, binop(Iop_CmpEQ32,
-                                      binop(Iop_And32,
-                                            getIReg(rs),
-                                            mkU32(0x0000ffff)),
+                                      unop(Iop_16Uto32,
+                                           unop(Iop_32to16, getIReg(rs))),
                                       mkU32(0x00008000)));
                      assign(t7, binop(Iop_CmpEQ32,
-                                      binop(Iop_And32,
-                                            getIReg(rt),
-                                            mkU32(0xffff0000)),
-                                      mkU32(0x80000000)));
+                                      unop(Iop_16Uto32,
+                                           unop(Iop_32HIto16, getIReg(rt))),
+                                      mkU32(0x00008000)));
 
-                     assign(t8,
-                            IRExpr_ITE(mkexpr(t6),
-                                       IRExpr_ITE(mkexpr(t7),
-                                                  mkU64(0x000000007fffffffULL),
-                                                  mkexpr(t5)),
-                                       mkexpr(t5)));
+                     assign(t8, IRExpr_ITE(binop(Iop_CmpNE32,
+                                                 binop(Iop_And32,
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t6)),
+                                                       unop(Iop_1Sto32,
+                                                            mkexpr(t7))),
+                                                 mkU32(0)),
+                                           mkU64(0x000000007fffffffULL),
+                                           mkexpr(t5)));
 
-                     putDSPControl(IRExpr_ITE(mkexpr(t6),
-                                              IRExpr_ITE(mkexpr(t7),
-                                                         binop(Iop_Or32,
-                                                         getDSPControl(),
-                                                         binop(Iop_Shl32,
-                                                               mkU32(0x1),
-                                                               mkU8(ac+16))),
-                                                         getDSPControl()),
+                     putDSPControl(IRExpr_ITE(binop(Iop_CmpNE32,
+                                                    binop(Iop_And32,
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t6)),
+                                                          unop(Iop_1Sto32,
+                                                               mkexpr(t7))),
+                                                    mkU32(0)),
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16))),
                                               getDSPControl()));
                      /* Subtract intermediate products from value in the
                         accumulator. */
@@ -11116,38 +11510,28 @@
                                                         mkU32(0xffffffff)),
                                                   mkU64(0xffffffff80000000ULL),
                                                   mkexpr(t9))));
-                     assign(t10, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  unop(Iop_64HIto32,
-                                                       binop(Iop_Shl64,
-                                                             mkexpr(t9),
-                                                             mkU8(1))),
-                                                  mkU32(0x0)),
-                                            binop(Iop_Or32,
-                                                  getDSPControl(),
-                                                  binop(Iop_Shl32,
-                                                        mkU32(0x1),
-                                                        mkU8(ac+16))),
-                                            getDSPControl()));
-                     assign(t11, IRExpr_ITE(binop(Iop_CmpNE32,
-                                                  unop(Iop_64HIto32,
-                                                       binop(Iop_Shl64,
-                                                             mkexpr(t9),
-                                                             mkU8(1))),
-                                                  mkU32(0xffffffff)),
-                                            binop(Iop_Or32,
-                                                  getDSPControl(),
-                                                  binop(Iop_Shl32,
-                                                        mkU32(0x1),
-                                                        mkU8(ac+16))),
-                                            getDSPControl()));
+                     assign(t10, IRExpr_ITE(binop(Iop_CmpEQ32,
+                                                  unop(Iop_64to32,
+                                                       mkexpr(t9)),
+                                                  unop(Iop_64to32,
+                                                       getAcc(ac))),
+                                           getDSPControl(),
+                                           binop(Iop_Or32,
+                                                 getDSPControl(),
+                                                 binop(Iop_Shl32,
+                                                       mkU32(0x1),
+                                                       mkU8(ac+16)))));
                      putDSPControl(IRExpr_ITE(binop(Iop_CmpEQ32,
-                                                    binop(Iop_And32,
-                                                          unop(Iop_64HIto32,
-                                                               mkexpr(t9)),
-                                                          mkU32(0x80000000)),
-                                                    mkU32(0x0)),
+                                                    unop(Iop_64HIto32,
+                                                         mkexpr(t9)),
+                                                    unop(Iop_64HIto32,
+                                                         getAcc(ac))),
                                               mkexpr(t10),
-                                              mkexpr(t11)));
+                                              binop(Iop_Or32,
+                                                    getDSPControl(),
+                                                    binop(Iop_Shl32,
+                                                          mkU32(0x1),
+                                                          mkU8(ac+16)))));
                      break;
                   }
                   default:
@@ -11417,22 +11801,22 @@
             vex_inject_ir(irsb, Iend_BE);
 #endif
             if (mode64) {
-               stmt(IRStmt_Put(offsetof(VexGuestMIPS64State, guest_TISTART),
+               stmt(IRStmt_Put(offsetof(VexGuestMIPS64State, guest_CMSTART),
                                mkU64(guest_PC_curr_instr)));
-               stmt(IRStmt_Put(offsetof(VexGuestMIPS64State, guest_TILEN),
+               stmt(IRStmt_Put(offsetof(VexGuestMIPS64State, guest_CMLEN),
                                mkU64(20)));
 
                putPC(mkU64(guest_PC_curr_instr + 20));
             } else {
-               stmt(IRStmt_Put(offsetof(VexGuestMIPS32State, guest_TISTART),
+               stmt(IRStmt_Put(offsetof(VexGuestMIPS32State, guest_CMSTART),
                                mkU32(guest_PC_curr_instr)));
-               stmt(IRStmt_Put(offsetof(VexGuestMIPS32State, guest_TILEN),
+               stmt(IRStmt_Put(offsetof(VexGuestMIPS32State, guest_CMLEN),
                                mkU32(20)));
 
                putPC(mkU32(guest_PC_curr_instr + 20));
             }
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             dres.len = 20;
             delta += 20;
             goto decode_success;
@@ -11462,7 +11846,7 @@
    trap_code = get_code(cins);
    function = get_function(cins);
    IRType ty = mode64 ? Ity_I64 : Ity_I32;
-   IRType tyF = mode64 ? Ity_F64 : Ity_F32;
+   IRType tyF = fp_mode64 ? Ity_F64 : Ity_F32;
 
    ac = get_acNo(cins);
 
@@ -11495,102 +11879,112 @@
       lastn = mkexpr(t0);
       break;
 
-   case 0x11:     /* COP1 */
-      {
+   case 0x11: {  /* COP1 */
+      if (fmt == 0x3 && fd == 0 && function == 0) {  /* MFHC1 */
+         DIP("mfhc1 r%d, f%d", rt, fs);
+         if (fp_mode64) {
+            t0 = newTemp(Ity_I64);
+            t1 = newTemp(Ity_I32);
+            assign(t0, unop(Iop_ReinterpF64asI64, getDReg(fs)));
+            assign(t1, unop(Iop_64HIto32, mkexpr(t0)));
+            putIReg(rt, mkWidenFrom32(ty, mkexpr(t1), True));
+         } else {
+            ILLEGAL_INSTRUCTON;
+         }
+         break;
+      } else if (fmt == 0x7 && fd == 0 && function == 0) {  /* MTHC1 */
+         DIP("mthc1 r%d, f%d", rt, fs);
+         if (fp_mode64) {
+            t0 = newTemp(Ity_I64);
+            assign(t0, binop(Iop_32HLto64, getIReg(rt),
+                             unop(Iop_ReinterpF32asI32,
+                                  getLoFromF64(Ity_F64 /* 32FPR mode. */,
+                                               getDReg(fs)))));
+            putDReg(fs, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+         } else {
+            ILLEGAL_INSTRUCTON;
+         }
+         break;
+      } else if (fmt == 0x8) {  /* BC */
+         /* FcConditionalCode(bc1_cc) */
          UInt bc1_cc = get_bc1_cc(cins);
-         if (0x08 == fmt) {
-            switch (fmt) {
-            case 0x08:  /* BC */
-               {
-                  DIP("tf: %d, nd: %d", tf, nd);
-                  /* FcConditionalCode(bc1_cc) */
-                  t1 = newTemp(Ity_I1);
-                  t2 = newTemp(Ity_I32);
-                  t3 = newTemp(Ity_I1);
+         t1 = newTemp(Ity_I1);
+         t2 = newTemp(Ity_I32);
+         t3 = newTemp(Ity_I1);
 
-                  assign(t1, binop(Iop_CmpEQ32, mkU32(0), mkU32(bc1_cc)));
-                  assign(t2, IRExpr_ITE(mkexpr(t1),
-                                        binop(Iop_And32,
-                                              binop(Iop_Shr32, getFCSR(),
-                                                    mkU8(23)),
-                                              mkU32(0x1)),
-                                        binop(Iop_And32,
-                                              binop(Iop_Shr32, getFCSR(),
-                                                    mkU8(24 + bc1_cc)),
-                                              mkU32(0x1))
-                                        ));
+         assign(t1, binop(Iop_CmpEQ32, mkU32(0), mkU32(bc1_cc)));
+         assign(t2, IRExpr_ITE(mkexpr(t1),
+                               binop(Iop_And32,
+                                     binop(Iop_Shr32, getFCSR(), mkU8(23)),
+                                     mkU32(0x1)),
+                               binop(Iop_And32,
+                                     binop(Iop_Shr32, getFCSR(),
+                                           mkU8(24 + bc1_cc)),
+                                     mkU32(0x1))));
 
-                  if (tf == 1 && nd == 0) {
-                     /* branch on true */
-                     DIP("bc1t %d, %d", bc1_cc, imm);
-                     assign(t3, binop(Iop_CmpEQ32, mkU32(1), mkexpr(t2)));
-                     dis_branch(False, mkexpr(t3), imm, &bstmt);
+         if (tf == 1 && nd == 0) {
+            /* branch on true */
+            DIP("bc1t %d, %d", bc1_cc, imm);
+            assign(t3, binop(Iop_CmpEQ32, mkU32(1), mkexpr(t2)));
+            dis_branch(False, mkexpr(t3), imm, &bstmt);
+            break;
+         } else if (tf == 0 && nd == 0) {
+            /* branch on false */
+            DIP("bc1f %d, %d", bc1_cc, imm);
+            assign(t3, binop(Iop_CmpEQ32, mkU32(0), mkexpr(t2)));
+            dis_branch(False, mkexpr(t3), imm, &bstmt);
+            break;
+         } else if (nd == 1 && tf == 0) {
+            DIP("bc1fl %d, %d", bc1_cc, imm);
+            lastn = dis_branch_likely(binop(Iop_CmpNE32, mkexpr(t2),
+                                            mkU32(0x0)), imm);
+            break;
+         } else if (nd == 1 && tf == 1) {
+            DIP("bc1tl %d, %d", bc1_cc, imm);
+            lastn = dis_branch_likely(binop(Iop_CmpEQ32, mkexpr(t2),
+                                            mkU32(0x0)), imm);
+            break;
+         } else
+            goto decode_failure;
+      } else {
+         switch (function) {
+            case 0x4: {  /* SQRT.fmt */
+               switch (fmt) {
+                  case 0x10: {  /* S */
+                     IRExpr *rm = get_IR_roundingmode();
+                     putFReg(fd, mkWidenFromF32(tyF, binop(Iop_SqrtF32, rm,
+                                 getLoFromF64(tyF, getFReg(fs)))));
                      break;
-                  } else if (tf == 0 && nd == 0) {
-                     /* branch on false */
-                     DIP("bc1f %d, %d", bc1_cc, imm);
-                     assign(t3, binop(Iop_CmpEQ32, mkU32(0), mkexpr(t2)));
-                     dis_branch(False, mkexpr(t3), imm, &bstmt);
+                  }
+                  case 0x11: {  /* D */
+                     IRExpr *rm = get_IR_roundingmode();
+                     putDReg(fd, binop(Iop_SqrtF64, rm, getDReg(fs)));
                      break;
-                  } else if (nd == 1 && tf == 0) {
-                     DIP("bc1fl %d, %d", bc1_cc, imm);
-                     lastn = dis_branch_likely(binop(Iop_CmpNE32, mkexpr(t2),
-                                               mkU32(0x0)), imm);
-                     break;
-                  } else if (nd == 1 && tf == 1) {
-                     DIP("bc1tl %d, %d", bc1_cc, imm);
-                     lastn = dis_branch_likely(binop(Iop_CmpEQ32, mkexpr(t2),
-                                               mkU32(0x0)), imm);
-                     break;
-                  } else
+                  }
+                  default:
                      goto decode_failure;
-               }
-
-            default:
-               goto decode_failure;
-            }
-         } else {
-            switch (function) {
-
-            case 0x4:  /* SQRT.fmt */
-               {
-                  switch (fmt) {
-                  case 0x10:  /* S */
-                     {
-                        IRExpr *rm = get_IR_roundingmode();
-                        putFReg(fd, mkWidenFromF32(tyF, binop(Iop_SqrtF32, rm,
-                                    getLoFromF64(tyF, getFReg(fs)))));
-                     }
-                     break;
-                  case 0x11:  /* D */
-                     {
-                        IRExpr *rm = get_IR_roundingmode();
-                        putDReg(fd, binop(Iop_SqrtF64, rm, getDReg(fs)));
-                     }
-                     break;
                   }
                }
                break;
             case 0x5:  /* abs.fmt */
                switch (fmt) {
-               case 0x10:  /* S */
-                  DIP("abs.s f%d, f%d", fd, fs);
-                  putFReg(fd, mkWidenFromF32(tyF, unop(Iop_AbsF32,
-                              getLoFromF64(tyF, getFReg(fs)))));
-                  break;
-               case 0x11:  /* D  */
-                  DIP("abs.d f%d, f%d", fd, fs);
-                  putDReg(fd, unop(Iop_AbsF64, getDReg(fs)));
-                  break;
-               default:
-                  goto decode_failure;
+                  case 0x10:  /* S */
+                     DIP("abs.s f%d, f%d", fd, fs);
+                     putFReg(fd, mkWidenFromF32(tyF, unop(Iop_AbsF32,
+                                 getLoFromF64(tyF, getFReg(fs)))));
+                     break;
+                  case 0x11:  /* D  */
+                     DIP("abs.d f%d, f%d", fd, fs);
+                     putDReg(fd, unop(Iop_AbsF64, getDReg(fs)));
+                     break;
+                  default:
+                     goto decode_failure;
                }
                break;  /* case 0x5 */
 
             case 0x02:  /* MUL.fmt */
                switch (fmt) {
-               case 0x11:  /* D */
-                  {
+                  case 0x11: {  /* D */
                      DIP("mul.d f%d, f%d, f%d", fd, fs, ft);
                      IRExpr *rm = get_IR_roundingmode();
                      putDReg(fd, triop(Iop_MulF64, rm, getDReg(fs),
@@ -11597,8 +11991,7 @@
                                        getDReg(ft)));
                      break;
                   }
-               case 0x10:  /* S */
-                  {
+                  case 0x10: {  /* S */
                      DIP("mul.s f%d, f%d, f%d", fd, fs, ft);
                      IRExpr *rm = get_IR_roundingmode();
                      putFReg(fd, mkWidenFromF32(tyF, triop(Iop_MulF32, rm,
@@ -11606,15 +11999,14 @@
                                  getLoFromF64(tyF, getFReg(ft)))));
                      break;
                   }
-               default:
-                  goto decode_failure;
+                  default:
+                     goto decode_failure;
                }
                break;  /* MUL.fmt */
 
             case 0x03:  /* DIV.fmt */
                switch (fmt) {
-               case 0x11:  /* D */
-                  {
+                  case 0x11: {  /* D */
                      DIP("div.d f%d, f%d, f%d", fd, fs, ft);
                      IRExpr *rm = get_IR_roundingmode();
                      putDReg(fd, triop(Iop_DivF64, rm, getDReg(fs),
@@ -11621,9 +12013,9 @@
                                  getDReg(ft)));
                      break;
                   }
-               case 0x10:  /* S */
-                  {
+                  case 0x10: {  /* S */
                      DIP("div.s f%d, f%d, f%d", fd, fs, ft);
+                     calculateFCSR(fs, ft, DIVS, False, 2);
                      IRExpr *rm = get_IR_roundingmode();
                      putFReg(fd, mkWidenFromF32(tyF, triop(Iop_DivF32, rm,
                                  getLoFromF64(tyF, getFReg(fs)),
@@ -11630,24 +12022,24 @@
                                  getLoFromF64(tyF, getFReg(ft)))));
                      break;
                   }
-               default:
-                  goto decode_failure;
+                  default:
+                     goto decode_failure;
                }
                break;  /* DIV.fmt */
 
             case 0x01:  /* SUB.fmt */
                switch (fmt) {
-               case 0x11:  /* D */
-                  {
+                  case 0x11: {  /* D */
                      DIP("sub.d f%d, f%d, f%d", fd, fs, ft);
+                     calculateFCSR(fs, ft, SUBD, False, 2);
                      IRExpr *rm = get_IR_roundingmode();
                      putDReg(fd, triop(Iop_SubF64, rm, getDReg(fs),
                                        getDReg(ft)));
                      break;
                   }
-               case 0x10:  /* S */
-                  {
+                  case 0x10: {  /* S */
                      DIP("sub.s f%d, f%d, f%d", fd, fs, ft);
+                     calculateFCSR(fs, ft, SUBS, True, 2);
                      IRExpr *rm = get_IR_roundingmode();
                      putFReg(fd, mkWidenFromF32(tyF, triop(Iop_SubF32, rm,
                                  getLoFromF64(tyF, getFReg(fs)),
@@ -11654,8 +12046,8 @@
                                  getLoFromF64(tyF, getFReg(ft)))));
                      break;
                   }
-               default:
-                  goto decode_failure;
+                  default:
+                     goto decode_failure;
                }
                break;  /* SUB.fmt */
 
@@ -11663,8 +12055,8 @@
                switch (fmt) {
                   case 0x11:  /* D */
                      DIP("mov.d f%d, f%d", fd, fs);
-                     if (mode64) {
-                        putFReg(fd, getFReg(fs));
+                     if (fp_mode64) {
+                        putDReg(fd, getDReg(fs));
                      } else {
                         putFReg(fd, getFReg(fs));
                         putFReg(fd + 1, getFReg(fs + 1));
@@ -11699,19 +12091,27 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("round.l.s f%d, f%d", fd, fs);
-                     calculateFCSR(fs, ROUNDLS, True);
-                     t0 = newTemp(Ity_I64);
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, ROUNDLS, True, 1);
+                        t0 = newTemp(Ity_I64);
 
-                     assign(t0, binop(Iop_F32toI64S, mkU32(0x0),
-                                getLoFromF64(Ity_F64, getFReg(fs))));
+                        assign(t0, binop(Iop_F32toI64S, mkU32(0x0),
+                                         getLoFromF64(Ity_F64, getFReg(fs))));
 
-                     putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
-                  break;
+                        putDReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
+                     break;
                   case 0x11:  /* D */
                      DIP("round.l.d f%d, f%d", fd, fs);
-                     calculateFCSR(fs, ROUNDLD, False);
-                     putFReg(fd, binop(Iop_RoundF64toInt, mkU32(0x0),
-                                       getFReg(fs)));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, ROUNDLD, False, 1);
+                        putDReg(fd, binop(Iop_RoundF64toInt, mkU32(0x0),
+                                          getDReg(fs)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
                   default:
                     goto decode_failure;
@@ -11723,18 +12123,26 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("trunc.l.s f%d, f%d", fd, fs);
-                     calculateFCSR(fs, TRUNCLS, True);
-                     t0 = newTemp(Ity_I64);
-                     assign(t0, binop(Iop_F32toI64S, mkU32(0x3),
-                                      getLoFromF64(Ity_F64, getFReg(fs))));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, TRUNCLS, True, 1);
+                        t0 = newTemp(Ity_I64);
+                        assign(t0, binop(Iop_F32toI64S, mkU32(0x3),
+                                         getLoFromF64(Ity_F64, getFReg(fs))));
 
-                     putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                        putDReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
                   case 0x11:  /* D */
                      DIP("trunc.l.d f%d, f%d", fd, fs);
-                     calculateFCSR(fs, TRUNCLD, False);
-                     putFReg(fd, binop(Iop_RoundF64toInt, mkU32(0x3),
-                                       getFReg(fs)));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, TRUNCLD, False, 1);
+                        putDReg(fd, binop(Iop_RoundF64toInt, mkU32(0x3),
+                                          getDReg(fs)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
                   default:
                      goto decode_failure;
@@ -11771,7 +12179,6 @@
                switch (fmt) {
                case 0x10:  /* S */
                   DIP("movn.s f%d, f%d, r%d", fd, fs, rt);
-
                   t1 = newTemp(Ity_F64);
                   t2 = newTemp(Ity_F64);
                   t3 = newTemp(Ity_I1);
@@ -11781,13 +12188,19 @@
                      assign(t2, getFReg(fd));
                      assign(t3, binop(Iop_CmpNE64, mkU64(0), getIReg(rt)));
                   } else {
-                     assign(t1, unop(Iop_F32toF64, getFReg(fs)));
-                     assign(t2, unop(Iop_F32toF64, getFReg(fd)));
-                     assign(t3, binop(Iop_CmpNE32, mkU32(0), getIReg(rt)));
+                     if (fp_mode64) {
+                        assign(t1, getFReg(fs));
+                        assign(t2, getFReg(fd));
+                        assign(t3, binop(Iop_CmpNE32, mkU32(0), getIReg(rt)));
+                     } else {
+                        assign(t1, unop(Iop_F32toF64, getFReg(fs)));
+                        assign(t2, unop(Iop_F32toF64, getFReg(fd)));
+                        assign(t3, binop(Iop_CmpNE32, mkU32(0), getIReg(rt)));
+                     }
                   }
 
                   assign(t4, IRExpr_ITE(mkexpr(t3), mkexpr(t1), mkexpr(t2)));
-                  if (mode64) {
+                  if (fp_mode64) {
                      IRTemp f = newTemp(Ity_F64);
                      IRTemp fd_hi = newTemp(Ity_I32);
                      t5 = newTemp(Ity_I64);
@@ -11795,7 +12208,7 @@
                      assign(fd_hi, unop(Iop_64HIto32, unop(Iop_ReinterpF64asI64,
                                         mkexpr(f))));
 
-                     assign(t5, mkWidenFrom32(ty, unop(Iop_64to32,
+                     assign(t5, mkWidenFrom32(Ity_I64, unop(Iop_64to32,
                                 unop(Iop_ReinterpF64asI64, mkexpr(t4))), True));
 
                      putFReg(fd, unop (Iop_ReinterpI64asF64, mkexpr(t5)));
@@ -11830,10 +12243,13 @@
                   t2 = newTemp(Ity_F64);
                   t3 = newTemp(Ity_I1);
                   t4 = newTemp(Ity_F64);
-                  if (mode64) {
+                  if (fp_mode64) {
                      assign(t1, getFReg(fs));
                      assign(t2, getFReg(fd));
-                     assign(t3, binop(Iop_CmpEQ64, mkU64(0), getIReg(rt)));
+                     if (mode64)
+                        assign(t3, binop(Iop_CmpEQ64, mkU64(0), getIReg(rt)));
+                     else
+                        assign(t3, binop(Iop_CmpEQ32, mkU32(0), getIReg(rt)));
                   } else {
                      assign(t1, unop(Iop_F32toF64, getFReg(fs)));
                      assign(t2, unop(Iop_F32toF64, getFReg(fd)));
@@ -11841,7 +12257,7 @@
                   }
                   assign(t4, IRExpr_ITE(mkexpr(t3), mkexpr(t1), mkexpr(t2)));
 
-                 if (mode64) {
+                 if (fp_mode64) {
                      IRTemp f = newTemp(Ity_F64);
                      IRTemp fd_hi = newTemp(Ity_I32);
                      t7 = newTemp(Ity_I64);
@@ -11848,7 +12264,7 @@
                      assign(f, getFReg(fd));
                      assign(fd_hi, unop(Iop_64HIto32,
                                    unop(Iop_ReinterpF64asI64, mkexpr(f))));
-                     assign(t7, mkWidenFrom32(ty, unop(Iop_64to32,
+                     assign(t7, mkWidenFrom32(Ity_I64, unop(Iop_64to32,
                                 unop(Iop_ReinterpF64asI64, mkexpr(t4))), True));
 
                      putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t7)));
@@ -11911,7 +12327,7 @@
                      t6 = newTemp(Ity_F64);
                      t7 = newTemp(Ity_I64);
 
-                     if (mode64) {
+                     if (fp_mode64) {
                         assign(t5, getFReg(fs));
                         assign(t6, getFReg(fd));
                      } else {
@@ -11935,13 +12351,13 @@
                      assign(t4, IRExpr_ITE(mkexpr(t3),
                                            mkexpr(t5), mkexpr(t6)));
 
-                     if (mode64) {
+                     if (fp_mode64) {
                         IRTemp f = newTemp(Ity_F64);
                         IRTemp fd_hi = newTemp(Ity_I32);
                         assign(f, getFReg(fd));
                         assign(fd_hi, unop(Iop_64HIto32,
                                       unop(Iop_ReinterpF64asI64, mkexpr(f))));
-                        assign(t7, mkWidenFrom32(ty, unop(Iop_64to32,
+                        assign(t7, mkWidenFrom32(Ity_I64, unop(Iop_64to32,
                                       unop(Iop_ReinterpF64asI64, mkexpr(t4))),
                                       True));
 
@@ -11991,7 +12407,7 @@
                      t5 = newTemp(Ity_F64);
                      t6 = newTemp(Ity_F64);
 
-                     if (mode64) {
+                     if (fp_mode64) {
                         assign(t5, getFReg(fs));
                         assign(t6, getFReg(fd));
                      } else {
@@ -12015,7 +12431,7 @@
                      assign(t4, IRExpr_ITE(mkexpr(t3),
                                            mkexpr(t5), mkexpr(t6)));
 
-                     if (mode64) {
+                     if (fp_mode64) {
                         IRTemp f = newTemp(Ity_F64);
                         IRTemp fd_hi = newTemp(Ity_I32);
                         t7 = newTemp(Ity_I64);
@@ -12022,7 +12438,7 @@
                         assign(f, getFReg(fd));
                         assign(fd_hi, unop(Iop_64HIto32,
                                       unop(Iop_ReinterpF64asI64, mkexpr(f))));
-                        assign(t7, mkWidenFrom32(ty, unop(Iop_64to32,
+                        assign(t7, mkWidenFrom32(Ity_I64, unop(Iop_64to32,
                                    unop(Iop_ReinterpF64asI64, mkexpr(t4))),
                                    True));
 
@@ -12040,17 +12456,18 @@
 
             case 0x0:  /* add.fmt */
                switch (fmt) {
-               case 0x10:  /* S */
-                  {
-                     DIP("add.s f%d, f%d, f%d", fd, fs, ft);
-                     IRExpr *rm = get_IR_roundingmode();
-                     putFReg(fd, mkWidenFromF32(tyF, triop(Iop_AddF32, rm,
-                                 getLoFromF64(tyF, getFReg(fs)),
-                                 getLoFromF64(tyF, getFReg(ft)))));
-                     break;
-                  }
+               case 0x10: {  /* S */
+                  DIP("add.s f%d, f%d, f%d", fd, fs, ft);
+                  calculateFCSR(fs, ft, ADDS, True, 2);
+                  IRExpr *rm = get_IR_roundingmode();
+                  putFReg(fd, mkWidenFromF32(tyF, triop(Iop_AddF32, rm,
+                              getLoFromF64(tyF, getFReg(fs)),
+                              getLoFromF64(tyF, getFReg(ft)))));
+                  break;
+               }
                case 0x11: {  /* D */
                   DIP("add.d f%d, f%d, f%d", fd, fs, ft);
+                  calculateFCSR(fs, ft, ADDD, False, 2);
                   IRExpr *rm = get_IR_roundingmode();
                   putDReg(fd, triop(Iop_AddF64, rm, getDReg(fs), getDReg(ft)));
                   break;
@@ -12058,10 +12475,10 @@
 
                case 0x4:  /* MTC1 (Move Word to Floating Point) */
                   DIP("mtc1 r%d, f%d", rt, fs);
-                  if (mode64) {
+                  if (fp_mode64) {
                      t0 = newTemp(Ity_I32);
                      t1 = newTemp(Ity_F32);
-                     assign(t0, unop(Iop_64to32, getIReg(rt)));
+                     assign(t0, mkNarrowTo32(ty, getIReg(rt)));
                      assign(t1, unop(Iop_ReinterpI32asF32, mkexpr(t0)));
 
                      putFReg(fs, mkWidenFromF32(tyF, mkexpr(t1)));
@@ -12077,7 +12494,7 @@
 
                case 0x0:  /* MFC1 */
                   DIP("mfc1 r%d, f%d", rt, fs);
-                  if (mode64) {
+                  if (fp_mode64) {
                      t0 = newTemp(Ity_I64);
                      t1 = newTemp(Ity_I32);
                      assign(t0, unop(Iop_ReinterpF64asI64, getFReg(fs)));
@@ -12200,8 +12617,8 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("cvt.d.s f%d, f%d", fd, fs);
-                     calculateFCSR(fs, CVTDS, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, CVTDS, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12220,8 +12637,8 @@
 
                   case 0x14:
                      DIP("cvt.d.w %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTDW, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, CVTDW, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12240,9 +12657,9 @@
                      }
 
                   case 0x15: {  /* L */
-                     if (mode64) {
+                     if (fp_mode64) {
                         DIP("cvt.d.l %d, %d", fd, fs);
-                        calculateFCSR(fs, CVTDL, False);
+                        calculateFCSR(fs, 0, CVTDL, False, 1);
                         t0 = newTemp(Ity_I64);
                         assign(t0, unop(Iop_ReinterpF64asI64, getFReg(fs)));
 
@@ -12261,8 +12678,8 @@
                switch (fmt) {
                   case 0x14:  /* W */
                      DIP("cvt.s.w %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTSW, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, CVTSW, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12283,20 +12700,16 @@
 
                   case 0x11:  /* D */
                      DIP("cvt.s.d %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTSD, False);
-                     if (mode64) {
-                        t0 = newTemp(Ity_F32);
-                        assign(t0, binop(Iop_F64toF32, get_IR_roundingmode(),
-                                         getFReg(fs)));
-                        putFReg(fd, mkWidenFromF32(tyF, mkexpr(t0)));
-                     } else
-                        putFReg(fd, binop(Iop_F64toF32, get_IR_roundingmode(),
-                                          getDReg(fs)));
+                     calculateFCSR(fs, 0, CVTSD, False, 1);
+                     t0 = newTemp(Ity_F32);
+                     assign(t0, binop(Iop_F64toF32, get_IR_roundingmode(),
+                                      getDReg(fs)));
+                     putFReg(fd, mkWidenFromF32(tyF, mkexpr(t0)));
                      break;
 
                   case 0x15:  /* L */
                      DIP("cvt.s.l %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTSL, False);
+                     calculateFCSR(fs, 0, CVTSL, False, 1);
                      t0 = newTemp(Ity_I64);
                      assign(t0, unop(Iop_ReinterpF64asI64, getFReg(fs)));
 
@@ -12313,34 +12726,24 @@
                switch (fmt) {
                case 0x10:  /* S */
                   DIP("cvt.w.s %d, %d", fd, fs);
-                  calculateFCSR(fs, CVTWS, True);
-                  if (mode64) {
-                     putFReg(fd, mkWidenFromF32(tyF, binop(Iop_RoundF32toInt,
-                             get_IR_roundingmode(), getLoFromF64(tyF,
-                                                                getFReg(fs)))));
-                  } else
-                     putFReg(fd, binop(Iop_RoundF32toInt, get_IR_roundingmode(),
-                                       getFReg(fs)));
+                  calculateFCSR(fs, 0, CVTWS, True, 1);
+                  putFReg(fd,
+                          mkWidenFromF32(tyF,
+                                         binop(Iop_RoundF32toInt,
+                                               get_IR_roundingmode(),
+                                               getLoFromF64(tyF, getFReg(fs))))
+                         );
                   break;
 
                case 0x11:
                   DIP("cvt.w.d %d, %d", fd, fs);
-                  calculateFCSR(fs, CVTWD, False);
-                  if (mode64) {
-                     t0 = newTemp(Ity_I32);
-                     t1 = newTemp(Ity_F32);
-                     assign(t0, binop(Iop_F64toI32S, get_IR_roundingmode(),
-                                      getFReg(fs)));
-                     assign(t1, unop(Iop_ReinterpI32asF32, mkexpr(t0)));
-                     putFReg(fd, mkWidenFromF32(tyF, mkexpr(t1)));
-                  } else {
-                     t0 = newTemp(Ity_I32);
-
-                     assign(t0, binop(Iop_F64toI32S, get_IR_roundingmode(),
-                                      getDReg(fs)));
-
-                     putFReg(fd, unop(Iop_ReinterpI32asF32, mkexpr(t0)));
-                  }
+                  calculateFCSR(fs, 0, CVTWD, False, 1);
+                  t0 = newTemp(Ity_I32);
+                  t1 = newTemp(Ity_F32);
+                  assign(t0, binop(Iop_F64toI32S, get_IR_roundingmode(),
+                                   getDReg(fs)));
+                  assign(t1, unop(Iop_ReinterpI32asF32, mkexpr(t0)));
+                  putFReg(fd, mkWidenFromF32(tyF, mkexpr(t1)));
                   break;
 
                default:
@@ -12353,20 +12756,28 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("cvt.l.s %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTLS, True);
-                     t0 = newTemp(Ity_I64);
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, CVTLS, True, 1);
+                        t0 = newTemp(Ity_I64);
 
-                     assign(t0, binop(Iop_F32toI64S, get_IR_roundingmode(),
-                                      getLoFromF64(Ity_F64, getFReg(fs))));
+                        assign(t0, binop(Iop_F32toI64S, get_IR_roundingmode(),
+                                         getLoFromF64(tyF, getFReg(fs))));
 
-                     putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                        putDReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
 
                   case 0x11: {  /* D */
                      DIP("cvt.l.d %d, %d", fd, fs);
-                     calculateFCSR(fs, CVTLD, False);
-                     putFReg(fd, binop(Iop_RoundF64toInt,
-                             get_IR_roundingmode(), getFReg(fs)));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, CVTLD, False, 1);
+                        putDReg(fd, binop(Iop_RoundF64toInt,
+                                get_IR_roundingmode(), getDReg(fs)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
                   }
 
@@ -12379,20 +12790,28 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("floor.l.s %d, %d", fd, fs);
-                     calculateFCSR(fs, FLOORLS, True);
-                     t0 = newTemp(Ity_I64);
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, FLOORLS, True, 1);
+                        t0 = newTemp(Ity_I64);
 
-                     assign(t0, binop(Iop_F32toI64S, mkU32(0x1),
-                                      getLoFromF64(Ity_F64, getFReg(fs))));
+                        assign(t0, binop(Iop_F32toI64S, mkU32(0x1),
+                                         getLoFromF64(tyF, getFReg(fs))));
 
-                     putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                        putDReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
 
                   case 0x11:  /* D */
                      DIP("floor.l.d %d, %d", fd, fs);
-                     calculateFCSR(fs, FLOORLD, False);
-                     putFReg(fd, binop(Iop_RoundF64toInt, mkU32(0x1),
-                                       getFReg(fs)));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, FLOORLD, False, 1);
+                        putDReg(fd, binop(Iop_RoundF64toInt, mkU32(0x1),
+                                          getDReg(fs)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
                   default:
                      goto decode_failure;
@@ -12403,8 +12822,8 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("round.w.s f%d, f%d", fd, fs);
-                     calculateFCSR(fs, ROUNDWS, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, ROUNDWS, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12427,8 +12846,8 @@
 
                   case 0x11:  /* D */
                      DIP("round.w.d f%d, f%d", fd, fs);
-                     calculateFCSR(fs, ROUNDWD, False);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, ROUNDWD, False, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I32);
                         assign(t0, binop(Iop_F64toI32S, mkU32(0x0),
                                          getDReg(fs)));
@@ -12453,8 +12872,8 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("floor.w.s f%d, f%d", fd, fs);
-                     calculateFCSR(fs, FLOORWS, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, FLOORWS, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12477,8 +12896,8 @@
 
                   case 0x11:  /* D */
                      DIP("floor.w.d f%d, f%d", fd, fs);
-                     calculateFCSR(fs, FLOORWD, False);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, FLOORWD, False, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I32);
                         assign(t0, binop(Iop_F64toI32S, mkU32(0x1),
                                          getDReg(fs)));
@@ -12504,8 +12923,8 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("trunc.w.s %d, %d", fd, fs);
-                     calculateFCSR(fs, TRUNCWS, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, TRUNCWS, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12527,8 +12946,8 @@
                      break;
                   case 0x11:  /* D */
                      DIP("trunc.w.d %d, %d", fd, fs);
-                     calculateFCSR(fs, TRUNCWD, False);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, TRUNCWD, False, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I32);
 
                         assign(t0, binop(Iop_F64toI32S, mkU32(0x3),
@@ -12555,8 +12974,8 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("ceil.w.s %d, %d", fd, fs);
-                     calculateFCSR(fs, CEILWS, True);
-                     if (mode64) {
+                     calculateFCSR(fs, 0, CEILWS, True, 1);
+                     if (fp_mode64) {
                         t0 = newTemp(Ity_I64);
                         t1 = newTemp(Ity_I32);
                         t3 = newTemp(Ity_F32);
@@ -12579,8 +12998,8 @@
 
                   case 0x11:  /* D */
                      DIP("ceil.w.d %d, %d", fd, fs);
-                     calculateFCSR(fs, CEILWD, False);
-                     if (!mode64) {
+                     calculateFCSR(fs, 0, CEILWD, False, 1);
+                     if (!fp_mode64) {
                         t0 = newTemp(Ity_I32);
                         assign(t0, binop(Iop_F64toI32S, mkU32(0x2),
                                          getDReg(fs)));
@@ -12603,20 +13022,28 @@
                switch (fmt) {
                   case 0x10:  /* S */
                      DIP("ceil.l.s %d, %d", fd, fs);
-                     calculateFCSR(fs, CEILLS, True);
-                     t0 = newTemp(Ity_I64);
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, CEILLS, True, 1);
+                        t0 = newTemp(Ity_I64);
 
-                     assign(t0, binop(Iop_F32toI64S, mkU32(0x2),
-                                getLoFromF64(Ity_F64, getFReg(fs))));
+                        assign(t0, binop(Iop_F32toI64S, mkU32(0x2),
+                                   getLoFromF64(tyF, getFReg(fs))));
 
-                     putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                        putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t0)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
 
                   case 0x11:  /* D */
                      DIP("ceil.l.d %d, %d", fd, fs);
-                     calculateFCSR(fs, CEILLD, False);
-                     putFReg(fd, binop(Iop_RoundF64toInt, mkU32(0x2),
-                                       getFReg(fs)));
+                     if (fp_mode64) {
+                        calculateFCSR(fs, 0, CEILLD, False, 1);
+                        putFReg(fd, binop(Iop_RoundF64toInt, mkU32(0x2),
+                                          getFReg(fs)));
+                     } else {
+                        ILLEGAL_INSTRUCTON;
+                     }
                      break;
 
                   default:
@@ -12692,17 +13119,24 @@
    case 0x31:  /* LWC1 */
       /* Load Word to Floating Point - LWC1 (MIPS32) */
       DIP("lwc1 f%d, %d(r%d)", ft, imm, rs);
-      if (mode64) {
-         t0 = newTemp(Ity_I64);
+      if (fp_mode64) {
          t1 = newTemp(Ity_F32);
          t2 = newTemp(Ity_I64);
-         /* new LO */
-         assign(t0, binop(Iop_Add64, getIReg(rs),
-                          mkU64(extend_s_16to64(imm))));
+         if (mode64) {
+            t0 = newTemp(Ity_I64);
+            /* new LO */
+            assign(t0, binop(Iop_Add64, getIReg(rs),
+                             mkU64(extend_s_16to64(imm))));
+         } else {
+            t0 = newTemp(Ity_I32);
+            /* new LO */
+            assign(t0, binop(Iop_Add32, getIReg(rs),
+                             mkU32(extend_s_16to32(imm))));
+         }
          assign(t1, load(Ity_F32, mkexpr(t0)));
-         assign(t2, mkWidenFrom32(ty, unop(Iop_ReinterpF32asI32,
-                                           mkexpr(t1)), True));
-         putFReg(ft, unop(Iop_ReinterpI64asF64, mkexpr(t2)));
+         assign(t2, mkWidenFrom32(Ity_I64, unop(Iop_ReinterpF32asI32,
+                                                mkexpr(t1)), True));
+         putDReg(ft, unop(Iop_ReinterpI64asF64, mkexpr(t2)));
       } else {
          t0 = newTemp(Ity_I32);
          assign(t0, binop(Iop_Add32, getIReg(rs),
@@ -12713,7 +13147,7 @@
 
    case 0x39:  /* SWC1 */
       DIP("swc1 f%d, %d(r%d)", ft, imm, rs);
-      if (mode64) {
+      if (fp_mode64) {
          t0 = newTemp(Ity_I64);
          t2 = newTemp(Ity_I32);
          LOAD_STORE_PATTERN;
@@ -12732,22 +13166,16 @@
 
    case 0x35:
       /* Load Doubleword to Floating Point - LDC1 (MIPS32) */
+      DIP("ldc1 f%d, %d(%d)", rt, imm, rs);
       LOAD_STORE_PATTERN;
-      if (mode64)
-         putFReg(ft, load(Ity_F64, mkexpr(t1)));
-      else
-         putDReg(ft, load(Ity_F64, mkexpr(t1)));
-      DIP("ldc1 f%d, %d(%d)", rt, imm, rs);
+      putDReg(ft, load(Ity_F64, mkexpr(t1)));
       break;
 
    case 0x3D:
       /* Store Doubleword from Floating Point - SDC1 */
+      DIP("sdc1 f%d, %d(%d)", ft, imm, rs);
       LOAD_STORE_PATTERN;
-      if (mode64)
-         store(mkexpr(t1), getFReg(ft));
-      else
-         store(mkexpr(t1), getDReg(ft));
-      DIP("sdc1 f%d, %d(%d)", ft, imm, rs);
+      store(mkexpr(t1), getDReg(ft));
       break;
 
    case 0x23:  /* LW */
@@ -12806,19 +13234,20 @@
       case 0x0: {  /* LWXC1 */
          /* Load Word  Indexed to Floating Point - LWXC1 (MIPS32r2) */
          DIP("lwxc1 f%d, r%d(r%d)", fd, rt, rs);
-         if (mode64) {
+         if (fp_mode64) {
             t0 = newTemp(Ity_I64);
             t1 = newTemp(Ity_I32);
-            t2 = newTemp(Ity_I64);
             t3 = newTemp(Ity_F32);
             t4 = newTemp(Ity_I64);
 
+            t2 = newTemp(ty);
             /* new LO */
-            assign(t2, binop(Iop_Add64, getIReg(rs), getIReg(rt)));
+            assign(t2, binop(mode64 ? Iop_Add64 : Iop_Add32, getIReg(rs),
+                             getIReg(rt)));
             assign(t3, load(Ity_F32, mkexpr(t2)));
 
-            assign(t4, mkWidenFrom32(ty, unop(Iop_ReinterpF32asI32,
-                                              mkexpr(t3)), True));
+            assign(t4, mkWidenFrom32(Ity_I64, unop(Iop_ReinterpF32asI32,
+                                                   mkexpr(t3)), True));
 
             putFReg(fd, unop(Iop_ReinterpI64asF64, mkexpr(t4)));
          } else {
@@ -12832,10 +13261,11 @@
       case 0x1: {  /* LDXC1 */
          /* Load Doubleword  Indexed to Floating Point
             LDXC1 (MIPS32r2 and MIPS64) */
-         if (mode64) {
+         if (fp_mode64) {
             DIP("ldxc1 f%d, r%d(r%d)", fd, rt, rs);
-            t0 = newTemp(Ity_I64);
-            assign(t0, binop(Iop_Add64, getIReg(rs), getIReg(rt)));
+            t0 = newTemp(ty);
+            assign(t0, binop(mode64 ? Iop_Add64 : Iop_Add32, getIReg(rs),
+                             getIReg(rt)));
             putFReg(fd, load(Ity_F64, mkexpr(t0)));
             break;
          } else {
@@ -12869,10 +13299,10 @@
 
       case 0x8: {  /* Store Word Indexed from Floating Point - SWXC1 */
          DIP("swxc1 f%d, r%d(r%d)", ft, rt, rs);
-         if (mode64) {
-            t0 = newTemp(Ity_I64);
-            assign(t0, binop(Iop_Add64, getIReg(rs), getIReg(rt)));
-
+         if (fp_mode64) {
+            t0 = newTemp(ty);
+            assign(t0, binop(mode64 ? Iop_Add64 : Iop_Add32, getIReg(rs),
+                             getIReg(rt)));
             store(mkexpr(t0), getLoFromF64(tyF, getFReg(fs)));
 
          } else {
@@ -12885,9 +13315,10 @@
       }
       case 0x9: {  /* Store Doubleword Indexed from Floating Point - SDXC1 */
          DIP("sdc1 f%d, %d(%d)", ft, imm, rs);
-         if (mode64) {
-            t0 = newTemp(Ity_I64);
-            assign(t0, binop(Iop_Add64, getIReg(rs), getIReg(rt)));
+         if (fp_mode64) {
+            t0 = newTemp(ty);
+            assign(t0, binop(mode64 ? Iop_Add64 : Iop_Add32, getIReg(rs),
+                             getIReg(rt)));
             store(mkexpr(t0), getFReg(fs));
          } else {
             t0 = newTemp(Ity_I32);
@@ -13692,6 +14123,7 @@
       /* Cavium Specific instructions */
       case 0x03: case 0x32: case 0x33:  /* DMUL, CINS , CINS32 */
       case 0x3A: case 0x3B: case 0x2B:  /* EXT,  EXT32, SNE    */
+      /* CVM Compare Instructions */
       case 0x2A: case 0x2E: case 0x2F:  /* SEQ,  SEQI,  SNEI   */
          if (VEX_MIPS_COMP_ID(archinfo->hwcaps) == VEX_PRID_COMP_CAVIUM) {
             if (dis_instr_CVM(cins))
@@ -14528,8 +14960,13 @@
          }
          break;  /* BSHFL */
 
-      /* -------- MIPS32(r2) DSP ASE(r2) instructions -------- */
+      /* --- MIPS32(r2) DSP ASE(r2) / Cavium Specfic (LX) instructions --- */
       case 0xA:  /* LX */
+         if (VEX_MIPS_COMP_ID(archinfo->hwcaps) == VEX_PRID_COMP_CAVIUM) {
+            if (dis_instr_CVM(cins))
+               break;
+            goto decode_failure;
+         }
       case 0xC:  /* INSV */
       case 0x38: {  /* EXTR.W */
          if (VEX_MIPS_PROC_DSP(archinfo->hwcaps)) {
@@ -16318,8 +16755,8 @@
 
    decode_failure_dsp:
       vex_printf("Error occured while trying to decode MIPS32 DSP "
-                  "instruction.\nYour platform probably doesn't support "
-                  "MIPS32 DSP ASE.\n");
+                 "instruction.\nYour platform probably doesn't support "
+                 "MIPS32 DSP ASE.\n");
    decode_failure:
       /* All decode failures end up here. */
       if (sigill_diag)
@@ -16422,7 +16859,6 @@
 
 /* Disassemble a single instruction into IR.  The instruction
    is located in host memory at &guest_code[delta]. */
-
 DisResult disInstr_MIPS( IRSB*        irsb_IN,
                          Bool         (*resteerOkFn) ( void *, Addr64 ),
                          Bool         resteerCisOk,
@@ -16441,6 +16877,10 @@
    vassert(guest_arch == VexArchMIPS32 || guest_arch == VexArchMIPS64);
 
    mode64 = guest_arch != VexArchMIPS32;
+#if (__mips_fpr==64)
+   fp_mode64 = ((VEX_MIPS_REV(archinfo->hwcaps) == VEX_PRID_CPU_32FPR)
+                || guest_arch == VexArchMIPS64);
+#endif
 
    guest_code = guest_code_IN;
    irsb = irsb_IN;
Index: priv/guest_ppc_helpers.c
===================================================================
--- priv/guest_ppc_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_ppc_helpers.c	(.../trunk)	(revision 2863)
@@ -498,8 +498,8 @@
 
    vex_state->guest_EMNOTE = EmNote_NONE;
 
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN   = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN   = 0;
 
    vex_state->guest_NRADDR = 0;
    vex_state->guest_NRADDR_GPR2 = 0;
@@ -665,8 +665,8 @@
 
    vex_state->padding = 0;
 
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN   = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN   = 0;
 
    vex_state->guest_NRADDR = 0;
    vex_state->guest_NRADDR_GPR2 = 0;
@@ -808,8 +808,8 @@
           .alwaysDefd 
 	  = { /*  0 */ ALWAYSDEFD32(guest_CIA),
 	      /*  1 */ ALWAYSDEFD32(guest_EMNOTE),
-	      /*  2 */ ALWAYSDEFD32(guest_TISTART),
-	      /*  3 */ ALWAYSDEFD32(guest_TILEN),
+	      /*  2 */ ALWAYSDEFD32(guest_CMSTART),
+	      /*  3 */ ALWAYSDEFD32(guest_CMLEN),
 	      /*  4 */ ALWAYSDEFD32(guest_VSCR),
 	      /*  5 */ ALWAYSDEFD32(guest_FPROUND),
               /*  6 */ ALWAYSDEFD32(guest_NRADDR),
@@ -849,8 +849,8 @@
           .alwaysDefd 
 	  = { /*  0 */ ALWAYSDEFD64(guest_CIA),
 	      /*  1 */ ALWAYSDEFD64(guest_EMNOTE),
-	      /*  2 */ ALWAYSDEFD64(guest_TISTART),
-	      /*  3 */ ALWAYSDEFD64(guest_TILEN),
+	      /*  2 */ ALWAYSDEFD64(guest_CMSTART),
+	      /*  3 */ ALWAYSDEFD64(guest_CMLEN),
 	      /*  4 */ ALWAYSDEFD64(guest_VSCR),
 	      /*  5 */ ALWAYSDEFD64(guest_FPROUND),
 	      /*  6 */ ALWAYSDEFD64(guest_NRADDR),
Index: priv/guest_ppc_toIR.c
===================================================================
--- priv/guest_ppc_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_ppc_toIR.c	(.../trunk)	(revision 2863)
@@ -69,6 +69,12 @@
        unconditional calls and returns (bl, blr).  They should also be
        emitted for conditional calls and returns, but we don't have a 
        way to express that right now.  Ah well.
+
+   - Uses of Iop_{Add,Sub,Mul}32Fx4: the backend (host_ppc_isel.c)
+       ignores the rounding mode, and generates code that assumes
+       round-to-nearest.  This means V will compute incorrect results
+       for uses of these IROps when the rounding mode (first) arg is
+       not mkU32(Irrm_NEAREST).
 */
 
 /* "Special" instructions.
@@ -78,9 +84,9 @@
    concerned) but have meaning for supporting Valgrind.  A special
    instruction is flagged by a 16-byte preamble:
 
-      32-bit mode: 54001800 54006800 5400E800 54009800
-                   (rlwinm 0,0,3,0,0; rlwinm 0,0,13,0,0; 
-                    rlwinm 0,0,29,0,0; rlwinm 0,0,19,0,0)
+      32-bit mode: 5400183E 5400683E 5400E83E 5400983E
+                   (rlwinm 0,0,3,0,31; rlwinm 0,0,13,0,31; 
+                    rlwinm 0,0,29,0,31; rlwinm 0,0,19,0,31)
 
       64-bit mode: 78001800 78006800 7800E802 78009802
                    (rotldi 0,0,3; rotldi 0,0,13;
@@ -228,8 +234,8 @@
 #define OFFB_VRSAVE      offsetofPPCGuestState(guest_VRSAVE)
 #define OFFB_VSCR        offsetofPPCGuestState(guest_VSCR)
 #define OFFB_EMNOTE      offsetofPPCGuestState(guest_EMNOTE)
-#define OFFB_TISTART     offsetofPPCGuestState(guest_TISTART)
-#define OFFB_TILEN       offsetofPPCGuestState(guest_TILEN)
+#define OFFB_CMSTART     offsetofPPCGuestState(guest_CMSTART)
+#define OFFB_CMLEN       offsetofPPCGuestState(guest_CMLEN)
 #define OFFB_NRADDR      offsetofPPCGuestState(guest_NRADDR)
 #define OFFB_NRADDR_GPR2 offsetofPPCGuestState(guest_NRADDR_GPR2)
 #define OFFB_TFHAR       offsetofPPCGuestState(guest_TFHAR)
@@ -377,8 +383,8 @@
     PPC_GST_VRSAVE, // Vector Save/Restore Register
     PPC_GST_VSCR,   // Vector Status and Control Register
     PPC_GST_EMWARN, // Emulation warnings
-    PPC_GST_TISTART,// For icbi: start of area to invalidate
-    PPC_GST_TILEN,  // For icbi: length of area to invalidate
+    PPC_GST_CMSTART,// For icbi: start of area to invalidate
+    PPC_GST_CMLEN,  // For icbi: length of area to invalidate
     PPC_GST_IP_AT_SYSCALL, // the CIA of the most recently executed SC insn
     PPC_GST_SPRG3_RO, // SPRG3
     PPC_GST_TFHAR,  // Transactional Failure Handler Address Register
@@ -2781,14 +2787,14 @@
       stmt( IRStmt_Put( OFFB_EMNOTE,src) );
       break;
       
-   case PPC_GST_TISTART: 
+   case PPC_GST_CMSTART: 
       vassert( ty_src == ty );
-      stmt( IRStmt_Put( OFFB_TISTART, src) );
+      stmt( IRStmt_Put( OFFB_CMSTART, src) );
       break;
       
-   case PPC_GST_TILEN: 
+   case PPC_GST_CMLEN: 
       vassert( ty_src == ty );
-      stmt( IRStmt_Put( OFFB_TILEN, src) );
+      stmt( IRStmt_Put( OFFB_CMLEN, src) );
       break;
       
    case PPC_GST_TEXASR:
@@ -5233,6 +5239,7 @@
 
    Int     simm16 = extend_s_16to32(uimm16);
    IRType  ty     = mode64 ? Ity_I64 : Ity_I32;
+   IROp    mkAdd  = mode64 ? Iop_Add64 : Iop_Add32;
    IRTemp  EA     = newTemp(ty);
    UInt    r      = 0;
    UInt    ea_off = 0;
@@ -5248,7 +5255,7 @@
       }
       DIP("lmw r%u,%d(r%u)\n", rD_addr, simm16, rA_addr);
       for (r = rD_addr; r <= 31; r++) {
-         irx_addr = binop(Iop_Add32, mkexpr(EA), mkU32(ea_off));
+         irx_addr = binop(mkAdd, mkexpr(EA), mode64 ? mkU64(ea_off) : mkU32(ea_off));
          putIReg( r, mkWidenFrom32(ty, loadBE(Ity_I32, irx_addr ),
                                        False) );
          ea_off += 4;
@@ -5258,7 +5265,7 @@
    case 0x2F: // stmw (Store Multiple Word, PPC32 p527)
       DIP("stmw r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
       for (r = rS_addr; r <= 31; r++) {
-         irx_addr = binop(Iop_Add32, mkexpr(EA), mkU32(ea_off));
+         irx_addr = binop(mkAdd, mkexpr(EA), mode64 ? mkU64(ea_off) : mkU32(ea_off));
          storeBE( irx_addr, mkNarrowTo32(ty, getIReg(r)) );
          ea_off += 4;
       }
@@ -7256,14 +7263,14 @@
       assign( addr, binop( mkSzOp(ty, Iop_And8),
                            mkexpr(EA),
                            mkSzImm(ty, ~(((ULong)lineszB)-1) )) );
-      putGST( PPC_GST_TISTART, mkexpr(addr) );
-      putGST( PPC_GST_TILEN, mkSzImm(ty, lineszB) );
+      putGST( PPC_GST_CMSTART, mkexpr(addr) );
+      putGST( PPC_GST_CMLEN, mkSzImm(ty, lineszB) );
 
       /* be paranoid ... */
       stmt( IRStmt_MBE(Imbe_Fence) );
 
       putGST( PPC_GST_CIA, mkSzImm(ty, nextInsnAddr()));
-      dres->jk_StopHere = Ijk_TInval;
+      dres->jk_StopHere = Ijk_InvalICache;
       dres->whatNext    = Dis_StopHere;
       break;
    }
@@ -12980,17 +12987,23 @@
    switch (opc2) {
       case 0x100: // xvaddsp (VSX Vector Add Single-Precision)
          DIP("xvaddsp v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB);
-         putVSReg( XT, binop(Iop_Add32Fx4, getVSReg( XA ), getVSReg( XB )) );
+         // WARNING: BOGUS! The backend ignores rm on Iop_Add32Fx4
+         putVSReg( XT, triop(Iop_Add32Fx4, rm,
+                             getVSReg( XA ), getVSReg( XB )) );
          break;
 
       case 0x140: // xvmulsp (VSX Vector Multiply Single-Precision)
          DIP("xvmulsp v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB);
-         putVSReg( XT, binop(Iop_Mul32Fx4, getVSReg( XA ), getVSReg( XB )) );
+         // WARNING: BOGUS! The backend ignores rm on Iop_Mul32Fx4
+         putVSReg( XT, triop(Iop_Mul32Fx4, rm,
+                             getVSReg( XA ), getVSReg( XB )) );
          break;
 
       case 0x120: // xvsubsp (VSX Vector Subtract Single-Precision)
          DIP("xvsubsp v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB);
-         putVSReg( XT, binop(Iop_Sub32Fx4, getVSReg( XA ), getVSReg( XB )) );
+         // WARNING: BOGUS! The backend ignores rm on Iop_Sub32Fx4
+         putVSReg( XT, triop(Iop_Sub32Fx4, rm,
+                             getVSReg( XA ), getVSReg( XB )) );
          break;
 
       case 0x160: // xvdivsp (VSX Vector Divide Single-Precision)
@@ -17774,6 +17787,9 @@
       return False;
    }
 
+   IRTemp rm = newTemp(Ity_I32);
+   assign(rm, get_IR_roundingmode());
+
    opc2 = IFIELD( theInstr, 0, 6 );
    switch (opc2) {
    case 0x2E: // vmaddfp (Multiply Add FP, AV p177)
@@ -17780,8 +17796,10 @@
       DIP("vmaddfp v%d,v%d,v%d,v%d\n",
           vD_addr, vA_addr, vC_addr, vB_addr);
       putVReg( vD_addr,
-               binop(Iop_Add32Fx4, mkexpr(vB),
-                     binop(Iop_Mul32Fx4, mkexpr(vA), mkexpr(vC))) );
+               triop(Iop_Add32Fx4, mkU32(Irrm_NEAREST),
+                     mkexpr(vB),
+                     triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
+                           mkexpr(vA), mkexpr(vC))) );
       return True;
 
    case 0x2F: { // vnmsubfp (Negative Multiply-Subtract FP, AV p215)
@@ -17788,9 +17806,10 @@
       DIP("vnmsubfp v%d,v%d,v%d,v%d\n",
           vD_addr, vA_addr, vC_addr, vB_addr);
       putVReg( vD_addr,
-               binop(Iop_Sub32Fx4,
+               triop(Iop_Sub32Fx4, mkU32(Irrm_NEAREST),
                      mkexpr(vB),
-                     binop(Iop_Mul32Fx4, mkexpr(vA), mkexpr(vC))) );
+                     triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
+                           mkexpr(vA), mkexpr(vC))) );
       return True;
    }
 
@@ -17802,12 +17821,14 @@
    switch (opc2) {
    case 0x00A: // vaddfp (Add FP, AV p137)
       DIP("vaddfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
-      putVReg( vD_addr, binop(Iop_Add32Fx4, mkexpr(vA), mkexpr(vB)) );
+      putVReg( vD_addr, triop(Iop_Add32Fx4,
+                              mkU32(Irrm_NEAREST), mkexpr(vA), mkexpr(vB)) );
       return True;
 
   case 0x04A: // vsubfp (Subtract FP, AV p261)
       DIP("vsubfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
-      putVReg( vD_addr, binop(Iop_Sub32Fx4, mkexpr(vA), mkexpr(vB)) );
+      putVReg( vD_addr, triop(Iop_Sub32Fx4,
+                              mkU32(Irrm_NEAREST), mkexpr(vA), mkexpr(vB)) );
       return True;
 
    case 0x40A: // vmaxfp (Maximum FP, AV p178)
@@ -17924,8 +17945,9 @@
                        binop(Iop_CmpLE32Fx4, mkexpr(vA), mkexpr(vB))) );
       assign( lt, unop(Iop_NotV128,
                        binop(Iop_CmpGE32Fx4, mkexpr(vA),
-                             binop(Iop_Sub32Fx4, mkexpr(zeros),
-                                                 mkexpr(vB)))) );
+                             triop(Iop_Sub32Fx4, mkU32(Irrm_NEAREST),
+                                   mkexpr(zeros),
+                                   mkexpr(vB)))) );
 
       // finally, just shift gt,lt to correct position
       assign( vD, binop(Iop_ShlN32x4,
@@ -17986,7 +18008,7 @@
    switch (opc2) {
    case 0x30A: // vcfux (Convert from Unsigned Fixed-Point W, AV p156)
       DIP("vcfux v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
-      putVReg( vD_addr, binop(Iop_Mul32Fx4,
+      putVReg( vD_addr, triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
                               unop(Iop_I32UtoFx4, mkexpr(vB)),
                               mkexpr(vInvScale)) );
       return True;
@@ -17994,7 +18016,7 @@
    case 0x34A: // vcfsx (Convert from Signed Fixed-Point W, AV p155)
       DIP("vcfsx v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
 
-      putVReg( vD_addr, binop(Iop_Mul32Fx4,
+      putVReg( vD_addr, triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
                               unop(Iop_I32StoFx4, mkexpr(vB)),
                               mkexpr(vInvScale)) );
       return True;
@@ -18003,7 +18025,8 @@
       DIP("vctuxs v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
       putVReg( vD_addr,
                unop(Iop_QFtoI32Ux4_RZ, 
-                    binop(Iop_Mul32Fx4, mkexpr(vB), mkexpr(vScale))) );
+                    triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
+                          mkexpr(vB), mkexpr(vScale))) );
       return True;
 
    case 0x3CA: // vctsxs (Convert to Signed Fixed-Point W Saturate, AV p171)
@@ -18010,7 +18033,8 @@
       DIP("vctsxs v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
       putVReg( vD_addr, 
                unop(Iop_QFtoI32Sx4_RZ, 
-                     binop(Iop_Mul32Fx4, mkexpr(vB), mkexpr(vScale))) );
+                     triop(Iop_Mul32Fx4, mkU32(Irrm_NEAREST),
+                           mkexpr(vB), mkexpr(vScale))) );
       return True;
 
    default:
@@ -18522,10 +18546,10 @@
       UChar* code = (UChar*)(guest_code + delta);
       /* Spot the 16-byte preamble: 
          32-bit mode:
-            54001800  rlwinm 0,0,3,0,0
-            54006800  rlwinm 0,0,13,0,0
-            5400E800  rlwinm 0,0,29,0,0
-            54009800  rlwinm 0,0,19,0,0
+            5400183E  rlwinm 0,0,3,0,31
+            5400683E  rlwinm 0,0,13,0,31
+            5400E83E  rlwinm 0,0,29,0,31
+            5400983E  rlwinm 0,0,19,0,31
          64-bit mode:
             78001800  rotldi 0,0,3
             78006800  rotldi 0,0,13
@@ -18532,10 +18556,10 @@
             7800E802  rotldi 0,0,61
             78009802  rotldi 0,0,51
       */
-      UInt word1 = mode64 ? 0x78001800 : 0x54001800;
-      UInt word2 = mode64 ? 0x78006800 : 0x54006800;
-      UInt word3 = mode64 ? 0x7800E802 : 0x5400E800;
-      UInt word4 = mode64 ? 0x78009802 : 0x54009800;
+      UInt word1 = mode64 ? 0x78001800 : 0x5400183E;
+      UInt word2 = mode64 ? 0x78006800 : 0x5400683E;
+      UInt word3 = mode64 ? 0x7800E802 : 0x5400E83E;
+      UInt word4 = mode64 ? 0x78009802 : 0x5400983E;
       if (getUIntBigendianly(code+ 0) == word1 &&
           getUIntBigendianly(code+ 4) == word2 &&
           getUIntBigendianly(code+ 8) == word3 &&
@@ -18593,12 +18617,12 @@
             // be redone. For ease of handling, we simply invalidate all the
             // time.
 
-            stmt(IRStmt_Put(OFFB_TISTART, mkSzImm(ty, guest_CIA_curr_instr)));
-            stmt(IRStmt_Put(OFFB_TILEN,   mkSzImm(ty, 20)));
+            stmt(IRStmt_Put(OFFB_CMSTART, mkSzImm(ty, guest_CIA_curr_instr)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkSzImm(ty, 20)));
    
             putGST( PPC_GST_CIA, mkSzImm( ty, guest_CIA_bbstart + delta ));
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             goto decode_success;
          }
          /* We don't know what it is.  Set opc1/opc2 so decode_failure
Index: priv/guest_s390_helpers.c
===================================================================
--- priv/guest_s390_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_s390_helpers.c	(.../trunk)	(revision 2863)
@@ -128,8 +128,8 @@
 /*------------------------------------------------------------*/
 
    state->guest_NRADDR = 0;
-   state->guest_TISTART = 0;
-   state->guest_TILEN = 0;
+   state->guest_CMSTART = 0;
+   state->guest_CMLEN = 0;
    state->guest_IP_AT_SYSCALL = 0;
    state->guest_EMNOTE = EmNote_NONE;
    state->host_EvC_COUNTER = 0;
@@ -225,8 +225,8 @@
       /*  0 */ ALWAYSDEFD(guest_CC_OP),     /* generic */
       /*  1 */ ALWAYSDEFD(guest_CC_NDEP),   /* generic */
       /*  2 */ ALWAYSDEFD(guest_EMNOTE),    /* generic */
-      /*  3 */ ALWAYSDEFD(guest_TISTART),   /* generic */
-      /*  4 */ ALWAYSDEFD(guest_TILEN),     /* generic */
+      /*  3 */ ALWAYSDEFD(guest_CMSTART),   /* generic */
+      /*  4 */ ALWAYSDEFD(guest_CMLEN),     /* generic */
       /*  5 */ ALWAYSDEFD(guest_IP_AT_SYSCALL), /* generic */
       /*  6 */ ALWAYSDEFD(guest_IA),        /* control reg */
       /*  7 */ ALWAYSDEFD(guest_fpc),       /* control reg */
Index: priv/guest_s390_toIR.c
===================================================================
--- priv/guest_s390_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_s390_toIR.c	(.../trunk)	(revision 2863)
@@ -417,7 +417,8 @@
 {
    vassert(typeOfIRExpr(irsb->tyenv, condition) == Ity_I1);
 
-   stmt(IRStmt_Exit(condition, Ijk_TInval, IRConst_U64(guest_IA_curr_instr),
+   stmt(IRStmt_Exit(condition, Ijk_InvalICache,
+                    IRConst_U64(guest_IA_curr_instr),
                     S390X_GUEST_OFFSET(guest_IA)));
 }
 
@@ -7606,7 +7607,7 @@
       put_gpr_dw0(r1, binop(Iop_And64, mkexpr(op2), mkU64(mask)));
    }
    assign(result, get_gpr_dw0(r1));
-   s390_cc_thunk_putS(S390_CC_OP_LOAD_AND_TEST, op2);
+   s390_cc_thunk_putS(S390_CC_OP_LOAD_AND_TEST, result);
 
    return "risbg";
 }
@@ -10862,9 +10863,9 @@
    stmt(IRStmt_Dirty(d));
 
    /* and restart */
-   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART),
+   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMSTART),
                    mkU64(guest_IA_curr_instr)));
-   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
+   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMLEN), mkU64(4)));
    restart_if(mkexpr(cond));
 
    ss.bytes = last_execute_target;
@@ -10893,15 +10894,15 @@
                              mkIRExprVec_1(load(Ity_I64, mkexpr(addr2))));
       stmt(IRStmt_Dirty(d));
       /* and restart */
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART),
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMSTART),
                       mkU64(guest_IA_curr_instr)));
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMLEN), mkU64(4)));
       restart_if(IRExpr_Const(IRConst_U1(True)));
 
       /* we know that this will be invalidated */
       put_IA(mkaddr_expr(guest_IA_next_instr));
       dis_res->whatNext = Dis_StopHere;
-      dis_res->jk_StopHere = Ijk_TInval;
+      dis_res->jk_StopHere = Ijk_InvalICache;
       break;
    }
 
@@ -10967,8 +10968,8 @@
       stmt(IRStmt_Dirty(d));
 
       /* and restart */
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART), mkU64(guest_IA_curr_instr)));
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMSTART), mkU64(guest_IA_curr_instr)));
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMLEN), mkU64(4)));
       restart_if(mkexpr(cond));
 
       /* Now comes the actual translation */
@@ -16362,9 +16363,9 @@
          injecting here can change. In which case the translation has to
          be redone. For ease of handling, we simply invalidate all the
          time. */
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART),
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMSTART),
                       mkU64(guest_IA_curr_instr)));
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN),
+      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMLEN),
                       mkU64(guest_IA_next_instr - guest_IA_curr_instr)));
       vassert(guest_IA_next_instr - guest_IA_curr_instr ==
               S390_SPECIAL_OP_PREAMBLE_SIZE + S390_SPECIAL_OP_SIZE);
@@ -16371,7 +16372,7 @@
 
       put_IA(mkaddr_expr(guest_IA_next_instr));
       dis_res->whatNext    = Dis_StopHere;
-      dis_res->jk_StopHere = Ijk_TInval;
+      dis_res->jk_StopHere = Ijk_InvalICache;
    } else {
       /* We don't know what it is. */
       return S390_DECODE_UNKNOWN_SPECIAL_INSN;
Index: priv/guest_x86_helpers.c
===================================================================
--- priv/guest_x86_helpers.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_x86_helpers.c	(.../trunk)	(revision 2863)
@@ -2757,18 +2757,14 @@
    vex_state->guest_EMNOTE = EmNote_NONE;
 
    /* SSE2 has a 'clflush' cache-line-invalidator which uses these. */
-   vex_state->guest_TISTART = 0;
-   vex_state->guest_TILEN   = 0;
+   vex_state->guest_CMSTART = 0;
+   vex_state->guest_CMLEN   = 0;
 
    vex_state->guest_NRADDR   = 0;
    vex_state->guest_SC_CLASS = 0;
    vex_state->guest_IP_AT_SYSCALL = 0;
 
-   Int i;
-   for (i = 0; i < sizeof(vex_state->padding)
-                   / sizeof(vex_state->padding[0]); i++) {
-      vex_state->padding[i] = 0;
-   }
+   vex_state->padding1 = 0;
 }
 
 
@@ -2866,8 +2862,8 @@
                  /* 17 */ ALWAYSDEFD(guest_GDT),
                  /* 18 */ ALWAYSDEFD(guest_EMNOTE),
                  /* 19 */ ALWAYSDEFD(guest_SSEROUND),
-                 /* 20 */ ALWAYSDEFD(guest_TISTART),
-                 /* 21 */ ALWAYSDEFD(guest_TILEN),
+                 /* 20 */ ALWAYSDEFD(guest_CMSTART),
+                 /* 21 */ ALWAYSDEFD(guest_CMLEN),
                  /* 22 */ ALWAYSDEFD(guest_SC_CLASS),
                  /* 23 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
                }
Index: priv/guest_x86_toIR.c
===================================================================
--- priv/guest_x86_toIR.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/guest_x86_toIR.c	(.../trunk)	(revision 2863)
@@ -54,10 +54,6 @@
      for float-to-float rounding.  For all other operations, 
      round-to-nearest is used, regardless.
 
-   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
-     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
-     even when it isn't.
-
    * some of the FCOM cases could do with testing -- not convinced
      that the args are the right way round.
 
@@ -279,8 +275,8 @@
 
 #define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
 
-#define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
-#define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
+#define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
+#define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
 
 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
@@ -3603,6 +3599,42 @@
 }
 
 
+/* Given i, and some expression e, and a condition cond, generate IR
+   which has the same effect as put_ST(i,e) when cond is true and has
+   no effect when cond is false.  Given the lack of proper
+   if-then-else in the IR, this is pretty tricky.
+*/
+
+static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
+{
+   // new_tag = if cond then FULL else old_tag
+   // new_val = if cond then (if old_tag==FULL then NaN else val)
+   //                   else old_val
+
+   IRTemp old_tag = newTemp(Ity_I8);
+   assign(old_tag, get_ST_TAG(i));
+   IRTemp new_tag = newTemp(Ity_I8);
+   assign(new_tag,
+          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
+
+   IRTemp old_val = newTemp(Ity_F64);
+   assign(old_val, get_ST_UNCHECKED(i));
+   IRTemp new_val = newTemp(Ity_F64);
+   assign(new_val,
+          IRExpr_ITE(mkexpr(cond),
+                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
+                                /* non-0 means full */
+                                mkQNaN64(),
+                                /* 0 means empty */
+                                value),
+                     mkexpr(old_val)));
+
+   put_ST_UNCHECKED(i, mkexpr(new_val));
+   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So 
+   // now set it to new_tag instead.
+   put_ST_TAG(i, mkexpr(new_tag));
+}
+
 /* Adjust FTOP downwards by one register. */
 
 static void fp_push ( void )
@@ -3610,6 +3642,14 @@
    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
 }
 
+/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
+   don't change it. */
+
+static void maybe_fp_push ( IRTemp cond )
+{
+   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
+}
+
 /* Adjust FTOP upwards by one register, and mark the vacated register
    as empty.  */
 
@@ -3619,12 +3659,49 @@
    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
 }
 
-/* Clear the C2 bit of the FPU status register, for
-   sin/cos/tan/sincos. */
+/* Set the C2 bit of the FPU status register to e[0].  Assumes that
+   e[31:1] == 0. 
+*/
+static void set_C2 ( IRExpr* e )
+{
+   IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
+   put_C3210( binop(Iop_Or32,
+                    cleared,
+                    binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
+}
 
-static void clear_C2 ( void )
+/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
+   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
+   test is simple, but the derivation of it is not so simple.
+
+   The exponent field for an IEEE754 double is 11 bits.  That means it
+   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
+   the number is either a NaN or an Infinity and so is not finite.
+   Furthermore, a finite value of exactly 2^63 is the smallest value
+   that has exponent value 0x43E.  Hence, what we need to do is
+   extract the exponent, ignoring the sign bit and mantissa, and check
+   it is < 0x43E, or <= 0x43D.
+
+   To make this easily applicable to 32- and 64-bit targets, a
+   roundabout approach is used.  First the number is converted to I64,
+   then the top 32 bits are taken.  Shifting them right by 20 bits
+   places the sign bit and exponent in the bottom 12 bits.  Anding
+   with 0x7FF gets rid of the sign bit, leaving just the exponent
+   available for comparison.
+*/
+static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
 {
-   put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
+   IRTemp i64 = newTemp(Ity_I64);
+   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
+   IRTemp exponent = newTemp(Ity_I32);
+   assign(exponent,
+          binop(Iop_And32,
+                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
+                mkU32(0x7FF)));
+   IRTemp in_range_and_finite = newTemp(Ity_I1);
+   assign(in_range_and_finite,
+          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
+   return in_range_and_finite;
 }
 
 /* Invent a plausible-looking FPU status word value:
@@ -4245,16 +4322,31 @@
                fp_pop();
                break;
 
-            case 0xF2: /* FPTAN */
-               DIP("ftan\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_TanF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               fp_push();
-               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
-               clear_C2(); /* HACK */
+            case 0xF2: { /* FPTAN */
+               DIP("fptan\n");
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(Iop_TanF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               /* Conditionally push 1.0 on the stack, if the arg is
+                  in range */
+               maybe_fp_push(argOK);
+               maybe_put_ST(argOK, 0,
+                            IRExpr_Const(IRConst_F64(1.0)));
+               set_C2( binop(Iop_Xor32,
+                             unop(Iop_1Uto32, mkexpr(argOK)), 
+                             mkU32(1)) );
                break;
+            }
 
             case 0xF3: /* FPATAN */
                DIP("fpatan\n");
@@ -4368,19 +4460,30 @@
                break;
 
             case 0xFB: { /* FSINCOS */
-               IRTemp a1 = newTemp(Ity_F64);
-               assign( a1, get_ST(0) );
                DIP("fsincos\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_SinF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        mkexpr(a1)));
-               fp_push();
-               put_ST(0, 
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(Iop_SinF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               /* Conditionally push the cos value on the stack, if
+                  the arg is in range */
+               maybe_fp_push(argOK);
+               maybe_put_ST(argOK, 0,
                   binop(Iop_CosF64,
                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        mkexpr(a1)));
-               clear_C2(); /* HACK */
+                        mkexpr(argD)));
+               set_C2( binop(Iop_Xor32,
+                             unop(Iop_1Uto32, mkexpr(argOK)), 
+                             mkU32(1)) );
                break;
             }
 
@@ -4399,24 +4502,29 @@
                         get_ST(1)));
                break;
 
-            case 0xFE: /* FSIN */
-               DIP("fsin\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_SinF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               clear_C2(); /* HACK */
+            case 0xFE:   /* FSIN */
+            case 0xFF: { /* FCOS */
+               Bool isSIN = modrm == 0xFE;
+               DIP("%s\n", isSIN ? "fsin" : "fcos");
+               IRTemp argD = newTemp(Ity_F64);
+               assign(argD, get_ST(0));
+               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
+               IRTemp resD = newTemp(Ity_F64);
+               assign(resD,
+                  IRExpr_ITE(
+                     mkexpr(argOK), 
+                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
+                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                           mkexpr(argD)),
+                     mkexpr(argD))
+               );
+               put_ST_UNCHECKED(0, mkexpr(resD));
+               set_C2( binop(Iop_Xor32,
+                             unop(Iop_1Uto32, mkexpr(argOK)), 
+                             mkU32(1)) );
                break;
+            }
 
-            case 0xFF: /* FCOS */
-               DIP("fcos\n");
-               put_ST_UNCHECKED(0, 
-                  binop(Iop_CosF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
-                        get_ST(0)));
-               clear_C2(); /* HACK */
-               break;
-
             default:
                goto decode_fail;
          }
@@ -6856,6 +6964,27 @@
 /*--- SSE/SSE2/SSE3 helpers                                ---*/
 /*------------------------------------------------------------*/
 
+/* Indicates whether the op requires a rounding-mode argument.  Note
+   that this covers only vector floating point arithmetic ops, and
+   omits the scalar ones that need rounding modes.  Note also that
+   inconsistencies here will get picked up later by the IR sanity
+   checker, so this isn't correctness-critical. */
+static Bool requiresRMode ( IROp op )
+{
+   switch (op) {
+      /* 128 bit ops */
+      case Iop_Add32Fx4: case Iop_Sub32Fx4:
+      case Iop_Mul32Fx4: case Iop_Div32Fx4:
+      case Iop_Add64Fx2: case Iop_Sub64Fx2:
+      case Iop_Mul64Fx2: case Iop_Div64Fx2:
+         return True;
+      default:
+         break;
+   }
+   return False;
+}
+
+
 /* Worker function; do not call directly. 
    Handles full width G = G `op` E   and   G = (not G) `op` E.
 */
@@ -6874,9 +7003,15 @@
       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
                 : getXMMReg(gregOfRM(rm));
    if (epartIsReg(rm)) {
-      putXMMReg( gregOfRM(rm), 
-                 binop(op, gpart,
-                           getXMMReg(eregOfRM(rm))) );
+      putXMMReg(
+         gregOfRM(rm),
+         requiresRMode(op)
+            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        gpart,
+                        getXMMReg(eregOfRM(rm)))
+            : binop(op, gpart,
+                        getXMMReg(eregOfRM(rm)))
+      );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRM(rm)),
                         nameXMMReg(gregOfRM(rm)) );
@@ -6883,9 +7018,15 @@
       return delta+1;
    } else {
       addr = disAMode ( &alen, sorb, delta, dis_buf );
-      putXMMReg( gregOfRM(rm), 
-                 binop(op, gpart,
-                           loadLE(Ity_V128, mkexpr(addr))) );
+      putXMMReg(
+         gregOfRM(rm), 
+         requiresRMode(op)
+            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        gpart,
+                        loadLE(Ity_V128, mkexpr(addr)))
+            : binop(op, gpart,
+                        loadLE(Ity_V128, mkexpr(addr)))
+      );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRM(rm)) );
@@ -8026,14 +8167,14 @@
             // injecting here can change. In which case the translation has to
             // be redone. For ease of handling, we simply invalidate all the
             // time.
-            stmt(IRStmt_Put(OFFB_TISTART, mkU32(guest_EIP_curr_instr)));
-            stmt(IRStmt_Put(OFFB_TILEN,   mkU32(14)));
+            stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
+            stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
    
             delta += 14;
 
             stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
             dres.whatNext    = Dis_StopHere;
-            dres.jk_StopHere = Ijk_TInval;
+            dres.jk_StopHere = Ijk_InvalICache;
             goto decode_success;
          }
          /* We don't know what it is. */
@@ -11613,14 +11754,14 @@
 
       /* Round addr down to the start of the containing block. */
       stmt( IRStmt_Put(
-               OFFB_TISTART,
+               OFFB_CMSTART,
                binop( Iop_And32, 
                       mkexpr(addr), 
                       mkU32( ~(lineszB-1) ))) );
 
-      stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
+      stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
 
-      jmp_lit(&dres, Ijk_TInval, (Addr32)(guest_EIP_bbstart+delta));
+      jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
 
       DIP("clflush %s\n", dis_buf);
       goto decode_success;
@@ -11712,6 +11853,7 @@
       IRTemp gV   = newTemp(Ity_V128);
       IRTemp addV = newTemp(Ity_V128);
       IRTemp subV = newTemp(Ity_V128);
+      IRTemp rm     = newTemp(Ity_I32);
       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
 
       modrm = insn[3];
@@ -11730,8 +11872,9 @@
 
       assign( gV, getXMMReg(gregOfRM(modrm)) );
 
-      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
-      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
+      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+      assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
+      assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
 
       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
@@ -11748,6 +11891,7 @@
       IRTemp subV = newTemp(Ity_V128);
       IRTemp a1     = newTemp(Ity_I64);
       IRTemp s0     = newTemp(Ity_I64);
+      IRTemp rm     = newTemp(Ity_I32);
 
       modrm = insn[2];
       if (epartIsReg(modrm)) {
@@ -11765,8 +11909,9 @@
 
       assign( gV, getXMMReg(gregOfRM(modrm)) );
 
-      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
-      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
+      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
+      assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
+      assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
 
       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
@@ -11785,6 +11930,7 @@
       IRTemp gV     = newTemp(Ity_V128);
       IRTemp leftV  = newTemp(Ity_V128);
       IRTemp rightV = newTemp(Ity_V128);
+      IRTemp rm     = newTemp(Ity_I32);
       Bool   isAdd  = insn[2] == 0x7C;
       const HChar* str = isAdd ? "add" : "sub";
       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
@@ -11811,9 +11957,10 @@
       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
 
+      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
       putXMMReg( gregOfRM(modrm), 
-                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
-                       mkexpr(leftV), mkexpr(rightV) ) );
+                 triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
+                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
       goto decode_success;
    }
 
@@ -11828,6 +11975,7 @@
       IRTemp gV     = newTemp(Ity_V128);
       IRTemp leftV  = newTemp(Ity_V128);
       IRTemp rightV = newTemp(Ity_V128);
+      IRTemp rm     = newTemp(Ity_I32);
       Bool   isAdd  = insn[1] == 0x7C;
       const HChar* str = isAdd ? "add" : "sub";
 
@@ -11855,9 +12003,10 @@
       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
 
+      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
       putXMMReg( gregOfRM(modrm), 
-                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2, 
-                       mkexpr(leftV), mkexpr(rightV) ) );
+                 triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2, 
+                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
       goto decode_success;
    }
 
@@ -15181,6 +15330,14 @@
          break;
       }
 
+      case 0x05: /* AMD's syscall */
+         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
+              mkU32(guest_EIP_curr_instr) ) );
+         jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
+         vassert(dres.whatNext == Dis_StopHere);
+         DIP("syscall\n");
+         break;
+
       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
 
       default:
Index: priv/host_amd64_defs.c
===================================================================
--- priv/host_amd64_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_amd64_defs.c	(.../trunk)	(revision 2863)
@@ -2865,7 +2865,7 @@
          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
-         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
@@ -3069,7 +3069,6 @@
          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
-         case Afp_TAN:    *p++ = 0xD9; *p++ = 0xF2; break;
          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
@@ -3078,7 +3077,24 @@
          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
-         default: goto bad;
+         case Afp_TAN:
+            /* fptan pushes 1.0 on the FP stack, except when the
+               argument is out of range.  Hence we have to do the
+               instruction, then inspect C2 to see if there is an out
+               of range condition.  If there is, we skip the fincstp
+               that is used by the in-range case to get rid of this
+               extra 1.0 value. */
+            *p++ = 0xD9; *p++ = 0xF2; // fptan
+            *p++ = 0x50;              // pushq %rax
+            *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
+            *p++ = 0x66; *p++ = 0xA9; 
+            *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
+            *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
+            *p++ = 0xD9; *p++ = 0xF7; // fincstp
+            *p++ = 0x58;              // after_fincstp: popq %rax
+            break;
+         default:
+            goto bad;
       }
       goto done;
 
Index: priv/host_amd64_isel.c
===================================================================
--- priv/host_amd64_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_amd64_isel.c	(.../trunk)	(revision 2863)
@@ -3031,11 +3031,12 @@
          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
          /* XXXROUNDINGFIXME */
          /* set roundingmode here */
+         /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
+            codes.  I don't think that matters, since this insn
+            selector never generates such an instruction intervening
+            between an flag-setting instruction and a flag-using
+            instruction. */
          addInstr(env, AMD64Instr_A87FpOp(fpop));
-         if (e->Iex.Binop.op==Iop_TanF64) {
-            /* get rid of the extra 1.0 that fptan pushes */
-            addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
-         }
          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
          return dst;
@@ -3355,12 +3356,8 @@
       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
-      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
-      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
-      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
-      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
       do_32Fx4:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3375,12 +3372,8 @@
       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
-      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
-      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
-      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
-      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
       do_64Fx2:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3660,6 +3653,47 @@
    } /* switch (e->Iex.Binop.op) */
    } /* if (e->tag == Iex_Binop) */
 
+   if (e->tag == Iex_Triop) {
+   IRTriop *triop = e->Iex.Triop.details;
+   switch (triop->op) {
+
+      case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
+      case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
+      case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
+      case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
+      do_64Fx2_w_rm:
+      {
+         HReg argL = iselVecExpr(env, triop->arg2);
+         HReg argR = iselVecExpr(env, triop->arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
+      case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
+      case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
+      case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
+      do_32Fx4_w_rm:
+      {
+         HReg argL = iselVecExpr(env, triop->arg2);
+         HReg argR = iselVecExpr(env, triop->arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (triop->op) */
+   } /* if (e->tag == Iex_Triop) */
+
    if (e->tag == Iex_ITE) { // VFD
       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
@@ -3851,10 +3885,6 @@
    if (e->tag == Iex_Binop) {
    switch (e->Iex.Binop.op) {
 
-      case Iop_Add64Fx4:   op = Asse_ADDF;   goto do_64Fx4;
-      case Iop_Sub64Fx4:   op = Asse_SUBF;   goto do_64Fx4;
-      case Iop_Mul64Fx4:   op = Asse_MULF;   goto do_64Fx4;
-      case Iop_Div64Fx4:   op = Asse_DIVF;   goto do_64Fx4;
       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
       do_64Fx4:
@@ -3873,10 +3903,6 @@
          return;
       }
 
-      case Iop_Add32Fx8:   op = Asse_ADDF;   goto do_32Fx8;
-      case Iop_Sub32Fx8:   op = Asse_SUBF;   goto do_32Fx8;
-      case Iop_Mul32Fx8:   op = Asse_MULF;   goto do_32Fx8;
-      case Iop_Div32Fx8:   op = Asse_DIVF;   goto do_32Fx8;
       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
       do_32Fx8:
@@ -4145,6 +4171,60 @@
    } /* switch (e->Iex.Binop.op) */
    } /* if (e->tag == Iex_Binop) */
 
+   if (e->tag == Iex_Triop) {
+   IRTriop *triop = e->Iex.Triop.details;
+   switch (triop->op) {
+
+      case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
+      case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
+      case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
+      case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
+      do_64Fx4_w_rm:
+      {
+         HReg argLhi, argLlo, argRhi, argRlo;
+         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
+         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
+         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
+         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
+         *rHi = dstHi;
+         *rLo = dstLo;
+         return;
+      }
+
+      case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
+      case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
+      case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
+      case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
+      do_32Fx8_w_rm:
+      {
+         HReg argLhi, argLlo, argRhi, argRlo;
+         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
+         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
+         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
+         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
+         *rHi = dstHi;
+         *rLo = dstLo;
+         return;
+      }
+
+      default:
+         break;
+   } /* switch (triop->op) */
+   } /* if (e->tag == Iex_Triop) */
+
+
    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
       HReg        rsp     = hregAMD64_RSP();
       HReg        vHi     = newVRegV(env);
@@ -4649,7 +4729,7 @@
          case Ijk_SigSEGV:
          case Ijk_SigTRAP:
          case Ijk_Sys_syscall:
-         case Ijk_TInval:
+         case Ijk_InvalICache:
          case Ijk_Yield:
          {
             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
@@ -4744,7 +4824,7 @@
       case Ijk_SigSEGV:
       case Ijk_SigTRAP:
       case Ijk_Sys_syscall:
-      case Ijk_TInval:
+      case Ijk_InvalICache:
       case Ijk_Yield: {
          HReg        r     = iselIntExpr_R(env, next);
          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
Index: priv/host_arm64_defs.c
===================================================================
--- priv/host_arm64_defs.c	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/host_arm64_defs.c	(.../trunk)	(revision 2863)
@@ -0,0 +1,6609 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_arm64_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+#include "libvex_trc_values.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "host_arm64_defs.h"
+
+//ZZ UInt arm_hwcaps = 0;
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.  We use the following classes only:
+     X regs (64 bit int)
+     D regs (64 bit float, also used for 32 bit float)
+     Q regs (128 bit vector)
+*/
+
+void ppHRegARM64 ( HReg reg )  {
+   Int r;
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+      case HRcInt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 31);
+         vex_printf("x%d", r);
+         return;
+      case HRcFlt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 32);
+         vex_printf("d%d", r);
+         return;
+      case HRcVec128:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 32);
+         vex_printf("q%d", r);
+         return;
+      default:
+         vpanic("ppHRegARM64");
+   }
+}
+
+static void ppHRegARM64asSreg ( HReg reg ) {
+   ppHRegARM64(reg);
+   vex_printf("(S-reg)");
+}
+
+HReg hregARM64_X0  ( void ) { return mkHReg(0,  HRcInt64, False); }
+HReg hregARM64_X1  ( void ) { return mkHReg(1,  HRcInt64, False); }
+HReg hregARM64_X2  ( void ) { return mkHReg(2,  HRcInt64, False); }
+HReg hregARM64_X3  ( void ) { return mkHReg(3,  HRcInt64, False); }
+HReg hregARM64_X4  ( void ) { return mkHReg(4,  HRcInt64, False); }
+HReg hregARM64_X5  ( void ) { return mkHReg(5,  HRcInt64, False); }
+HReg hregARM64_X6  ( void ) { return mkHReg(6,  HRcInt64, False); }
+HReg hregARM64_X7  ( void ) { return mkHReg(7,  HRcInt64, False); }
+//ZZ HReg hregARM_R8  ( void ) { return mkHReg(8,  HRcInt32, False); }
+HReg hregARM64_X9  ( void ) { return mkHReg(9,  HRcInt64, False); }
+HReg hregARM64_X10 ( void ) { return mkHReg(10, HRcInt64, False); }
+HReg hregARM64_X11 ( void ) { return mkHReg(11, HRcInt64, False); }
+HReg hregARM64_X12 ( void ) { return mkHReg(12, HRcInt64, False); }
+HReg hregARM64_X13 ( void ) { return mkHReg(13, HRcInt64, False); }
+HReg hregARM64_X14 ( void ) { return mkHReg(14, HRcInt64, False); }
+HReg hregARM64_X15 ( void ) { return mkHReg(15, HRcInt64, False); }
+HReg hregARM64_X21 ( void ) { return mkHReg(21, HRcInt64, False); }
+HReg hregARM64_X22 ( void ) { return mkHReg(22, HRcInt64, False); }
+HReg hregARM64_X23 ( void ) { return mkHReg(23, HRcInt64, False); }
+HReg hregARM64_X24 ( void ) { return mkHReg(24, HRcInt64, False); }
+HReg hregARM64_X25 ( void ) { return mkHReg(25, HRcInt64, False); }
+HReg hregARM64_X26 ( void ) { return mkHReg(26, HRcInt64, False); }
+HReg hregARM64_X27 ( void ) { return mkHReg(27, HRcInt64, False); }
+HReg hregARM64_X28 ( void ) { return mkHReg(28, HRcInt64, False); }
+
+// Should really use D8 .. D15 for class F64, since they are callee
+// save
+HReg hregARM64_D8  ( void ) { return mkHReg(8,  HRcFlt64, False); }
+HReg hregARM64_D9  ( void ) { return mkHReg(9,  HRcFlt64, False); }
+HReg hregARM64_D10 ( void ) { return mkHReg(10, HRcFlt64, False); }
+HReg hregARM64_D11 ( void ) { return mkHReg(11, HRcFlt64, False); }
+HReg hregARM64_D12 ( void ) { return mkHReg(12, HRcFlt64, False); }
+HReg hregARM64_D13 ( void ) { return mkHReg(13, HRcFlt64, False); }
+//ZZ HReg hregARM_S26 ( void ) { return mkHReg(26, HRcFlt32, False); }
+//ZZ HReg hregARM_S27 ( void ) { return mkHReg(27, HRcFlt32, False); }
+//ZZ HReg hregARM_S28 ( void ) { return mkHReg(28, HRcFlt32, False); }
+//ZZ HReg hregARM_S29 ( void ) { return mkHReg(29, HRcFlt32, False); }
+//ZZ HReg hregARM_S30 ( void ) { return mkHReg(30, HRcFlt32, False); }
+HReg hregARM64_Q16 ( void ) { return mkHReg(16, HRcVec128, False); }
+HReg hregARM64_Q17 ( void ) { return mkHReg(17, HRcVec128, False); }
+HReg hregARM64_Q18 ( void ) { return mkHReg(18, HRcVec128, False); }
+//ZZ HReg hregARM_Q11 ( void ) { return mkHReg(11, HRcVec128, False); }
+//ZZ HReg hregARM_Q12 ( void ) { return mkHReg(12, HRcVec128, False); }
+//ZZ HReg hregARM_Q13 ( void ) { return mkHReg(13, HRcVec128, False); }
+//ZZ HReg hregARM_Q14 ( void ) { return mkHReg(14, HRcVec128, False); }
+//ZZ HReg hregARM_Q15 ( void ) { return mkHReg(15, HRcVec128, False); }
+
+void getAllocableRegs_ARM64 ( Int* nregs, HReg** arr )
+{
+   Int i = 0;
+   *nregs = 24;
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+
+   // callee saves ones (22 to 28) are listed first, since we prefer
+   // them if they're available
+   (*arr)[i++] = hregARM64_X22();
+   (*arr)[i++] = hregARM64_X23();
+   (*arr)[i++] = hregARM64_X24();
+   (*arr)[i++] = hregARM64_X25();
+   (*arr)[i++] = hregARM64_X26();
+   (*arr)[i++] = hregARM64_X27();
+   (*arr)[i++] = hregARM64_X28();
+
+   (*arr)[i++] = hregARM64_X0();
+   (*arr)[i++] = hregARM64_X1();
+   (*arr)[i++] = hregARM64_X2();
+   (*arr)[i++] = hregARM64_X3();
+   (*arr)[i++] = hregARM64_X4();
+   (*arr)[i++] = hregARM64_X5();
+   (*arr)[i++] = hregARM64_X6();
+   (*arr)[i++] = hregARM64_X7();
+   // X8 .. who knows.
+   // X9 is a chaining/spill temporary, not available to regalloc.
+
+   // Do we really need all these?
+   //(*arr)[i++] = hregARM64_X10();
+   //(*arr)[i++] = hregARM64_X11();
+   //(*arr)[i++] = hregARM64_X12();
+   //(*arr)[i++] = hregARM64_X13();
+   //(*arr)[i++] = hregARM64_X14();
+   //(*arr)[i++] = hregARM64_X15();
+   // X21 is the guest state pointer, not available to regalloc.
+
+   // vector regs.  Unfortunately not callee-saved.
+   (*arr)[i++] = hregARM64_Q16();
+   (*arr)[i++] = hregARM64_Q17();
+   (*arr)[i++] = hregARM64_Q18();
+
+   // F64 regs, all of which are callee-saved
+   (*arr)[i++] = hregARM64_D8();
+   (*arr)[i++] = hregARM64_D9();
+   (*arr)[i++] = hregARM64_D10();
+   (*arr)[i++] = hregARM64_D11();
+   (*arr)[i++] = hregARM64_D12();
+   (*arr)[i++] = hregARM64_D13();
+
+   // unavail: x21 as GSP
+   // x9 is used as a spill/reload/chaining/call temporary
+   // x8 is unassigned
+   // x30 as LR
+   // x31 because dealing with the SP-vs-ZR overloading is too
+   // confusing, and we don't need to do so, so let's just avoid
+   // the problem
+   //
+   // Currently, we have 15 allocatable integer registers:
+   // 0 1 2 3 4 5 6 7 22 23 24 25 26 27 28
+   //
+   // Hence for the allocatable integer registers we have:
+   //
+   // callee-saved: 22 23 24 25 26 27 28
+   // caller-saved: 0 1 2 3 4 5 6 7
+   //
+   // If the set of available registers changes or if the e/r status
+   // changes, be sure to re-check/sync the definition of
+   // getHRegUsage for ARMInstr_Call too.
+   vassert(i == *nregs);
+}
+
+
+/* --------- Condition codes, ARM64 encoding. --------- */
+
+static const HChar* showARM64CondCode ( ARM64CondCode cond ) {
+   switch (cond) {
+       case ARM64cc_EQ:  return "eq";
+       case ARM64cc_NE:  return "ne";
+       case ARM64cc_CS:  return "cs";
+       case ARM64cc_CC:  return "cc";
+       case ARM64cc_MI:  return "mi";
+       case ARM64cc_PL:  return "pl";
+       case ARM64cc_VS:  return "vs";
+       case ARM64cc_VC:  return "vc";
+       case ARM64cc_HI:  return "hi";
+       case ARM64cc_LS:  return "ls";
+       case ARM64cc_GE:  return "ge";
+       case ARM64cc_LT:  return "lt";
+       case ARM64cc_GT:  return "gt";
+       case ARM64cc_LE:  return "le";
+       case ARM64cc_AL:  return "al"; // default
+       case ARM64cc_NV:  return "nv";
+       default: vpanic("showARM64CondCode");
+   }
+}
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+ARM64AMode* ARM64AMode_RI9  ( HReg reg, Int simm9 ) {
+   ARM64AMode* am        = LibVEX_Alloc(sizeof(ARM64AMode));
+   am->tag               = ARM64am_RI9;
+   am->ARM64am.RI9.reg   = reg;
+   am->ARM64am.RI9.simm9 = simm9;
+   vassert(-256 <= simm9 && simm9 <= 255);
+   return am;
+}
+
+ARM64AMode* ARM64AMode_RI12 ( HReg reg, Int uimm12, UChar szB ) {
+   ARM64AMode* am          = LibVEX_Alloc(sizeof(ARM64AMode));
+   am->tag                 = ARM64am_RI12;
+   am->ARM64am.RI12.reg    = reg;
+   am->ARM64am.RI12.uimm12 = uimm12;
+   am->ARM64am.RI12.szB    = szB;
+   vassert(uimm12 >= 0 && uimm12 <= 4095);
+   switch (szB) {
+      case 1: case 2: case 4: case 8: break;
+      default: vassert(0);
+   }
+   return am;
+}
+
+ARM64AMode* ARM64AMode_RR ( HReg base, HReg index ) {
+   ARM64AMode* am       = LibVEX_Alloc(sizeof(ARM64AMode));
+   am->tag              = ARM64am_RR;
+   am->ARM64am.RR.base  = base;
+   am->ARM64am.RR.index = index;
+   return am;
+}
+
+static void ppARM64AMode ( ARM64AMode* am ) {
+   switch (am->tag) {
+      case ARM64am_RI9:
+         vex_printf("%d(", am->ARM64am.RI9.simm9);
+         ppHRegARM64(am->ARM64am.RI9.reg);
+         vex_printf(")");
+         break;
+      case ARM64am_RI12:
+         vex_printf("%u(", (UInt)am->ARM64am.RI12.szB
+                           * (UInt)am->ARM64am.RI12.uimm12);
+         ppHRegARM64(am->ARM64am.RI12.reg);
+         vex_printf(")");
+         break;
+      case ARM64am_RR:
+         vex_printf("(");
+         ppHRegARM64(am->ARM64am.RR.base);
+         vex_printf(",");
+         ppHRegARM64(am->ARM64am.RR.index);
+         vex_printf(")");
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARM64AMode ( HRegUsage* u, ARM64AMode* am ) {
+   switch (am->tag) {
+      case ARM64am_RI9:
+         addHRegUse(u, HRmRead, am->ARM64am.RI9.reg);
+         return;
+      case ARM64am_RI12:
+         addHRegUse(u, HRmRead, am->ARM64am.RI12.reg);
+         return;
+      case ARM64am_RR:
+         addHRegUse(u, HRmRead, am->ARM64am.RR.base);
+         addHRegUse(u, HRmRead, am->ARM64am.RR.index);
+         return;
+      default:
+         vpanic("addRegUsage_ARM64Amode");
+   }
+}
+
+static void mapRegs_ARM64AMode ( HRegRemap* m, ARM64AMode* am ) {
+   switch (am->tag) {
+      case ARM64am_RI9:
+         am->ARM64am.RI9.reg = lookupHRegRemap(m, am->ARM64am.RI9.reg);
+         return;
+      case ARM64am_RI12:
+         am->ARM64am.RI12.reg = lookupHRegRemap(m, am->ARM64am.RI12.reg);
+         return;
+      case ARM64am_RR:
+         am->ARM64am.RR.base  = lookupHRegRemap(m, am->ARM64am.RR.base);
+         am->ARM64am.RR.index = lookupHRegRemap(m, am->ARM64am.RR.index);
+         return;
+      default:
+         vpanic("mapRegs_ARM64Amode");
+   }
+}
+
+
+//ZZ /* --------- Mem AModes: Addressing Mode 2 --------- */
+//ZZ 
+//ZZ ARMAMode2* ARMAMode2_RI ( HReg reg, Int simm9 ) {
+//ZZ    ARMAMode2* am       = LibVEX_Alloc(sizeof(ARMAMode2));
+//ZZ    am->tag             = ARMam2_RI;
+//ZZ    am->ARMam2.RI.reg   = reg;
+//ZZ    am->ARMam2.RI.simm9 = simm9;
+//ZZ    vassert(-255 <= simm9 && simm9 <= 255);
+//ZZ    return am;
+//ZZ }
+//ZZ ARMAMode2* ARMAMode2_RR ( HReg base, HReg index ) {
+//ZZ    ARMAMode2* am       = LibVEX_Alloc(sizeof(ARMAMode2));
+//ZZ    am->tag             = ARMam2_RR;
+//ZZ    am->ARMam2.RR.base  = base;
+//ZZ    am->ARMam2.RR.index = index;
+//ZZ    return am;
+//ZZ }
+//ZZ 
+//ZZ void ppARMAMode2 ( ARMAMode2* am ) {
+//ZZ    switch (am->tag) {
+//ZZ       case ARMam2_RI:
+//ZZ          vex_printf("%d(", am->ARMam2.RI.simm9);
+//ZZ          ppHRegARM(am->ARMam2.RI.reg);
+//ZZ          vex_printf(")");
+//ZZ          break;
+//ZZ       case ARMam2_RR:
+//ZZ          vex_printf("(");
+//ZZ          ppHRegARM(am->ARMam2.RR.base);
+//ZZ          vex_printf(",");
+//ZZ          ppHRegARM(am->ARMam2.RR.index);
+//ZZ          vex_printf(")");
+//ZZ          break;
+//ZZ       default:
+//ZZ          vassert(0);
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static void addRegUsage_ARMAMode2 ( HRegUsage* u, ARMAMode2* am ) {
+//ZZ    switch (am->tag) {
+//ZZ       case ARMam2_RI:
+//ZZ          addHRegUse(u, HRmRead, am->ARMam2.RI.reg);
+//ZZ          return;
+//ZZ       case ARMam2_RR:
+//ZZ          //    addHRegUse(u, HRmRead, am->ARMam2.RR.base);
+//ZZ          //    addHRegUse(u, HRmRead, am->ARMam2.RR.index);
+//ZZ          //   return;
+//ZZ       default:
+//ZZ          vpanic("addRegUsage_ARMAmode2");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static void mapRegs_ARMAMode2 ( HRegRemap* m, ARMAMode2* am ) {
+//ZZ    switch (am->tag) {
+//ZZ       case ARMam2_RI:
+//ZZ          am->ARMam2.RI.reg = lookupHRegRemap(m, am->ARMam2.RI.reg);
+//ZZ          return;
+//ZZ       case ARMam2_RR:
+//ZZ          //am->ARMam2.RR.base =lookupHRegRemap(m, am->ARMam2.RR.base);
+//ZZ          //am->ARMam2.RR.index = lookupHRegRemap(m, am->ARMam2.RR.index);
+//ZZ          //return;
+//ZZ       default:
+//ZZ          vpanic("mapRegs_ARMAmode2");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* --------- Mem AModes: Addressing Mode VFP --------- */
+//ZZ 
+//ZZ ARMAModeV* mkARMAModeV ( HReg reg, Int simm11 ) {
+//ZZ    ARMAModeV* am = LibVEX_Alloc(sizeof(ARMAModeV));
+//ZZ    vassert(simm11 >= -1020 && simm11 <= 1020);
+//ZZ    vassert(0 == (simm11 & 3));
+//ZZ    am->reg    = reg;
+//ZZ    am->simm11 = simm11;
+//ZZ    return am;
+//ZZ }
+//ZZ 
+//ZZ void ppARMAModeV ( ARMAModeV* am ) {
+//ZZ    vex_printf("%d(", am->simm11);
+//ZZ    ppHRegARM(am->reg);
+//ZZ    vex_printf(")");
+//ZZ }
+//ZZ 
+//ZZ static void addRegUsage_ARMAModeV ( HRegUsage* u, ARMAModeV* am ) {
+//ZZ    addHRegUse(u, HRmRead, am->reg);
+//ZZ }
+//ZZ 
+//ZZ static void mapRegs_ARMAModeV ( HRegRemap* m, ARMAModeV* am ) {
+//ZZ    am->reg = lookupHRegRemap(m, am->reg);
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* --------- Mem AModes: Addressing Mode Neon ------- */
+//ZZ 
+//ZZ ARMAModeN *mkARMAModeN_RR ( HReg rN, HReg rM ) {
+//ZZ    ARMAModeN* am = LibVEX_Alloc(sizeof(ARMAModeN));
+//ZZ    am->tag = ARMamN_RR;
+//ZZ    am->ARMamN.RR.rN = rN;
+//ZZ    am->ARMamN.RR.rM = rM;
+//ZZ    return am;
+//ZZ }
+//ZZ 
+//ZZ ARMAModeN *mkARMAModeN_R ( HReg rN ) {
+//ZZ    ARMAModeN* am = LibVEX_Alloc(sizeof(ARMAModeN));
+//ZZ    am->tag = ARMamN_R;
+//ZZ    am->ARMamN.R.rN = rN;
+//ZZ    return am;
+//ZZ }
+//ZZ 
+//ZZ static void addRegUsage_ARMAModeN ( HRegUsage* u, ARMAModeN* am ) {
+//ZZ    if (am->tag == ARMamN_R) {
+//ZZ       addHRegUse(u, HRmRead, am->ARMamN.R.rN);
+//ZZ    } else {
+//ZZ       addHRegUse(u, HRmRead, am->ARMamN.RR.rN);
+//ZZ       addHRegUse(u, HRmRead, am->ARMamN.RR.rM);
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static void mapRegs_ARMAModeN ( HRegRemap* m, ARMAModeN* am ) {
+//ZZ    if (am->tag == ARMamN_R) {
+//ZZ       am->ARMamN.R.rN = lookupHRegRemap(m, am->ARMamN.R.rN);
+//ZZ    } else {
+//ZZ       am->ARMamN.RR.rN = lookupHRegRemap(m, am->ARMamN.RR.rN);
+//ZZ       am->ARMamN.RR.rM = lookupHRegRemap(m, am->ARMamN.RR.rM);
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ void ppARMAModeN ( ARMAModeN* am ) {
+//ZZ    vex_printf("[");
+//ZZ    if (am->tag == ARMamN_R) {
+//ZZ       ppHRegARM(am->ARMamN.R.rN);
+//ZZ    } else {
+//ZZ       ppHRegARM(am->ARMamN.RR.rN);
+//ZZ    }
+//ZZ    vex_printf("]");
+//ZZ    if (am->tag == ARMamN_RR) {
+//ZZ       vex_printf(", ");
+//ZZ       ppHRegARM(am->ARMamN.RR.rM);
+//ZZ    }
+//ZZ }
+
+
+/* --------- Reg or uimm12<<{0,12} operands --------- */
+
+ARM64RIA* ARM64RIA_I12 ( UShort imm12, UChar shift ) {
+   ARM64RIA* riA           = LibVEX_Alloc(sizeof(ARM64RIA));
+   riA->tag                = ARM64riA_I12;
+   riA->ARM64riA.I12.imm12 = imm12;
+   riA->ARM64riA.I12.shift = shift;
+   vassert(imm12 < 4096);
+   vassert(shift == 0 || shift == 12);
+   return riA;
+}
+ARM64RIA* ARM64RIA_R ( HReg reg ) {
+   ARM64RIA* riA       = LibVEX_Alloc(sizeof(ARM64RIA));
+   riA->tag            = ARM64riA_R;
+   riA->ARM64riA.R.reg = reg;
+   return riA;
+}
+
+static void ppARM64RIA ( ARM64RIA* riA ) {
+   switch (riA->tag) {
+      case ARM64riA_I12:
+         vex_printf("#%u",(UInt)(riA->ARM64riA.I12.imm12
+                                 << riA->ARM64riA.I12.shift));
+         break;
+      case ARM64riA_R:
+         ppHRegARM64(riA->ARM64riA.R.reg);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARM64RIA ( HRegUsage* u, ARM64RIA* riA ) {
+   switch (riA->tag) {
+      case ARM64riA_I12:
+         return;
+      case ARM64riA_R:
+         addHRegUse(u, HRmRead, riA->ARM64riA.R.reg);
+         return;
+      default:
+         vpanic("addRegUsage_ARM64RIA");
+   }
+}
+
+static void mapRegs_ARM64RIA ( HRegRemap* m, ARM64RIA* riA ) {
+   switch (riA->tag) {
+      case ARM64riA_I12:
+         return;
+      case ARM64riA_R:
+         riA->ARM64riA.R.reg = lookupHRegRemap(m, riA->ARM64riA.R.reg);
+         return;
+      default:
+         vpanic("mapRegs_ARM64RIA");
+   }
+}
+
+
+/* --------- Reg or "bitfield" (logic immediate) operands --------- */
+
+ARM64RIL* ARM64RIL_I13 ( UChar bitN, UChar immR, UChar immS ) {
+   ARM64RIL* riL          = LibVEX_Alloc(sizeof(ARM64RIL));
+   riL->tag               = ARM64riL_I13;
+   riL->ARM64riL.I13.bitN = bitN;
+   riL->ARM64riL.I13.immR = immR;
+   riL->ARM64riL.I13.immS = immS;
+   vassert(bitN < 2);
+   vassert(immR < 64);
+   vassert(immS < 64);
+   return riL;
+}
+ARM64RIL* ARM64RIL_R ( HReg reg ) {
+   ARM64RIL* riL       = LibVEX_Alloc(sizeof(ARM64RIL));
+   riL->tag            = ARM64riL_R;
+   riL->ARM64riL.R.reg = reg;
+   return riL;
+}
+
+static void ppARM64RIL ( ARM64RIL* riL ) {
+   switch (riL->tag) {
+      case ARM64riL_I13:
+         vex_printf("#nrs(%u,%u,%u)",
+                     (UInt)riL->ARM64riL.I13.bitN,
+                     (UInt)riL->ARM64riL.I13.immR,
+                     (UInt)riL->ARM64riL.I13.immS);
+         break;
+      case ARM64riL_R:
+         ppHRegARM64(riL->ARM64riL.R.reg);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARM64RIL ( HRegUsage* u, ARM64RIL* riL ) {
+   switch (riL->tag) {
+      case ARM64riL_I13:
+         return;
+      case ARM64riL_R:
+         addHRegUse(u, HRmRead, riL->ARM64riL.R.reg);
+         return;
+      default:
+         vpanic("addRegUsage_ARM64RIL");
+   }
+}
+
+static void mapRegs_ARM64RIL ( HRegRemap* m, ARM64RIL* riL ) {
+   switch (riL->tag) {
+      case ARM64riL_I13:
+         return;
+      case ARM64riL_R:
+         riL->ARM64riL.R.reg = lookupHRegRemap(m, riL->ARM64riL.R.reg);
+         return;
+      default:
+         vpanic("mapRegs_ARM64RIL");
+   }
+}
+
+
+/* --------------- Reg or uimm6 operands --------------- */
+
+ARM64RI6* ARM64RI6_I6 ( UInt imm6 ) {
+   ARM64RI6* ri6         = LibVEX_Alloc(sizeof(ARM64RI6));
+   ri6->tag              = ARM64ri6_I6;
+   ri6->ARM64ri6.I6.imm6 = imm6;
+   vassert(imm6 > 0 && imm6 < 64);
+   return ri6;
+}
+ARM64RI6* ARM64RI6_R ( HReg reg ) {
+   ARM64RI6* ri6       = LibVEX_Alloc(sizeof(ARM64RI6));
+   ri6->tag            = ARM64ri6_R;
+   ri6->ARM64ri6.R.reg = reg;
+   return ri6;
+}
+
+static void ppARM64RI6 ( ARM64RI6* ri6 ) {
+   switch (ri6->tag) {
+      case ARM64ri6_I6:
+         vex_printf("#%u", ri6->ARM64ri6.I6.imm6);
+         break;
+      case ARM64ri6_R:
+         ppHRegARM64(ri6->ARM64ri6.R.reg);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARM64RI6 ( HRegUsage* u, ARM64RI6* ri6 ) {
+   switch (ri6->tag) {
+      case ARM64ri6_I6:
+         return;
+      case ARM64ri6_R:
+         addHRegUse(u, HRmRead, ri6->ARM64ri6.R.reg);
+         return;
+      default:
+         vpanic("addRegUsage_ARM64RI6");
+   }
+}
+
+static void mapRegs_ARM64RI6 ( HRegRemap* m, ARM64RI6* ri6 ) {
+   switch (ri6->tag) {
+      case ARM64ri6_I6:
+         return;
+      case ARM64ri6_R:
+         ri6->ARM64ri6.R.reg = lookupHRegRemap(m, ri6->ARM64ri6.R.reg);
+         return;
+      default:
+         vpanic("mapRegs_ARM64RI6");
+   }
+}
+
+
+//ZZ /* -------- Neon Immediate operatnd --------- */
+//ZZ 
+//ZZ ARMNImm* ARMNImm_TI ( UInt type, UInt imm8 ) {
+//ZZ    ARMNImm* i = LibVEX_Alloc(sizeof(ARMNImm));
+//ZZ    i->type = type;
+//ZZ    i->imm8 = imm8;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ULong ARMNImm_to_Imm64 ( ARMNImm* imm ) {
+//ZZ    int i, j;
+//ZZ    ULong y, x = imm->imm8;
+//ZZ    switch (imm->type) {
+//ZZ       case 3:
+//ZZ          x = x << 8; /* fallthrough */
+//ZZ       case 2:
+//ZZ          x = x << 8; /* fallthrough */
+//ZZ       case 1:
+//ZZ          x = x << 8; /* fallthrough */
+//ZZ       case 0:
+//ZZ          return (x << 32) | x;
+//ZZ       case 5:
+//ZZ       case 6:
+//ZZ          if (imm->type == 5)
+//ZZ             x = x << 8;
+//ZZ          else
+//ZZ             x = (x << 8) | x;
+//ZZ          /* fallthrough */
+//ZZ       case 4:
+//ZZ          x = (x << 16) | x;
+//ZZ          return (x << 32) | x;
+//ZZ       case 8:
+//ZZ          x = (x << 8) | 0xFF;
+//ZZ          /* fallthrough */
+//ZZ       case 7:
+//ZZ          x = (x << 8) | 0xFF;
+//ZZ          return (x << 32) | x;
+//ZZ       case 9:
+//ZZ          x = 0;
+//ZZ          for (i = 7; i >= 0; i--) {
+//ZZ             y = ((ULong)imm->imm8 >> i) & 1;
+//ZZ             for (j = 0; j < 8; j++) {
+//ZZ                x = (x << 1) | y;
+//ZZ             }
+//ZZ          }
+//ZZ          return x;
+//ZZ       case 10:
+//ZZ          x |= (x & 0x80) << 5;
+//ZZ          x |= (~x & 0x40) << 5;
+//ZZ          x &= 0x187F; /* 0001 1000 0111 1111 */
+//ZZ          x |= (x & 0x40) << 4;
+//ZZ          x |= (x & 0x40) << 3;
+//ZZ          x |= (x & 0x40) << 2;
+//ZZ          x |= (x & 0x40) << 1;
+//ZZ          x = x << 19;
+//ZZ          x = (x << 32) | x;
+//ZZ          return x;
+//ZZ       default:
+//ZZ          vpanic("ARMNImm_to_Imm64");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ ARMNImm* Imm64_to_ARMNImm ( ULong x ) {
+//ZZ    ARMNImm tmp;
+//ZZ    if ((x & 0xFFFFFFFF) == (x >> 32)) {
+//ZZ       if ((x & 0xFFFFFF00) == 0)
+//ZZ          return ARMNImm_TI(0, x & 0xFF);
+//ZZ       if ((x & 0xFFFF00FF) == 0)
+//ZZ          return ARMNImm_TI(1, (x >> 8) & 0xFF);
+//ZZ       if ((x & 0xFF00FFFF) == 0)
+//ZZ          return ARMNImm_TI(2, (x >> 16) & 0xFF);
+//ZZ       if ((x & 0x00FFFFFF) == 0)
+//ZZ          return ARMNImm_TI(3, (x >> 24) & 0xFF);
+//ZZ       if ((x & 0xFFFF00FF) == 0xFF)
+//ZZ          return ARMNImm_TI(7, (x >> 8) & 0xFF);
+//ZZ       if ((x & 0xFF00FFFF) == 0xFFFF)
+//ZZ          return ARMNImm_TI(8, (x >> 16) & 0xFF);
+//ZZ       if ((x & 0xFFFF) == ((x >> 16) & 0xFFFF)) {
+//ZZ          if ((x & 0xFF00) == 0)
+//ZZ             return ARMNImm_TI(4, x & 0xFF);
+//ZZ          if ((x & 0x00FF) == 0)
+//ZZ             return ARMNImm_TI(5, (x >> 8) & 0xFF);
+//ZZ          if ((x & 0xFF) == ((x >> 8) & 0xFF))
+//ZZ             return ARMNImm_TI(6, x & 0xFF);
+//ZZ       }
+//ZZ       if ((x & 0x7FFFF) == 0) {
+//ZZ          tmp.type = 10;
+//ZZ          tmp.imm8 = ((x >> 19) & 0x7F) | ((x >> 24) & 0x80);
+//ZZ          if (ARMNImm_to_Imm64(&tmp) == x)
+//ZZ             return ARMNImm_TI(tmp.type, tmp.imm8);
+//ZZ       }
+//ZZ    } else {
+//ZZ       /* This can only be type 9. */
+//ZZ       tmp.imm8 = (((x >> 56) & 1) << 7)
+//ZZ                | (((x >> 48) & 1) << 6)
+//ZZ                | (((x >> 40) & 1) << 5)
+//ZZ                | (((x >> 32) & 1) << 4)
+//ZZ                | (((x >> 24) & 1) << 3)
+//ZZ                | (((x >> 16) & 1) << 2)
+//ZZ                | (((x >>  8) & 1) << 1)
+//ZZ                | (((x >>  0) & 1) << 0);
+//ZZ       tmp.type = 9;
+//ZZ       if (ARMNImm_to_Imm64 (&tmp) == x)
+//ZZ          return ARMNImm_TI(tmp.type, tmp.imm8);
+//ZZ    }
+//ZZ    return NULL;
+//ZZ }
+//ZZ 
+//ZZ void ppARMNImm (ARMNImm* i) {
+//ZZ    ULong x = ARMNImm_to_Imm64(i);
+//ZZ    vex_printf("0x%llX%llX", x, x);
+//ZZ }
+//ZZ 
+//ZZ /* -- Register or scalar operand --- */
+//ZZ 
+//ZZ ARMNRS* mkARMNRS(ARMNRS_tag tag, HReg reg, UInt index)
+//ZZ {
+//ZZ    ARMNRS *p = LibVEX_Alloc(sizeof(ARMNRS));
+//ZZ    p->tag = tag;
+//ZZ    p->reg = reg;
+//ZZ    p->index = index;
+//ZZ    return p;
+//ZZ }
+//ZZ 
+//ZZ void ppARMNRS(ARMNRS *p)
+//ZZ {
+//ZZ    ppHRegARM(p->reg);
+//ZZ    if (p->tag == ARMNRS_Scalar) {
+//ZZ       vex_printf("[%d]", p->index);
+//ZZ    }
+//ZZ }
+
+/* --------- Instructions. --------- */
+
+static const HChar* showARM64LogicOp ( ARM64LogicOp op ) {
+   switch (op) {
+      case ARM64lo_AND: return "and";
+      case ARM64lo_OR:  return "orr";
+      case ARM64lo_XOR: return "eor";
+      default: vpanic("showARM64LogicOp");
+   }
+}
+
+static const HChar* showARM64ShiftOp ( ARM64ShiftOp op ) {
+   switch (op) {
+      case ARM64sh_SHL: return "lsl";
+      case ARM64sh_SHR: return "lsr";
+      case ARM64sh_SAR: return "asr";
+      default: vpanic("showARM64ShiftOp");
+   }
+}
+
+static const HChar* showARM64UnaryOp ( ARM64UnaryOp op ) {
+   switch (op) {
+      case ARM64un_NEG: return "neg";
+      case ARM64un_NOT: return "not";
+      case ARM64un_CLZ: return "clz";
+      default: vpanic("showARM64UnaryOp");
+   }
+}
+
+static const HChar* showARM64MulOp ( ARM64MulOp op ) {
+   switch (op) {
+      case ARM64mul_PLAIN: return "mul  ";
+      case ARM64mul_ZX:    return "umulh";
+      case ARM64mul_SX:    return "smulh";
+      default: vpanic("showARM64MulOp");
+   }
+}
+
+static void characteriseARM64CvtOp ( /*OUT*/HChar* syn,
+                                     /*OUT*/UInt* fszB, /*OUT*/UInt* iszB, 
+                                     ARM64CvtOp op ) {
+   switch (op) {
+      case ARM64cvt_F32_I32S:
+         *syn = 's'; *fszB = 4; *iszB = 4; break;
+      case ARM64cvt_F64_I32S:
+         *syn = 's'; *fszB = 8; *iszB = 4; break;
+      case ARM64cvt_F32_I64S:
+         *syn = 's'; *fszB = 4; *iszB = 8; break;
+      case ARM64cvt_F64_I64S:
+         *syn = 's'; *fszB = 8; *iszB = 8; break;
+      case ARM64cvt_F32_I32U:
+         *syn = 'u'; *fszB = 4; *iszB = 4; break;
+      case ARM64cvt_F64_I32U:
+         *syn = 'u'; *fszB = 8; *iszB = 4; break;
+      case ARM64cvt_F32_I64U:
+         *syn = 'u'; *fszB = 4; *iszB = 8; break;
+      case ARM64cvt_F64_I64U:
+         *syn = 'u'; *fszB = 8; *iszB = 8; break;
+      default:
+         vpanic("characteriseARM64CvtOp");
+  }
+}
+
+static const HChar* showARM64FpBinOp ( ARM64FpBinOp op ) {
+   switch (op) {
+      case ARM64fpb_ADD: return "add";
+      case ARM64fpb_SUB: return "sub";
+      case ARM64fpb_MUL: return "mul";
+      case ARM64fpb_DIV: return "div";
+      default: vpanic("showARM64FpBinOp");
+   }
+}
+
+static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) {
+   switch (op) {
+      case ARM64fpu_NEG:  return "neg  ";
+      case ARM64fpu_ABS:  return "abs  ";
+      case ARM64fpu_SQRT: return "sqrt ";
+      case ARM64fpu_RINT: return "rinti";
+      default: vpanic("showARM64FpUnaryOp");
+   }
+}
+
+static void showARM64VecBinOp(/*OUT*/const HChar** nm,
+                              /*OUT*/const HChar** ar, ARM64VecBinOp op ) {
+   switch (op) {
+      case ARM64vecb_ADD64x2:   *nm = "add ";  *ar = "2d";  return;
+      case ARM64vecb_ADD32x4:   *nm = "add ";  *ar = "4s";  return;
+      case ARM64vecb_ADD16x8:   *nm = "add ";  *ar = "8h";  return;
+      case ARM64vecb_ADD8x16:   *nm = "add ";  *ar = "16b"; return;
+      case ARM64vecb_SUB64x2:   *nm = "sub ";  *ar = "2d";  return;
+      case ARM64vecb_SUB32x4:   *nm = "sub ";  *ar = "4s";  return;
+      case ARM64vecb_SUB16x8:   *nm = "sub ";  *ar = "8h";  return;
+      case ARM64vecb_SUB8x16:   *nm = "sub ";  *ar = "16b"; return;
+      case ARM64vecb_MUL32x4:   *nm = "mul ";  *ar = "4s";  return;
+      case ARM64vecb_MUL16x8:   *nm = "mul ";  *ar = "8h";  return;
+      case ARM64vecb_MUL8x16:   *nm = "mul ";  *ar = "16b"; return;
+      case ARM64vecb_FADD64x2:  *nm = "fadd";  *ar = "2d";  return;
+      case ARM64vecb_FSUB64x2:  *nm = "fsub";  *ar = "2d";  return;
+      case ARM64vecb_FMUL64x2:  *nm = "fmul";  *ar = "2d";  return;
+      case ARM64vecb_FDIV64x2:  *nm = "fdiv";  *ar = "2d";  return;
+      case ARM64vecb_FADD32x4:  *nm = "fadd";  *ar = "4s";  return;
+      case ARM64vecb_FSUB32x4:  *nm = "fsub";  *ar = "4s";  return;
+      case ARM64vecb_FMUL32x4:  *nm = "fmul";  *ar = "4s";  return;
+      case ARM64vecb_FDIV32x4:  *nm = "fdiv";  *ar = "4s";  return;
+      case ARM64vecb_UMAX32x4:  *nm = "umax";  *ar = "4s";  return;
+      case ARM64vecb_UMAX16x8:  *nm = "umax";  *ar = "8h";  return;
+      case ARM64vecb_UMAX8x16:  *nm = "umax";  *ar = "16b"; return;
+      case ARM64vecb_UMIN32x4:  *nm = "umin";  *ar = "4s";  return;
+      case ARM64vecb_UMIN16x8:  *nm = "umin";  *ar = "8h";  return;
+      case ARM64vecb_UMIN8x16:  *nm = "umin";  *ar = "16b"; return;
+      case ARM64vecb_SMAX32x4:  *nm = "smax";  *ar = "4s";  return;
+      case ARM64vecb_SMAX16x8:  *nm = "smax";  *ar = "8h";  return;
+      case ARM64vecb_SMAX8x16:  *nm = "smax";  *ar = "16b"; return;
+      case ARM64vecb_SMIN32x4:  *nm = "smin";  *ar = "4s";  return;
+      case ARM64vecb_SMIN16x8:  *nm = "smin";  *ar = "8h";  return;
+      case ARM64vecb_SMIN8x16:  *nm = "smin";  *ar = "16b"; return;
+      case ARM64vecb_AND:       *nm = "and ";  *ar = "all"; return;
+      case ARM64vecb_ORR:       *nm = "orr ";  *ar = "all"; return;
+      case ARM64vecb_XOR:       *nm = "eor ";  *ar = "all"; return;
+      case ARM64vecb_CMEQ64x2:  *nm = "cmeq";  *ar = "2d";  return;
+      case ARM64vecb_CMEQ32x4:  *nm = "cmeq";  *ar = "4s";  return;
+      case ARM64vecb_CMEQ16x8:  *nm = "cmeq";  *ar = "8h";  return;
+      case ARM64vecb_CMEQ8x16:  *nm = "cmeq";  *ar = "16b"; return;
+      case ARM64vecb_CMHI64x2:  *nm = "cmhi";  *ar = "2d";  return;
+      case ARM64vecb_CMHI32x4:  *nm = "cmhi";  *ar = "4s";  return;
+      case ARM64vecb_CMHI16x8:  *nm = "cmhi";  *ar = "8h";  return;
+      case ARM64vecb_CMHI8x16:  *nm = "cmhi";  *ar = "16b"; return;
+      case ARM64vecb_CMGT64x2:  *nm = "cmgt";  *ar = "2d";  return;
+      case ARM64vecb_CMGT32x4:  *nm = "cmgt";  *ar = "4s";  return;
+      case ARM64vecb_CMGT16x8:  *nm = "cmgt";  *ar = "8h";  return;
+      case ARM64vecb_CMGT8x16:  *nm = "cmgt";  *ar = "16b"; return;
+      case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
+      case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
+      case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
+      case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
+      case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
+      case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
+      case ARM64vecb_TBL1:      *nm = "tbl ";  *ar = "16b"; return;
+      default: vpanic("showARM64VecBinOp");
+   }
+}
+
+static void showARM64VecUnaryOp(/*OUT*/const HChar** nm,
+                                /*OUT*/const HChar** ar, ARM64VecUnaryOp op )
+{
+   switch (op) {
+      case ARM64vecu_FNEG64x2: *nm = "fneg "; *ar = "2d";  return;
+      case ARM64vecu_FNEG32x4: *nm = "fneg "; *ar = "4s";  return;
+      case ARM64vecu_FABS64x2: *nm = "fabs "; *ar = "2d";  return;
+      case ARM64vecu_FABS32x4: *nm = "fabs "; *ar = "4s";  return;
+      case ARM64vecu_NOT:      *nm = "not  "; *ar = "all"; return;
+      default: vpanic("showARM64VecUnaryOp");
+   }
+}
+
+static void showARM64VecShiftOp(/*OUT*/const HChar** nm,
+                                /*OUT*/const HChar** ar,
+                                ARM64VecShiftOp op )
+{
+   switch (op) {
+      case ARM64vecsh_USHR64x2: *nm = "ushr  "; *ar = "2d";  return;
+      case ARM64vecsh_USHR32x4: *nm = "ushr  "; *ar = "4s";  return;
+      case ARM64vecsh_USHR16x8: *nm = "ushr  "; *ar = "8h";  return;
+      case ARM64vecsh_USHR8x16: *nm = "ushr  "; *ar = "16b"; return;
+      case ARM64vecsh_SSHR64x2: *nm = "sshr  "; *ar = "2d";  return;
+      case ARM64vecsh_SSHR32x4: *nm = "sshr  "; *ar = "4s";  return;
+      case ARM64vecsh_SSHR16x8: *nm = "sshr  "; *ar = "8h";  return;
+      case ARM64vecsh_SSHR8x16: *nm = "sshr  "; *ar = "16b"; return;
+      case ARM64vecsh_SHL64x2:  *nm = "shl   "; *ar = "2d";  return;
+      case ARM64vecsh_SHL32x4:  *nm = "shl   "; *ar = "4s";  return;
+      case ARM64vecsh_SHL16x8:  *nm = "shl   "; *ar = "8h";  return;
+      case ARM64vecsh_SHL8x16:  *nm = "shl   "; *ar = "16b"; return;
+      default: vpanic("showARM64VecShiftImmOp");
+   }
+}
+
+//ZZ const HChar* showARMNeonBinOp ( ARMNeonBinOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_VAND: return "vand";
+//ZZ       case ARMneon_VORR: return "vorr";
+//ZZ       case ARMneon_VXOR: return "veor";
+//ZZ       case ARMneon_VADD: return "vadd";
+//ZZ       case ARMneon_VRHADDS: return "vrhadd";
+//ZZ       case ARMneon_VRHADDU: return "vrhadd";
+//ZZ       case ARMneon_VADDFP: return "vadd";
+//ZZ       case ARMneon_VPADDFP: return "vpadd";
+//ZZ       case ARMneon_VABDFP: return "vabd";
+//ZZ       case ARMneon_VSUB: return "vsub";
+//ZZ       case ARMneon_VSUBFP: return "vsub";
+//ZZ       case ARMneon_VMINU: return "vmin";
+//ZZ       case ARMneon_VMINS: return "vmin";
+//ZZ       case ARMneon_VMINF: return "vmin";
+//ZZ       case ARMneon_VMAXU: return "vmax";
+//ZZ       case ARMneon_VMAXS: return "vmax";
+//ZZ       case ARMneon_VMAXF: return "vmax";
+//ZZ       case ARMneon_VQADDU: return "vqadd";
+//ZZ       case ARMneon_VQADDS: return "vqadd";
+//ZZ       case ARMneon_VQSUBU: return "vqsub";
+//ZZ       case ARMneon_VQSUBS: return "vqsub";
+//ZZ       case ARMneon_VCGTU:  return "vcgt";
+//ZZ       case ARMneon_VCGTS:  return "vcgt";
+//ZZ       case ARMneon_VCGTF:  return "vcgt";
+//ZZ       case ARMneon_VCGEF:  return "vcgt";
+//ZZ       case ARMneon_VCGEU:  return "vcge";
+//ZZ       case ARMneon_VCGES:  return "vcge";
+//ZZ       case ARMneon_VCEQ:  return "vceq";
+//ZZ       case ARMneon_VCEQF:  return "vceq";
+//ZZ       case ARMneon_VPADD:   return "vpadd";
+//ZZ       case ARMneon_VPMINU:   return "vpmin";
+//ZZ       case ARMneon_VPMINS:   return "vpmin";
+//ZZ       case ARMneon_VPMINF:   return "vpmin";
+//ZZ       case ARMneon_VPMAXU:   return "vpmax";
+//ZZ       case ARMneon_VPMAXS:   return "vpmax";
+//ZZ       case ARMneon_VPMAXF:   return "vpmax";
+//ZZ       case ARMneon_VEXT:   return "vext";
+//ZZ       case ARMneon_VMUL:   return "vmuli";
+//ZZ       case ARMneon_VMULLU:   return "vmull";
+//ZZ       case ARMneon_VMULLS:   return "vmull";
+//ZZ       case ARMneon_VMULP:  return "vmul";
+//ZZ       case ARMneon_VMULFP:  return "vmul";
+//ZZ       case ARMneon_VMULLP:  return "vmul";
+//ZZ       case ARMneon_VQDMULH: return "vqdmulh";
+//ZZ       case ARMneon_VQRDMULH: return "vqrdmulh";
+//ZZ       case ARMneon_VQDMULL: return "vqdmull";
+//ZZ       case ARMneon_VTBL: return "vtbl";
+//ZZ       case ARMneon_VRECPS: return "vrecps";
+//ZZ       case ARMneon_VRSQRTS: return "vrecps";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonBinOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonBinOpDataType ( ARMNeonBinOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_VAND:
+//ZZ       case ARMneon_VORR:
+//ZZ       case ARMneon_VXOR:
+//ZZ          return "";
+//ZZ       case ARMneon_VADD:
+//ZZ       case ARMneon_VSUB:
+//ZZ       case ARMneon_VEXT:
+//ZZ       case ARMneon_VMUL:
+//ZZ       case ARMneon_VPADD:
+//ZZ       case ARMneon_VTBL:
+//ZZ       case ARMneon_VCEQ:
+//ZZ          return ".i";
+//ZZ       case ARMneon_VRHADDU:
+//ZZ       case ARMneon_VMINU:
+//ZZ       case ARMneon_VMAXU:
+//ZZ       case ARMneon_VQADDU:
+//ZZ       case ARMneon_VQSUBU:
+//ZZ       case ARMneon_VCGTU:
+//ZZ       case ARMneon_VCGEU:
+//ZZ       case ARMneon_VMULLU:
+//ZZ       case ARMneon_VPMINU:
+//ZZ       case ARMneon_VPMAXU:
+//ZZ          return ".u";
+//ZZ       case ARMneon_VRHADDS:
+//ZZ       case ARMneon_VMINS:
+//ZZ       case ARMneon_VMAXS:
+//ZZ       case ARMneon_VQADDS:
+//ZZ       case ARMneon_VQSUBS:
+//ZZ       case ARMneon_VCGTS:
+//ZZ       case ARMneon_VCGES:
+//ZZ       case ARMneon_VQDMULL:
+//ZZ       case ARMneon_VMULLS:
+//ZZ       case ARMneon_VPMINS:
+//ZZ       case ARMneon_VPMAXS:
+//ZZ       case ARMneon_VQDMULH:
+//ZZ       case ARMneon_VQRDMULH:
+//ZZ          return ".s";
+//ZZ       case ARMneon_VMULP:
+//ZZ       case ARMneon_VMULLP:
+//ZZ          return ".p";
+//ZZ       case ARMneon_VADDFP:
+//ZZ       case ARMneon_VABDFP:
+//ZZ       case ARMneon_VPADDFP:
+//ZZ       case ARMneon_VSUBFP:
+//ZZ       case ARMneon_VMULFP:
+//ZZ       case ARMneon_VMINF:
+//ZZ       case ARMneon_VMAXF:
+//ZZ       case ARMneon_VPMINF:
+//ZZ       case ARMneon_VPMAXF:
+//ZZ       case ARMneon_VCGTF:
+//ZZ       case ARMneon_VCGEF:
+//ZZ       case ARMneon_VCEQF:
+//ZZ       case ARMneon_VRECPS:
+//ZZ       case ARMneon_VRSQRTS:
+//ZZ          return ".f";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonBinOpDataType");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonUnOp ( ARMNeonUnOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_COPY: return "vmov";
+//ZZ       case ARMneon_COPYLS: return "vmov";
+//ZZ       case ARMneon_COPYLU: return "vmov";
+//ZZ       case ARMneon_COPYN: return "vmov";
+//ZZ       case ARMneon_COPYQNSS: return "vqmovn";
+//ZZ       case ARMneon_COPYQNUS: return "vqmovun";
+//ZZ       case ARMneon_COPYQNUU: return "vqmovn";
+//ZZ       case ARMneon_NOT: return "vmvn";
+//ZZ       case ARMneon_EQZ: return "vceq";
+//ZZ       case ARMneon_CNT: return "vcnt";
+//ZZ       case ARMneon_CLS: return "vcls";
+//ZZ       case ARMneon_CLZ: return "vclz";
+//ZZ       case ARMneon_DUP: return "vdup";
+//ZZ       case ARMneon_PADDLS: return "vpaddl";
+//ZZ       case ARMneon_PADDLU: return "vpaddl";
+//ZZ       case ARMneon_VQSHLNSS: return "vqshl";
+//ZZ       case ARMneon_VQSHLNUU: return "vqshl";
+//ZZ       case ARMneon_VQSHLNUS: return "vqshlu";
+//ZZ       case ARMneon_REV16: return "vrev16";
+//ZZ       case ARMneon_REV32: return "vrev32";
+//ZZ       case ARMneon_REV64: return "vrev64";
+//ZZ       case ARMneon_VCVTFtoU: return "vcvt";
+//ZZ       case ARMneon_VCVTFtoS: return "vcvt";
+//ZZ       case ARMneon_VCVTUtoF: return "vcvt";
+//ZZ       case ARMneon_VCVTStoF: return "vcvt";
+//ZZ       case ARMneon_VCVTFtoFixedU: return "vcvt";
+//ZZ       case ARMneon_VCVTFtoFixedS: return "vcvt";
+//ZZ       case ARMneon_VCVTFixedUtoF: return "vcvt";
+//ZZ       case ARMneon_VCVTFixedStoF: return "vcvt";
+//ZZ       case ARMneon_VCVTF32toF16: return "vcvt";
+//ZZ       case ARMneon_VCVTF16toF32: return "vcvt";
+//ZZ       case ARMneon_VRECIP: return "vrecip";
+//ZZ       case ARMneon_VRECIPF: return "vrecipf";
+//ZZ       case ARMneon_VNEGF: return "vneg";
+//ZZ       case ARMneon_ABS: return "vabs";
+//ZZ       case ARMneon_VABSFP: return "vabsfp";
+//ZZ       case ARMneon_VRSQRTEFP: return "vrsqrtefp";
+//ZZ       case ARMneon_VRSQRTE: return "vrsqrte";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonUnOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonUnOpDataType ( ARMNeonUnOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_COPY:
+//ZZ       case ARMneon_NOT:
+//ZZ          return "";
+//ZZ       case ARMneon_COPYN:
+//ZZ       case ARMneon_EQZ:
+//ZZ       case ARMneon_CNT:
+//ZZ       case ARMneon_DUP:
+//ZZ       case ARMneon_REV16:
+//ZZ       case ARMneon_REV32:
+//ZZ       case ARMneon_REV64:
+//ZZ          return ".i";
+//ZZ       case ARMneon_COPYLU:
+//ZZ       case ARMneon_PADDLU:
+//ZZ       case ARMneon_COPYQNUU:
+//ZZ       case ARMneon_VQSHLNUU:
+//ZZ       case ARMneon_VRECIP:
+//ZZ       case ARMneon_VRSQRTE:
+//ZZ          return ".u";
+//ZZ       case ARMneon_CLS:
+//ZZ       case ARMneon_CLZ:
+//ZZ       case ARMneon_COPYLS:
+//ZZ       case ARMneon_PADDLS:
+//ZZ       case ARMneon_COPYQNSS:
+//ZZ       case ARMneon_COPYQNUS:
+//ZZ       case ARMneon_VQSHLNSS:
+//ZZ       case ARMneon_VQSHLNUS:
+//ZZ       case ARMneon_ABS:
+//ZZ          return ".s";
+//ZZ       case ARMneon_VRECIPF:
+//ZZ       case ARMneon_VNEGF:
+//ZZ       case ARMneon_VABSFP:
+//ZZ       case ARMneon_VRSQRTEFP:
+//ZZ          return ".f";
+//ZZ       case ARMneon_VCVTFtoU: return ".u32.f32";
+//ZZ       case ARMneon_VCVTFtoS: return ".s32.f32";
+//ZZ       case ARMneon_VCVTUtoF: return ".f32.u32";
+//ZZ       case ARMneon_VCVTStoF: return ".f32.s32";
+//ZZ       case ARMneon_VCVTF16toF32: return ".f32.f16";
+//ZZ       case ARMneon_VCVTF32toF16: return ".f16.f32";
+//ZZ       case ARMneon_VCVTFtoFixedU: return ".u32.f32";
+//ZZ       case ARMneon_VCVTFtoFixedS: return ".s32.f32";
+//ZZ       case ARMneon_VCVTFixedUtoF: return ".f32.u32";
+//ZZ       case ARMneon_VCVTFixedStoF: return ".f32.s32";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonUnOpDataType");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonUnOpS ( ARMNeonUnOpS op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_SETELEM: return "vmov";
+//ZZ       case ARMneon_GETELEMU: return "vmov";
+//ZZ       case ARMneon_GETELEMS: return "vmov";
+//ZZ       case ARMneon_VDUP: return "vdup";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonUnarySOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonUnOpSDataType ( ARMNeonUnOpS op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_SETELEM:
+//ZZ       case ARMneon_VDUP:
+//ZZ          return ".i";
+//ZZ       case ARMneon_GETELEMS:
+//ZZ          return ".s";
+//ZZ       case ARMneon_GETELEMU:
+//ZZ          return ".u";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonUnarySOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonShiftOp ( ARMNeonShiftOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_VSHL: return "vshl";
+//ZZ       case ARMneon_VSAL: return "vshl";
+//ZZ       case ARMneon_VQSHL: return "vqshl";
+//ZZ       case ARMneon_VQSAL: return "vqshl";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonShiftOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonShiftOpDataType ( ARMNeonShiftOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_VSHL:
+//ZZ       case ARMneon_VQSHL:
+//ZZ          return ".u";
+//ZZ       case ARMneon_VSAL:
+//ZZ       case ARMneon_VQSAL:
+//ZZ          return ".s";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonShiftOpDataType");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonDualOp ( ARMNeonDualOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_TRN: return "vtrn";
+//ZZ       case ARMneon_ZIP: return "vzip";
+//ZZ       case ARMneon_UZP: return "vuzp";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonDualOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ const HChar* showARMNeonDualOpDataType ( ARMNeonDualOp op ) {
+//ZZ    switch (op) {
+//ZZ       case ARMneon_TRN:
+//ZZ       case ARMneon_ZIP:
+//ZZ       case ARMneon_UZP:
+//ZZ          return "i";
+//ZZ       /* ... */
+//ZZ       default: vpanic("showARMNeonDualOp");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static const HChar* showARMNeonDataSize_wrk ( UInt size )
+//ZZ {
+//ZZ    switch (size) {
+//ZZ       case 0: return "8";
+//ZZ       case 1: return "16";
+//ZZ       case 2: return "32";
+//ZZ       case 3: return "64";
+//ZZ       default: vpanic("showARMNeonDataSize");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ static const HChar* showARMNeonDataSize ( ARMInstr* i )
+//ZZ {
+//ZZ    switch (i->tag) {
+//ZZ       case ARMin_NBinary:
+//ZZ          if (i->ARMin.NBinary.op == ARMneon_VEXT)
+//ZZ             return "8";
+//ZZ          if (i->ARMin.NBinary.op == ARMneon_VAND ||
+//ZZ              i->ARMin.NBinary.op == ARMneon_VORR ||
+//ZZ              i->ARMin.NBinary.op == ARMneon_VXOR)
+//ZZ             return "";
+//ZZ          return showARMNeonDataSize_wrk(i->ARMin.NBinary.size);
+//ZZ       case ARMin_NUnary:
+//ZZ          if (i->ARMin.NUnary.op == ARMneon_COPY ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_NOT ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTF32toF16||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTF16toF32||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedS ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedU ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFixedStoF ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFixedUtoF ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFtoS ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFtoU ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTStoF ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTUtoF)
+//ZZ             return "";
+//ZZ          if (i->ARMin.NUnary.op == ARMneon_VQSHLNSS ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VQSHLNUU ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VQSHLNUS) {
+//ZZ             UInt size;
+//ZZ             size = i->ARMin.NUnary.size;
+//ZZ             if (size & 0x40)
+//ZZ                return "64";
+//ZZ             if (size & 0x20)
+//ZZ                return "32";
+//ZZ             if (size & 0x10)
+//ZZ                return "16";
+//ZZ             if (size & 0x08)
+//ZZ                return "8";
+//ZZ             vpanic("showARMNeonDataSize");
+//ZZ          }
+//ZZ          return showARMNeonDataSize_wrk(i->ARMin.NUnary.size);
+//ZZ       case ARMin_NUnaryS:
+//ZZ          if (i->ARMin.NUnaryS.op == ARMneon_VDUP) {
+//ZZ             int size;
+//ZZ             size = i->ARMin.NUnaryS.size;
+//ZZ             if ((size & 1) == 1)
+//ZZ                return "8";
+//ZZ             if ((size & 3) == 2)
+//ZZ                return "16";
+//ZZ             if ((size & 7) == 4)
+//ZZ                return "32";
+//ZZ             vpanic("showARMNeonDataSize");
+//ZZ          }
+//ZZ          return showARMNeonDataSize_wrk(i->ARMin.NUnaryS.size);
+//ZZ       case ARMin_NShift:
+//ZZ          return showARMNeonDataSize_wrk(i->ARMin.NShift.size);
+//ZZ       case ARMin_NDual:
+//ZZ          return showARMNeonDataSize_wrk(i->ARMin.NDual.size);
+//ZZ       default:
+//ZZ          vpanic("showARMNeonDataSize");
+//ZZ    }
+//ZZ }
+
+ARM64Instr* ARM64Instr_Arith ( HReg dst,
+                               HReg argL, ARM64RIA* argR, Bool isAdd ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_Arith;
+   i->ARM64in.Arith.dst   = dst;
+   i->ARM64in.Arith.argL  = argL;
+   i->ARM64in.Arith.argR  = argR;
+   i->ARM64in.Arith.isAdd = isAdd;
+   return i;
+}
+ARM64Instr* ARM64Instr_Cmp ( HReg argL, ARM64RIA* argR, Bool is64 ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag              = ARM64in_Cmp;
+   i->ARM64in.Cmp.argL = argL;
+   i->ARM64in.Cmp.argR = argR;
+   i->ARM64in.Cmp.is64 = is64;
+   return i;
+}
+ARM64Instr* ARM64Instr_Logic ( HReg dst,
+                               HReg argL, ARM64RIL* argR, ARM64LogicOp op ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_Logic;
+   i->ARM64in.Logic.dst   = dst;
+   i->ARM64in.Logic.argL  = argL;
+   i->ARM64in.Logic.argR  = argR;
+   i->ARM64in.Logic.op    = op;
+   return i;
+}
+ARM64Instr* ARM64Instr_Test ( HReg argL, ARM64RIL* argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_Test;
+   i->ARM64in.Test.argL = argL;
+   i->ARM64in.Test.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_Shift ( HReg dst,
+                               HReg argL, ARM64RI6* argR, ARM64ShiftOp op ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_Shift;
+   i->ARM64in.Shift.dst  = dst;
+   i->ARM64in.Shift.argL = argL;
+   i->ARM64in.Shift.argR = argR;
+   i->ARM64in.Shift.op   = op;
+   return i;
+}
+ARM64Instr* ARM64Instr_Unary ( HReg dst, HReg src, ARM64UnaryOp op ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_Unary;
+   i->ARM64in.Unary.dst = dst;
+   i->ARM64in.Unary.src = src;
+   i->ARM64in.Unary.op  = op;
+   return i;
+}
+ARM64Instr* ARM64Instr_MovI ( HReg dst, HReg src ) {
+   ARM64Instr* i      = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag             = ARM64in_MovI;
+   i->ARM64in.MovI.dst = dst;
+   i->ARM64in.MovI.src = src;
+   vassert(hregClass(src) == HRcInt64);
+   vassert(hregClass(dst) == HRcInt64);
+   return i;
+}
+ARM64Instr* ARM64Instr_Imm64 ( HReg dst, ULong imm64 ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_Imm64;
+   i->ARM64in.Imm64.dst   = dst;
+   i->ARM64in.Imm64.imm64 = imm64;
+   return i;
+}
+ARM64Instr* ARM64Instr_LdSt64 ( Bool isLoad, HReg rD, ARM64AMode* amode ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_LdSt64;
+   i->ARM64in.LdSt64.isLoad = isLoad;
+   i->ARM64in.LdSt64.rD     = rD;
+   i->ARM64in.LdSt64.amode  = amode;
+   return i;
+}
+ARM64Instr* ARM64Instr_LdSt32 ( Bool isLoad, HReg rD, ARM64AMode* amode ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_LdSt32;
+   i->ARM64in.LdSt32.isLoad = isLoad;
+   i->ARM64in.LdSt32.rD     = rD;
+   i->ARM64in.LdSt32.amode  = amode;
+   return i;
+}
+ARM64Instr* ARM64Instr_LdSt16 ( Bool isLoad, HReg rD, ARM64AMode* amode ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_LdSt16;
+   i->ARM64in.LdSt16.isLoad = isLoad;
+   i->ARM64in.LdSt16.rD     = rD;
+   i->ARM64in.LdSt16.amode  = amode;
+   return i;
+}
+ARM64Instr* ARM64Instr_LdSt8 ( Bool isLoad, HReg rD, ARM64AMode* amode ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_LdSt8;
+   i->ARM64in.LdSt8.isLoad = isLoad;
+   i->ARM64in.LdSt8.rD     = rD;
+   i->ARM64in.LdSt8.amode  = amode;
+   return i;
+}
+ARM64Instr* ARM64Instr_XDirect ( Addr64 dstGA, ARM64AMode* amPC,
+                                 ARM64CondCode cond, Bool toFastEP ) {
+   ARM64Instr* i               = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                      = ARM64in_XDirect;
+   i->ARM64in.XDirect.dstGA    = dstGA;
+   i->ARM64in.XDirect.amPC     = amPC;
+   i->ARM64in.XDirect.cond     = cond;
+   i->ARM64in.XDirect.toFastEP = toFastEP;
+   return i;
+}
+ARM64Instr* ARM64Instr_XIndir ( HReg dstGA, ARM64AMode* amPC,
+                                ARM64CondCode cond ) {
+   ARM64Instr* i           = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_XIndir;
+   i->ARM64in.XIndir.dstGA = dstGA;
+   i->ARM64in.XIndir.amPC  = amPC;
+   i->ARM64in.XIndir.cond  = cond;
+   return i;
+}
+ARM64Instr* ARM64Instr_XAssisted ( HReg dstGA, ARM64AMode* amPC,
+                                   ARM64CondCode cond, IRJumpKind jk ) {
+   ARM64Instr* i              = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                     = ARM64in_XAssisted;
+   i->ARM64in.XAssisted.dstGA = dstGA;
+   i->ARM64in.XAssisted.amPC  = amPC;
+   i->ARM64in.XAssisted.cond  = cond;
+   i->ARM64in.XAssisted.jk    = jk;
+   return i;
+}
+ARM64Instr* ARM64Instr_CSel ( HReg dst, HReg argL, HReg argR,
+                              ARM64CondCode cond ) {
+   ARM64Instr* i        = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_CSel;
+   i->ARM64in.CSel.dst  = dst;
+   i->ARM64in.CSel.argL = argL;
+   i->ARM64in.CSel.argR = argR;
+   i->ARM64in.CSel.cond = cond;
+   return i;
+}
+ARM64Instr* ARM64Instr_Call ( ARM64CondCode cond, HWord target, Int nArgRegs,
+                              RetLoc rloc ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_Call;
+   i->ARM64in.Call.cond     = cond;
+   i->ARM64in.Call.target   = target;
+   i->ARM64in.Call.nArgRegs = nArgRegs;
+   i->ARM64in.Call.rloc     = rloc;
+   vassert(is_sane_RetLoc(rloc));
+   return i;
+}
+extern ARM64Instr* ARM64Instr_AddToSP ( Int simm ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_AddToSP;
+   i->ARM64in.AddToSP.simm = simm;
+   vassert(-4096 < simm && simm < 4096);
+   vassert(0 == (simm & 0xF));
+   return i;
+}
+extern ARM64Instr* ARM64Instr_FromSP  ( HReg dst ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_FromSP;
+   i->ARM64in.FromSP.dst = dst;
+   return i;
+}
+ARM64Instr* ARM64Instr_Mul ( HReg dst, HReg argL, HReg argR,
+                             ARM64MulOp op ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag              = ARM64in_Mul;
+   i->ARM64in.Mul.dst  = dst;
+   i->ARM64in.Mul.argL = argL;
+   i->ARM64in.Mul.argR = argR;
+   i->ARM64in.Mul.op   = op;
+   return i;
+}
+ARM64Instr* ARM64Instr_LdrEX ( Int szB ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_LdrEX;
+   i->ARM64in.LdrEX.szB = szB;
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+   return i;
+}
+ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_StrEX;
+   i->ARM64in.StrEX.szB = szB;
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+   return i;
+}
+ARM64Instr* ARM64Instr_MFence ( void ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag        = ARM64in_MFence;
+   return i;
+}
+//ZZ ARM64Instr* ARM64Instr_CLREX( void ) {
+//ZZ    ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+//ZZ    i->tag        = ARM64in_CLREX;
+//ZZ    return i;
+//ZZ }
+ARM64Instr* ARM64Instr_VLdStS ( Bool isLoad, HReg sD, HReg rN, UInt uimm12 ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_VLdStS;
+   i->ARM64in.VLdStS.isLoad = isLoad;
+   i->ARM64in.VLdStS.sD     = sD;
+   i->ARM64in.VLdStS.rN     = rN;
+   i->ARM64in.VLdStS.uimm12 = uimm12;
+   vassert(uimm12 < 16384 && 0 == (uimm12 & 3));
+   return i;
+}
+ARM64Instr* ARM64Instr_VLdStD ( Bool isLoad, HReg dD, HReg rN, UInt uimm12 ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_VLdStD;
+   i->ARM64in.VLdStD.isLoad = isLoad;
+   i->ARM64in.VLdStD.dD     = dD;
+   i->ARM64in.VLdStD.rN     = rN;
+   i->ARM64in.VLdStD.uimm12 = uimm12;
+   vassert(uimm12 < 32768 && 0 == (uimm12 & 7));
+   return i;
+}
+ARM64Instr* ARM64Instr_VLdStQ ( Bool isLoad, HReg rQ, HReg rN ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_VLdStQ;
+   i->ARM64in.VLdStQ.isLoad = isLoad;
+   i->ARM64in.VLdStQ.rQ     = rQ;
+   i->ARM64in.VLdStQ.rN     = rN;
+   return i;
+}
+ARM64Instr* ARM64Instr_VCvtI2F ( ARM64CvtOp how, HReg rD, HReg rS ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VCvtI2F;
+   i->ARM64in.VCvtI2F.how = how;
+   i->ARM64in.VCvtI2F.rD  = rD;
+   i->ARM64in.VCvtI2F.rS  = rS;
+   return i;
+}
+ARM64Instr* ARM64Instr_VCvtF2I ( ARM64CvtOp how, HReg rD, HReg rS,
+                                 UChar armRM ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_VCvtF2I;
+   i->ARM64in.VCvtF2I.how   = how;
+   i->ARM64in.VCvtF2I.rD    = rD;
+   i->ARM64in.VCvtF2I.rS    = rS;
+   i->ARM64in.VCvtF2I.armRM = armRM;
+   vassert(armRM <= 3);
+   return i;
+}
+ARM64Instr* ARM64Instr_VCvtSD ( Bool sToD, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_VCvtSD;
+   i->ARM64in.VCvtSD.sToD = sToD;
+   i->ARM64in.VCvtSD.dst  = dst;
+   i->ARM64in.VCvtSD.src  = src;
+   return i;
+}
+ARM64Instr* ARM64Instr_VUnaryD ( ARM64FpUnaryOp op, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VUnaryD;
+   i->ARM64in.VUnaryD.op  = op;
+   i->ARM64in.VUnaryD.dst = dst;
+   i->ARM64in.VUnaryD.src = src;
+   return i;
+}
+ARM64Instr* ARM64Instr_VUnaryS ( ARM64FpUnaryOp op, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VUnaryS;
+   i->ARM64in.VUnaryS.op  = op;
+   i->ARM64in.VUnaryS.dst = dst;
+   i->ARM64in.VUnaryS.src = src;
+   return i;
+}
+ARM64Instr* ARM64Instr_VBinD ( ARM64FpBinOp op,
+                               HReg dst, HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VBinD;
+   i->ARM64in.VBinD.op   = op;
+   i->ARM64in.VBinD.dst  = dst;
+   i->ARM64in.VBinD.argL = argL;
+   i->ARM64in.VBinD.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op,
+                               HReg dst, HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VBinS;
+   i->ARM64in.VBinS.op   = op;
+   i->ARM64in.VBinS.dst  = dst;
+   i->ARM64in.VBinS.argL = argL;
+   i->ARM64in.VBinS.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VCmpD;
+   i->ARM64in.VCmpD.argL = argL;
+   i->ARM64in.VCmpD.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VCmpS;
+   i->ARM64in.VCmpS.argL = argL;
+   i->ARM64in.VCmpS.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_FPCR;
+   i->ARM64in.FPCR.toFPCR = toFPCR;
+   i->ARM64in.FPCR.iReg   = iReg;
+   return i;
+}
+ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op,
+                               HReg dst, HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VBinV;
+   i->ARM64in.VBinV.op   = op;
+   i->ARM64in.VBinV.dst  = dst;
+   i->ARM64in.VBinV.argL = argL;
+   i->ARM64in.VBinV.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg dst, HReg arg ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VUnaryV;
+   i->ARM64in.VUnaryV.op  = op;
+   i->ARM64in.VUnaryV.dst = dst;
+   i->ARM64in.VUnaryV.arg = arg;
+   return i;
+}
+ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                      = ARM64in_VNarrowV;
+   i->ARM64in.VNarrowV.dszBlg2 = dszBlg2;
+   i->ARM64in.VNarrowV.dst     = dst;
+   i->ARM64in.VNarrowV.src     = src;
+   vassert(dszBlg2 == 0 || dszBlg2 == 1 || dszBlg2 == 2);
+   return i;
+}
+ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+                                    HReg dst, HReg src, UInt amt ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                    = ARM64in_VShiftImmV;
+   i->ARM64in.VShiftImmV.op  = op;
+   i->ARM64in.VShiftImmV.dst = dst;
+   i->ARM64in.VShiftImmV.src = src;
+   i->ARM64in.VShiftImmV.amt = amt;
+   UInt maxSh = 0;
+   switch (op) {
+      case ARM64vecsh_USHR64x2: case ARM64vecsh_SSHR64x2:
+      case ARM64vecsh_SHL64x2:
+         maxSh = 63; break;
+      case ARM64vecsh_USHR32x4: case ARM64vecsh_SSHR32x4:
+      case ARM64vecsh_SHL32x4:
+         maxSh = 31; break;
+      case ARM64vecsh_USHR16x8: case ARM64vecsh_SSHR16x8:
+      case ARM64vecsh_SHL16x8:
+         maxSh = 15; break;
+      case ARM64vecsh_USHR8x16: case ARM64vecsh_SSHR8x16:
+      case ARM64vecsh_SHL8x16:
+         maxSh = 7; break;
+      default:
+         vassert(0);
+   }
+   vassert(maxSh > 0);
+   vassert(amt > 0 && amt <= maxSh);
+   return i;
+}
+//ZZ ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag              = ARMin_VAluS;
+//ZZ    i->ARMin.VAluS.op   = op;
+//ZZ    i->ARMin.VAluS.dst  = dst;
+//ZZ    i->ARMin.VAluS.argL = argL;
+//ZZ    i->ARMin.VAluS.argR = argR;
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_VCMovD ( ARMCondCode cond, HReg dst, HReg src ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag               = ARMin_VCMovD;
+//ZZ    i->ARMin.VCMovD.cond = cond;
+//ZZ    i->ARMin.VCMovD.dst  = dst;
+//ZZ    i->ARMin.VCMovD.src  = src;
+//ZZ    vassert(cond != ARMcc_AL);
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_VCMovS ( ARMCondCode cond, HReg dst, HReg src ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag               = ARMin_VCMovS;
+//ZZ    i->ARMin.VCMovS.cond = cond;
+//ZZ    i->ARMin.VCMovS.dst  = dst;
+//ZZ    i->ARMin.VCMovS.src  = src;
+//ZZ    vassert(cond != ARMcc_AL);
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_VXferD ( Bool toD, HReg dD, HReg rHi, HReg rLo ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag              = ARMin_VXferD;
+//ZZ    i->ARMin.VXferD.toD = toD;
+//ZZ    i->ARMin.VXferD.dD  = dD;
+//ZZ    i->ARMin.VXferD.rHi = rHi;
+//ZZ    i->ARMin.VXferD.rLo = rLo;
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_VXferS ( Bool toS, HReg fD, HReg rLo ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag              = ARMin_VXferS;
+//ZZ    i->ARMin.VXferS.toS = toS;
+//ZZ    i->ARMin.VXferS.fD  = fD;
+//ZZ    i->ARMin.VXferS.rLo = rLo;
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_VCvtID ( Bool iToD, Bool syned,
+//ZZ                             HReg dst, HReg src ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_VCvtID;
+//ZZ    i->ARMin.VCvtID.iToD  = iToD;
+//ZZ    i->ARMin.VCvtID.syned = syned;
+//ZZ    i->ARMin.VCvtID.dst   = dst;
+//ZZ    i->ARMin.VCvtID.src   = src;
+//ZZ    return i;
+//ZZ }
+//ZZ ARMInstr* ARMInstr_NLdStD ( Bool isLoad, HReg dD, ARMAModeN *amode ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                  = ARMin_NLdStD;
+//ZZ    i->ARMin.NLdStD.isLoad  = isLoad;
+//ZZ    i->ARMin.NLdStD.dD      = dD;
+//ZZ    i->ARMin.NLdStD.amode   = amode;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NUnary ( ARMNeonUnOp op, HReg dQ, HReg nQ,
+//ZZ                             UInt size, Bool Q ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_NUnary;
+//ZZ    i->ARMin.NUnary.op   = op;
+//ZZ    i->ARMin.NUnary.src  = nQ;
+//ZZ    i->ARMin.NUnary.dst  = dQ;
+//ZZ    i->ARMin.NUnary.size = size;
+//ZZ    i->ARMin.NUnary.Q    = Q;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NUnaryS ( ARMNeonUnOpS op, ARMNRS* dst, ARMNRS* src,
+//ZZ                              UInt size, Bool Q ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_NUnaryS;
+//ZZ    i->ARMin.NUnaryS.op   = op;
+//ZZ    i->ARMin.NUnaryS.src  = src;
+//ZZ    i->ARMin.NUnaryS.dst  = dst;
+//ZZ    i->ARMin.NUnaryS.size = size;
+//ZZ    i->ARMin.NUnaryS.Q    = Q;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NDual ( ARMNeonDualOp op, HReg nQ, HReg mQ,
+//ZZ                            UInt size, Bool Q ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_NDual;
+//ZZ    i->ARMin.NDual.op   = op;
+//ZZ    i->ARMin.NDual.arg1 = nQ;
+//ZZ    i->ARMin.NDual.arg2 = mQ;
+//ZZ    i->ARMin.NDual.size = size;
+//ZZ    i->ARMin.NDual.Q    = Q;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NBinary ( ARMNeonBinOp op,
+//ZZ                              HReg dst, HReg argL, HReg argR,
+//ZZ                              UInt size, Bool Q ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_NBinary;
+//ZZ    i->ARMin.NBinary.op   = op;
+//ZZ    i->ARMin.NBinary.argL = argL;
+//ZZ    i->ARMin.NBinary.argR = argR;
+//ZZ    i->ARMin.NBinary.dst  = dst;
+//ZZ    i->ARMin.NBinary.size = size;
+//ZZ    i->ARMin.NBinary.Q    = Q;
+//ZZ    return i;
+//ZZ }
+
+ARM64Instr* ARM64Instr_VImmQ (HReg rQ, UShort imm) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag               = ARM64in_VImmQ;
+   i->ARM64in.VImmQ.rQ  = rQ;
+   i->ARM64in.VImmQ.imm = imm;
+   return i;
+}
+ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VDfromX;
+   i->ARM64in.VDfromX.rD = rD;
+   i->ARM64in.VDfromX.rX = rX;
+   return i;
+}
+ARM64Instr* ARM64Instr_VQfromXX ( HReg rQ, HReg rXhi, HReg rXlo ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_VQfromXX;
+   i->ARM64in.VQfromXX.rQ   = rQ;
+   i->ARM64in.VQfromXX.rXhi = rXhi;
+   i->ARM64in.VQfromXX.rXlo = rXlo;
+   return i;
+}
+ARM64Instr* ARM64Instr_VXfromQ ( HReg rX, HReg rQ, UInt laneNo ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                    = ARM64in_VXfromQ;
+   i->ARM64in.VXfromQ.rX     = rX;
+   i->ARM64in.VXfromQ.rQ     = rQ;
+   i->ARM64in.VXfromQ.laneNo = laneNo;
+   vassert(laneNo <= 1);
+   return i;
+}
+ARM64Instr* ARM64Instr_VMov ( UInt szB, HReg dst, HReg src ) {
+   ARM64Instr* i       = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag              = ARM64in_VMov;
+   i->ARM64in.VMov.szB = szB;
+   i->ARM64in.VMov.dst = dst;
+   i->ARM64in.VMov.src = src;
+   switch (szB) {
+      case 16:
+        vassert(hregClass(src) == HRcVec128);
+        vassert(hregClass(dst) == HRcVec128);
+        break;
+      case 8:
+        vassert(hregClass(src) == HRcFlt64);
+        vassert(hregClass(dst) == HRcFlt64);
+        break;
+      default:
+        vpanic("ARM64Instr_VMov");
+   }
+   return i;
+}
+
+//ZZ ARMInstr* ARMInstr_NCMovQ ( ARMCondCode cond, HReg dst, HReg src ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag               = ARMin_NCMovQ;
+//ZZ    i->ARMin.NCMovQ.cond = cond;
+//ZZ    i->ARMin.NCMovQ.dst  = dst;
+//ZZ    i->ARMin.NCMovQ.src  = src;
+//ZZ    vassert(cond != ARMcc_AL);
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NShift ( ARMNeonShiftOp op,
+//ZZ                             HReg dst, HReg argL, HReg argR,
+//ZZ                             UInt size, Bool Q ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag                = ARMin_NShift;
+//ZZ    i->ARMin.NShift.op   = op;
+//ZZ    i->ARMin.NShift.argL = argL;
+//ZZ    i->ARMin.NShift.argR = argR;
+//ZZ    i->ARMin.NShift.dst  = dst;
+//ZZ    i->ARMin.NShift.size = size;
+//ZZ    i->ARMin.NShift.Q    = Q;
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_NShl64 ( HReg dst, HReg src, UInt amt )
+//ZZ {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag              = ARMin_NShl64;
+//ZZ    i->ARMin.NShl64.dst = dst;
+//ZZ    i->ARMin.NShl64.src = src;
+//ZZ    i->ARMin.NShl64.amt = amt;
+//ZZ    vassert(amt >= 1 && amt <= 63);
+//ZZ    return i;
+//ZZ }
+//ZZ 
+//ZZ /* Helper copy-pasted from isel.c */
+//ZZ static Bool fitsIn8x4 ( UInt* u8, UInt* u4, UInt u )
+//ZZ {
+//ZZ    UInt i;
+//ZZ    for (i = 0; i < 16; i++) {
+//ZZ       if (0 == (u & 0xFFFFFF00)) {
+//ZZ          *u8 = u;
+//ZZ          *u4 = i;
+//ZZ          return True;
+//ZZ       }
+//ZZ       u = ROR32(u, 30);
+//ZZ    }
+//ZZ    vassert(i == 16);
+//ZZ    return False;
+//ZZ }
+//ZZ 
+//ZZ ARMInstr* ARMInstr_Add32 ( HReg rD, HReg rN, UInt imm32 ) {
+//ZZ    UInt u8, u4;
+//ZZ    ARMInstr *i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    /* Try to generate single ADD if possible */
+//ZZ    if (fitsIn8x4(&u8, &u4, imm32)) {
+//ZZ       i->tag            = ARMin_Alu;
+//ZZ       i->ARMin.Alu.op   = ARMalu_ADD;
+//ZZ       i->ARMin.Alu.dst  = rD;
+//ZZ       i->ARMin.Alu.argL = rN;
+//ZZ       i->ARMin.Alu.argR = ARMRI84_I84(u8, u4);
+//ZZ    } else {
+//ZZ       i->tag               = ARMin_Add32;
+//ZZ       i->ARMin.Add32.rD    = rD;
+//ZZ       i->ARMin.Add32.rN    = rN;
+//ZZ       i->ARMin.Add32.imm32 = imm32;
+//ZZ    }
+//ZZ    return i;
+//ZZ }
+
+ARM64Instr* ARM64Instr_EvCheck ( ARM64AMode* amCounter,
+                                 ARM64AMode* amFailAddr ) {
+   ARM64Instr* i                 = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                        = ARM64in_EvCheck;
+   i->ARM64in.EvCheck.amCounter  = amCounter;
+   i->ARM64in.EvCheck.amFailAddr = amFailAddr;
+   return i;
+}
+
+//ZZ ARMInstr* ARMInstr_ProfInc ( void ) {
+//ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+//ZZ    i->tag      = ARMin_ProfInc;
+//ZZ    return i;
+//ZZ }
+
+/* ... */
+
+void ppARM64Instr ( ARM64Instr* i ) {
+   switch (i->tag) {
+      case ARM64in_Arith:
+         vex_printf("%s    ", i->ARM64in.Arith.isAdd ? "add" : "sub");
+         ppHRegARM64(i->ARM64in.Arith.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Arith.argL);
+         vex_printf(", ");
+         ppARM64RIA(i->ARM64in.Arith.argR);
+         return;
+      case ARM64in_Cmp:
+         vex_printf("cmp%s ", i->ARM64in.Cmp.is64 ? "   " : "(w)" );
+         ppHRegARM64(i->ARM64in.Cmp.argL);
+         vex_printf(", ");
+         ppARM64RIA(i->ARM64in.Cmp.argR);
+         return;
+      case ARM64in_Logic:
+         vex_printf("%s    ", showARM64LogicOp(i->ARM64in.Logic.op));
+         ppHRegARM64(i->ARM64in.Logic.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Logic.argL);
+         vex_printf(", ");
+         ppARM64RIL(i->ARM64in.Logic.argR);
+         return;
+      case ARM64in_Test:
+         vex_printf("tst    ");
+         ppHRegARM64(i->ARM64in.Test.argL);
+         vex_printf(", ");
+         ppARM64RIL(i->ARM64in.Test.argR);
+         return;
+      case ARM64in_Shift:
+         vex_printf("%s    ", showARM64ShiftOp(i->ARM64in.Shift.op));
+         ppHRegARM64(i->ARM64in.Shift.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Shift.argL);
+         vex_printf(", ");
+         ppARM64RI6(i->ARM64in.Shift.argR);
+         return;
+      case ARM64in_Unary:
+         vex_printf("%s    ", showARM64UnaryOp(i->ARM64in.Unary.op));
+         ppHRegARM64(i->ARM64in.Unary.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Unary.src);
+         return;
+      case ARM64in_MovI:
+         vex_printf("mov    ");
+         ppHRegARM64(i->ARM64in.MovI.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.MovI.src);
+         return;
+      case ARM64in_Imm64:
+         vex_printf("imm64  ");
+         ppHRegARM64(i->ARM64in.Imm64.dst);
+         vex_printf(", 0x%llx", i->ARM64in.Imm64.imm64);
+         return;
+      case ARM64in_LdSt64:
+         if (i->ARM64in.LdSt64.isLoad) {
+            vex_printf("ldr    ");
+            ppHRegARM64(i->ARM64in.LdSt64.rD);
+            vex_printf(", ");
+            ppARM64AMode(i->ARM64in.LdSt64.amode);
+         } else {
+            vex_printf("str    ");
+            ppARM64AMode(i->ARM64in.LdSt64.amode);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.LdSt64.rD);
+         }
+         return;
+      case ARM64in_LdSt32:
+         if (i->ARM64in.LdSt32.isLoad) {
+            vex_printf("ldruw  ");
+            ppHRegARM64(i->ARM64in.LdSt32.rD);
+            vex_printf(", ");
+            ppARM64AMode(i->ARM64in.LdSt32.amode);
+         } else {
+            vex_printf("strw   ");
+            ppARM64AMode(i->ARM64in.LdSt32.amode);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.LdSt32.rD);
+         }
+         return;
+      case ARM64in_LdSt16:
+         if (i->ARM64in.LdSt16.isLoad) {
+            vex_printf("ldruh  ");
+            ppHRegARM64(i->ARM64in.LdSt16.rD);
+            vex_printf(", ");
+            ppARM64AMode(i->ARM64in.LdSt16.amode);
+         } else {
+            vex_printf("strh   ");
+            ppARM64AMode(i->ARM64in.LdSt16.amode);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.LdSt16.rD);
+         }
+         return;
+      case ARM64in_LdSt8:
+         if (i->ARM64in.LdSt8.isLoad) {
+            vex_printf("ldrub  ");
+            ppHRegARM64(i->ARM64in.LdSt8.rD);
+            vex_printf(", ");
+            ppARM64AMode(i->ARM64in.LdSt8.amode);
+         } else {
+            vex_printf("strb   ");
+            ppARM64AMode(i->ARM64in.LdSt8.amode);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.LdSt8.rD);
+         }
+         return;
+      case ARM64in_XDirect:
+         vex_printf("(xDirect) ");
+         vex_printf("if (%%pstate.%s) { ",
+                    showARM64CondCode(i->ARM64in.XDirect.cond));
+         vex_printf("imm64 x9,0x%llx; ", i->ARM64in.XDirect.dstGA);
+         vex_printf("str x9,");
+         ppARM64AMode(i->ARM64in.XDirect.amPC);
+         vex_printf("; imm64-exactly4 x9,$disp_cp_chain_me_to_%sEP; ",
+                    i->ARM64in.XDirect.toFastEP ? "fast" : "slow");
+         vex_printf("blr x9 }");
+         return;
+      case ARM64in_XIndir:
+         vex_printf("(xIndir) ");
+         vex_printf("if (%%pstate.%s) { ",
+                    showARM64CondCode(i->ARM64in.XIndir.cond));
+         vex_printf("str ");
+         ppHRegARM64(i->ARM64in.XIndir.dstGA);
+         vex_printf(",");
+         ppARM64AMode(i->ARM64in.XIndir.amPC);
+         vex_printf("; imm64 x9,$disp_cp_xindir; ");
+         vex_printf("br x9 }");
+         return;
+      case ARM64in_XAssisted:
+         vex_printf("(xAssisted) ");
+         vex_printf("if (%%pstate.%s) { ",
+                    showARM64CondCode(i->ARM64in.XAssisted.cond));
+         vex_printf("str ");
+         ppHRegARM64(i->ARM64in.XAssisted.dstGA);
+         vex_printf(",");
+         ppARM64AMode(i->ARM64in.XAssisted.amPC);
+         vex_printf("; movw x21,$IRJumpKind_to_TRCVAL(%d); ",
+                    (Int)i->ARM64in.XAssisted.jk);
+         vex_printf("imm64 x9,$disp_cp_xassisted; ");
+         vex_printf("br x9 }");
+         return;
+      case ARM64in_CSel:
+         vex_printf("csel   ");
+         ppHRegARM64(i->ARM64in.CSel.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.CSel.argL);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.CSel.argR);
+         vex_printf(", %s", showARM64CondCode(i->ARM64in.CSel.cond));
+         return;
+      case ARM64in_Call:
+         vex_printf("call%s ",
+                    i->ARM64in.Call.cond==ARM64cc_AL
+                       ? "  " : showARM64CondCode(i->ARM64in.Call.cond));
+         vex_printf("0x%lx [nArgRegs=%d, ",
+                    i->ARM64in.Call.target, i->ARM64in.Call.nArgRegs);
+         ppRetLoc(i->ARM64in.Call.rloc);
+         vex_printf("]");
+         return;
+      case ARM64in_AddToSP: {
+         Int simm = i->ARM64in.AddToSP.simm;
+         vex_printf("%s    xsp, xsp, #%d", simm < 0 ? "sub" : "add", 
+                                           simm < 0 ? -simm : simm);
+         return;
+      }
+      case ARM64in_FromSP:
+         vex_printf("mov    ");
+         ppHRegARM64(i->ARM64in.FromSP.dst);
+         vex_printf(", xsp");
+         return;
+      case ARM64in_Mul:
+         vex_printf("%s  ", showARM64MulOp(i->ARM64in.Mul.op));
+         ppHRegARM64(i->ARM64in.Mul.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Mul.argL);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.Mul.argR);
+         return;
+
+      case ARM64in_LdrEX: {
+         const HChar* sz = " ";
+         switch (i->ARM64in.LdrEX.szB) {
+            case 1: sz = "b"; break;
+            case 2: sz = "h"; break;
+            case 4: case 8: break;
+            default: vassert(0);
+         }
+         vex_printf("ldxr%s  %c2, [x4]",
+                    sz, i->ARM64in.LdrEX.szB == 8 ? 'x' : 'w');
+         return;
+      }
+      case ARM64in_StrEX: {
+         const HChar* sz = " ";
+         switch (i->ARM64in.StrEX.szB) {
+            case 1: sz = "b"; break;
+            case 2: sz = "h"; break;
+            case 4: case 8: break;
+            default: vassert(0);
+         }
+         vex_printf("stxr%s  w0, %c2, [x4]",
+                    sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
+         return;
+      }
+      case ARM64in_MFence:
+         vex_printf("(mfence) dsb sy; dmb sy; isb");
+         return;
+//ZZ       case ARM64in_CLREX:
+//ZZ          vex_printf("clrex");
+//ZZ          return;
+      case ARM64in_VLdStS:
+         if (i->ARM64in.VLdStS.isLoad) {
+            vex_printf("ldr    ");
+            ppHRegARM64asSreg(i->ARM64in.VLdStS.sD);
+            vex_printf(", %u(", i->ARM64in.VLdStS.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStS.rN);
+            vex_printf(")");
+         } else {
+            vex_printf("str    ");
+            vex_printf("%u(", i->ARM64in.VLdStS.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStS.rN);
+            vex_printf("), ");
+            ppHRegARM64asSreg(i->ARM64in.VLdStS.sD);
+         }
+         return;
+      case ARM64in_VLdStD:
+         if (i->ARM64in.VLdStD.isLoad) {
+            vex_printf("ldr    ");
+            ppHRegARM64(i->ARM64in.VLdStD.dD);
+            vex_printf(", %u(", i->ARM64in.VLdStD.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStD.rN);
+            vex_printf(")");
+         } else {
+            vex_printf("str    ");
+            vex_printf("%u(", i->ARM64in.VLdStD.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStD.rN);
+            vex_printf("), ");
+            ppHRegARM64(i->ARM64in.VLdStD.dD);
+         }
+         return;
+      case ARM64in_VLdStQ:
+         if (i->ARM64in.VLdStQ.isLoad)
+            vex_printf("ld1.2d {");
+         else
+            vex_printf("st1.2d {");
+         ppHRegARM64(i->ARM64in.VLdStQ.rQ);
+         vex_printf("}, [");
+         ppHRegARM64(i->ARM64in.VLdStQ.rN);
+         vex_printf("]");
+         return;
+      case ARM64in_VCvtI2F: {
+         HChar syn  = '?';
+         UInt  fszB = 0;
+         UInt  iszB = 0;
+         characteriseARM64CvtOp(&syn, &fszB, &iszB, i->ARM64in.VCvtI2F.how);
+         vex_printf("%ccvtf  ", syn);
+         ppHRegARM64(i->ARM64in.VCvtI2F.rD);
+         vex_printf("(%c-reg), ", fszB == 4 ? 'S' : 'D');
+         ppHRegARM64(i->ARM64in.VCvtI2F.rS);
+         vex_printf("(%c-reg)", iszB == 4 ? 'W' : 'X');
+         return;
+      }
+      case ARM64in_VCvtF2I: {
+         HChar syn  = '?';
+         UInt  fszB = 0;
+         UInt  iszB = 0;
+         HChar rmo  = '?';
+         characteriseARM64CvtOp(&syn, &fszB, &iszB, i->ARM64in.VCvtF2I.how);
+         UChar armRM = i->ARM64in.VCvtF2I.armRM;
+         if (armRM < 4) rmo = "npmz"[armRM];
+         vex_printf("fcvt%c%c ", rmo, syn);
+         ppHRegARM64(i->ARM64in.VCvtF2I.rD);
+         vex_printf("(%c-reg), ", iszB == 4 ? 'W' : 'X');
+         ppHRegARM64(i->ARM64in.VCvtF2I.rS);
+         vex_printf("(%c-reg)", fszB == 4 ? 'S' : 'D');
+         return;
+      }
+      case ARM64in_VCvtSD:
+         vex_printf("fcvt%s ", i->ARM64in.VCvtSD.sToD ? "s2d" : "d2s");
+         if (i->ARM64in.VCvtSD.sToD) {
+            ppHRegARM64(i->ARM64in.VCvtSD.dst);
+            vex_printf(", ");
+            ppHRegARM64asSreg(i->ARM64in.VCvtSD.src);
+         } else {
+            ppHRegARM64asSreg(i->ARM64in.VCvtSD.dst);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.VCvtSD.src);
+         }
+         return;
+      case ARM64in_VUnaryD:
+         vex_printf("f%s ", showARM64FpUnaryOp(i->ARM64in.VUnaryD.op));
+         ppHRegARM64(i->ARM64in.VUnaryD.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VUnaryD.src);
+         return;
+      case ARM64in_VUnaryS:
+         vex_printf("f%s ", showARM64FpUnaryOp(i->ARM64in.VUnaryS.op));
+         ppHRegARM64asSreg(i->ARM64in.VUnaryS.dst);
+         vex_printf(", ");
+         ppHRegARM64asSreg(i->ARM64in.VUnaryS.src);
+         return;
+      case ARM64in_VBinD:
+         vex_printf("f%s   ", showARM64FpBinOp(i->ARM64in.VBinD.op));
+         ppHRegARM64(i->ARM64in.VBinD.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VBinD.argL);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VBinD.argR);
+         return;
+      case ARM64in_VBinS:
+         vex_printf("f%s   ", showARM64FpBinOp(i->ARM64in.VBinS.op));
+         ppHRegARM64asSreg(i->ARM64in.VBinS.dst);
+         vex_printf(", ");
+         ppHRegARM64asSreg(i->ARM64in.VBinS.argL);
+         vex_printf(", ");
+         ppHRegARM64asSreg(i->ARM64in.VBinS.argR);
+         return;
+      case ARM64in_VCmpD:
+         vex_printf("fcmp   ");
+         ppHRegARM64(i->ARM64in.VCmpD.argL);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VCmpD.argR);
+         return;
+      case ARM64in_VCmpS:
+         vex_printf("fcmp   ");
+         ppHRegARM64asSreg(i->ARM64in.VCmpS.argL);
+         vex_printf(", ");
+         ppHRegARM64asSreg(i->ARM64in.VCmpS.argR);
+         return;
+      case ARM64in_FPCR:
+         if (i->ARM64in.FPCR.toFPCR) {
+            vex_printf("msr    fpcr, ");
+            ppHRegARM64(i->ARM64in.FPCR.iReg);
+         } else {
+            vex_printf("mrs    ");
+            ppHRegARM64(i->ARM64in.FPCR.iReg);
+            vex_printf(", fpcr");
+         }
+         return;
+      case ARM64in_VBinV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecBinOp(&nm, &ar, i->ARM64in.VBinV.op);
+         vex_printf("%s   ", nm);
+         ppHRegARM64(i->ARM64in.VBinV.dst);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VBinV.argL);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VBinV.argR);
+         vex_printf(".%s", ar);
+         return;
+      }
+      case ARM64in_VUnaryV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecUnaryOp(&nm, &ar, i->ARM64in.VUnaryV.op);
+         vex_printf("%s  ", nm);
+         ppHRegARM64(i->ARM64in.VUnaryV.dst);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VUnaryV.arg);
+         vex_printf(".%s", ar);
+         return;
+      }
+      case ARM64in_VNarrowV: {
+         UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2;
+         const HChar* darr[3] = { "8b", "4h", "2s" };
+         const HChar* sarr[3] = { "8h", "4s", "2d" };
+         vex_printf("xtn    ");
+         ppHRegARM64(i->ARM64in.VNarrowV.dst);
+         vex_printf(".%s, ", dszBlg2 < 3 ? darr[dszBlg2] : "??");
+         ppHRegARM64(i->ARM64in.VNarrowV.src);
+         vex_printf(".%s", dszBlg2 < 3 ? sarr[dszBlg2] : "??");
+         return;
+      }
+      case ARM64in_VShiftImmV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecShiftOp(&nm, &ar, i->ARM64in.VShiftImmV.op);
+         vex_printf("%s ", nm);
+         ppHRegARM64(i->ARM64in.VShiftImmV.dst);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VShiftImmV.src);
+         vex_printf(".%s, #%u", ar, i->ARM64in.VShiftImmV.amt);
+         return;
+      }
+//ZZ       case ARMin_VAluS:
+//ZZ          vex_printf("f%-3ss ", showARMVfpOp(i->ARMin.VAluS.op));
+//ZZ          ppHRegARM(i->ARMin.VAluS.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.VAluS.argL);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.VAluS.argR);
+//ZZ          return;
+//ZZ       case ARMin_VCMovD:
+//ZZ          vex_printf("fcpyd%s ", showARMCondCode(i->ARMin.VCMovD.cond));
+//ZZ          ppHRegARM(i->ARMin.VCMovD.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.VCMovD.src);
+//ZZ          return;
+//ZZ       case ARMin_VCMovS:
+//ZZ          vex_printf("fcpys%s ", showARMCondCode(i->ARMin.VCMovS.cond));
+//ZZ          ppHRegARM(i->ARMin.VCMovS.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.VCMovS.src);
+//ZZ          return;
+//ZZ       case ARMin_VXferD:
+//ZZ          vex_printf("vmov  ");
+//ZZ          if (i->ARMin.VXferD.toD) {
+//ZZ             ppHRegARM(i->ARMin.VXferD.dD);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferD.rLo);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferD.rHi);
+//ZZ          } else {
+//ZZ             ppHRegARM(i->ARMin.VXferD.rLo);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferD.rHi);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferD.dD);
+//ZZ          }
+//ZZ          return;
+//ZZ       case ARMin_VXferS:
+//ZZ          vex_printf("vmov  ");
+//ZZ          if (i->ARMin.VXferS.toS) {
+//ZZ             ppHRegARM(i->ARMin.VXferS.fD);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferS.rLo);
+//ZZ          } else {
+//ZZ             ppHRegARM(i->ARMin.VXferS.rLo);
+//ZZ             vex_printf(", ");
+//ZZ             ppHRegARM(i->ARMin.VXferS.fD);
+//ZZ          }
+//ZZ          return;
+//ZZ       case ARMin_VCvtID: {
+//ZZ          const HChar* nm = "?";
+//ZZ          if (i->ARMin.VCvtID.iToD) {
+//ZZ             nm = i->ARMin.VCvtID.syned ? "fsitod" : "fuitod";
+//ZZ          } else {
+//ZZ             nm = i->ARMin.VCvtID.syned ? "ftosid" : "ftouid";
+//ZZ          }
+//ZZ          vex_printf("%s ", nm);
+//ZZ          ppHRegARM(i->ARMin.VCvtID.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.VCvtID.src);
+//ZZ          return;
+//ZZ       }
+//ZZ       case ARMin_NLdStD:
+//ZZ          if (i->ARMin.NLdStD.isLoad)
+//ZZ             vex_printf("vld1.32 {");
+//ZZ          else
+//ZZ             vex_printf("vst1.32 {");
+//ZZ          ppHRegARM(i->ARMin.NLdStD.dD);
+//ZZ          vex_printf("} ");
+//ZZ          ppARMAModeN(i->ARMin.NLdStD.amode);
+//ZZ          return;
+//ZZ       case ARMin_NUnary:
+//ZZ          vex_printf("%s%s%s  ",
+//ZZ                     showARMNeonUnOp(i->ARMin.NUnary.op),
+//ZZ                     showARMNeonUnOpDataType(i->ARMin.NUnary.op),
+//ZZ                     showARMNeonDataSize(i));
+//ZZ          ppHRegARM(i->ARMin.NUnary.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NUnary.src);
+//ZZ          if (i->ARMin.NUnary.op == ARMneon_EQZ)
+//ZZ             vex_printf(", #0");
+//ZZ          if (i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedS ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedU ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFixedStoF ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VCVTFixedUtoF) {
+//ZZ             vex_printf(", #%d", i->ARMin.NUnary.size);
+//ZZ          }
+//ZZ          if (i->ARMin.NUnary.op == ARMneon_VQSHLNSS ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VQSHLNUU ||
+//ZZ              i->ARMin.NUnary.op == ARMneon_VQSHLNUS) {
+//ZZ             UInt size;
+//ZZ             size = i->ARMin.NUnary.size;
+//ZZ             if (size & 0x40) {
+//ZZ                vex_printf(", #%d", size - 64);
+//ZZ             } else if (size & 0x20) {
+//ZZ                vex_printf(", #%d", size - 32);
+//ZZ             } else if (size & 0x10) {
+//ZZ                vex_printf(", #%d", size - 16);
+//ZZ             } else if (size & 0x08) {
+//ZZ                vex_printf(", #%d", size - 8);
+//ZZ             }
+//ZZ          }
+//ZZ          return;
+//ZZ       case ARMin_NUnaryS:
+//ZZ          vex_printf("%s%s%s  ",
+//ZZ                     showARMNeonUnOpS(i->ARMin.NUnaryS.op),
+//ZZ                     showARMNeonUnOpSDataType(i->ARMin.NUnaryS.op),
+//ZZ                     showARMNeonDataSize(i));
+//ZZ          ppARMNRS(i->ARMin.NUnaryS.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppARMNRS(i->ARMin.NUnaryS.src);
+//ZZ          return;
+//ZZ       case ARMin_NShift:
+//ZZ          vex_printf("%s%s%s  ",
+//ZZ                     showARMNeonShiftOp(i->ARMin.NShift.op),
+//ZZ                     showARMNeonShiftOpDataType(i->ARMin.NShift.op),
+//ZZ                     showARMNeonDataSize(i));
+//ZZ          ppHRegARM(i->ARMin.NShift.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NShift.argL);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NShift.argR);
+//ZZ          return;
+//ZZ       case ARMin_NShl64:
+//ZZ          vex_printf("vshl.i64 ");
+//ZZ          ppHRegARM(i->ARMin.NShl64.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NShl64.src);
+//ZZ          vex_printf(", #%u", i->ARMin.NShl64.amt);
+//ZZ          return;
+//ZZ       case ARMin_NDual:
+//ZZ          vex_printf("%s%s%s  ",
+//ZZ                     showARMNeonDualOp(i->ARMin.NDual.op),
+//ZZ                     showARMNeonDualOpDataType(i->ARMin.NDual.op),
+//ZZ                     showARMNeonDataSize(i));
+//ZZ          ppHRegARM(i->ARMin.NDual.arg1);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NDual.arg2);
+//ZZ          return;
+//ZZ       case ARMin_NBinary:
+//ZZ          vex_printf("%s%s%s",
+//ZZ                     showARMNeonBinOp(i->ARMin.NBinary.op),
+//ZZ                     showARMNeonBinOpDataType(i->ARMin.NBinary.op),
+//ZZ                     showARMNeonDataSize(i));
+//ZZ          vex_printf("  ");
+//ZZ          ppHRegARM(i->ARMin.NBinary.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NBinary.argL);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NBinary.argR);
+//ZZ          return;
+      case ARM64in_VImmQ:
+         vex_printf("qimm   ");
+         ppHRegARM64(i->ARM64in.VImmQ.rQ);
+         vex_printf(", Bits16toBytes16(0x%x)", (UInt)i->ARM64in.VImmQ.imm);
+         return;
+      case ARM64in_VDfromX:
+         vex_printf("fmov   ");
+         ppHRegARM64(i->ARM64in.VDfromX.rD);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VDfromX.rX);
+         return;
+      case ARM64in_VQfromXX:
+         vex_printf("qFromXX ");
+         ppHRegARM64(i->ARM64in.VQfromXX.rQ);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VQfromXX.rXhi);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VQfromXX.rXlo);
+         return;
+      case ARM64in_VXfromQ:
+         vex_printf("mov    ");
+         ppHRegARM64(i->ARM64in.VXfromQ.rX);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VXfromQ.rQ);
+         vex_printf(".d[%u]", i->ARM64in.VXfromQ.laneNo);
+         return;
+      case ARM64in_VMov: {
+         UChar aux = '?';
+         switch (i->ARM64in.VMov.szB) {
+            case 16: aux = 'q'; break;
+            case 8:  aux = 'd'; break;
+            case 4:  aux = 's'; break;
+            default: break;
+         }
+         vex_printf("mov(%c) ", aux);
+         ppHRegARM64(i->ARM64in.VMov.dst);
+         vex_printf(", ");
+         ppHRegARM64(i->ARM64in.VMov.src);
+         return;
+   }
+//ZZ        case ARMin_NCMovQ:
+//ZZ          vex_printf("vmov%s ", showARMCondCode(i->ARMin.NCMovQ.cond));
+//ZZ          ppHRegARM(i->ARMin.NCMovQ.dst);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.NCMovQ.src);
+//ZZ          return;
+//ZZ       case ARMin_Add32:
+//ZZ          vex_printf("add32 ");
+//ZZ          ppHRegARM(i->ARMin.Add32.rD);
+//ZZ          vex_printf(", ");
+//ZZ          ppHRegARM(i->ARMin.Add32.rN);
+//ZZ          vex_printf(", ");
+//ZZ          vex_printf("%d", i->ARMin.Add32.imm32);
+//ZZ          return;
+      case ARM64in_EvCheck:
+         vex_printf("(evCheck) ldr w9,");
+         ppARM64AMode(i->ARM64in.EvCheck.amCounter);
+         vex_printf("; subs w9,w9,$1; str w9,");
+         ppARM64AMode(i->ARM64in.EvCheck.amCounter);
+         vex_printf("; bpl nofail; ldr x9,");
+         ppARM64AMode(i->ARM64in.EvCheck.amFailAddr);
+         vex_printf("; br x9; nofail:");
+         return;
+//ZZ       case ARMin_ProfInc:
+//ZZ          vex_printf("(profInc) movw r12,LO16($NotKnownYet); "
+//ZZ                     "movw r12,HI16($NotKnownYet); "
+//ZZ                     "ldr r11,[r12]; "
+//ZZ                     "adds r11,r11,$1; "
+//ZZ                     "str r11,[r12]; "
+//ZZ                     "ldr r11,[r12+4]; "
+//ZZ                     "adc r11,r11,$0; "
+//ZZ                     "str r11,[r12+4]");
+//ZZ          return;
+      default:
+         vex_printf("ppARM64Instr: unhandled case (tag %d)", (Int)i->tag);
+         vpanic("ppARM64Instr(1)");
+         return;
+   }
+}
+
+
+/* --------- Helpers for register allocation. --------- */
+
+void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 )
+{
+   vassert(mode64 == True);
+   initHRegUsage(u);
+   switch (i->tag) {
+      case ARM64in_Arith:
+         addHRegUse(u, HRmWrite, i->ARM64in.Arith.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.Arith.argL);
+         addRegUsage_ARM64RIA(u, i->ARM64in.Arith.argR);
+         return;
+      case ARM64in_Cmp:
+         addHRegUse(u, HRmRead, i->ARM64in.Cmp.argL);
+         addRegUsage_ARM64RIA(u, i->ARM64in.Cmp.argR);
+         return;
+      case ARM64in_Logic:
+         addHRegUse(u, HRmWrite, i->ARM64in.Logic.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.Logic.argL);
+         addRegUsage_ARM64RIL(u, i->ARM64in.Logic.argR);
+         return;
+      case ARM64in_Test:
+         addHRegUse(u, HRmRead, i->ARM64in.Test.argL);
+         addRegUsage_ARM64RIL(u, i->ARM64in.Test.argR);
+         return;
+      case ARM64in_Shift:
+         addHRegUse(u, HRmWrite, i->ARM64in.Shift.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.Shift.argL);
+         addRegUsage_ARM64RI6(u, i->ARM64in.Shift.argR);
+         return;
+      case ARM64in_Unary:
+         addHRegUse(u, HRmWrite, i->ARM64in.Unary.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.Unary.src);
+         return;
+      case ARM64in_MovI:
+         addHRegUse(u, HRmWrite, i->ARM64in.MovI.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.MovI.src);
+         return;
+      case ARM64in_Imm64:
+         addHRegUse(u, HRmWrite, i->ARM64in.Imm64.dst);
+         return;
+      case ARM64in_LdSt64:
+         addRegUsage_ARM64AMode(u, i->ARM64in.LdSt64.amode);
+         if (i->ARM64in.LdSt64.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.LdSt64.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.LdSt64.rD);
+         }
+         return;
+      case ARM64in_LdSt32:
+         addRegUsage_ARM64AMode(u, i->ARM64in.LdSt32.amode);
+         if (i->ARM64in.LdSt32.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.LdSt32.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.LdSt32.rD);
+         }
+         return;
+      case ARM64in_LdSt16:
+         addRegUsage_ARM64AMode(u, i->ARM64in.LdSt16.amode);
+         if (i->ARM64in.LdSt16.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.LdSt16.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.LdSt16.rD);
+         }
+         return;
+      case ARM64in_LdSt8:
+         addRegUsage_ARM64AMode(u, i->ARM64in.LdSt8.amode);
+         if (i->ARM64in.LdSt8.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.LdSt8.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.LdSt8.rD);
+         }
+         return;
+      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
+         conditionally exit the block.  Hence we only need to list (1)
+         the registers that they read, and (2) the registers that they
+         write in the case where the block is not exited.  (2) is
+         empty, hence only (1) is relevant here. */
+      case ARM64in_XDirect:
+         addRegUsage_ARM64AMode(u, i->ARM64in.XDirect.amPC);
+         return;
+      case ARM64in_XIndir:
+         addHRegUse(u, HRmRead, i->ARM64in.XIndir.dstGA);
+         addRegUsage_ARM64AMode(u, i->ARM64in.XIndir.amPC);
+         return;
+      case ARM64in_XAssisted:
+         addHRegUse(u, HRmRead, i->ARM64in.XAssisted.dstGA);
+         addRegUsage_ARM64AMode(u, i->ARM64in.XAssisted.amPC);
+         return;
+      case ARM64in_CSel:
+         addHRegUse(u, HRmWrite, i->ARM64in.CSel.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.CSel.argL);
+         addHRegUse(u, HRmRead,  i->ARM64in.CSel.argR);
+         return;
+      case ARM64in_Call:
+         /* logic and comments copied/modified from x86 back end */
+         /* This is a bit subtle. */
+         /* First off, claim it trashes all the caller-saved regs
+            which fall within the register allocator's jurisdiction.
+            These I believe to be x0 to x7.  Also need to be
+            careful about vector regs. */
+         addHRegUse(u, HRmWrite, hregARM64_X0());
+         addHRegUse(u, HRmWrite, hregARM64_X1());
+         addHRegUse(u, HRmWrite, hregARM64_X2());
+         addHRegUse(u, HRmWrite, hregARM64_X3());
+         addHRegUse(u, HRmWrite, hregARM64_X4());
+         addHRegUse(u, HRmWrite, hregARM64_X5());
+         addHRegUse(u, HRmWrite, hregARM64_X6());
+         addHRegUse(u, HRmWrite, hregARM64_X7());
+         addHRegUse(u, HRmWrite, hregARM64_Q16());
+         addHRegUse(u, HRmWrite, hregARM64_Q17());
+         addHRegUse(u, HRmWrite, hregARM64_Q18());
+         /* Now we have to state any parameter-carrying registers
+            which might be read.  This depends on nArgRegs. */
+            switch (i->ARM64in.Call.nArgRegs) {
+            case 8: addHRegUse(u, HRmRead, hregARM64_X7()); /*fallthru*/
+            case 7: addHRegUse(u, HRmRead, hregARM64_X6()); /*fallthru*/
+            case 6: addHRegUse(u, HRmRead, hregARM64_X5()); /*fallthru*/
+            case 5: addHRegUse(u, HRmRead, hregARM64_X4()); /*fallthru*/
+            case 4: addHRegUse(u, HRmRead, hregARM64_X3()); /*fallthru*/
+            case 3: addHRegUse(u, HRmRead, hregARM64_X2()); /*fallthru*/
+            case 2: addHRegUse(u, HRmRead, hregARM64_X1()); /*fallthru*/
+            case 1: addHRegUse(u, HRmRead, hregARM64_X0()); break;
+            case 0: break;
+            default: vpanic("getRegUsage_ARM64:Call:regparms");
+         }
+         /* Finally, there is the issue that the insn trashes a
+            register because the literal target address has to be
+            loaded into a register.  However, we reserve x9 for that
+            purpose so there's no further complexity here.  Stating x9
+            as trashed is pointless since it's not under the control
+            of the allocator, but what the hell. */
+         addHRegUse(u, HRmWrite, hregARM64_X9());
+         return;
+      case ARM64in_AddToSP:
+         /* Only changes SP, but regalloc doesn't control that, hence
+            we don't care. */
+         return;
+      case ARM64in_FromSP:
+         addHRegUse(u, HRmWrite, i->ARM64in.FromSP.dst);
+         return;
+      case ARM64in_Mul:
+         addHRegUse(u, HRmWrite, i->ARM64in.Mul.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.Mul.argL);
+         addHRegUse(u, HRmRead,  i->ARM64in.Mul.argR);
+         return;
+      case ARM64in_LdrEX:
+         addHRegUse(u, HRmRead, hregARM64_X4());
+         addHRegUse(u, HRmWrite, hregARM64_X2());
+         return;
+      case ARM64in_StrEX:
+         addHRegUse(u, HRmRead, hregARM64_X4());
+         addHRegUse(u, HRmWrite, hregARM64_X0());
+         addHRegUse(u, HRmRead, hregARM64_X2());
+         return;
+      case ARM64in_MFence:
+         return;
+//ZZ       case ARMin_CLREX:
+//ZZ          return;
+      case ARM64in_VLdStS:
+         addHRegUse(u, HRmRead, i->ARM64in.VLdStS.rN);
+         if (i->ARM64in.VLdStS.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.VLdStS.sD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.VLdStS.sD);
+         }
+         return;
+      case ARM64in_VLdStD:
+         addHRegUse(u, HRmRead, i->ARM64in.VLdStD.rN);
+         if (i->ARM64in.VLdStD.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.VLdStD.dD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.VLdStD.dD);
+         }
+         return;
+      case ARM64in_VLdStQ:
+         addHRegUse(u, HRmRead, i->ARM64in.VLdStQ.rN);
+         if (i->ARM64in.VLdStQ.isLoad)
+            addHRegUse(u, HRmWrite, i->ARM64in.VLdStQ.rQ);
+         else
+            addHRegUse(u, HRmRead, i->ARM64in.VLdStQ.rQ);
+         return;
+      case ARM64in_VCvtI2F:
+         addHRegUse(u, HRmRead, i->ARM64in.VCvtI2F.rS);
+         addHRegUse(u, HRmWrite, i->ARM64in.VCvtI2F.rD);
+         return;
+      case ARM64in_VCvtF2I:
+         addHRegUse(u, HRmRead, i->ARM64in.VCvtF2I.rS);
+         addHRegUse(u, HRmWrite, i->ARM64in.VCvtF2I.rD);
+         return;
+      case ARM64in_VCvtSD:
+         addHRegUse(u, HRmWrite, i->ARM64in.VCvtSD.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.VCvtSD.src);
+         return;
+      case ARM64in_VUnaryD:
+         addHRegUse(u, HRmWrite, i->ARM64in.VUnaryD.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VUnaryD.src);
+         return;
+      case ARM64in_VUnaryS:
+         addHRegUse(u, HRmWrite, i->ARM64in.VUnaryS.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VUnaryS.src);
+         return;
+      case ARM64in_VBinD:
+         addHRegUse(u, HRmWrite, i->ARM64in.VBinD.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinD.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinD.argR);
+         return;
+      case ARM64in_VBinS:
+         addHRegUse(u, HRmWrite, i->ARM64in.VBinS.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinS.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinS.argR);
+         return;
+      case ARM64in_VCmpD:
+         addHRegUse(u, HRmRead, i->ARM64in.VCmpD.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VCmpD.argR);
+         return;
+      case ARM64in_VCmpS:
+         addHRegUse(u, HRmRead, i->ARM64in.VCmpS.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VCmpS.argR);
+         return;
+      case ARM64in_FPCR:
+         if (i->ARM64in.FPCR.toFPCR)
+            addHRegUse(u, HRmRead, i->ARM64in.FPCR.iReg);
+         else
+            addHRegUse(u, HRmWrite, i->ARM64in.FPCR.iReg);
+         return;
+      case ARM64in_VBinV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VBinV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinV.argR);
+         return;
+      case ARM64in_VUnaryV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VUnaryV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VUnaryV.arg);
+         return;
+      case ARM64in_VNarrowV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VNarrowV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VNarrowV.src);
+         return;
+      case ARM64in_VShiftImmV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VShiftImmV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VShiftImmV.src);
+         return;
+//ZZ       case ARMin_VAluS:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.VAluS.dst);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.VAluS.argL);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.VAluS.argR);
+//ZZ          return;
+//ZZ       case ARMin_VUnaryS:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.VUnaryS.dst);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.VUnaryS.src);
+//ZZ          return;
+//ZZ       case ARMin_VCMovD:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.VCMovD.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.VCMovD.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.VCMovD.src);
+//ZZ          return;
+//ZZ       case ARMin_VCMovS:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.VCMovS.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.VCMovS.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.VCMovS.src);
+//ZZ          return;
+//ZZ       case ARMin_VXferD:
+//ZZ          if (i->ARMin.VXferD.toD) {
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.VXferD.dD);
+//ZZ             addHRegUse(u, HRmRead,  i->ARMin.VXferD.rHi);
+//ZZ             addHRegUse(u, HRmRead,  i->ARMin.VXferD.rLo);
+//ZZ          } else {
+//ZZ             addHRegUse(u, HRmRead,  i->ARMin.VXferD.dD);
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.VXferD.rHi);
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.VXferD.rLo);
+//ZZ          }
+//ZZ          return;
+//ZZ       case ARMin_VXferS:
+//ZZ          if (i->ARMin.VXferS.toS) {
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.VXferS.fD);
+//ZZ             addHRegUse(u, HRmRead,  i->ARMin.VXferS.rLo);
+//ZZ          } else {
+//ZZ             addHRegUse(u, HRmRead,  i->ARMin.VXferS.fD);
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.VXferS.rLo);
+//ZZ          }
+//ZZ          return;
+//ZZ       case ARMin_VCvtID:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.VCvtID.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.VCvtID.src);
+//ZZ          return;
+//ZZ       case ARMin_NLdStD:
+//ZZ          if (i->ARMin.NLdStD.isLoad)
+//ZZ             addHRegUse(u, HRmWrite, i->ARMin.NLdStD.dD);
+//ZZ          else
+//ZZ             addHRegUse(u, HRmRead, i->ARMin.NLdStD.dD);
+//ZZ          addRegUsage_ARMAModeN(u, i->ARMin.NLdStD.amode);
+//ZZ          return;
+//ZZ       case ARMin_NUnary:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NUnary.dst);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NUnary.src);
+//ZZ          return;
+//ZZ       case ARMin_NUnaryS:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NUnaryS.dst->reg);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NUnaryS.src->reg);
+//ZZ          return;
+//ZZ       case ARMin_NShift:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NShift.dst);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NShift.argL);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NShift.argR);
+//ZZ          return;
+//ZZ       case ARMin_NShl64:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NShl64.dst);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NShl64.src);
+//ZZ          return;
+//ZZ       case ARMin_NDual:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NDual.arg1);
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NDual.arg2);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NDual.arg1);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NDual.arg2);
+//ZZ          return;
+      case ARM64in_VImmQ:
+         addHRegUse(u, HRmWrite, i->ARM64in.VImmQ.rQ);
+         return;
+      case ARM64in_VDfromX:
+         addHRegUse(u, HRmWrite, i->ARM64in.VDfromX.rD);
+         addHRegUse(u, HRmRead,  i->ARM64in.VDfromX.rX);
+         return;
+      case ARM64in_VQfromXX:
+         addHRegUse(u, HRmWrite, i->ARM64in.VQfromXX.rQ);
+         addHRegUse(u, HRmRead,  i->ARM64in.VQfromXX.rXhi);
+         addHRegUse(u, HRmRead,  i->ARM64in.VQfromXX.rXlo);
+         return;
+      case ARM64in_VXfromQ:
+         addHRegUse(u, HRmWrite, i->ARM64in.VXfromQ.rX);
+         addHRegUse(u, HRmRead,  i->ARM64in.VXfromQ.rQ);
+         return;
+      case ARM64in_VMov:
+         addHRegUse(u, HRmWrite, i->ARM64in.VMov.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.VMov.src);
+         return;
+//ZZ       case ARMin_NBinary:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NBinary.dst);
+//ZZ          /* TODO: sometimes dst is also being read! */
+//ZZ          // XXX fix this
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NBinary.argL);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.NBinary.argR);
+//ZZ          return;
+//ZZ       case ARMin_NCMovQ:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.NCMovQ.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.NCMovQ.dst);
+//ZZ          addHRegUse(u, HRmRead,  i->ARMin.NCMovQ.src);
+//ZZ          return;
+//ZZ       case ARMin_Add32:
+//ZZ          addHRegUse(u, HRmWrite, i->ARMin.Add32.rD);
+//ZZ          addHRegUse(u, HRmRead, i->ARMin.Add32.rN);
+//ZZ          return;
+      case ARM64in_EvCheck:
+         /* We expect both amodes only to mention x21, so this is in
+            fact pointless, since x21 isn't allocatable, but
+            anyway.. */
+         addRegUsage_ARM64AMode(u, i->ARM64in.EvCheck.amCounter);
+         addRegUsage_ARM64AMode(u, i->ARM64in.EvCheck.amFailAddr);
+         addHRegUse(u, HRmWrite, hregARM64_X9()); /* also unavail to RA */
+         return;
+//ZZ       case ARMin_ProfInc:
+//ZZ          addHRegUse(u, HRmWrite, hregARM_R12());
+//ZZ          addHRegUse(u, HRmWrite, hregARM_R11());
+//ZZ          return;
+      default:
+         ppARM64Instr(i);
+         vpanic("getRegUsage_ARM64Instr");
+   }
+}
+
+
+void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
+{
+   vassert(mode64 == True);
+   switch (i->tag) {
+      case ARM64in_Arith:
+         i->ARM64in.Arith.dst = lookupHRegRemap(m, i->ARM64in.Arith.dst);
+         i->ARM64in.Arith.argL = lookupHRegRemap(m, i->ARM64in.Arith.argL);
+         mapRegs_ARM64RIA(m, i->ARM64in.Arith.argR);
+         return;
+      case ARM64in_Cmp:
+         i->ARM64in.Cmp.argL = lookupHRegRemap(m, i->ARM64in.Cmp.argL);
+         mapRegs_ARM64RIA(m, i->ARM64in.Cmp.argR);
+         return;
+      case ARM64in_Logic:
+         i->ARM64in.Logic.dst = lookupHRegRemap(m, i->ARM64in.Logic.dst);
+         i->ARM64in.Logic.argL = lookupHRegRemap(m, i->ARM64in.Logic.argL);
+         mapRegs_ARM64RIL(m, i->ARM64in.Logic.argR);
+         return;
+      case ARM64in_Test:
+         i->ARM64in.Test.argL = lookupHRegRemap(m, i->ARM64in.Test.argL);
+         mapRegs_ARM64RIL(m, i->ARM64in.Logic.argR);
+         return;
+      case ARM64in_Shift:
+         i->ARM64in.Shift.dst = lookupHRegRemap(m, i->ARM64in.Shift.dst);
+         i->ARM64in.Shift.argL = lookupHRegRemap(m, i->ARM64in.Shift.argL);
+         mapRegs_ARM64RI6(m, i->ARM64in.Shift.argR);
+         return;
+      case ARM64in_Unary:
+         i->ARM64in.Unary.dst = lookupHRegRemap(m, i->ARM64in.Unary.dst);
+         i->ARM64in.Unary.src = lookupHRegRemap(m, i->ARM64in.Unary.src);
+         return;
+      case ARM64in_MovI:
+         i->ARM64in.MovI.dst = lookupHRegRemap(m, i->ARM64in.MovI.dst);
+         i->ARM64in.MovI.src = lookupHRegRemap(m, i->ARM64in.MovI.src);
+         return;
+      case ARM64in_Imm64:
+         i->ARM64in.Imm64.dst = lookupHRegRemap(m, i->ARM64in.Imm64.dst);
+         return;
+      case ARM64in_LdSt64:
+         i->ARM64in.LdSt64.rD = lookupHRegRemap(m, i->ARM64in.LdSt64.rD);
+         mapRegs_ARM64AMode(m, i->ARM64in.LdSt64.amode);
+         return;
+      case ARM64in_LdSt32:
+         i->ARM64in.LdSt32.rD = lookupHRegRemap(m, i->ARM64in.LdSt32.rD);
+         mapRegs_ARM64AMode(m, i->ARM64in.LdSt32.amode);
+         return;
+      case ARM64in_LdSt16:
+         i->ARM64in.LdSt16.rD = lookupHRegRemap(m, i->ARM64in.LdSt16.rD);
+         mapRegs_ARM64AMode(m, i->ARM64in.LdSt16.amode);
+         return;
+      case ARM64in_LdSt8:
+         i->ARM64in.LdSt8.rD = lookupHRegRemap(m, i->ARM64in.LdSt8.rD);
+         mapRegs_ARM64AMode(m, i->ARM64in.LdSt8.amode);
+         return;
+      case ARM64in_XDirect:
+         mapRegs_ARM64AMode(m, i->ARM64in.XDirect.amPC);
+         return;
+      case ARM64in_XIndir:
+         i->ARM64in.XIndir.dstGA
+            = lookupHRegRemap(m, i->ARM64in.XIndir.dstGA);
+         mapRegs_ARM64AMode(m, i->ARM64in.XIndir.amPC);
+         return;
+      case ARM64in_XAssisted:
+         i->ARM64in.XAssisted.dstGA
+            = lookupHRegRemap(m, i->ARM64in.XAssisted.dstGA);
+         mapRegs_ARM64AMode(m, i->ARM64in.XAssisted.amPC);
+         return;
+      case ARM64in_CSel:
+         i->ARM64in.CSel.dst  = lookupHRegRemap(m, i->ARM64in.CSel.dst);
+         i->ARM64in.CSel.argL = lookupHRegRemap(m, i->ARM64in.CSel.argL);
+         i->ARM64in.CSel.argR = lookupHRegRemap(m, i->ARM64in.CSel.argR);
+         return;
+      case ARM64in_Call:
+         return;
+      case ARM64in_AddToSP:
+         return;
+      case ARM64in_FromSP:
+         i->ARM64in.FromSP.dst = lookupHRegRemap(m, i->ARM64in.FromSP.dst);
+         return;
+      case ARM64in_Mul:
+         i->ARM64in.Mul.dst  = lookupHRegRemap(m, i->ARM64in.Mul.dst);
+         i->ARM64in.Mul.argL = lookupHRegRemap(m, i->ARM64in.Mul.argL);
+         i->ARM64in.Mul.argR = lookupHRegRemap(m, i->ARM64in.Mul.argR);
+         break;
+      case ARM64in_LdrEX:
+         return;
+      case ARM64in_StrEX:
+         return;
+      case ARM64in_MFence:
+         return;
+//ZZ       case ARMin_CLREX:
+//ZZ          return;
+      case ARM64in_VLdStS:
+         i->ARM64in.VLdStS.sD = lookupHRegRemap(m, i->ARM64in.VLdStS.sD);
+         i->ARM64in.VLdStS.rN = lookupHRegRemap(m, i->ARM64in.VLdStS.rN);
+         return;
+      case ARM64in_VLdStD:
+         i->ARM64in.VLdStD.dD = lookupHRegRemap(m, i->ARM64in.VLdStD.dD);
+         i->ARM64in.VLdStD.rN = lookupHRegRemap(m, i->ARM64in.VLdStD.rN);
+         return;
+      case ARM64in_VLdStQ:
+         i->ARM64in.VLdStQ.rQ = lookupHRegRemap(m, i->ARM64in.VLdStQ.rQ);
+         i->ARM64in.VLdStQ.rN = lookupHRegRemap(m, i->ARM64in.VLdStQ.rN);
+         return;
+      case ARM64in_VCvtI2F:
+         i->ARM64in.VCvtI2F.rS = lookupHRegRemap(m, i->ARM64in.VCvtI2F.rS);
+         i->ARM64in.VCvtI2F.rD = lookupHRegRemap(m, i->ARM64in.VCvtI2F.rD);
+         return;
+      case ARM64in_VCvtF2I:
+         i->ARM64in.VCvtF2I.rS = lookupHRegRemap(m, i->ARM64in.VCvtF2I.rS);
+         i->ARM64in.VCvtF2I.rD = lookupHRegRemap(m, i->ARM64in.VCvtF2I.rD);
+         return;
+      case ARM64in_VCvtSD:
+         i->ARM64in.VCvtSD.dst = lookupHRegRemap(m, i->ARM64in.VCvtSD.dst);
+         i->ARM64in.VCvtSD.src = lookupHRegRemap(m, i->ARM64in.VCvtSD.src);
+         return;
+      case ARM64in_VUnaryD:
+         i->ARM64in.VUnaryD.dst = lookupHRegRemap(m, i->ARM64in.VUnaryD.dst);
+         i->ARM64in.VUnaryD.src = lookupHRegRemap(m, i->ARM64in.VUnaryD.src);
+         return;
+      case ARM64in_VUnaryS:
+         i->ARM64in.VUnaryS.dst = lookupHRegRemap(m, i->ARM64in.VUnaryS.dst);
+         i->ARM64in.VUnaryS.src = lookupHRegRemap(m, i->ARM64in.VUnaryS.src);
+         return;
+      case ARM64in_VBinD:
+         i->ARM64in.VBinD.dst  = lookupHRegRemap(m, i->ARM64in.VBinD.dst);
+         i->ARM64in.VBinD.argL = lookupHRegRemap(m, i->ARM64in.VBinD.argL);
+         i->ARM64in.VBinD.argR = lookupHRegRemap(m, i->ARM64in.VBinD.argR);
+         return;
+      case ARM64in_VBinS:
+         i->ARM64in.VBinS.dst  = lookupHRegRemap(m, i->ARM64in.VBinS.dst);
+         i->ARM64in.VBinS.argL = lookupHRegRemap(m, i->ARM64in.VBinS.argL);
+         i->ARM64in.VBinS.argR = lookupHRegRemap(m, i->ARM64in.VBinS.argR);
+         return;
+      case ARM64in_VCmpD:
+         i->ARM64in.VCmpD.argL = lookupHRegRemap(m, i->ARM64in.VCmpD.argL);
+         i->ARM64in.VCmpD.argR = lookupHRegRemap(m, i->ARM64in.VCmpD.argR);
+         return;
+      case ARM64in_VCmpS:
+         i->ARM64in.VCmpS.argL = lookupHRegRemap(m, i->ARM64in.VCmpS.argL);
+         i->ARM64in.VCmpS.argR = lookupHRegRemap(m, i->ARM64in.VCmpS.argR);
+         return;
+      case ARM64in_FPCR:
+         i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg);
+         return;
+      case ARM64in_VBinV:
+         i->ARM64in.VBinV.dst  = lookupHRegRemap(m, i->ARM64in.VBinV.dst);
+         i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL);
+         i->ARM64in.VBinV.argR = lookupHRegRemap(m, i->ARM64in.VBinV.argR);
+         return;
+      case ARM64in_VUnaryV:
+         i->ARM64in.VUnaryV.dst = lookupHRegRemap(m, i->ARM64in.VUnaryV.dst);
+         i->ARM64in.VUnaryV.arg = lookupHRegRemap(m, i->ARM64in.VUnaryV.arg);
+         return;
+      case ARM64in_VNarrowV:
+         i->ARM64in.VNarrowV.dst = lookupHRegRemap(m, i->ARM64in.VNarrowV.dst);
+         i->ARM64in.VNarrowV.src = lookupHRegRemap(m, i->ARM64in.VNarrowV.src);
+         return;
+      case ARM64in_VShiftImmV:
+         i->ARM64in.VShiftImmV.dst
+            = lookupHRegRemap(m, i->ARM64in.VShiftImmV.dst);
+         i->ARM64in.VShiftImmV.src
+            = lookupHRegRemap(m, i->ARM64in.VShiftImmV.src);
+         return;
+//ZZ       case ARMin_VAluS:
+//ZZ          i->ARMin.VAluS.dst  = lookupHRegRemap(m, i->ARMin.VAluS.dst);
+//ZZ          i->ARMin.VAluS.argL = lookupHRegRemap(m, i->ARMin.VAluS.argL);
+//ZZ          i->ARMin.VAluS.argR = lookupHRegRemap(m, i->ARMin.VAluS.argR);
+//ZZ          return;
+//ZZ       case ARMin_VCMovD:
+//ZZ          i->ARMin.VCMovD.dst = lookupHRegRemap(m, i->ARMin.VCMovD.dst);
+//ZZ          i->ARMin.VCMovD.src = lookupHRegRemap(m, i->ARMin.VCMovD.src);
+//ZZ          return;
+//ZZ       case ARMin_VCMovS:
+//ZZ          i->ARMin.VCMovS.dst = lookupHRegRemap(m, i->ARMin.VCMovS.dst);
+//ZZ          i->ARMin.VCMovS.src = lookupHRegRemap(m, i->ARMin.VCMovS.src);
+//ZZ          return;
+//ZZ       case ARMin_VXferD:
+//ZZ          i->ARMin.VXferD.dD  = lookupHRegRemap(m, i->ARMin.VXferD.dD);
+//ZZ          i->ARMin.VXferD.rHi = lookupHRegRemap(m, i->ARMin.VXferD.rHi);
+//ZZ          i->ARMin.VXferD.rLo = lookupHRegRemap(m, i->ARMin.VXferD.rLo);
+//ZZ          return;
+//ZZ       case ARMin_VXferS:
+//ZZ          i->ARMin.VXferS.fD  = lookupHRegRemap(m, i->ARMin.VXferS.fD);
+//ZZ          i->ARMin.VXferS.rLo = lookupHRegRemap(m, i->ARMin.VXferS.rLo);
+//ZZ          return;
+//ZZ       case ARMin_VCvtID:
+//ZZ          i->ARMin.VCvtID.dst = lookupHRegRemap(m, i->ARMin.VCvtID.dst);
+//ZZ          i->ARMin.VCvtID.src = lookupHRegRemap(m, i->ARMin.VCvtID.src);
+//ZZ          return;
+//ZZ       case ARMin_NLdStD:
+//ZZ          i->ARMin.NLdStD.dD = lookupHRegRemap(m, i->ARMin.NLdStD.dD);
+//ZZ          mapRegs_ARMAModeN(m, i->ARMin.NLdStD.amode);
+//ZZ          return;
+//ZZ       case ARMin_NUnary:
+//ZZ          i->ARMin.NUnary.src = lookupHRegRemap(m, i->ARMin.NUnary.src);
+//ZZ          i->ARMin.NUnary.dst = lookupHRegRemap(m, i->ARMin.NUnary.dst);
+//ZZ          return;
+//ZZ       case ARMin_NUnaryS:
+//ZZ          i->ARMin.NUnaryS.src->reg
+//ZZ             = lookupHRegRemap(m, i->ARMin.NUnaryS.src->reg);
+//ZZ          i->ARMin.NUnaryS.dst->reg
+//ZZ             = lookupHRegRemap(m, i->ARMin.NUnaryS.dst->reg);
+//ZZ          return;
+//ZZ       case ARMin_NShift:
+//ZZ          i->ARMin.NShift.dst = lookupHRegRemap(m, i->ARMin.NShift.dst);
+//ZZ          i->ARMin.NShift.argL = lookupHRegRemap(m, i->ARMin.NShift.argL);
+//ZZ          i->ARMin.NShift.argR = lookupHRegRemap(m, i->ARMin.NShift.argR);
+//ZZ          return;
+//ZZ       case ARMin_NShl64:
+//ZZ          i->ARMin.NShl64.dst = lookupHRegRemap(m, i->ARMin.NShl64.dst);
+//ZZ          i->ARMin.NShl64.src = lookupHRegRemap(m, i->ARMin.NShl64.src);
+//ZZ          return;
+//ZZ       case ARMin_NDual:
+//ZZ          i->ARMin.NDual.arg1 = lookupHRegRemap(m, i->ARMin.NDual.arg1);
+//ZZ          i->ARMin.NDual.arg2 = lookupHRegRemap(m, i->ARMin.NDual.arg2);
+//ZZ          return;
+      case ARM64in_VImmQ:
+         i->ARM64in.VImmQ.rQ = lookupHRegRemap(m, i->ARM64in.VImmQ.rQ);
+         return;
+      case ARM64in_VDfromX:
+         i->ARM64in.VDfromX.rD
+            = lookupHRegRemap(m, i->ARM64in.VDfromX.rD);
+         i->ARM64in.VDfromX.rX
+            = lookupHRegRemap(m, i->ARM64in.VDfromX.rX);
+         return;
+      case ARM64in_VQfromXX:
+         i->ARM64in.VQfromXX.rQ
+            = lookupHRegRemap(m, i->ARM64in.VQfromXX.rQ);
+         i->ARM64in.VQfromXX.rXhi
+            = lookupHRegRemap(m, i->ARM64in.VQfromXX.rXhi);
+         i->ARM64in.VQfromXX.rXlo
+            = lookupHRegRemap(m, i->ARM64in.VQfromXX.rXlo);
+         return;
+      case ARM64in_VXfromQ:
+         i->ARM64in.VXfromQ.rX
+            = lookupHRegRemap(m, i->ARM64in.VXfromQ.rX);
+         i->ARM64in.VXfromQ.rQ
+            = lookupHRegRemap(m, i->ARM64in.VXfromQ.rQ);
+         return;
+      case ARM64in_VMov:
+         i->ARM64in.VMov.dst = lookupHRegRemap(m, i->ARM64in.VMov.dst);
+         i->ARM64in.VMov.src = lookupHRegRemap(m, i->ARM64in.VMov.src);
+         return;
+
+//ZZ       case ARMin_NBinary:
+//ZZ          i->ARMin.NBinary.argL = lookupHRegRemap(m, i->ARMin.NBinary.argL);
+//ZZ          i->ARMin.NBinary.argR = lookupHRegRemap(m, i->ARMin.NBinary.argR);
+//ZZ          i->ARMin.NBinary.dst  = lookupHRegRemap(m, i->ARMin.NBinary.dst);
+//ZZ          return;
+//ZZ       case ARMin_NCMovQ:
+//ZZ          i->ARMin.NCMovQ.dst = lookupHRegRemap(m, i->ARMin.NCMovQ.dst);
+//ZZ          i->ARMin.NCMovQ.src = lookupHRegRemap(m, i->ARMin.NCMovQ.src);
+//ZZ          return;
+//ZZ       case ARMin_Add32:
+//ZZ          i->ARMin.Add32.rD = lookupHRegRemap(m, i->ARMin.Add32.rD);
+//ZZ          i->ARMin.Add32.rN = lookupHRegRemap(m, i->ARMin.Add32.rN);
+//ZZ          return;
+      case ARM64in_EvCheck:
+         /* We expect both amodes only to mention x21, so this is in
+            fact pointless, since x21 isn't allocatable, but
+            anyway.. */
+         mapRegs_ARM64AMode(m, i->ARM64in.EvCheck.amCounter);
+         mapRegs_ARM64AMode(m, i->ARM64in.EvCheck.amFailAddr);
+         return;
+//ZZ       case ARMin_ProfInc:
+//ZZ          /* hardwires r11 and r12 -- nothing to modify. */
+//ZZ          return;
+      default:
+         ppARM64Instr(i);
+         vpanic("mapRegs_ARM64Instr");
+   }
+}
+
+/* Figure out if i represents a reg-reg move, and if so assign the
+   source and destination to *src and *dst.  If in doubt say No.  Used
+   by the register allocator to do move coalescing. 
+*/
+Bool isMove_ARM64Instr ( ARM64Instr* i, HReg* src, HReg* dst )
+{
+   switch (i->tag) {
+      case ARM64in_MovI:
+         *src = i->ARM64in.MovI.src;
+         *dst = i->ARM64in.MovI.dst;
+         return True;
+      case ARM64in_VMov:
+         *src = i->ARM64in.VMov.src;
+         *dst = i->ARM64in.VMov.dst;
+         return True;
+      default:
+         break;
+   }
+
+   return False;
+}
+
+
+/* Generate arm spill/reload instructions under the direction of the
+   register allocator.  Note it's critical these don't write the
+   condition codes. */
+
+void genSpill_ARM64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                      HReg rreg, Int offsetB, Bool mode64 )
+{
+   HRegClass rclass;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == True);
+   *i1 = *i2 = NULL;
+   rclass = hregClass(rreg);
+   switch (rclass) {
+      case HRcInt64:
+         vassert(0 == (offsetB & 7));
+         offsetB >>= 3;
+         vassert(offsetB < 4096);
+         *i1 = ARM64Instr_LdSt64(
+                  False/*!isLoad*/, 
+                  rreg, 
+                  ARM64AMode_RI12(hregARM64_X21(), offsetB, 8)
+               );
+         return;
+      case HRcFlt64:
+         vassert(0 == (offsetB & 7));
+         vassert(offsetB >= 0 && offsetB < 32768);
+         *i1 = ARM64Instr_VLdStD(False/*!isLoad*/,
+                                 rreg, hregARM64_X21(), offsetB);
+         return;
+      case HRcVec128: {
+         HReg x21  = hregARM64_X21();  // baseblock
+         HReg x9   = hregARM64_X9();   // spill temporary
+         vassert(0 == (offsetB & 15)); // check sane alignment
+         vassert(offsetB < 4096);
+         *i1 = ARM64Instr_Arith(x9, x21, ARM64RIA_I12(offsetB, 0), True);
+         *i2 = ARM64Instr_VLdStQ(False/*!isLoad*/, rreg, x9);
+         return;
+      }
+      default:
+         ppHRegClass(rclass);
+         vpanic("genSpill_ARM: unimplemented regclass");
+   }
+}
+
+void genReload_ARM64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                       HReg rreg, Int offsetB, Bool mode64 )
+{
+   HRegClass rclass;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == True);
+   *i1 = *i2 = NULL;
+   rclass = hregClass(rreg);
+   switch (rclass) {
+      case HRcInt64:
+         vassert(0 == (offsetB & 7));
+         offsetB >>= 3;
+         vassert(offsetB < 4096);
+         *i1 = ARM64Instr_LdSt64(
+                  True/*isLoad*/, 
+                  rreg, 
+                  ARM64AMode_RI12(hregARM64_X21(), offsetB, 8)
+               );
+         return;
+      case HRcFlt64:
+         vassert(0 == (offsetB & 7));
+         vassert(offsetB >= 0 && offsetB < 32768);
+         *i1 = ARM64Instr_VLdStD(True/*isLoad*/,
+                                 rreg, hregARM64_X21(), offsetB);
+         return;
+      case HRcVec128: {
+         HReg x21  = hregARM64_X21();  // baseblock
+         HReg x9   = hregARM64_X9();   // spill temporary
+         vassert(0 == (offsetB & 15)); // check sane alignment
+         vassert(offsetB < 4096);
+         *i1 = ARM64Instr_Arith(x9, x21, ARM64RIA_I12(offsetB, 0), True);
+         *i2 = ARM64Instr_VLdStQ(True/*isLoad*/, rreg, x9);
+         return;
+      }
+      default:
+         ppHRegClass(rclass);
+         vpanic("genReload_ARM: unimplemented regclass");
+   }
+}
+
+
+//ZZ /* Emit an instruction into buf and return the number of bytes used.
+//ZZ    Note that buf is not the insn's final place, and therefore it is
+//ZZ    imperative to emit position-independent code. */
+
+static inline UChar iregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 30);
+   return toUChar(n);
+}
+
+static inline UChar dregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 31);
+   return toUChar(n);
+}
+
+static inline UChar qregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcVec128);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 31);
+   return toUChar(n);
+}
+
+#define BITS4(zzb3,zzb2,zzb1,zzb0) \
+   (((zzb3) << 3) | ((zzb2) << 2) | ((zzb1) << 1) | (zzb0))
+
+#define X00  BITS4(0,0, 0,0)
+#define X01  BITS4(0,0, 0,1)
+#define X10  BITS4(0,0, 1,0)
+#define X11  BITS4(0,0, 1,1)
+
+#define X000 BITS4(0, 0,0,0)
+#define X001 BITS4(0, 0,0,1)
+#define X010 BITS4(0, 0,1,0)
+#define X011 BITS4(0, 0,1,1)
+#define X100 BITS4(0, 1,0,0)
+#define X101 BITS4(0, 1,0,1)
+#define X110 BITS4(0, 1,1,0)
+#define X111 BITS4(0, 1,1,1)
+
+#define X0000 BITS4(0,0,0,0)
+#define X0001 BITS4(0,0,0,1)
+#define X0010 BITS4(0,0,1,0)
+#define X0011 BITS4(0,0,1,1)
+
+#define BITS8(zzb7,zzb6,zzb5,zzb4,zzb3,zzb2,zzb1,zzb0) \
+  ((BITS4(zzb7,zzb6,zzb5,zzb4) << 4) | BITS4(zzb3,zzb2,zzb1,zzb0))
+
+#define X00000   BITS8(0,0,0, 0,0,0,0,0)
+#define X00001   BITS8(0,0,0, 0,0,0,0,1)
+#define X00111   BITS8(0,0,0, 0,0,1,1,1)
+#define X01000   BITS8(0,0,0, 0,1,0,0,0)
+#define X10000   BITS8(0,0,0, 1,0,0,0,0)
+#define X11000   BITS8(0,0,0, 1,1,0,0,0)
+#define X11110   BITS8(0,0,0, 1,1,1,1,0)
+#define X11111   BITS8(0,0,0, 1,1,1,1,1)
+
+#define X000000  BITS8(0,0, 0,0,0,0,0,0)
+#define X000001  BITS8(0,0, 0,0,0,0,0,1)
+#define X000100  BITS8(0,0, 0,0,0,1,0,0)
+#define X000111  BITS8(0,0, 0,0,0,1,1,1)
+#define X001000  BITS8(0,0, 0,0,1,0,0,0)
+#define X001001  BITS8(0,0, 0,0,1,0,0,1)
+#define X001010  BITS8(0,0, 0,0,1,0,1,0)
+#define X001101  BITS8(0,0, 0,0,1,1,0,1)
+#define X001111  BITS8(0,0, 0,0,1,1,1,1)
+#define X010000  BITS8(0,0, 0,1,0,0,0,0)
+#define X010001  BITS8(0,0, 0,1,0,0,0,1)
+#define X010101  BITS8(0,0, 0,1,0,1,0,1)
+#define X010110  BITS8(0,0, 0,1,0,1,1,0)
+#define X011001  BITS8(0,0, 0,1,1,0,0,1)
+#define X011010  BITS8(0,0, 0,1,1,0,1,0)
+#define X011011  BITS8(0,0, 0,1,1,0,1,1)
+#define X011110  BITS8(0,0, 0,1,1,1,1,0)
+#define X011111  BITS8(0,0, 0,1,1,1,1,1)
+#define X100001  BITS8(0,0, 1,0,0,0,0,1)
+#define X100011  BITS8(0,0, 1,0,0,0,1,1)
+#define X100100  BITS8(0,0, 1,0,0,1,0,0)
+#define X100101  BITS8(0,0, 1,0,0,1,0,1)
+#define X100110  BITS8(0,0, 1,0,0,1,1,0)
+#define X100111  BITS8(0,0, 1,0,0,1,1,1)
+#define X110000  BITS8(0,0, 1,1,0,0,0,0)
+#define X110001  BITS8(0,0, 1,1,0,0,0,1)
+#define X110101  BITS8(0,0, 1,1,0,1,0,1)
+#define X110111  BITS8(0,0, 1,1,0,1,1,1)
+#define X111000  BITS8(0,0, 1,1,1,0,0,0)
+#define X111001  BITS8(0,0, 1,1,1,0,0,1)
+#define X111101  BITS8(0,0, 1,1,1,1,0,1)
+#define X111110  BITS8(0,0, 1,1,1,1,1,0)
+#define X111111  BITS8(0,0, 1,1,1,1,1,1)
+
+#define X0001000  BITS8(0, 0,0,0,1,0,0,0)
+#define X0010000  BITS8(0, 0,0,1,0,0,0,0)
+#define X0100000  BITS8(0, 0,1,0,0,0,0,0)
+#define X1000000  BITS8(0, 1,0,0,0,0,0,0)
+
+#define X00100000  BITS8(0,0,1,0,0,0,0,0)
+#define X00100001  BITS8(0,0,1,0,0,0,0,1)
+#define X00100010  BITS8(0,0,1,0,0,0,1,0)
+#define X00100011  BITS8(0,0,1,0,0,0,1,1)
+#define X01010000  BITS8(0,1,0,1,0,0,0,0)
+#define X01010001  BITS8(0,1,0,1,0,0,0,1)
+#define X01010100  BITS8(0,1,0,1,0,1,0,0)
+#define X01011000  BITS8(0,1,0,1,1,0,0,0)
+#define X01100000  BITS8(0,1,1,0,0,0,0,0)
+#define X01100001  BITS8(0,1,1,0,0,0,0,1)
+#define X01100010  BITS8(0,1,1,0,0,0,1,0)
+#define X01100011  BITS8(0,1,1,0,0,0,1,1)
+#define X01110000  BITS8(0,1,1,1,0,0,0,0)
+#define X01110001  BITS8(0,1,1,1,0,0,0,1)
+#define X01110011  BITS8(0,1,1,1,0,0,1,1)
+#define X01110101  BITS8(0,1,1,1,0,1,0,1)
+#define X01110111  BITS8(0,1,1,1,0,1,1,1)
+#define X11000001  BITS8(1,1,0,0,0,0,0,1)
+#define X11000011  BITS8(1,1,0,0,0,0,1,1)
+#define X11010100  BITS8(1,1,0,1,0,1,0,0)
+#define X11010110  BITS8(1,1,0,1,0,1,1,0)
+#define X11011000  BITS8(1,1,0,1,1,0,0,0)
+#define X11011010  BITS8(1,1,0,1,1,0,1,0)
+#define X11011110  BITS8(1,1,0,1,1,1,1,0)
+#define X11110001  BITS8(1,1,1,1,0,0,0,1)
+#define X11110011  BITS8(1,1,1,1,0,0,1,1)
+
+
+/* --- 4 fields --- */
+
+static inline UInt X_8_19_1_4 ( UInt f1, UInt f2, UInt f3, UInt f4 ) {
+   vassert(8+19+1+4 == 32);
+   vassert(f1 < (1<<8));
+   vassert(f2 < (1<<19));
+   vassert(f3 < (1<<1));
+   vassert(f4 < (1<<4));
+   UInt w = 0;
+   w = (w <<  8) | f1;
+   w = (w << 19) | f2;
+   w = (w <<  1) | f3;
+   w = (w <<  4) | f4;
+   return w;
+}
+
+/* --- 5 fields --- */
+
+static inline UInt X_3_6_2_16_5 ( UInt f1, UInt f2,
+                                  UInt f3, UInt f4, UInt f5 ) {
+   vassert(3+6+2+16+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<2));
+   vassert(f4 < (1<<16));
+   vassert(f5 < (1<<5));
+   UInt w = 0;
+   w = (w <<  3) | f1;
+   w = (w <<  6) | f2;
+   w = (w <<  2) | f3;
+   w = (w << 16) | f4;
+   w = (w <<  5) | f5;
+   return w;
+}
+
+/* --- 6 fields --- */
+
+static inline UInt X_2_6_2_12_5_5 ( UInt f1, UInt f2, UInt f3,
+                                    UInt f4, UInt f5, UInt f6 ) {
+   vassert(2+6+2+12+5+5 == 32);
+   vassert(f1 < (1<<2));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<2));
+   vassert(f4 < (1<<12));
+   vassert(f5 < (1<<5));
+   vassert(f6 < (1<<5));
+   UInt w = 0;
+   w = (w <<  2) | f1;
+   w = (w <<  6) | f2;
+   w = (w <<  2) | f3;
+   w = (w << 12) | f4;
+   w = (w <<  5) | f5;
+   w = (w <<  5) | f6;
+   return w;
+}
+
+static inline UInt X_3_8_5_6_5_5 ( UInt f1, UInt f2, UInt f3,
+                                   UInt f4, UInt f5, UInt f6 ) {
+   vassert(3+8+5+6+5+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<8));
+   vassert(f3 < (1<<5));
+   vassert(f4 < (1<<6));
+   vassert(f5 < (1<<5));
+   vassert(f6 < (1<<5));
+   UInt w = 0;
+   w = (w <<  3) | f1;
+   w = (w <<  8) | f2;
+   w = (w <<  5) | f3;
+   w = (w <<  6) | f4;
+   w = (w <<  5) | f5;
+   w = (w <<  5) | f6;
+   return w;
+}
+
+static inline UInt X_3_5_8_6_5_5 ( UInt f1, UInt f2, UInt f3,
+                                   UInt f4, UInt f5, UInt f6 ) {
+   vassert(3+8+5+6+5+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<5));
+   vassert(f3 < (1<<8));
+   vassert(f4 < (1<<6));
+   vassert(f5 < (1<<5));
+   vassert(f6 < (1<<5));
+   UInt w = 0;
+   w = (w <<  3) | f1;
+   w = (w <<  5) | f2;
+   w = (w <<  8) | f3;
+   w = (w <<  6) | f4;
+   w = (w <<  5) | f5;
+   w = (w <<  5) | f6;
+   return w;
+}
+
+static inline UInt X_3_6_7_6_5_5 ( UInt f1, UInt f2, UInt f3,
+                                   UInt f4, UInt f5, UInt f6 ) {
+   vassert(3+6+7+6+5+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<7));
+   vassert(f4 < (1<<6));
+   vassert(f5 < (1<<5));
+   vassert(f6 < (1<<5));
+   UInt w = 0;
+   w = (w <<  3) | f1;
+   w = (w <<  6) | f2;
+   w = (w <<  7) | f3;
+   w = (w <<  6) | f4;
+   w = (w <<  5) | f5;
+   w = (w <<  5) | f6;
+   return w;
+}
+
+/* --- 7 fields --- */
+
+static inline UInt X_2_6_3_9_2_5_5 ( UInt f1, UInt f2, UInt f3,
+                                     UInt f4, UInt f5, UInt f6, UInt f7 ) {
+   vassert(2+6+3+9+2+5+5 == 32);
+   vassert(f1 < (1<<2));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<3));
+   vassert(f4 < (1<<9));
+   vassert(f5 < (1<<2));
+   vassert(f6 < (1<<5));
+   vassert(f7 < (1<<5));
+   UInt w = 0;
+   w = (w << 2) | f1;
+   w = (w << 6) | f2;
+   w = (w << 3) | f3;
+   w = (w << 9) | f4;
+   w = (w << 2) | f5;
+   w = (w << 5) | f6;
+   w = (w << 5) | f7;
+   return w;
+}
+
+static inline UInt X_3_6_1_6_6_5_5 ( UInt f1, UInt f2, UInt f3,
+                                     UInt f4, UInt f5, UInt f6, UInt f7 ) {
+   vassert(3+6+1+6+6+5+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<1));
+   vassert(f4 < (1<<6));
+   vassert(f5 < (1<<6));
+   vassert(f6 < (1<<5));
+   vassert(f7 < (1<<5));
+   UInt w = 0;
+   w = (w << 3) | f1;
+   w = (w << 6) | f2;
+   w = (w << 1) | f3;
+   w = (w << 6) | f4;
+   w = (w << 6) | f5;
+   w = (w << 5) | f6;
+   w = (w << 5) | f7;
+   return w;
+}
+
+
+//ZZ #define X0000  BITS4(0,0,0,0)
+//ZZ #define X0001  BITS4(0,0,0,1)
+//ZZ #define X0010  BITS4(0,0,1,0)
+//ZZ #define X0011  BITS4(0,0,1,1)
+//ZZ #define X0100  BITS4(0,1,0,0)
+//ZZ #define X0101  BITS4(0,1,0,1)
+//ZZ #define X0110  BITS4(0,1,1,0)
+//ZZ #define X0111  BITS4(0,1,1,1)
+//ZZ #define X1000  BITS4(1,0,0,0)
+//ZZ #define X1001  BITS4(1,0,0,1)
+//ZZ #define X1010  BITS4(1,0,1,0)
+//ZZ #define X1011  BITS4(1,0,1,1)
+//ZZ #define X1100  BITS4(1,1,0,0)
+//ZZ #define X1101  BITS4(1,1,0,1)
+//ZZ #define X1110  BITS4(1,1,1,0)
+//ZZ #define X1111  BITS4(1,1,1,1)
+/*
+#define XXXXX___(zzx7,zzx6,zzx5,zzx4,zzx3) \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12))
+
+#define XXXXXX__(zzx7,zzx6,zzx5,zzx4,zzx3,zzx2)        \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx2) & 0xF) <<  8))
+
+#define XXXXX__X(zzx7,zzx6,zzx5,zzx4,zzx3,zzx0)        \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx0) & 0xF) <<  0))
+
+#define XXX___XX(zzx7,zzx6,zzx5,zzx1,zzx0) \
+  ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) | \
+   (((zzx5) & 0xF) << 20) | (((zzx1) & 0xF) << 4) | \
+   (((zzx0) & 0xF) << 0))
+
+#define XXXXXXXX(zzx7,zzx6,zzx5,zzx4,zzx3,zzx2,zzx1,zzx0)  \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx2) & 0xF) <<  8) |  \
+    (((zzx1) & 0xF) <<  4) | (((zzx0) & 0xF) <<  0))
+
+#define XX______(zzx7,zzx6) \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24))
+*/
+//ZZ /* Generate a skeletal insn that involves an a RI84 shifter operand.
+//ZZ    Returns a word which is all zeroes apart from bits 25 and 11..0,
+//ZZ    since it is those that encode the shifter operand (at least to the
+//ZZ    extent that we care about it.) */
+//ZZ static UInt skeletal_RI84 ( ARMRI84* ri )
+//ZZ {
+//ZZ    UInt instr;
+//ZZ    if (ri->tag == ARMri84_I84) {
+//ZZ       vassert(0 == (ri->ARMri84.I84.imm4 & ~0x0F));
+//ZZ       vassert(0 == (ri->ARMri84.I84.imm8 & ~0xFF));
+//ZZ       instr = 1 << 25;
+//ZZ       instr |= (ri->ARMri84.I84.imm4 << 8);
+//ZZ       instr |= ri->ARMri84.I84.imm8;
+//ZZ    } else {
+//ZZ       instr = 0 << 25;
+//ZZ       instr |= iregNo(ri->ARMri84.R.reg);
+//ZZ    }
+//ZZ    return instr;
+//ZZ }
+//ZZ 
+//ZZ /* Ditto for RI5.  Resulting word is zeroes apart from bit 4 and bits
+//ZZ    11..7. */
+//ZZ static UInt skeletal_RI5 ( ARMRI5* ri )
+//ZZ {
+//ZZ    UInt instr;
+//ZZ    if (ri->tag == ARMri5_I5) {
+//ZZ       UInt imm5 = ri->ARMri5.I5.imm5;
+//ZZ       vassert(imm5 >= 1 && imm5 <= 31);
+//ZZ       instr = 0 << 4;
+//ZZ       instr |= imm5 << 7;
+//ZZ    } else {
+//ZZ       instr = 1 << 4;
+//ZZ       instr |= iregNo(ri->ARMri5.R.reg) << 8;
+//ZZ    }
+//ZZ    return instr;
+//ZZ }
+
+
+/* Get an immediate into a register, using only that register. */
+static UInt* imm64_to_iregNo ( UInt* p, Int xD, ULong imm64 )
+{
+   if (imm64 == 0) {
+      // This has to be special-cased, since the logic below
+      // will leave the register unchanged in this case.
+      // MOVZ xD, #0, LSL #0
+      *p++ = X_3_6_2_16_5(X110, X100101, X00, 0/*imm16*/, xD);
+      return p;
+   }
+
+   // There must be at least one non-zero halfword.  Find the
+   // lowest nonzero such, and use MOVZ to install it and zero
+   // out the rest of the register.
+   UShort h[4];
+   h[3] = (UShort)((imm64 >> 48) & 0xFFFF);
+   h[2] = (UShort)((imm64 >> 32) & 0xFFFF);
+   h[1] = (UShort)((imm64 >> 16) & 0xFFFF);
+   h[0] = (UShort)((imm64 >>  0) & 0xFFFF);
+
+   UInt i;
+   for (i = 0; i < 4; i++) {
+      if (h[i] != 0)
+         break;
+   }
+   vassert(i < 4);
+
+   // MOVZ xD, h[i], LSL (16*i)
+   *p++ = X_3_6_2_16_5(X110, X100101, i, h[i], xD);
+
+   // Work on upwards through h[i], using MOVK to stuff in any
+   // remaining nonzero elements.
+   i++;
+   for (; i < 4; i++) {
+      if (h[i] == 0)
+         continue;
+      // MOVK xD, h[i], LSL (16*i)
+      *p++ = X_3_6_2_16_5(X111, X100101, i, h[i], xD);
+   }
+
+   return p;
+}
+
+/* Get an immediate into a register, using only that register, and
+   generating exactly 4 instructions, regardless of the value of the
+   immediate. This is used when generating sections of code that need
+   to be patched later, so as to guarantee a specific size. */
+static UInt* imm64_to_iregNo_EXACTLY4 ( UInt* p, Int xD, ULong imm64 )
+{
+   UShort h[4];
+   h[3] = (UShort)((imm64 >> 48) & 0xFFFF);
+   h[2] = (UShort)((imm64 >> 32) & 0xFFFF);
+   h[1] = (UShort)((imm64 >> 16) & 0xFFFF);
+   h[0] = (UShort)((imm64 >>  0) & 0xFFFF);
+   // Work on upwards through h[i], using MOVK to stuff in the
+   // remaining elements.
+   UInt i;
+   for (i = 0; i < 4; i++) {
+      if (i == 0) {
+         // MOVZ xD, h[0], LSL (16*0)
+         *p++ = X_3_6_2_16_5(X110, X100101, i, h[i], xD);
+      } else {
+         // MOVK xD, h[i], LSL (16*i)
+         *p++ = X_3_6_2_16_5(X111, X100101, i, h[i], xD);
+      }
+   }
+   return p;
+}
+
+/* Check whether p points at a 4-insn sequence cooked up by
+   imm64_to_iregNo_EXACTLY4(). */
+static Bool is_imm64_to_iregNo_EXACTLY4 ( UInt* p, Int xD, ULong imm64 )
+{
+   UShort h[4];
+   h[3] = (UShort)((imm64 >> 48) & 0xFFFF);
+   h[2] = (UShort)((imm64 >> 32) & 0xFFFF);
+   h[1] = (UShort)((imm64 >> 16) & 0xFFFF);
+   h[0] = (UShort)((imm64 >>  0) & 0xFFFF);
+   // Work on upwards through h[i], using MOVK to stuff in the
+   // remaining elements.
+   UInt i;
+   for (i = 0; i < 4; i++) {
+      UInt expected;
+      if (i == 0) {
+         // MOVZ xD, h[0], LSL (16*0)
+         expected = X_3_6_2_16_5(X110, X100101, i, h[i], xD);
+      } else {
+         // MOVK xD, h[i], LSL (16*i)
+         expected = X_3_6_2_16_5(X111, X100101, i, h[i], xD);
+      }
+      if (p[i] != expected)
+         return False;
+   }
+   return True;
+}
+
+
+/* Generate a 8 bit store or 8-to-64 unsigned widening load from/to
+   rD, using the given amode for the address. */
+static UInt* do_load_or_store8 ( UInt* p,
+                                 Bool isLoad, UInt wD, ARM64AMode* am )
+{
+   vassert(wD <= 30);
+   if (am->tag == ARM64am_RI9) {
+      /* STURB Wd, [Xn|SP + simm9]:  00 111000 000 simm9 00 n d
+         LDURB Wd, [Xn|SP + simm9]:  00 111000 010 simm9 00 n d
+      */
+      Int simm9 = am->ARM64am.RI9.simm9;
+      vassert(-256 <= simm9 && simm9 <= 255);
+      UInt instr = X_2_6_3_9_2_5_5(X00, X111000, isLoad ? X010 : X000,
+                                   simm9 & 0x1FF, X00,
+                                   iregNo(am->ARM64am.RI9.reg), wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RI12) {
+      /* STRB Wd, [Xn|SP + uimm12 * 1]:  00 111 001 00 imm12 n d
+         LDRB Wd, [Xn|SP + uimm12 * 1]:  00 111 001 01 imm12 n d
+      */
+      UInt uimm12 = am->ARM64am.RI12.uimm12;
+      UInt scale  = am->ARM64am.RI12.szB;
+      vassert(scale == 1); /* failure of this is serious.  Do not ignore. */
+      UInt xN    = iregNo(am->ARM64am.RI12.reg);
+      vassert(xN <= 30);
+      UInt instr = X_2_6_2_12_5_5(X00, X111001, isLoad ? X01 : X00,
+                                  uimm12, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RR) {
+      /* STRB Xd, [Xn|SP, Xm]: 00 111 000 001 m 011 0 10 n d
+         LDRB Xd, [Xn|SP, Xm]: 00 111 000 011 m 011 0 10 n d
+      */
+      UInt xN = iregNo(am->ARM64am.RR.base);
+      UInt xM = iregNo(am->ARM64am.RR.index);
+      vassert(xN <= 30);
+      UInt instr = X_3_8_5_6_5_5(X001, isLoad ? X11000011 : X11000001, 
+                                 xM, X011010, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   vpanic("do_load_or_store8");
+   vassert(0);
+}
+
+
+/* Generate a 16 bit store or 16-to-64 unsigned widening load from/to
+   rD, using the given amode for the address. */
+static UInt* do_load_or_store16 ( UInt* p,
+                                  Bool isLoad, UInt wD, ARM64AMode* am )
+{
+   vassert(wD <= 30);
+   if (am->tag == ARM64am_RI9) {
+      /* STURH Wd, [Xn|SP + simm9]:  01 111000 000 simm9 00 n d
+         LDURH Wd, [Xn|SP + simm9]:  01 111000 010 simm9 00 n d
+      */
+      Int simm9 = am->ARM64am.RI9.simm9;
+      vassert(-256 <= simm9 && simm9 <= 255);
+      UInt instr = X_2_6_3_9_2_5_5(X01, X111000, isLoad ? X010 : X000,
+                                   simm9 & 0x1FF, X00,
+                                   iregNo(am->ARM64am.RI9.reg), wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RI12) {
+      /* STRH Wd, [Xn|SP + uimm12 * 2]:  01 111 001 00 imm12 n d
+         LDRH Wd, [Xn|SP + uimm12 * 2]:  01 111 001 01 imm12 n d
+      */
+      UInt uimm12 = am->ARM64am.RI12.uimm12;
+      UInt scale  = am->ARM64am.RI12.szB;
+      vassert(scale == 2); /* failure of this is serious.  Do not ignore. */
+      UInt xN    = iregNo(am->ARM64am.RI12.reg);
+      vassert(xN <= 30);
+      UInt instr = X_2_6_2_12_5_5(X01, X111001, isLoad ? X01 : X00,
+                                  uimm12, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RR) {
+      /* STRH Xd, [Xn|SP, Xm]: 01 111 000 001 m 011 0 10 n d
+         LDRH Xd, [Xn|SP, Xm]: 01 111 000 011 m 011 0 10 n d
+      */
+      UInt xN = iregNo(am->ARM64am.RR.base);
+      UInt xM = iregNo(am->ARM64am.RR.index);
+      vassert(xN <= 30);
+      UInt instr = X_3_8_5_6_5_5(X011, isLoad ? X11000011 : X11000001, 
+                                 xM, X011010, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   vpanic("do_load_or_store16");
+   vassert(0);
+}
+
+
+/* Generate a 32 bit store or 32-to-64 unsigned widening load from/to
+   rD, using the given amode for the address. */
+static UInt* do_load_or_store32 ( UInt* p,
+                                  Bool isLoad, UInt wD, ARM64AMode* am )
+{
+   vassert(wD <= 30);
+   if (am->tag == ARM64am_RI9) {
+      /* STUR Wd, [Xn|SP + simm9]:  10 111000 000 simm9 00 n d
+         LDUR Wd, [Xn|SP + simm9]:  10 111000 010 simm9 00 n d
+      */
+      Int simm9 = am->ARM64am.RI9.simm9;
+      vassert(-256 <= simm9 && simm9 <= 255);
+      UInt instr = X_2_6_3_9_2_5_5(X10, X111000, isLoad ? X010 : X000,
+                                   simm9 & 0x1FF, X00,
+                                   iregNo(am->ARM64am.RI9.reg), wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RI12) {
+      /* STR Wd, [Xn|SP + uimm12 * 4]:  10 111 001 00 imm12 n d
+         LDR Wd, [Xn|SP + uimm12 * 4]:  10 111 001 01 imm12 n d
+      */
+      UInt uimm12 = am->ARM64am.RI12.uimm12;
+      UInt scale  = am->ARM64am.RI12.szB;
+      vassert(scale == 4); /* failure of this is serious.  Do not ignore. */
+      UInt xN    = iregNo(am->ARM64am.RI12.reg);
+      vassert(xN <= 30);
+      UInt instr = X_2_6_2_12_5_5(X10, X111001, isLoad ? X01 : X00,
+                                  uimm12, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RR) {
+      /* STR Wd, [Xn|SP, Xm]: 10 111 000 001 m 011 0 10 n d
+         LDR Wd, [Xn|SP, Xm]: 10 111 000 011 m 011 0 10 n d
+      */
+      UInt xN = iregNo(am->ARM64am.RR.base);
+      UInt xM = iregNo(am->ARM64am.RR.index);
+      vassert(xN <= 30);
+      UInt instr = X_3_8_5_6_5_5(X101, isLoad ? X11000011 : X11000001, 
+                                 xM, X011010, xN, wD);
+      *p++ = instr;
+      return p;
+   }
+   vpanic("do_load_or_store32");
+   vassert(0);
+}
+
+
+/* Generate a 64 bit load or store to/from xD, using the given amode
+   for the address. */
+static UInt* do_load_or_store64 ( UInt* p,
+                                  Bool isLoad, UInt xD, ARM64AMode* am )
+{
+   /* In all these cases, Rn can't be 31 since that means SP. */
+   vassert(xD <= 30);
+   if (am->tag == ARM64am_RI9) {
+      /* STUR Xd, [Xn|SP + simm9]:  11 111000 000 simm9 00 n d
+         LDUR Xd, [Xn|SP + simm9]:  11 111000 010 simm9 00 n d
+      */
+      Int simm9 = am->ARM64am.RI9.simm9;
+      vassert(-256 <= simm9 && simm9 <= 255);
+      UInt xN = iregNo(am->ARM64am.RI9.reg);
+      vassert(xN <= 30);
+      UInt instr = X_2_6_3_9_2_5_5(X11, X111000, isLoad ? X010 : X000,
+                                   simm9 & 0x1FF, X00, xN, xD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RI12) {
+      /* STR Xd, [Xn|SP + uimm12 * 8]:  11 111 001 00 imm12 n d
+         LDR Xd, [Xn|SP + uimm12 * 8]:  11 111 001 01 imm12 n d
+      */
+      UInt uimm12 = am->ARM64am.RI12.uimm12;
+      UInt scale  = am->ARM64am.RI12.szB;
+      vassert(scale == 8); /* failure of this is serious.  Do not ignore. */
+      UInt xN    = iregNo(am->ARM64am.RI12.reg);
+      vassert(xN <= 30);
+      UInt instr = X_2_6_2_12_5_5(X11, X111001, isLoad ? X01 : X00,
+                                  uimm12, xN, xD);
+      *p++ = instr;
+      return p;
+   }
+   if (am->tag == ARM64am_RR) {
+      /* STR Xd, [Xn|SP, Xm]: 11 111 000 001 m 011 0 10 n d
+         LDR Xd, [Xn|SP, Xm]: 11 111 000 011 m 011 0 10 n d
+      */
+      UInt xN = iregNo(am->ARM64am.RR.base);
+      UInt xM = iregNo(am->ARM64am.RR.index);
+      vassert(xN <= 30);
+      UInt instr = X_3_8_5_6_5_5(X111, isLoad ? X11000011 : X11000001, 
+                                 xM, X011010, xN, xD);
+      *p++ = instr;
+      return p;
+   }
+   vpanic("do_load_or_store64");
+   vassert(0);
+}
+
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code.  If the emitted
+   instruction was a profiler inc, set *is_profInc to True, else
+   leave it unchanged. */
+
+Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+                      UChar* buf, Int nbuf, ARM64Instr* i,
+                      Bool mode64,
+                      void* disp_cp_chain_me_to_slowEP,
+                      void* disp_cp_chain_me_to_fastEP,
+                      void* disp_cp_xindir,
+                      void* disp_cp_xassisted )
+{
+   UInt* p = (UInt*)buf;
+   vassert(nbuf >= 32);
+   vassert(mode64 == True);
+   vassert(0 == (((HWord)buf) & 3));
+
+   switch (i->tag) {
+      case ARM64in_Arith: {
+         UInt      rD   = iregNo(i->ARM64in.Arith.dst);
+         UInt      rN   = iregNo(i->ARM64in.Arith.argL);
+         ARM64RIA* argR = i->ARM64in.Arith.argR;
+         switch (argR->tag) {
+            case ARM64riA_I12:
+               *p++ = X_2_6_2_12_5_5(
+                         i->ARM64in.Arith.isAdd ? X10 : X11,
+                         X010001,
+                         argR->ARM64riA.I12.shift == 12 ? X01 : X00,
+                         argR->ARM64riA.I12.imm12, rN, rD
+                      );
+               break;
+            case ARM64riA_R: {
+               UInt rM = iregNo(i->ARM64in.Arith.argR->ARM64riA.R.reg);
+               *p++ = X_3_8_5_6_5_5(
+                         i->ARM64in.Arith.isAdd ? X100 : X110,
+                         X01011000, rM, X000000, rN, rD
+                      );
+               break;
+            }
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_Cmp: {
+         UInt      rD   = 31; /* XZR, we are going to dump the result */
+         UInt      rN   = iregNo(i->ARM64in.Cmp.argL);
+         ARM64RIA* argR = i->ARM64in.Cmp.argR;
+         Bool      is64 = i->ARM64in.Cmp.is64;
+         switch (argR->tag) {
+            case ARM64riA_I12:
+               /* 1 11 10001 sh imm12 Rn Rd = SUBS Xd, Xn, #imm */
+               /* 0 11 10001 sh imm12 Rn Rd = SUBS Wd, Wn, #imm */
+               *p++ = X_2_6_2_12_5_5(
+                         is64 ? X11 : X01, X110001,
+                         argR->ARM64riA.I12.shift == 12 ? X01 : X00,
+                         argR->ARM64riA.I12.imm12, rN, rD);
+               break;
+            case ARM64riA_R: {
+               /* 1 11 01011 00 0 Rm 000000 Rn Rd = SUBS Xd, Xn, Xm */
+               /* 0 11 01011 00 0 Rm 000000 Rn Rd = SUBS Wd, Wn, Wm */
+               UInt rM = iregNo(i->ARM64in.Cmp.argR->ARM64riA.R.reg);
+               *p++ = X_3_8_5_6_5_5(is64 ? X111 : X011,
+                                    X01011000, rM, X000000, rN, rD);
+               break;
+            }
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_Logic: {
+         UInt      rD   = iregNo(i->ARM64in.Logic.dst);
+         UInt      rN   = iregNo(i->ARM64in.Logic.argL);
+         ARM64RIL* argR = i->ARM64in.Logic.argR;
+         UInt      opc  = 0; /* invalid */
+         vassert(rD < 31);
+         vassert(rN < 31);
+         switch (i->ARM64in.Logic.op) {
+            case ARM64lo_OR:  opc = X101; break;
+            case ARM64lo_AND: opc = X100; break;
+            case ARM64lo_XOR: opc = X110; break;
+            default: break;
+         }
+         vassert(opc != 0);
+         switch (argR->tag) {
+            case ARM64riL_I13: {
+               /* 1 01 100100 N immR immS Rn Rd = ORR <Xd|Sp>, Xn, #imm */
+               /* 1 00 100100 N immR immS Rn Rd = AND <Xd|Sp>, Xn, #imm */
+               /* 1 10 100100 N immR immS Rn Rd = EOR <Xd|Sp>, Xn, #imm */
+               *p++ = X_3_6_1_6_6_5_5(
+                         opc, X100100, argR->ARM64riL.I13.bitN,
+                         argR->ARM64riL.I13.immR, argR->ARM64riL.I13.immS,
+                         rN, rD
+                      );
+               break;
+            }
+            case ARM64riL_R: {
+               /* 1 01 01010 00 0 m 000000 n d = ORR Xd, Xn, Xm */
+               /* 1 00 01010 00 0 m 000000 n d = AND Xd, Xn, Xm */
+               /* 1 10 01010 00 0 m 000000 n d = EOR Xd, Xn, Xm */
+               UInt rM = iregNo(argR->ARM64riL.R.reg);
+               vassert(rM < 31);
+               *p++ = X_3_8_5_6_5_5(opc, X01010000, rM, X000000, rN, rD);
+               break;
+            }
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_Test: {
+         UInt      rD   = 31; /* XZR, we are going to dump the result */
+         UInt      rN   = iregNo(i->ARM64in.Test.argL);
+         ARM64RIL* argR = i->ARM64in.Test.argR;
+         switch (argR->tag) {
+            case ARM64riL_I13: {
+               /* 1 11 100100 N immR immS Rn Rd = ANDS Xd, Xn, #imm */
+               *p++ = X_3_6_1_6_6_5_5(
+                         X111, X100100, argR->ARM64riL.I13.bitN,
+                         argR->ARM64riL.I13.immR, argR->ARM64riL.I13.immS,
+                         rN, rD
+                      );
+               break;
+            }
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_Shift: {
+         UInt      rD   = iregNo(i->ARM64in.Shift.dst);
+         UInt      rN   = iregNo(i->ARM64in.Shift.argL);
+         ARM64RI6* argR = i->ARM64in.Shift.argR;
+         vassert(rD < 31);
+         vassert(rN < 31);
+         switch (argR->tag) {
+            case ARM64ri6_I6: {
+               /* 110 1001101 (63-sh) (64-sh) nn dd   LSL Xd, Xn, sh */
+               /* 110 1001101 sh      63      nn dd   LSR Xd, Xn, sh */
+               /* 100 1001101 sh      63      nn dd   ASR Xd, Xn, sh */
+               UInt sh = argR->ARM64ri6.I6.imm6;
+               vassert(sh > 0 && sh < 64);
+               switch (i->ARM64in.Shift.op) {
+                  case ARM64sh_SHL:
+                     *p++ = X_3_6_1_6_6_5_5(X110, X100110,
+                                            1, 64-sh, 63-sh, rN, rD);
+                     break;
+                  case ARM64sh_SHR:
+                     *p++ = X_3_6_1_6_6_5_5(X110, X100110, 1, sh, 63, rN, rD);
+                     break;
+                  case ARM64sh_SAR:
+                     *p++ = X_3_6_1_6_6_5_5(X100, X100110, 1, sh, 63, rN, rD);
+                     break;
+                  default:
+                     vassert(0);
+               }
+               break;
+            }
+            case ARM64ri6_R: {
+               /* 100 1101 0110 mm 001000 nn dd   LSL Xd, Xn, Xm */
+               /* 100 1101 0110 mm 001001 nn dd   LSR Xd, Xn, Xm */
+               /* 100 1101 0110 mm 001010 nn dd   ASR Xd, Xn, Xm */
+               UInt rM = iregNo(argR->ARM64ri6.R.reg);
+               vassert(rM < 31);
+               UInt subOpc = 0;
+               switch (i->ARM64in.Shift.op) {
+                  case ARM64sh_SHL: subOpc = X001000; break;
+                  case ARM64sh_SHR: subOpc = X001001; break;
+                  case ARM64sh_SAR: subOpc = X001010; break;
+                  default: vassert(0);
+               }
+               *p++ = X_3_8_5_6_5_5(X100, X11010110, rM, subOpc, rN, rD);
+               break;
+            }
+            default:
+               vassert(0);
+         }
+         goto done;
+      }
+      case ARM64in_Unary: {
+         UInt rDst = iregNo(i->ARM64in.Unary.dst);
+         UInt rSrc = iregNo(i->ARM64in.Unary.src);
+         switch (i->ARM64in.Unary.op) {
+            case ARM64un_CLZ:
+               /* 1 10 1101 0110 00000 00010 0 nn dd   CLZ Xd, Xn */
+               /* 1 10 1101 0110 00000 00010 1 nn dd   CLS Xd, Xn (unimp) */
+               *p++ = X_3_8_5_6_5_5(X110,
+                                    X11010110, X00000, X000100, rSrc, rDst);
+               goto done;
+            case ARM64un_NEG:
+               /* 1 10 01011 000 m 000000 11111 d  NEG Xd,Xm */
+               /* 0 10 01011 000 m 000000 11111 d  NEG Wd,Wm (unimp) */
+               *p++ = X_3_8_5_6_5_5(X110,
+                                    X01011000, rSrc, X000000, X11111, rDst);
+               goto done;
+            case ARM64un_NOT: {
+               /* 1 01 01010 00 1 m 000000 11111 d   MVN Xd,Xm */
+               *p++ = X_3_8_5_6_5_5(X101,
+                                    X01010001, rSrc, X000000, X11111, rDst);
+               goto done;
+            }
+            default:
+               break;
+         }
+         goto bad;
+      }
+      case ARM64in_MovI: {
+         /* We generate the "preferred form", ORR Xd, XZR, Xm
+            101 01010 00 0 m 000000 11111 d
+         */
+         UInt instr = 0xAA0003E0;
+         UInt d     = iregNo(i->ARM64in.MovI.dst);
+         UInt m     = iregNo(i->ARM64in.MovI.src);
+         *p++ = instr | ((m & 31) << 16) | ((d & 31) << 0);
+         goto done;
+      }
+      case ARM64in_Imm64: {
+         p = imm64_to_iregNo( p, iregNo(i->ARM64in.Imm64.dst),
+                              i->ARM64in.Imm64.imm64 );
+         goto done;
+      }
+      case ARM64in_LdSt64: {
+         p = do_load_or_store64( p, i->ARM64in.LdSt64.isLoad,
+                                 iregNo(i->ARM64in.LdSt64.rD),
+                                 i->ARM64in.LdSt64.amode );
+         goto done;
+      }
+      case ARM64in_LdSt32: {
+         p = do_load_or_store32( p, i->ARM64in.LdSt32.isLoad,
+                                 iregNo(i->ARM64in.LdSt32.rD),
+                                 i->ARM64in.LdSt32.amode );
+         goto done;
+      }
+      case ARM64in_LdSt16: {
+         p = do_load_or_store16( p, i->ARM64in.LdSt16.isLoad,
+                                 iregNo(i->ARM64in.LdSt16.rD),
+                                 i->ARM64in.LdSt16.amode );
+         goto done;
+      }
+      case ARM64in_LdSt8: {
+         p = do_load_or_store8( p, i->ARM64in.LdSt8.isLoad,
+                                iregNo(i->ARM64in.LdSt8.rD),
+                                i->ARM64in.LdSt8.amode );
+         goto done;
+      }
+//ZZ       case ARMin_LdSt32:
+//ZZ       case ARMin_LdSt8U: {
+//ZZ          UInt        bL, bB;
+//ZZ          HReg        rD;
+//ZZ          ARMAMode1*  am;
+//ZZ          ARMCondCode cc;
+//ZZ          if (i->tag == ARMin_LdSt32) {
+//ZZ             bB = 0;
+//ZZ             bL = i->ARMin.LdSt32.isLoad ? 1 : 0;
+//ZZ             am = i->ARMin.LdSt32.amode;
+//ZZ             rD = i->ARMin.LdSt32.rD;
+//ZZ             cc = i->ARMin.LdSt32.cc;
+//ZZ          } else {
+//ZZ             bB = 1;
+//ZZ             bL = i->ARMin.LdSt8U.isLoad ? 1 : 0;
+//ZZ             am = i->ARMin.LdSt8U.amode;
+//ZZ             rD = i->ARMin.LdSt8U.rD;
+//ZZ             cc = i->ARMin.LdSt8U.cc;
+//ZZ          }
+//ZZ          vassert(cc != ARMcc_NV);
+//ZZ          if (am->tag == ARMam1_RI) {
+//ZZ             Int  simm12;
+//ZZ             UInt instr, bP;
+//ZZ             if (am->ARMam1.RI.simm13 < 0) {
+//ZZ                bP = 0;
+//ZZ                simm12 = -am->ARMam1.RI.simm13;
+//ZZ             } else {
+//ZZ                bP = 1;
+//ZZ                simm12 = am->ARMam1.RI.simm13;
+//ZZ             }
+//ZZ             vassert(simm12 >= 0 && simm12 <= 4095);
+//ZZ             instr = XXXXX___(cc,X0101,BITS4(bP,bB,0,bL),
+//ZZ                              iregNo(am->ARMam1.RI.reg),
+//ZZ                              iregNo(rD));
+//ZZ             instr |= simm12;
+//ZZ             *p++ = instr;
+//ZZ             goto done;
+//ZZ          } else {
+//ZZ             // RR case
+//ZZ             goto bad;
+//ZZ          }
+//ZZ       }
+//ZZ       case ARMin_LdSt16: {
+//ZZ          HReg        rD = i->ARMin.LdSt16.rD;
+//ZZ          UInt        bS = i->ARMin.LdSt16.signedLoad ? 1 : 0;
+//ZZ          UInt        bL = i->ARMin.LdSt16.isLoad ? 1 : 0;
+//ZZ          ARMAMode2*  am = i->ARMin.LdSt16.amode;
+//ZZ          ARMCondCode cc = i->ARMin.LdSt16.cc;
+//ZZ          vassert(cc != ARMcc_NV);
+//ZZ          if (am->tag == ARMam2_RI) {
+//ZZ             HReg rN = am->ARMam2.RI.reg;
+//ZZ             Int  simm8;
+//ZZ             UInt bP, imm8hi, imm8lo, instr;
+//ZZ             if (am->ARMam2.RI.simm9 < 0) {
+//ZZ                bP = 0;
+//ZZ                simm8 = -am->ARMam2.RI.simm9;
+//ZZ             } else {
+//ZZ                bP = 1;
+//ZZ                simm8 = am->ARMam2.RI.simm9;
+//ZZ             }
+//ZZ             vassert(simm8 >= 0 && simm8 <= 255);
+//ZZ             imm8hi = (simm8 >> 4) & 0xF;
+//ZZ             imm8lo = simm8 & 0xF;
+//ZZ             vassert(!(bL == 0 && bS == 1)); // "! signed store"
+//ZZ             /**/ if (bL == 0 && bS == 0) {
+//ZZ                // strh
+//ZZ                instr = XXXXXXXX(cc,X0001, BITS4(bP,1,0,0), iregNo(rN),
+//ZZ                                 iregNo(rD), imm8hi, X1011, imm8lo);
+//ZZ                *p++ = instr;
+//ZZ                goto done;
+//ZZ             }
+//ZZ             else if (bL == 1 && bS == 0) {
+//ZZ                // ldrh
+//ZZ                instr = XXXXXXXX(cc,X0001, BITS4(bP,1,0,1), iregNo(rN),
+//ZZ                                 iregNo(rD), imm8hi, X1011, imm8lo);
+//ZZ                *p++ = instr;
+//ZZ                goto done;
+//ZZ             }
+//ZZ             else if (bL == 1 && bS == 1) {
+//ZZ                // ldrsh
+//ZZ                instr = XXXXXXXX(cc,X0001, BITS4(bP,1,0,1), iregNo(rN),
+//ZZ                                 iregNo(rD), imm8hi, X1111, imm8lo);
+//ZZ                *p++ = instr;
+//ZZ                goto done;
+//ZZ             }
+//ZZ             else vassert(0); // ill-constructed insn
+//ZZ          } else {
+//ZZ             // RR case
+//ZZ             goto bad;
+//ZZ          }
+//ZZ       }
+//ZZ       case ARMin_Ld8S: {
+//ZZ          HReg        rD = i->ARMin.Ld8S.rD;
+//ZZ          ARMAMode2*  am = i->ARMin.Ld8S.amode;
+//ZZ          ARMCondCode cc = i->ARMin.Ld8S.cc;
+//ZZ          vassert(cc != ARMcc_NV);
+//ZZ          if (am->tag == ARMam2_RI) {
+//ZZ             HReg rN = am->ARMam2.RI.reg;
+//ZZ             Int  simm8;
+//ZZ             UInt bP, imm8hi, imm8lo, instr;
+//ZZ             if (am->ARMam2.RI.simm9 < 0) {
+//ZZ                bP = 0;
+//ZZ                simm8 = -am->ARMam2.RI.simm9;
+//ZZ             } else {
+//ZZ                bP = 1;
+//ZZ                simm8 = am->ARMam2.RI.simm9;
+//ZZ             }
+//ZZ             vassert(simm8 >= 0 && simm8 <= 255);
+//ZZ             imm8hi = (simm8 >> 4) & 0xF;
+//ZZ             imm8lo = simm8 & 0xF;
+//ZZ             // ldrsb
+//ZZ             instr = XXXXXXXX(cc,X0001, BITS4(bP,1,0,1), iregNo(rN),
+//ZZ                              iregNo(rD), imm8hi, X1101, imm8lo);
+//ZZ             *p++ = instr;
+//ZZ             goto done;
+//ZZ          } else {
+//ZZ             // RR case
+//ZZ             goto bad;
+//ZZ          }
+//ZZ       }
+
+      case ARM64in_XDirect: {
+         /* NB: what goes on here has to be very closely coordinated
+            with chainXDirect_ARM64 and unchainXDirect_ARM64 below. */
+         /* We're generating chain-me requests here, so we need to be
+            sure this is actually allowed -- no-redir translations
+            can't use chain-me's.  Hence: */
+         vassert(disp_cp_chain_me_to_slowEP != NULL);
+         vassert(disp_cp_chain_me_to_fastEP != NULL);
+
+         /* Use ptmp for backpatching conditional jumps. */
+         UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in. */
+         if (i->ARM64in.XDirect.cond != ARM64cc_AL) {
+            vassert(i->ARM64in.XDirect.cond != ARM64cc_NV);
+            ptmp = p;
+            *p++ = 0;
+         }
+
+         /* Update the guest PC. */
+         /* imm64 x9, dstGA */
+         /* str   x9, amPC */
+         p = imm64_to_iregNo(p, /*x*/9, i->ARM64in.XDirect.dstGA);
+         p = do_load_or_store64(p, False/*!isLoad*/,
+                                /*x*/9, i->ARM64in.XDirect.amPC);
+
+         /* --- FIRST PATCHABLE BYTE follows --- */
+         /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're
+            calling to) backs up the return address, so as to find the
+            address of the first patchable byte.  So: don't change the
+            number of instructions (5) below. */
+         /* movw x9, VG_(disp_cp_chain_me_to_{slowEP,fastEP})[15:0] */
+         /* movk x9, VG_(disp_cp_chain_me_to_{slowEP,fastEP})[31:15], lsl 16 */
+         /* movk x9, VG_(disp_cp_chain_me_to_{slowEP,fastEP})[47:32], lsl 32 */
+         /* movk x9, VG_(disp_cp_chain_me_to_{slowEP,fastEP})[63:48], lsl 48 */
+         /* blr  x9 */
+         void* disp_cp_chain_me
+                  = i->ARM64in.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
+                                                : disp_cp_chain_me_to_slowEP;
+         p = imm64_to_iregNo_EXACTLY4(p, /*x*/9,
+                                      Ptr_to_ULong(disp_cp_chain_me));
+         *p++ = 0xD63F0120;
+         /* --- END of PATCHABLE BYTES --- */
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARM64in.XDirect.cond != ARM64cc_AL) {
+            Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+            vassert(delta > 0 && delta < 40);
+            vassert((delta & 3) == 0);
+            UInt notCond = 1 ^ (UInt)i->ARM64in.XDirect.cond;
+            vassert(notCond <= 13); /* Neither AL nor NV */
+            vassert(ptmp != NULL);
+            delta = delta >> 2;
+            *ptmp = X_8_19_1_4(X01010100, delta & ((1<<19)-1), 0, notCond);
+         }
+         goto done;
+      }
+
+      case ARM64in_XIndir: {
+         // XIndir is more or less the same as XAssisted, except
+         // we don't have a trc value to hand back, so there's no
+         // write to r21
+         /* Use ptmp for backpatching conditional jumps. */
+         //UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in. */
+         if (i->ARM64in.XIndir.cond != ARM64cc_AL) {
+            vassert(0); //ATC
+//ZZ             vassert(i->ARMin.XIndir.cond != ARMcc_NV);
+//ZZ             ptmp = p;
+//ZZ             *p++ = 0;
+         }
+
+         /* Update the guest PC. */
+         /* str r-dstGA, amPC */
+         p = do_load_or_store64(p, False/*!isLoad*/,
+                                iregNo(i->ARM64in.XIndir.dstGA),
+                                i->ARM64in.XIndir.amPC);
+
+         /* imm64 x9, VG_(disp_cp_xindir) */
+         /* br    x9 */
+         p = imm64_to_iregNo(p, /*x*/9, Ptr_to_ULong(disp_cp_xindir));
+         *p++ = 0xD61F0120; /* br x9 */
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARM64in.XIndir.cond != ARM64cc_AL) {
+            vassert(0); //ATC
+//ZZ             Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+//ZZ             vassert(delta > 0 && delta < 40);
+//ZZ             vassert((delta & 3) == 0);
+//ZZ             UInt notCond = 1 ^ (UInt)i->ARMin.XIndir.cond;
+//ZZ             vassert(notCond <= 13); /* Neither AL nor NV */
+//ZZ             delta = (delta >> 2) - 2;
+//ZZ             *ptmp = XX______(notCond, X1010) | (delta & 0xFFFFFF);
+         }
+         goto done;
+      }
+
+      case ARM64in_XAssisted: {
+         /* Use ptmp for backpatching conditional jumps. */
+         UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in.  I think this can only
+            ever happen when VEX is driven by the switchbacker. */
+         if (i->ARM64in.XAssisted.cond != ARM64cc_AL) {
+            vassert(i->ARM64in.XDirect.cond != ARM64cc_NV);
+            ptmp = p;
+            *p++ = 0;
+         }
+
+         /* Update the guest PC. */
+         /* str r-dstGA, amPC */
+         p = do_load_or_store64(p, False/*!isLoad*/,
+                                iregNo(i->ARM64in.XAssisted.dstGA),
+                                i->ARM64in.XAssisted.amPC);
+
+         /* movw r21,  $magic_number */
+         UInt trcval = 0;
+         switch (i->ARM64in.XAssisted.jk) {
+            case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+            case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+            //case Ijk_Sys_int128:  trcval = VEX_TRC_JMP_SYS_INT128;  break;
+            //case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+            //case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+            //case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+            case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+            case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
+            case Ijk_FlushDCache: trcval = VEX_TRC_JMP_FLUSHDCACHE; break;
+            case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+            //case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+            //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+            case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+            /* We don't expect to see the following being assisted. */
+            //case Ijk_Ret:
+            //case Ijk_Call:
+            /* fallthrough */
+            default: 
+               ppIRJumpKind(i->ARM64in.XAssisted.jk);
+               vpanic("emit_ARM64Instr.ARM64in_XAssisted: "
+                      "unexpected jump kind");
+         }
+         vassert(trcval != 0);
+         p = imm64_to_iregNo(p, /*x*/21, (ULong)trcval);
+
+         /* imm64 x9, VG_(disp_cp_xassisted) */
+         /* br    x9 */
+         p = imm64_to_iregNo(p, /*x*/9, Ptr_to_ULong(disp_cp_xassisted));
+         *p++ = 0xD61F0120; /* br x9 */
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARM64in.XAssisted.cond != ARM64cc_AL) {
+            Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+            vassert(delta > 0 && delta < 40);
+            vassert((delta & 3) == 0);
+            UInt notCond = 1 ^ (UInt)i->ARM64in.XDirect.cond;
+            vassert(notCond <= 13); /* Neither AL nor NV */
+            vassert(ptmp != NULL);
+            delta = delta >> 2;
+            *ptmp = X_8_19_1_4(X01010100, delta & ((1<<19)-1), 0, notCond);
+         }
+         goto done;
+      }
+
+      case ARM64in_CSel: {
+         /* 100 1101 0100 mm cond 00 nn dd = CSEL Xd, Xn, Xm, cond */
+         UInt dd   = iregNo(i->ARM64in.CSel.dst);
+         UInt nn   = iregNo(i->ARM64in.CSel.argL);
+         UInt mm   = iregNo(i->ARM64in.CSel.argR);
+         UInt cond = (UInt)i->ARM64in.CSel.cond;
+         vassert(dd < 31 && nn < 31 && mm < 31 && cond < 16);
+         *p++ = X_3_8_5_6_5_5(X100, X11010100, mm, cond << 2, nn, dd);
+         goto done;
+      }
+
+      case ARM64in_Call: {
+         /* We'll use x9 as a scratch register to put the target
+            address in. */
+         if (i->ARM64in.Call.cond != ARM64cc_AL
+             && i->ARM64in.Call.rloc.pri != RLPri_None) {
+            /* The call might not happen (it isn't unconditional) and
+               it returns a result.  In this case we will need to
+               generate a control flow diamond to put 0x555..555 in
+               the return register(s) in the case where the call
+               doesn't happen.  If this ever becomes necessary, maybe
+               copy code from the 32-bit ARM equivalent.  Until that
+               day, just give up. */
+            goto bad;
+         }
+
+         UInt* ptmp = NULL;
+         if (i->ARM64in.Call.cond != ARM64cc_AL) {
+            /* Create a hole to put a conditional branch in.  We'll
+               patch it once we know the branch length. */
+            ptmp = p;
+            *p++ = 0;
+         }
+
+         // x9 = &target
+         p = imm64_to_iregNo( (UInt*)p,
+                              /*x*/9, (ULong)i->ARM64in.Call.target );
+         // blr x9
+         *p++ = 0xD63F0120;
+
+         // Patch the hole if necessary
+         if (i->ARM64in.Call.cond != ARM64cc_AL) {
+            ULong dist = (ULong)(p - ptmp);
+            /* imm64_to_iregNo produces between 1 and 4 insns, and
+               then there's the BLR itself.  Hence: */
+            vassert(dist >= 2 && dist <= 5);
+            vassert(ptmp != NULL);
+            // 01010100 simm19 0 cond = B.cond (here + simm19 << 2)
+            *ptmp = X_8_19_1_4(X01010100, dist, 0,
+                               1 ^ (UInt)i->ARM64in.Call.cond);
+         } else {
+            vassert(ptmp == NULL);
+         }
+
+         goto done;
+      }
+
+      case ARM64in_AddToSP: {
+         /* 10,0 10001 00 imm12 11111 11111  ADD xsp, xsp, #imm12
+            11,0 10001 00 imm12 11111 11111  SUB xsp, xsp, #imm12
+         */
+         Int simm12 = i->ARM64in.AddToSP.simm;
+         vassert(-4096 < simm12 && simm12 < 4096);
+         vassert(0 == (simm12 & 0xF));
+         if (simm12 >= 0) {
+            *p++ = X_2_6_2_12_5_5(X10, X010001, X00, simm12, X11111, X11111);
+         } else {
+            *p++ = X_2_6_2_12_5_5(X11, X010001, X00, -simm12, X11111, X11111);
+         }
+         goto done;
+      }
+
+      case ARM64in_FromSP: {
+         /* 10,0 10001 00 0..(12)..0 11111 dd  MOV Xd, xsp */
+         UInt dd = iregNo(i->ARM64in.FromSP.dst);
+         vassert(dd < 31);
+         *p++ = X_2_6_2_12_5_5(X10, X010001, X00, 0, X11111, dd);
+         goto done;
+      }
+
+      case ARM64in_Mul: {
+         /* 100 11011 110 mm 011111 nn dd   UMULH Xd, Xn,Xm
+            100 11011 010 mm 011111 nn dd   SMULH Xd, Xn,Xm
+            100 11011 000 mm 011111 nn dd   MUL   Xd, Xn,Xm
+         */
+         UInt dd = iregNo(i->ARM64in.Mul.dst);
+         UInt nn = iregNo(i->ARM64in.Mul.argL);
+         UInt mm = iregNo(i->ARM64in.Mul.argR);
+         vassert(dd < 31 && nn < 31 && mm < 31);
+         switch (i->ARM64in.Mul.op) {
+            case ARM64mul_ZX:
+               *p++ = X_3_8_5_6_5_5(X100, X11011110, mm, X011111, nn, dd);
+               goto done;
+            case ARM64mul_SX:
+               *p++ = X_3_8_5_6_5_5(X100, X11011010, mm, X011111, nn, dd);
+               goto done;
+            case ARM64mul_PLAIN:
+               *p++ = X_3_8_5_6_5_5(X100, X11011000, mm, X011111, nn, dd);
+               goto done;
+            default:
+               vassert(0);
+         }
+         goto bad;
+      }
+      case ARM64in_LdrEX: {
+         /* 085F7C82   ldxrb w2, [x4]
+            485F7C82   ldxrh w2, [x4]
+            885F7C82   ldxr  w2, [x4]
+            C85F7C82   ldxr  x2, [x4]
+         */
+         switch (i->ARM64in.LdrEX.szB) {
+            case 1: *p++ = 0x085F7C82; goto done;
+            case 2: *p++ = 0x485F7C82; goto done;
+            case 4: *p++ = 0x885F7C82; goto done;
+            case 8: *p++ = 0xC85F7C82; goto done;
+            default: break;
+         }
+         goto bad;
+      }
+      case ARM64in_StrEX: {
+         /* 08007C82   stxrb w0, w2, [x4]
+            48007C82   stxrh w0, w2, [x4]
+            88007C82   stxr  w0, w2, [x4]
+            C8007C82   stxr  w0, x2, [x4]
+         */
+         switch (i->ARM64in.StrEX.szB) {
+            case 1: *p++ = 0x08007C82; goto done;
+            case 2: *p++ = 0x48007C82; goto done;
+            case 4: *p++ = 0x88007C82; goto done;
+            case 8: *p++ = 0xC8007C82; goto done;
+            default: break;
+         }
+         goto bad;
+      }
+      case ARM64in_MFence: {
+         *p++ = 0xD5033F9F; /* DSB sy */
+         *p++ = 0xD5033FBF; /* DMB sy */
+         *p++ = 0xD5033FDF; /* ISB */
+         goto done;
+      }
+      //case ARM64in_CLREX: {
+      //   //ATC, but believed to be correct
+      //   goto bad;
+      //   *p++ = 0xD5033F5F; /* clrex */
+      //   goto done;
+      //}
+      case ARM64in_VLdStS: {
+         /* 10 111101 01 imm12 n t   LDR St, [Xn|SP, #imm12 * 4]
+            10 111101 00 imm12 n t   STR St, [Xn|SP, #imm12 * 4]
+         */
+         UInt sD     = dregNo(i->ARM64in.VLdStS.sD);
+         UInt rN     = iregNo(i->ARM64in.VLdStS.rN);
+         UInt uimm12 = i->ARM64in.VLdStS.uimm12;
+         Bool isLD   = i->ARM64in.VLdStS.isLoad;
+         vassert(uimm12 < 16384 && 0 == (uimm12 & 3));
+         uimm12 >>= 2;
+         vassert(uimm12 < (1<<12));
+         vassert(sD < 32);
+         vassert(rN < 31);
+         *p++ = X_2_6_2_12_5_5(X10, X111101, isLD ? X01 : X00,
+                               uimm12, rN, sD);
+         goto done;
+      }
+      case ARM64in_VLdStD: {
+         /* 11 111101 01 imm12 n t   LDR Dt, [Xn|SP, #imm12 * 8]
+            11 111101 00 imm12 n t   STR Dt, [Xn|SP, #imm12 * 8]
+         */
+         UInt dD     = dregNo(i->ARM64in.VLdStD.dD);
+         UInt rN     = iregNo(i->ARM64in.VLdStD.rN);
+         UInt uimm12 = i->ARM64in.VLdStD.uimm12;
+         Bool isLD   = i->ARM64in.VLdStD.isLoad;
+         vassert(uimm12 < 32768 && 0 == (uimm12 & 7));
+         uimm12 >>= 3;
+         vassert(uimm12 < (1<<12));
+         vassert(dD < 32);
+         vassert(rN < 31);
+         *p++ = X_2_6_2_12_5_5(X11, X111101, isLD ? X01 : X00,
+                               uimm12, rN, dD);
+         goto done;
+      }
+      case ARM64in_VLdStQ: {
+         /* 0100 1100 0000 0000 0111 11 rN rQ   st1 {vQ.2d}, [<rN|SP>]
+            0100 1100 0100 0000 0111 11 rN rQ   ld1 {vQ.2d}, [<rN|SP>]
+         */
+         UInt rQ = qregNo(i->ARM64in.VLdStQ.rQ);
+         UInt rN = iregNo(i->ARM64in.VLdStQ.rN);
+         vassert(rQ < 32);
+         vassert(rN < 31);
+         if (i->ARM64in.VLdStQ.isLoad) {
+            *p++ = 0x4C407C00 | (rN << 5) | rQ;
+         } else {
+            *p++ = 0x4C007C00 | (rN << 5) | rQ;
+         }
+         goto done;
+      }
+      case ARM64in_VCvtI2F: {
+         /* 31  28    23 21 20 18  15     9 4
+            000 11110 00 1  00 010 000000 n d  SCVTF Sd, Wn
+            000 11110 01 1  00 010 000000 n d  SCVTF Dd, Wn
+            100 11110 00 1  00 010 000000 n d  SCVTF Sd, Xn
+            100 11110 01 1  00 010 000000 n d  SCVTF Dd, Xn
+            000 11110 00 1  00 011 000000 n d  UCVTF Sd, Wn
+            000 11110 01 1  00 011 000000 n d  UCVTF Dd, Wn
+            100 11110 00 1  00 011 000000 n d  UCVTF Sd, Xn
+            100 11110 01 1  00 011 000000 n d  UCVTF Dd, Xn
+         */
+         UInt       rN = iregNo(i->ARM64in.VCvtI2F.rS);
+         UInt       rD = dregNo(i->ARM64in.VCvtI2F.rD);
+         ARM64CvtOp how = i->ARM64in.VCvtI2F.how;
+         /* Just handle cases as they show up. */
+         switch (how) {
+            case ARM64cvt_F32_I32S: /* SCVTF Sd, Wn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X00100010, X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I32S: /* SCVTF Dd, Wn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X01100010, X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I64S: /* SCVTF Sd, Xn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X00100010, X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I64S: /* SCVTF Dd, Xn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X01100010, X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I32U: /* UCVTF Sd, Wn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X00100011, X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I32U: /* UCVTF Dd, Wn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X01100011, X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I64U: /* UCVTF Sd, Xn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X00100011, X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I64U: /* UCVTF Dd, Xn  */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X01100011, X000000, rN, rD);
+               break;
+            default:
+               goto bad; //ATC
+         }
+         goto done;
+      }
+      case ARM64in_VCvtF2I: {
+         /*    30       23   20 18  15     9 4
+            sf 00,11110,0x 1 00 000,000000 n d  FCVTNS Rd, Fn (round to
+            sf 00,11110,0x 1 00 001,000000 n d  FCVTNU Rd, Fn  nearest)
+            ---------------- 01 --------------  FCVTP-------- (round to +inf)
+            ---------------- 10 --------------  FCVTM-------- (round to -inf)
+            ---------------- 11 --------------  FCVTZ-------- (round to zero)
+
+            Rd is Xd when sf==1, Wd when sf==0
+            Fn is Dn when x==1, Sn when x==0
+            20:19 carry the rounding mode, using the same encoding as FPCR
+         */
+         UInt       rD    = iregNo(i->ARM64in.VCvtF2I.rD);
+         UInt       rN    = dregNo(i->ARM64in.VCvtF2I.rS);
+         ARM64CvtOp how   = i->ARM64in.VCvtF2I.how;
+         UChar      armRM = i->ARM64in.VCvtF2I.armRM;
+         /* Just handle cases as they show up. */
+         switch (how) {
+            case ARM64cvt_F64_I32S: /* FCVTxS Wd, Dn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X01100000 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I32U: /* FCVTxU Wd, Dn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X01100001 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I64S: /* FCVTxS Xd, Dn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X01100000 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F64_I64U: /* FCVTxU Xd, Dn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X01100001 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I32S: /* FCVTxS Wd, Sn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X00100000 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I32U: /* FCVTxU Wd, Sn */
+               *p++ = X_3_5_8_6_5_5(X000, X11110, X00100001 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I64S: /* FCVTxS Xd, Sn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X00100000 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            case ARM64cvt_F32_I64U: /* FCVTxU Xd, Sn */
+               *p++ = X_3_5_8_6_5_5(X100, X11110, X00100001 | (armRM << 3),
+                                    X000000, rN, rD);
+               break;
+            default:
+               goto bad; //ATC
+         }
+         goto done;
+      }
+      case ARM64in_VCvtSD: {
+         /* 31        23 21     16  14    9 4
+            000,11110, 00 10001 0,1 10000 n d   FCVT Dd, Sn (S->D)
+            ---------- 01 ----- 0,0 ---------   FCVT Sd, Dn (D->S)
+            Rounding, when dst is smaller than src, is per the FPCR.
+         */
+         UInt dd = dregNo(i->ARM64in.VCvtSD.dst);
+         UInt nn = dregNo(i->ARM64in.VCvtSD.src);
+         if (i->ARM64in.VCvtSD.sToD) {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X00100010, X110000, nn, dd);
+         } else {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X01100010, X010000, nn, dd);
+         }
+         goto done;
+      }
+      case ARM64in_VUnaryD: {
+         /* 31        23 21     16 14    9 4
+            000,11110 01 1,0000 0,0 10000 n d  FMOV Dd, Dn (not handled)
+            ------------------- 0,1 ---------  FABS ------
+            ------------------- 1,0 ---------  FNEG ------
+            ------------------- 1,1 ---------  FQSRT -----
+         */
+         UInt dD  = dregNo(i->ARM64in.VUnaryD.dst);
+         UInt dN  = dregNo(i->ARM64in.VUnaryD.src);
+         UInt b16 = 2; /* impossible */
+         UInt b15 = 2; /* impossible */
+         switch (i->ARM64in.VUnaryD.op) {
+            case ARM64fpu_NEG:  b16 = 1; b15 = 0; break;
+            case ARM64fpu_SQRT: b16 = 1; b15 = 1; break;
+            case ARM64fpu_ABS:  b16 = 0; b15 = 1; break;
+            default: break;
+         }
+         if (b16 < 2 && b15 < 2) {
+            *p++ = X_3_8_5_6_5_5(X000, X11110011, (X0000 << 1) | b16,
+                                 (b15 << 5) | X10000, dN, dD);
+            goto done;
+         }
+         /* 
+            000, 11110 01 1,001 11,1 10000 n d  FRINTI Dd, Dm (round per FPCR)
+         */
+         if (i->ARM64in.VUnaryD.op == ARM64fpu_RINT) {
+           *p++ = X_3_8_5_6_5_5(X000, X11110011, X00111, X110000, dN, dD);
+           goto done;
+         }
+         goto bad;
+      }
+      case ARM64in_VUnaryS: {
+         /* 31        23 21     16 14    9 4
+            000,11110 00 1,0000 0,0 10000 n d  FMOV Sd, Sn (not handled)
+            ------------------- 0,1 ---------  FABS ------
+            ------------------- 1,0 ---------  FNEG ------
+            ------------------- 1,1 ---------  FQSRT -----
+         */
+         UInt sD  = dregNo(i->ARM64in.VUnaryS.dst);
+         UInt sN  = dregNo(i->ARM64in.VUnaryS.src);
+         UInt b16 = 2; /* impossible */
+         UInt b15 = 2; /* impossible */
+         switch (i->ARM64in.VUnaryS.op) {
+            case ARM64fpu_NEG:  b16 = 1; b15 = 0; break;
+            case ARM64fpu_SQRT: b16 = 1; b15 = 1; break;
+            case ARM64fpu_ABS:  b16 = 0; b15 = 1; break;
+            default: break;
+         }
+         if (b16 < 2 && b15 < 2) {
+            *p++ = X_3_8_5_6_5_5(X000, X11110001, (X0000 << 1) | b16,
+                                 (b15 << 5) | X10000, sN, sD);
+            goto done;
+         }
+         /* 
+            000, 11110 00 1,001 11,1 10000 n d  FRINTI Sd, Sm (round per FPCR)
+         */
+         if (i->ARM64in.VUnaryS.op == ARM64fpu_RINT) {
+           *p++ = X_3_8_5_6_5_5(X000, X11110001, X00111, X110000, sN, sD);
+           goto done;
+         }
+         goto bad;
+      }
+      case ARM64in_VBinD: {
+         /* 31        23  20 15   11 9 4
+            ---------------- 0000 ------   FMUL  --------
+            000 11110 011 m  0001 10 n d   FDIV  Dd,Dn,Dm
+            ---------------- 0010 ------   FADD  --------
+            ---------------- 0011 ------   FSUB  --------
+         */
+         UInt dD = dregNo(i->ARM64in.VBinD.dst);
+         UInt dN = dregNo(i->ARM64in.VBinD.argL);
+         UInt dM = dregNo(i->ARM64in.VBinD.argR);
+         UInt b1512 = 16; /* impossible */
+         switch (i->ARM64in.VBinD.op) {
+            case ARM64fpb_DIV: b1512 = X0001; break;
+            case ARM64fpb_MUL: b1512 = X0000; break;
+            case ARM64fpb_SUB: b1512 = X0011; break;
+            case ARM64fpb_ADD: b1512 = X0010; break;
+            default: goto bad;
+         }
+         vassert(b1512 < 16);
+         *p++
+            = X_3_8_5_6_5_5(X000, X11110011, dM, (b1512 << 2) | X10, dN, dD);
+         goto done;
+      }
+      case ARM64in_VBinS: {
+         /* 31        23  20 15   11 9 4
+            ---------------- 0000 ------   FMUL  --------
+            000 11110 001 m  0001 10 n d   FDIV  Dd,Dn,Dm
+            ---------------- 0010 ------   FADD  --------
+            ---------------- 0011 ------   FSUB  --------
+         */
+         UInt sD = dregNo(i->ARM64in.VBinS.dst);
+         UInt sN = dregNo(i->ARM64in.VBinS.argL);
+         UInt sM = dregNo(i->ARM64in.VBinS.argR);
+         UInt b1512 = 16; /* impossible */
+         switch (i->ARM64in.VBinS.op) {
+            case ARM64fpb_DIV: b1512 = X0001; break;
+            case ARM64fpb_MUL: b1512 = X0000; break;
+            case ARM64fpb_SUB: b1512 = X0011; break;
+            case ARM64fpb_ADD: b1512 = X0010; break;
+            default: goto bad;
+         }
+         vassert(b1512 < 16);
+         *p++
+            = X_3_8_5_6_5_5(X000, X11110001, sM, (b1512 << 2) | X10, sN, sD);
+         goto done;
+      }
+      case ARM64in_VCmpD: {
+         /* 000 11110 01 1 m 00 1000 n 00 000  FCMP Dn, Dm */
+         UInt dN = dregNo(i->ARM64in.VCmpD.argL);
+         UInt dM = dregNo(i->ARM64in.VCmpD.argR);
+         *p++ = X_3_8_5_6_5_5(X000, X11110011, dM, X001000, dN, X00000);
+         goto done;
+      }
+      case ARM64in_VCmpS: {
+         /* 000 11110 00 1 m 00 1000 n 00 000  FCMP Sn, Sm */
+         UInt sN = dregNo(i->ARM64in.VCmpS.argL);
+         UInt sM = dregNo(i->ARM64in.VCmpS.argR);
+         *p++ = X_3_8_5_6_5_5(X000, X11110001, sM, X001000, sN, X00000);
+         goto done;
+      }
+      case ARM64in_FPCR: {
+         Bool toFPCR = i->ARM64in.FPCR.toFPCR;
+         UInt iReg   = iregNo(i->ARM64in.FPCR.iReg);
+         if (toFPCR) {
+            /* 0xD51B44 000 Rt  MSR fpcr, rT */
+            *p++ = 0xD51B4400 | (iReg & 0x1F);
+            goto done;
+         }
+         goto bad; // FPCR -> iReg case currently ATC
+      }
+      case ARM64in_VBinV: {
+         /* 31        23   20 15     9 4
+            010 01110 11 1 m  100001 n d   ADD Vd.2d,  Vn.2d,  Vm.2d
+            010 01110 10 1 m  100001 n d   ADD Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  100001 n d   ADD Vd.8h,  Vn.8h,  Vm.8h
+            010 01110 00 1 m  100001 n d   ADD Vd.16b, Vn.16b, Vm.16b
+
+            011 01110 11 1 m  100001 n d   SUB Vd.2d,  Vn.2d,  Vm.2d
+            011 01110 10 1 m  100001 n d   SUB Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 01 1 m  100001 n d   SUB Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 00 1 m  100001 n d   SUB Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 10 1 m  100111 n d   MUL Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  100111 n d   MUL Vd.8h,  Vn.8h,  Vm.8h
+            010 01110 00 1 m  100111 n d   MUL Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 01 1 m  110101 n d   FADD Vd.2d, Vn.2d, Vm.2d
+            010 01110 00 1 m  110101 n d   FADD Vd.4s, Vn.4s, Vm.4s
+            010 01110 11 1 m  110101 n d   FSUB Vd.2d, Vn.2d, Vm.2d
+            010 01110 10 1 m  110101 n d   FSUB Vd.4s, Vn.4s, Vm.4s
+
+            011 01110 01 1 m  110111 n d   FMUL Vd.2d, Vn.2d, Vm.2d
+            011 01110 00 1 m  110111 n d   FMUL Vd.4s, Vn.4s, Vm.4s
+            011 01110 01 1 m  111111 n d   FDIV Vd.2d, Vn.2d, Vm.2d
+            011 01110 00 1 m  111111 n d   FDIV Vd.4s, Vn.4s, Vm.4s
+
+            011 01110 10 1 m  011001 n d   UMAX Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 01 1 m  011001 n d   UMAX Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 00 1 m  011001 n d   UMAX Vd.16b, Vn.16b, Vm.16b
+
+            011 01110 10 1 m  011011 n d   UMIN Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 01 1 m  011011 n d   UMIN Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 00 1 m  011011 n d   UMIN Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 10 1 m  011001 n d   SMAX Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  011001 n d   SMAX Vd.8h,  Vn.8h,  Vm.8h
+            010 01110 00 1 m  011001 n d   SMAX Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 10 1 m  011011 n d   SMIN Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  011011 n d   SMIN Vd.8h,  Vn.8h,  Vm.8h
+            010 01110 00 1 m  011011 n d   SMIN Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 00 1 m  000111 n d   AND Vd, Vn, Vm
+            010 01110 10 1 m  000111 n d   ORR Vd, Vn, Vm
+            011 01110 00 1 m  000111 n d   EOR Vd, Vn, Vm
+
+            011 01110 11 1 m  100011 n d   CMEQ Vd.2d,  Vn.2d,  Vm.2d
+            011 01110 10 1 m  100011 n d   CMEQ Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 01 1 m  100011 n d   CMEQ Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 00 1 m  100011 n d   CMEQ Vd.16b, Vn.16b, Vm.16b
+
+            011 01110 11 1 m  001101 n d   CMHI Vd.2d,  Vn.2d,  Vm.2d
+            011 01110 10 1 m  001101 n d   CMHI Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 01 1 m  001101 n d   CMHI Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 00 1 m  001101 n d   CMHI Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 11 1 m  001101 n d   CMGT Vd.2d,  Vn.2d,  Vm.2d
+            010 01110 10 1 m  001101 n d   CMGT Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  001101 n d   CMGT Vd.8h,  Vn.8h,  Vm.8h
+            010 01110 00 1 m  001101 n d   CMGT Vd.16b, Vn.16b, Vm.16b
+
+            010 01110 01 1 m  111001 n d   FCMEQ Vd.2d, Vn.2d, Vm.2d
+            010 01110 00 1 m  111001 n d   FCMEQ Vd.4s, Vn.4s, Vm.4s
+
+            011 01110 01 1 m  111001 n d   FCMGE Vd.2d, Vn.2d, Vm.2d
+            011 01110 00 1 m  111001 n d   FCMGE Vd.4s, Vn.4s, Vm.4s
+
+            011 01110 11 1 m  111001 n d   FCMGT Vd.2d, Vn.2d, Vm.2d
+            011 01110 10 1 m  111001 n d   FCMGT Vd.4s, Vn.4s, Vm.4s
+
+            010 01110 00 0 m  000000 n d   TBL Vd.16b, {Vn.16b}, Vm.16b
+
+         */
+         UInt vD = qregNo(i->ARM64in.VBinV.dst);
+         UInt vN = qregNo(i->ARM64in.VBinV.argL);
+         UInt vM = qregNo(i->ARM64in.VBinV.argR);
+         switch (i->ARM64in.VBinV.op) {
+            case ARM64vecb_ADD64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_ADD32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_ADD16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_ADD8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_MUL32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X100111, vN, vD);
+               break;
+            case ARM64vecb_MUL16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X100111, vN, vD);
+               break;
+            case ARM64vecb_MUL8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X100111, vN, vD);
+               break;
+            case ARM64vecb_FADD64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FADD32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FSUB64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FSUB32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FMUL64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X110111, vN, vD);
+               break;
+            case ARM64vecb_FMUL32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X110111, vN, vD);
+               break;
+            case ARM64vecb_FDIV64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X111111, vN, vD);
+               break;
+            case ARM64vecb_FDIV32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X111111, vN, vD);
+               break;
+
+            case ARM64vecb_UMAX32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X011001, vN, vD);
+               break;
+            case ARM64vecb_UMAX16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X011001, vN, vD);
+               break;
+            case ARM64vecb_UMAX8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X011001, vN, vD);
+               break;
+
+            case ARM64vecb_UMIN32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X011011, vN, vD);
+               break;
+            case ARM64vecb_UMIN16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X011011, vN, vD);
+               break;
+            case ARM64vecb_UMIN8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X011011, vN, vD);
+               break;
+
+            case ARM64vecb_SMAX32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X011001, vN, vD);
+               break;
+            case ARM64vecb_SMAX16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X011001, vN, vD);
+               break;
+            case ARM64vecb_SMAX8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X011001, vN, vD);
+               break;
+
+            case ARM64vecb_SMIN32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X011011, vN, vD);
+               break;
+            case ARM64vecb_SMIN16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X011011, vN, vD);
+               break;
+            case ARM64vecb_SMIN8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X011011, vN, vD);
+               break;
+
+            case ARM64vecb_AND:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X000111, vN, vD);
+               break;
+            case ARM64vecb_ORR:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X000111, vN, vD);
+               break;
+            case ARM64vecb_XOR:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X000111, vN, vD);
+               break;
+
+            case ARM64vecb_CMEQ64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100011, vN, vD);
+               break;
+            case ARM64vecb_CMEQ32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X100011, vN, vD);
+               break;
+            case ARM64vecb_CMEQ16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X100011, vN, vD);
+               break;
+            case ARM64vecb_CMEQ8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X100011, vN, vD);
+               break;
+
+            case ARM64vecb_CMHI64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMHI32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMHI16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMHI8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM,  X001101, vN, vD);
+               break;
+
+            case ARM64vecb_CMGT64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMGT32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMGT16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM,  X001101, vN, vD);
+               break;
+            case ARM64vecb_CMGT8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM,  X001101, vN, vD);
+               break;
+
+            case ARM64vecb_FCMEQ64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X111001, vN, vD);
+               break;
+            case ARM64vecb_FCMEQ32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X111001, vN, vD);
+               break;
+
+            case ARM64vecb_FCMGE64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X111001, vN, vD);
+               break;
+            case ARM64vecb_FCMGE32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X111001, vN, vD);
+               break;
+
+            case ARM64vecb_FCMGT64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X111001, vN, vD);
+               break;
+            case ARM64vecb_FCMGT32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X111001, vN, vD);
+               break;
+
+            case ARM64vecb_TBL1:
+               *p++ = X_3_8_5_6_5_5(X010, X01110000, vM,  X000000, vN, vD);
+               break;
+
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_VUnaryV: {
+         /* 31        23   20    15     9 4
+            010 01110 11 1 00000 111110 n d  FABS Vd.2d,  Vn.2d
+            010 01110 10 1 00000 111110 n d  FABS Vd.4s,  Vn.4s
+            011 01110 11 1 00000 111110 n d  FNEG Vd.2d,  Vn.2d
+            011 01110 10 1 00000 111110 n d  FNEG Vd.4s,  Vn.4s
+            011 01110 00 1 00000 010110 n d  NOT  Vd.16b, Vn.16b
+         */
+         UInt vD = qregNo(i->ARM64in.VUnaryV.dst);
+         UInt vN = qregNo(i->ARM64in.VUnaryV.arg);
+         switch (i->ARM64in.VUnaryV.op) {
+            case ARM64vecu_FABS64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X111110, vN, vD);
+               break;
+            case ARM64vecu_FABS32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, X00000, X111110, vN, vD);
+               break;
+            case ARM64vecu_FNEG64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, X00000, X111110, vN, vD);
+               break;
+            case ARM64vecu_FNEG32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, X00000, X111110, vN, vD);
+               break;
+            case ARM64vecu_NOT:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X010110, vN, vD);
+               break;
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_VNarrowV: {
+         /* 31        23 21      15     9 4
+            000 01110 00 1,00001 001010 n d  XTN Vd.8b, Vn.8h
+            000 01110 01 1,00001 001010 n d  XTN Vd.4h, Vn.4s
+            000 01110 10 1,00001 001010 n d  XTN Vd.2s, Vn.2d
+         */
+         UInt vD = qregNo(i->ARM64in.VNarrowV.dst);
+         UInt vN = qregNo(i->ARM64in.VNarrowV.src);
+         UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2;
+         vassert(dszBlg2 >= 0 && dszBlg2 <= 2);
+         *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1),
+                              X00001, X001010, vN, vD);
+         goto done;
+      }
+      case ARM64in_VShiftImmV: {
+         /*
+            0q1 011110 immh immb 000001 n d  USHR Vd.T, Vn.T, #sh
+            0q0 011110 immh immb 000001 n d  SSHR Vd.T, Vn.T, #sh
+            where immh:immb
+               = case T of 
+                    2d  | sh in 1..63 -> let xxxxxx = 64-sh in 1xxx:xxx
+                    4s  | sh in 1..31 -> let  xxxxx = 32-sh in 01xx:xxx
+                    8h  | sh in 1..15 -> let   xxxx = 16-sh in 001x:xxx
+                    16b | sh in 1..7  -> let    xxx =  8-sh in 0001:xxx
+
+            0q0 011110 immh immb 010101 n d  SHL Vd.T, Vn.T, #sh
+            where immh:immb
+               = case T of 
+                    2d  | sh in 1..63 -> let xxxxxx = sh in 1xxx:xxx
+                    4s  | sh in 1..31 -> let  xxxxx = sh in 01xx:xxx
+                    8h  | sh in 1..15 -> let   xxxx = sh in 001x:xxx
+                    16b | sh in 1..7  -> let    xxx = sh in 0001:xxx
+         */
+         UInt vD = qregNo(i->ARM64in.VShiftImmV.dst);
+         UInt vN = qregNo(i->ARM64in.VShiftImmV.src);
+         UInt sh = i->ARM64in.VShiftImmV.amt;
+         ARM64VecShiftOp op = i->ARM64in.VShiftImmV.op;
+         Bool syned = False;
+         switch (op) {
+            /* 64x2 cases */
+            case ARM64vecsh_SSHR64x2: syned = True;
+            case ARM64vecsh_USHR64x2: /* fallthrough */
+               if (sh >= 1 && sh <= 63) {
+                  UInt xxxxxx = 64-sh;
+                  *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110,
+                                       X1000000 | xxxxxx, X000001, vN, vD);
+                  goto done;
+               }
+               break;
+            case ARM64vecsh_SHL64x2:
+               if (sh >= 1 && sh <= 63) {
+                  UInt xxxxxx = sh;
+                  *p++ = X_3_6_7_6_5_5(X010, X011110,
+                                       X1000000 | xxxxxx, X010101, vN, vD);
+                  goto done;
+               }
+               break;
+            /* 32x4 cases */
+            case ARM64vecsh_SSHR32x4: syned = True;
+            case ARM64vecsh_USHR32x4: /* fallthrough */
+               if (sh >= 1 && sh <= 31) {
+                  UInt xxxxx = 32-sh;
+                  *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110,
+                                       X0100000 | xxxxx, X000001, vN, vD);
+                  goto done;
+               }
+               break;
+            case ARM64vecsh_SHL32x4:
+               if (sh >= 1 && sh <= 31) {
+                  UInt xxxxx = sh;
+                  *p++ = X_3_6_7_6_5_5(X010, X011110,
+                                       X0100000 | xxxxx, X010101, vN, vD);
+                  goto done;
+               }
+               break;
+            /* 16x8 cases */
+            case ARM64vecsh_SSHR16x8: syned = True;
+            case ARM64vecsh_USHR16x8: /* fallthrough */
+               if (sh >= 1 && sh <= 15) {
+                  UInt xxxx = 16-sh;
+                  *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110,
+                                       X0010000 | xxxx, X000001, vN, vD);
+                  goto done;
+               }
+               break;
+            case ARM64vecsh_SHL16x8:
+               if (sh >= 1 && sh <= 15) {
+                  UInt xxxx = sh;
+                  *p++ = X_3_6_7_6_5_5(X010, X011110,
+                                       X0010000 | xxxx, X010101, vN, vD);
+                  goto done;
+               }
+               break;
+
+
+            /* 8x16 cases */
+            case ARM64vecsh_SSHR8x16: syned = True;
+            case ARM64vecsh_USHR8x16: /* fallthrough */
+               if (sh >= 1 && sh <= 7) {
+                  UInt xxx = 8-sh;
+                  *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110,
+                                       X0001000 | xxx, X000001, vN, vD);
+                  goto done;
+               }
+               break;
+            case ARM64vecsh_SHL8x16:
+               if (sh >= 1 && sh <= 7) {
+                  UInt xxx = sh;
+                  *p++ = X_3_6_7_6_5_5(X010, X011110,
+                                       X0001000 | xxx, X010101, vN, vD);
+                  goto done;
+               }
+               break;
+
+            default:
+               break;
+         }
+         goto bad;
+      }
+//ZZ       case ARMin_VAluS: {
+//ZZ          UInt dN = fregNo(i->ARMin.VAluS.argL);
+//ZZ          UInt dD = fregNo(i->ARMin.VAluS.dst);
+//ZZ          UInt dM = fregNo(i->ARMin.VAluS.argR);
+//ZZ          UInt bN = dN & 1;
+//ZZ          UInt bD = dD & 1;
+//ZZ          UInt bM = dM & 1;
+//ZZ          UInt pqrs = X1111; /* undefined */
+//ZZ          switch (i->ARMin.VAluS.op) {
+//ZZ             case ARMvfp_ADD: pqrs = X0110; break;
+//ZZ             case ARMvfp_SUB: pqrs = X0111; break;
+//ZZ             case ARMvfp_MUL: pqrs = X0100; break;
+//ZZ             case ARMvfp_DIV: pqrs = X1000; break;
+//ZZ             default: goto bad;
+//ZZ          }
+//ZZ          vassert(pqrs != X1111);
+//ZZ          UInt bP  = (pqrs >> 3) & 1;
+//ZZ          UInt bQ  = (pqrs >> 2) & 1;
+//ZZ          UInt bR  = (pqrs >> 1) & 1;
+//ZZ          UInt bS  = (pqrs >> 0) & 1;
+//ZZ          UInt insn = XXXXXXXX(0xE, X1110, BITS4(bP,bD,bQ,bR),
+//ZZ                               (dN >> 1), (dD >> 1),
+//ZZ                               X1010, BITS4(bN,bS,bM,0), (dM >> 1));
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VUnaryS: {
+//ZZ          UInt fD   = fregNo(i->ARMin.VUnaryS.dst);
+//ZZ          UInt fM   = fregNo(i->ARMin.VUnaryS.src);
+//ZZ          UInt insn = 0;
+//ZZ          switch (i->ARMin.VUnaryS.op) {
+//ZZ             case ARMvfpu_COPY:
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0000,
+//ZZ                                (fD >> 1), X1010, BITS4(0,1,(fM & 1),0),
+//ZZ                                (fM >> 1));
+//ZZ                break;
+//ZZ             case ARMvfpu_ABS:
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0000,
+//ZZ                                (fD >> 1), X1010, BITS4(1,1,(fM & 1),0),
+//ZZ                                (fM >> 1));
+//ZZ                break;
+//ZZ             case ARMvfpu_NEG:
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0001,
+//ZZ                                (fD >> 1), X1010, BITS4(0,1,(fM & 1),0),
+//ZZ                                (fM >> 1));
+//ZZ                break;
+//ZZ             case ARMvfpu_SQRT:
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0001,
+//ZZ                                (fD >> 1), X1010, BITS4(1,1,(fM & 1),0),
+//ZZ                                (fM >> 1));
+//ZZ                break;
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VCMovD: {
+//ZZ          UInt cc = (UInt)i->ARMin.VCMovD.cond;
+//ZZ          UInt dD = dregNo(i->ARMin.VCMovD.dst);
+//ZZ          UInt dM = dregNo(i->ARMin.VCMovD.src);
+//ZZ          vassert(cc < 16 && cc != ARMcc_AL);
+//ZZ          UInt insn = XXXXXXXX(cc, X1110,X1011,X0000,dD,X1011,X0100,dM);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VCMovS: {
+//ZZ          UInt cc = (UInt)i->ARMin.VCMovS.cond;
+//ZZ          UInt fD = fregNo(i->ARMin.VCMovS.dst);
+//ZZ          UInt fM = fregNo(i->ARMin.VCMovS.src);
+//ZZ          vassert(cc < 16 && cc != ARMcc_AL);
+//ZZ          UInt insn = XXXXXXXX(cc, X1110, BITS4(1,(fD & 1),1,1),
+//ZZ                               X0000,(fD >> 1),X1010,
+//ZZ                               BITS4(0,1,(fM & 1),0), (fM >> 1));
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VXferD: {
+//ZZ          UInt dD  = dregNo(i->ARMin.VXferD.dD);
+//ZZ          UInt rHi = iregNo(i->ARMin.VXferD.rHi);
+//ZZ          UInt rLo = iregNo(i->ARMin.VXferD.rLo);
+//ZZ          /* vmov dD, rLo, rHi is
+//ZZ             E C 4 rHi rLo B (0,0,dD[4],1) dD[3:0]
+//ZZ             vmov rLo, rHi, dD is
+//ZZ             E C 5 rHi rLo B (0,0,dD[4],1) dD[3:0]
+//ZZ          */
+//ZZ          UInt insn
+//ZZ             = XXXXXXXX(0xE, 0xC, i->ARMin.VXferD.toD ? 4 : 5,
+//ZZ                        rHi, rLo, 0xB,
+//ZZ                        BITS4(0,0, ((dD >> 4) & 1), 1), (dD & 0xF));
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VXferS: {
+//ZZ          UInt fD  = fregNo(i->ARMin.VXferS.fD);
+//ZZ          UInt rLo = iregNo(i->ARMin.VXferS.rLo);
+//ZZ          /* vmov fD, rLo is
+//ZZ             E E 0 fD[4:1] rLo A (fD[0],0,0,1) 0
+//ZZ             vmov rLo, fD is
+//ZZ             E E 1 fD[4:1] rLo A (fD[0],0,0,1) 0
+//ZZ          */
+//ZZ          UInt insn
+//ZZ             = XXXXXXXX(0xE, 0xE, i->ARMin.VXferS.toS ? 0 : 1,
+//ZZ                        (fD >> 1) & 0xF, rLo, 0xA, 
+//ZZ                        BITS4((fD & 1),0,0,1), 0);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_VCvtID: {
+//ZZ          Bool iToD = i->ARMin.VCvtID.iToD;
+//ZZ          Bool syned = i->ARMin.VCvtID.syned;
+//ZZ          if (iToD && syned) {
+//ZZ             // FSITOD: I32S-in-freg to F64-in-dreg
+//ZZ             UInt regF = fregNo(i->ARMin.VCvtID.src);
+//ZZ             UInt regD = dregNo(i->ARMin.VCvtID.dst);
+//ZZ             UInt insn = XXXXXXXX(0xE, X1110, X1011, X1000, regD,
+//ZZ                                  X1011, BITS4(1,1,(regF & 1),0),
+//ZZ                                  (regF >> 1) & 0xF);
+//ZZ             *p++ = insn;
+//ZZ             goto done;
+//ZZ          }
+//ZZ          if (iToD && (!syned)) {
+//ZZ             // FUITOD: I32U-in-freg to F64-in-dreg
+//ZZ             UInt regF = fregNo(i->ARMin.VCvtID.src);
+//ZZ             UInt regD = dregNo(i->ARMin.VCvtID.dst);
+//ZZ             UInt insn = XXXXXXXX(0xE, X1110, X1011, X1000, regD,
+//ZZ                                  X1011, BITS4(0,1,(regF & 1),0),
+//ZZ                                  (regF >> 1) & 0xF);
+//ZZ             *p++ = insn;
+//ZZ             goto done;
+//ZZ          }
+//ZZ          if ((!iToD) && syned) {
+//ZZ             // FTOSID: F64-in-dreg to I32S-in-freg
+//ZZ             UInt regD = dregNo(i->ARMin.VCvtID.src);
+//ZZ             UInt regF = fregNo(i->ARMin.VCvtID.dst);
+//ZZ             UInt insn = XXXXXXXX(0xE, X1110, BITS4(1,(regF & 1),1,1),
+//ZZ                                  X1101, (regF >> 1) & 0xF,
+//ZZ                                  X1011, X0100, regD);
+//ZZ             *p++ = insn;
+//ZZ             goto done;
+//ZZ          }
+//ZZ          if ((!iToD) && (!syned)) {
+//ZZ             // FTOUID: F64-in-dreg to I32U-in-freg
+//ZZ             UInt regD = dregNo(i->ARMin.VCvtID.src);
+//ZZ             UInt regF = fregNo(i->ARMin.VCvtID.dst);
+//ZZ             UInt insn = XXXXXXXX(0xE, X1110, BITS4(1,(regF & 1),1,1),
+//ZZ                                  X1100, (regF >> 1) & 0xF,
+//ZZ                                  X1011, X0100, regD);
+//ZZ             *p++ = insn;
+//ZZ             goto done;
+//ZZ          }
+//ZZ          /*UNREACHED*/
+//ZZ          vassert(0);
+//ZZ       }
+//ZZ       case ARMin_NLdStD: {
+//ZZ          UInt regD = dregNo(i->ARMin.NLdStD.dD);
+//ZZ          UInt regN, regM;
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt bL = i->ARMin.NLdStD.isLoad ? 1 : 0;
+//ZZ          UInt insn;
+//ZZ          vassert(hregClass(i->ARMin.NLdStD.dD) == HRcFlt64);
+//ZZ          regD &= 0xF;
+//ZZ          if (i->ARMin.NLdStD.amode->tag == ARMamN_RR) {
+//ZZ             regN = iregNo(i->ARMin.NLdStD.amode->ARMamN.RR.rN);
+//ZZ             regM = iregNo(i->ARMin.NLdStD.amode->ARMamN.RR.rM);
+//ZZ          } else {
+//ZZ             regN = iregNo(i->ARMin.NLdStD.amode->ARMamN.R.rN);
+//ZZ             regM = 15;
+//ZZ          }
+//ZZ          insn = XXXXXXXX(0xF, X0100, BITS4(0, D, bL, 0),
+//ZZ                               regN, regD, X0111, X1000, regM);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NUnaryS: {
+//ZZ          UInt Q = i->ARMin.NUnaryS.Q ? 1 : 0;
+//ZZ          UInt regD, D;
+//ZZ          UInt regM, M;
+//ZZ          UInt size = i->ARMin.NUnaryS.size;
+//ZZ          UInt insn;
+//ZZ          UInt opc, opc1, opc2;
+//ZZ          switch (i->ARMin.NUnaryS.op) {
+//ZZ 	    case ARMneon_VDUP:
+//ZZ                if (i->ARMin.NUnaryS.size >= 16)
+//ZZ                   goto bad;
+//ZZ                if (i->ARMin.NUnaryS.dst->tag != ARMNRS_Reg)
+//ZZ                   goto bad;
+//ZZ                if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+//ZZ                   goto bad;
+//ZZ                regD = (hregClass(i->ARMin.NUnaryS.dst->reg) == HRcVec128)
+//ZZ                         ? (qregNo(i->ARMin.NUnaryS.dst->reg) << 1)
+//ZZ                         : dregNo(i->ARMin.NUnaryS.dst->reg);
+//ZZ                regM = (hregClass(i->ARMin.NUnaryS.src->reg) == HRcVec128)
+//ZZ                         ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1)
+//ZZ                         : dregNo(i->ARMin.NUnaryS.src->reg);
+//ZZ                D = regD >> 4;
+//ZZ                M = regM >> 4;
+//ZZ                regD &= 0xf;
+//ZZ                regM &= 0xf;
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1),
+//ZZ                                (i->ARMin.NUnaryS.size & 0xf), regD,
+//ZZ                                X1100, BITS4(0,Q,M,0), regM);
+//ZZ                *p++ = insn;
+//ZZ                goto done; 
+//ZZ             case ARMneon_SETELEM:
+//ZZ                regD = Q ? (qregNo(i->ARMin.NUnaryS.dst->reg) << 1) :
+//ZZ                                 dregNo(i->ARMin.NUnaryS.dst->reg);
+//ZZ                regM = iregNo(i->ARMin.NUnaryS.src->reg);
+//ZZ                M = regM >> 4;
+//ZZ                D = regD >> 4;
+//ZZ                regM &= 0xF;
+//ZZ                regD &= 0xF;
+//ZZ                if (i->ARMin.NUnaryS.dst->tag != ARMNRS_Scalar)
+//ZZ                   goto bad;
+//ZZ                switch (size) {
+//ZZ                   case 0:
+//ZZ                      if (i->ARMin.NUnaryS.dst->index > 7)
+//ZZ                         goto bad;
+//ZZ                      opc = X1000 | i->ARMin.NUnaryS.dst->index;
+//ZZ                      break;
+//ZZ                   case 1:
+//ZZ                      if (i->ARMin.NUnaryS.dst->index > 3)
+//ZZ                         goto bad;
+//ZZ                      opc = X0001 | (i->ARMin.NUnaryS.dst->index << 1);
+//ZZ                      break;
+//ZZ                   case 2:
+//ZZ                      if (i->ARMin.NUnaryS.dst->index > 1)
+//ZZ                         goto bad;
+//ZZ                      opc = X0000 | (i->ARMin.NUnaryS.dst->index << 2);
+//ZZ                      break;
+//ZZ                   default:
+//ZZ                      goto bad;
+//ZZ                }
+//ZZ                opc1 = (opc >> 2) & 3;
+//ZZ                opc2 = opc & 3;
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(0,(opc1 >> 1),(opc1 & 1),0),
+//ZZ                                regD, regM, X1011,
+//ZZ                                BITS4(D,(opc2 >> 1),(opc2 & 1),1), X0000);
+//ZZ                *p++ = insn;
+//ZZ                goto done;
+//ZZ             case ARMneon_GETELEMU:
+//ZZ                regM = Q ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1) :
+//ZZ                                 dregNo(i->ARMin.NUnaryS.src->reg);
+//ZZ                regD = iregNo(i->ARMin.NUnaryS.dst->reg);
+//ZZ                M = regM >> 4;
+//ZZ                D = regD >> 4;
+//ZZ                regM &= 0xF;
+//ZZ                regD &= 0xF;
+//ZZ                if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+//ZZ                   goto bad;
+//ZZ                switch (size) {
+//ZZ                   case 0:
+//ZZ                      if (Q && i->ARMin.NUnaryS.src->index > 7) {
+//ZZ                         regM++;
+//ZZ                         i->ARMin.NUnaryS.src->index -= 8;
+//ZZ                      }
+//ZZ                      if (i->ARMin.NUnaryS.src->index > 7)
+//ZZ                         goto bad;
+//ZZ                      opc = X1000 | i->ARMin.NUnaryS.src->index;
+//ZZ                      break;
+//ZZ                   case 1:
+//ZZ                      if (Q && i->ARMin.NUnaryS.src->index > 3) {
+//ZZ                         regM++;
+//ZZ                         i->ARMin.NUnaryS.src->index -= 4;
+//ZZ                      }
+//ZZ                      if (i->ARMin.NUnaryS.src->index > 3)
+//ZZ                         goto bad;
+//ZZ                      opc = X0001 | (i->ARMin.NUnaryS.src->index << 1);
+//ZZ                      break;
+//ZZ                   case 2:
+//ZZ                      goto bad;
+//ZZ                   default:
+//ZZ                      goto bad;
+//ZZ                }
+//ZZ                opc1 = (opc >> 2) & 3;
+//ZZ                opc2 = opc & 3;
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1,(opc1 >> 1),(opc1 & 1),1),
+//ZZ                                regM, regD, X1011,
+//ZZ                                BITS4(M,(opc2 >> 1),(opc2 & 1),1), X0000);
+//ZZ                *p++ = insn;
+//ZZ                goto done;
+//ZZ             case ARMneon_GETELEMS:
+//ZZ                regM = Q ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1) :
+//ZZ                                 dregNo(i->ARMin.NUnaryS.src->reg);
+//ZZ                regD = iregNo(i->ARMin.NUnaryS.dst->reg);
+//ZZ                M = regM >> 4;
+//ZZ                D = regD >> 4;
+//ZZ                regM &= 0xF;
+//ZZ                regD &= 0xF;
+//ZZ                if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+//ZZ                   goto bad;
+//ZZ                switch (size) {
+//ZZ                   case 0:
+//ZZ                      if (Q && i->ARMin.NUnaryS.src->index > 7) {
+//ZZ                         regM++;
+//ZZ                         i->ARMin.NUnaryS.src->index -= 8;
+//ZZ                      }
+//ZZ                      if (i->ARMin.NUnaryS.src->index > 7)
+//ZZ                         goto bad;
+//ZZ                      opc = X1000 | i->ARMin.NUnaryS.src->index;
+//ZZ                      break;
+//ZZ                   case 1:
+//ZZ                      if (Q && i->ARMin.NUnaryS.src->index > 3) {
+//ZZ                         regM++;
+//ZZ                         i->ARMin.NUnaryS.src->index -= 4;
+//ZZ                      }
+//ZZ                      if (i->ARMin.NUnaryS.src->index > 3)
+//ZZ                         goto bad;
+//ZZ                      opc = X0001 | (i->ARMin.NUnaryS.src->index << 1);
+//ZZ                      break;
+//ZZ                   case 2:
+//ZZ                      if (Q && i->ARMin.NUnaryS.src->index > 1) {
+//ZZ                         regM++;
+//ZZ                         i->ARMin.NUnaryS.src->index -= 2;
+//ZZ                      }
+//ZZ                      if (i->ARMin.NUnaryS.src->index > 1)
+//ZZ                         goto bad;
+//ZZ                      opc = X0000 | (i->ARMin.NUnaryS.src->index << 2);
+//ZZ                      break;
+//ZZ                   default:
+//ZZ                      goto bad;
+//ZZ                }
+//ZZ                opc1 = (opc >> 2) & 3;
+//ZZ                opc2 = opc & 3;
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(0,(opc1 >> 1),(opc1 & 1),1),
+//ZZ                                regM, regD, X1011,
+//ZZ                                BITS4(M,(opc2 >> 1),(opc2 & 1),1), X0000);
+//ZZ                *p++ = insn;
+//ZZ                goto done;
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ       }
+//ZZ       case ARMin_NUnary: {
+//ZZ          UInt Q = i->ARMin.NUnary.Q ? 1 : 0;
+//ZZ          UInt regD = (hregClass(i->ARMin.NUnary.dst) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NUnary.dst) << 1)
+//ZZ                        : dregNo(i->ARMin.NUnary.dst);
+//ZZ          UInt regM, M;
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt sz1 = i->ARMin.NUnary.size >> 1;
+//ZZ          UInt sz2 = i->ARMin.NUnary.size & 1;
+//ZZ          UInt sz = i->ARMin.NUnary.size;
+//ZZ          UInt insn;
+//ZZ          UInt F = 0; /* TODO: floating point EQZ ??? */
+//ZZ          if (i->ARMin.NUnary.op != ARMneon_DUP) {
+//ZZ             regM = (hregClass(i->ARMin.NUnary.src) == HRcVec128) 
+//ZZ                      ? (qregNo(i->ARMin.NUnary.src) << 1)
+//ZZ                      : dregNo(i->ARMin.NUnary.src);
+//ZZ             M = regM >> 4;
+//ZZ          } else {
+//ZZ             regM = iregNo(i->ARMin.NUnary.src);
+//ZZ             M = regM >> 4;
+//ZZ          }
+//ZZ          regD &= 0xF;
+//ZZ          regM &= 0xF;
+//ZZ          switch (i->ARMin.NUnary.op) {
+//ZZ             case ARMneon_COPY: /* VMOV reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regM, regD, X0001,
+//ZZ                                BITS4(M,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYN: /* VMOVN regD, regQ */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0010, BITS4(0,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYQNSS: /* VQMOVN regD, regQ */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0010, BITS4(1,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYQNUS: /* VQMOVUN regD, regQ */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0010, BITS4(0,1,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYQNUU: /* VQMOVN regD, regQ */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0010, BITS4(1,1,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYLS: /* VMOVL regQ, regD */
+//ZZ                if (sz >= 3)
+//ZZ                   goto bad;
+//ZZ                insn = XXXXXXXX(0xF, X0010,
+//ZZ                                BITS4(1,D,(sz == 2) ? 1 : 0,(sz == 1) ? 1 : 0),
+//ZZ                                BITS4((sz == 0) ? 1 : 0,0,0,0),
+//ZZ                                regD, X1010, BITS4(0,0,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_COPYLU: /* VMOVL regQ, regD */
+//ZZ                if (sz >= 3)
+//ZZ                   goto bad;
+//ZZ                insn = XXXXXXXX(0xF, X0011,
+//ZZ                                BITS4(1,D,(sz == 2) ? 1 : 0,(sz == 1) ? 1 : 0),
+//ZZ                                BITS4((sz == 0) ? 1 : 0,0,0,0),
+//ZZ                                regD, X1010, BITS4(0,0,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_NOT: /* VMVN reg, reg*/
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0000, regD, X0101,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_EQZ:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,1),
+//ZZ                                regD, BITS4(0,F,0,1), BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_CNT:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0000, regD, X0101,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_CLZ:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, X0100, BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_CLS:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, X0100, BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_ABS:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,1),
+//ZZ                                regD, X0011, BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_DUP:
+//ZZ                sz1 = i->ARMin.NUnary.size == 0 ? 1 : 0;
+//ZZ                sz2 = i->ARMin.NUnary.size == 1 ? 1 : 0;
+//ZZ                vassert(sz1 + sz2 < 2);
+//ZZ                insn = XXXXXXXX(0xE, X1110, BITS4(1, sz1, Q, 0), regD, regM,
+//ZZ                                X1011, BITS4(D,0,sz2,1), X0000);
+//ZZ                break;
+//ZZ             case ARMneon_REV16:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, BITS4(0,0,0,1), BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_REV32:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, BITS4(0,0,0,0), BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_REV64:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, BITS4(0,0,0,0), BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_PADDLU:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, X0010, BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_PADDLS:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+//ZZ                                regD, X0010, BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSHLNUU:
+//ZZ                insn = XXXXXXXX(0xF, X0011,
+//ZZ                                (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+//ZZ                                sz & 0xf, regD, X0111,
+//ZZ                                BITS4(sz >> 6,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSHLNSS:
+//ZZ                insn = XXXXXXXX(0xF, X0010,
+//ZZ                                (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+//ZZ                                sz & 0xf, regD, X0111,
+//ZZ                                BITS4(sz >> 6,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSHLNUS:
+//ZZ                insn = XXXXXXXX(0xF, X0011,
+//ZZ                                (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+//ZZ                                sz & 0xf, regD, X0110,
+//ZZ                                BITS4(sz >> 6,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFtoS:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0111,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFtoU:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0111,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTStoF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0110,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTUtoF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0110,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFtoFixedU:
+//ZZ                sz1 = (sz >> 5) & 1;
+//ZZ                sz2 = (sz >> 4) & 1;
+//ZZ                sz &= 0xf;
+//ZZ                insn = XXXXXXXX(0xF, X0011,
+//ZZ                                BITS4(1,D,sz1,sz2), sz, regD, X1111,
+//ZZ                                BITS4(0,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFtoFixedS:
+//ZZ                sz1 = (sz >> 5) & 1;
+//ZZ                sz2 = (sz >> 4) & 1;
+//ZZ                sz &= 0xf;
+//ZZ                insn = XXXXXXXX(0xF, X0010,
+//ZZ                                BITS4(1,D,sz1,sz2), sz, regD, X1111,
+//ZZ                                BITS4(0,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFixedUtoF:
+//ZZ                sz1 = (sz >> 5) & 1;
+//ZZ                sz2 = (sz >> 4) & 1;
+//ZZ                sz &= 0xf;
+//ZZ                insn = XXXXXXXX(0xF, X0011,
+//ZZ                                BITS4(1,D,sz1,sz2), sz, regD, X1110,
+//ZZ                                BITS4(0,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTFixedStoF:
+//ZZ                sz1 = (sz >> 5) & 1;
+//ZZ                sz2 = (sz >> 4) & 1;
+//ZZ                sz &= 0xf;
+//ZZ                insn = XXXXXXXX(0xF, X0010,
+//ZZ                                BITS4(1,D,sz1,sz2), sz, regD, X1110,
+//ZZ                                BITS4(0,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTF32toF16:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0110, regD, X0110,
+//ZZ                                BITS4(0,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCVTF16toF32:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0110, regD, X0111,
+//ZZ                                BITS4(0,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRECIP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0100,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRECIPF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0101,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VABSFP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1001, regD, X0111,
+//ZZ                                BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRSQRTEFP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0101,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRSQRTE:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0100,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VNEGF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1001, regD, X0111,
+//ZZ                                BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ 
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NDual: {
+//ZZ          UInt Q = i->ARMin.NDual.Q ? 1 : 0;
+//ZZ          UInt regD = (hregClass(i->ARMin.NDual.arg1) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NDual.arg1) << 1)
+//ZZ                        : dregNo(i->ARMin.NDual.arg1);
+//ZZ          UInt regM = (hregClass(i->ARMin.NDual.arg2) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NDual.arg2) << 1)
+//ZZ                        : dregNo(i->ARMin.NDual.arg2);
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt M = regM >> 4;
+//ZZ          UInt sz1 = i->ARMin.NDual.size >> 1;
+//ZZ          UInt sz2 = i->ARMin.NDual.size & 1;
+//ZZ          UInt insn;
+//ZZ          regD &= 0xF;
+//ZZ          regM &= 0xF;
+//ZZ          switch (i->ARMin.NDual.op) {
+//ZZ             case ARMneon_TRN: /* VTRN reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0000, BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_ZIP: /* VZIP reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0001, BITS4(1,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_UZP: /* VUZP reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+//ZZ                                regD, X0001, BITS4(0,Q,M,0), regM);
+//ZZ                break;
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NBinary: {
+//ZZ          UInt Q = i->ARMin.NBinary.Q ? 1 : 0;
+//ZZ          UInt regD = (hregClass(i->ARMin.NBinary.dst) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NBinary.dst) << 1)
+//ZZ                        : dregNo(i->ARMin.NBinary.dst);
+//ZZ          UInt regN = (hregClass(i->ARMin.NBinary.argL) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NBinary.argL) << 1)
+//ZZ                        : dregNo(i->ARMin.NBinary.argL);
+//ZZ          UInt regM = (hregClass(i->ARMin.NBinary.argR) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NBinary.argR) << 1)
+//ZZ                        : dregNo(i->ARMin.NBinary.argR);
+//ZZ          UInt sz1 = i->ARMin.NBinary.size >> 1;
+//ZZ          UInt sz2 = i->ARMin.NBinary.size & 1;
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt N = regN >> 4;
+//ZZ          UInt M = regM >> 4;
+//ZZ          UInt insn;
+//ZZ          regD &= 0xF;
+//ZZ          regM &= 0xF;
+//ZZ          regN &= 0xF;
+//ZZ          switch (i->ARMin.NBinary.op) {
+//ZZ             case ARMneon_VAND: /* VAND reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X0001,
+//ZZ                                BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VORR: /* VORR reg, reg, reg*/
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD, X0001,
+//ZZ                                BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VXOR: /* VEOR reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD, X0001,
+//ZZ                                BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VADD: /* VADD reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1000, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VSUB: /* VSUB reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1000, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMINU: /* VMIN.Uxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0110, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMINS: /* VMIN.Sxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0110, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMAXU: /* VMAX.Uxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0110, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMAXS: /* VMAX.Sxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0110, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRHADDS: /* VRHADD.Sxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0001, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRHADDU: /* VRHADD.Uxx reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0001, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQADDU: /* VQADD unsigned reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0000, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQADDS: /* VQADD signed reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0000, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSUBU: /* VQSUB unsigned reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0010, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSUBS: /* VQSUB signed reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0010, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGTU: /* VCGT unsigned reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0011, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGTS: /* VCGT signed reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0011, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGEU: /* VCGE unsigned reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0011, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGES: /* VCGE signed reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0011, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCEQ: /* VCEQ reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1000, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VEXT: /* VEXT.8 reg, reg, #imm4*/
+//ZZ                if (i->ARMin.NBinary.size >= 16)
+//ZZ                   goto bad;
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(1,D,1,1), regN, regD,
+//ZZ                                i->ARMin.NBinary.size & 0xf, BITS4(N,Q,M,0),
+//ZZ                                regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMUL:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1001, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMULLU:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,sz1,sz2), regN, regD,
+//ZZ                                X1100, BITS4(N,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMULLS:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+//ZZ                                X1100, BITS4(N,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMULP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1001, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMULFP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+//ZZ                                X1101, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMULLP:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+//ZZ                                X1110, BITS4(N,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQDMULH:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1011, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQRDMULH:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1011, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQDMULL:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+//ZZ                                X1101, BITS4(N,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VTBL:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), regN, regD,
+//ZZ                                X1000, BITS4(N,0,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPADD:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1011, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPADDFP:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+//ZZ                                X1101, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMINU:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1010, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMINS:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1010, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMAXU:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1010, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMAXS:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X1010, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VADDFP: /* VADD reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD,
+//ZZ                                X1101, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VSUBFP: /* VADD reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD,
+//ZZ                                X1101, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VABDFP: /* VABD reg, reg, reg */
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD,
+//ZZ                                X1101, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMINF:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD,
+//ZZ                                X1111, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VMAXF:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD,
+//ZZ                                X1111, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMINF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD,
+//ZZ                                X1111, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VPMAXF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+//ZZ                                X1111, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRECPS:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X1111,
+//ZZ                                BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGTF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD, X1110,
+//ZZ                                BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCGEF:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD, X1110,
+//ZZ                                BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VCEQF:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X1110,
+//ZZ                                BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VRSQRTS:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD, X1111,
+//ZZ                                BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NShift: {
+//ZZ          UInt Q = i->ARMin.NShift.Q ? 1 : 0;
+//ZZ          UInt regD = (hregClass(i->ARMin.NShift.dst) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NShift.dst) << 1)
+//ZZ                        : dregNo(i->ARMin.NShift.dst);
+//ZZ          UInt regM = (hregClass(i->ARMin.NShift.argL) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NShift.argL) << 1)
+//ZZ                        : dregNo(i->ARMin.NShift.argL);
+//ZZ          UInt regN = (hregClass(i->ARMin.NShift.argR) == HRcVec128)
+//ZZ                        ? (qregNo(i->ARMin.NShift.argR) << 1)
+//ZZ                        : dregNo(i->ARMin.NShift.argR);
+//ZZ          UInt sz1 = i->ARMin.NShift.size >> 1;
+//ZZ          UInt sz2 = i->ARMin.NShift.size & 1;
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt N = regN >> 4;
+//ZZ          UInt M = regM >> 4;
+//ZZ          UInt insn;
+//ZZ          regD &= 0xF;
+//ZZ          regM &= 0xF;
+//ZZ          regN &= 0xF;
+//ZZ          switch (i->ARMin.NShift.op) {
+//ZZ             case ARMneon_VSHL:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0100, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VSAL:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0100, BITS4(N,Q,M,0), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSHL:
+//ZZ                insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0100, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             case ARMneon_VQSAL:
+//ZZ                insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+//ZZ                                X0100, BITS4(N,Q,M,1), regM);
+//ZZ                break;
+//ZZ             default:
+//ZZ                goto bad;
+//ZZ          }
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NShl64: {
+//ZZ          HReg regDreg = i->ARMin.NShl64.dst;
+//ZZ          HReg regMreg = i->ARMin.NShl64.src;
+//ZZ          UInt amt     = i->ARMin.NShl64.amt;
+//ZZ          vassert(amt >= 1 && amt <= 63);
+//ZZ          vassert(hregClass(regDreg) == HRcFlt64);
+//ZZ          vassert(hregClass(regMreg) == HRcFlt64);
+//ZZ          UInt regD = dregNo(regDreg);
+//ZZ          UInt regM = dregNo(regMreg);
+//ZZ          UInt D    = (regD >> 4) & 1;
+//ZZ          UInt Vd   = regD & 0xF;
+//ZZ          UInt L    = 1;
+//ZZ          UInt Q    = 0; /* always 64-bit */
+//ZZ          UInt M    = (regM >> 4) & 1;
+//ZZ          UInt Vm   = regM & 0xF;
+//ZZ          UInt insn = XXXXXXXX(X1111,X0010, BITS4(1,D,(amt>>5)&1,(amt>>4)&1),
+//ZZ                               amt & 0xF, Vd, X0101, BITS4(L,Q,M,1), Vm);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+      case ARM64in_VImmQ: {
+         UInt   rQ  = qregNo(i->ARM64in.VImmQ.rQ);
+         UShort imm = i->ARM64in.VImmQ.imm;
+         if (imm == 0x0000) {
+            /* movi rQ.4s, #0x0 == 0x4F 0x00 0x04 000 rQ */
+            vassert(rQ < 32);
+            *p++ = 0x4F000400 | rQ;
+            goto done;
+         }
+         if (imm == 0x0001) {
+            /* movi rD, #0xFF == 0x2F 0x00 0xE4 001 rD */
+            vassert(rQ < 32);
+            *p++ = 0x2F00E420 | rQ;
+            goto done;
+         }
+         if (imm == 0x0003) {
+            /* movi rD, #0xFFFF == 0x2F 0x00 0xE4 011 rD */
+            vassert(rQ < 32);
+            *p++ = 0x2F00E460 | rQ;
+            goto done;
+         }
+         if (imm == 0x000F) {
+            /* movi rD, #0xFFFFFFFF == 0x2F 0x00 0xE5 111 rD */
+            vassert(rQ < 32);
+            *p++ = 0x2F00E5E0 | rQ;
+            goto done;
+         }
+         if (imm == 0x00FF) {
+            /* movi rD, #0xFFFFFFFFFFFFFFFF == 0x2F 0x07 0xE7 111 rD */
+            vassert(rQ < 32);
+            *p++ = 0x2F07E7E0 | rQ;
+            goto done;
+         }
+         goto bad; /* no other handled cases right now */
+      }
+
+      case ARM64in_VDfromX: {
+         /* INS Vd.D[0], rX
+            0100 1110 0000 1000 0001 11 nn dd   INS Vd.D[0], Xn
+            This isn't wonderful, in the sense that the upper half of
+            the vector register stays unchanged and thus the insn is
+            data dependent on its output register. */
+         UInt dd = dregNo(i->ARM64in.VDfromX.rD);
+         UInt xx = iregNo(i->ARM64in.VDfromX.rX);
+         vassert(xx < 31);
+         *p++ = 0x4E081C00 | X_2_6_2_12_5_5(0,0,0,0,xx,dd);
+         goto done;
+      }
+
+      case ARM64in_VQfromXX: {
+         /* What we really generate is a two insn sequence:
+               INS Vd.D[0], Xlo; INS Vd.D[1], Xhi
+            0100 1110 0000 1000 0001 11 nn dd   INS Vd.D[0], Xn
+            0100 1110 0001 1000 0001 11 nn dd   INS Vd.D[1], Xn
+         */
+         UInt qq  = qregNo(i->ARM64in.VQfromXX.rQ);
+         UInt xhi = iregNo(i->ARM64in.VQfromXX.rXhi);
+         UInt xlo = iregNo(i->ARM64in.VQfromXX.rXlo);
+         vassert(xhi < 31 && xlo < 31);
+         *p++ = 0x4E081C00 | X_2_6_2_12_5_5(0,0,0,0,xlo,qq);
+         *p++ = 0x4E181C00 | X_2_6_2_12_5_5(0,0,0,0,xhi,qq);
+         goto done;
+      }
+
+      case ARM64in_VXfromQ: {
+         /* 010 0111 0000 01000 001111 nn dd  UMOV Xd, Vn.D[0]
+            010 0111 0000 11000 001111 nn dd  UMOV Xd, Vn.D[1]
+         */
+         UInt dd     = iregNo(i->ARM64in.VXfromQ.rX);
+         UInt nn     = qregNo(i->ARM64in.VXfromQ.rQ);
+         UInt laneNo = i->ARM64in.VXfromQ.laneNo;
+         vassert(dd < 31);
+         vassert(laneNo < 2);
+         *p++ = X_3_8_5_6_5_5(X010, X01110000,
+                              laneNo == 1 ? X11000 : X01000, X001111, nn, dd);
+         goto done;
+      }
+
+      case ARM64in_VMov: {
+         /* 000 11110 00 10000 00 10000 n d   FMOV Sd, Sn
+            000 11110 01 10000 00 10000 n d   FMOV Dd, Dn
+            010 01110 10 1 n    0 00111 n d   MOV Vd.16b, Vn.16b
+         */
+        HReg rD = i->ARM64in.VMov.dst;
+        HReg rN = i->ARM64in.VMov.src;
+        switch (i->ARM64in.VMov.szB) {
+           case 8: {
+              UInt dd = dregNo(rD);
+              UInt nn = dregNo(rN);
+              *p++ = X_3_8_5_6_5_5(X000, X11110011, X00000, X010000, nn, dd);
+              goto done;
+           }
+           default: 
+              break;
+        }
+        goto bad;
+      }
+//ZZ       case ARMin_NeonImm: {
+//ZZ          UInt Q = (hregClass(i->ARMin.NeonImm.dst) == HRcVec128) ? 1 : 0;
+//ZZ          UInt regD = Q ? (qregNo(i->ARMin.NeonImm.dst) << 1) :
+//ZZ                           dregNo(i->ARMin.NeonImm.dst);
+//ZZ          UInt D = regD >> 4;
+//ZZ          UInt imm = i->ARMin.NeonImm.imm->imm8;
+//ZZ          UInt tp = i->ARMin.NeonImm.imm->type;
+//ZZ          UInt j = imm >> 7;
+//ZZ          UInt imm3 = (imm >> 4) & 0x7;
+//ZZ          UInt imm4 = imm & 0xF;
+//ZZ          UInt cmode, op;
+//ZZ          UInt insn;
+//ZZ          regD &= 0xF;
+//ZZ          if (tp == 9)
+//ZZ             op = 1;
+//ZZ          else
+//ZZ             op = 0;
+//ZZ          switch (tp) {
+//ZZ             case 0:
+//ZZ             case 1:
+//ZZ             case 2:
+//ZZ             case 3:
+//ZZ             case 4:
+//ZZ             case 5:
+//ZZ                cmode = tp << 1;
+//ZZ                break;
+//ZZ             case 9:
+//ZZ             case 6:
+//ZZ                cmode = 14;
+//ZZ                break;
+//ZZ             case 7:
+//ZZ                cmode = 12;
+//ZZ                break;
+//ZZ             case 8:
+//ZZ                cmode = 13;
+//ZZ                break;
+//ZZ             case 10:
+//ZZ                cmode = 15;
+//ZZ                break;
+//ZZ             default:
+//ZZ                vpanic("ARMin_NeonImm");
+//ZZ 
+//ZZ          }
+//ZZ          insn = XXXXXXXX(0xF, BITS4(0,0,1,j), BITS4(1,D,0,0), imm3, regD,
+//ZZ                          cmode, BITS4(0,Q,op,1), imm4);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_NCMovQ: {
+//ZZ          UInt cc = (UInt)i->ARMin.NCMovQ.cond;
+//ZZ          UInt qM = qregNo(i->ARMin.NCMovQ.src) << 1;
+//ZZ          UInt qD = qregNo(i->ARMin.NCMovQ.dst) << 1;
+//ZZ          UInt vM = qM & 0xF;
+//ZZ          UInt vD = qD & 0xF;
+//ZZ          UInt M  = (qM >> 4) & 1;
+//ZZ          UInt D  = (qD >> 4) & 1;
+//ZZ          vassert(cc < 16 && cc != ARMcc_AL && cc != ARMcc_NV);
+//ZZ          /* b!cc here+8: !cc A00 0000 */
+//ZZ          UInt insn = XXXXXXXX(cc ^ 1, 0xA, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+//ZZ          *p++ = insn;
+//ZZ          /* vmov qD, qM */
+//ZZ          insn = XXXXXXXX(0xF, 0x2, BITS4(0,D,1,0),
+//ZZ                          vM, vD, BITS4(0,0,0,1), BITS4(M,1,M,1), vM);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+//ZZ       case ARMin_Add32: {
+//ZZ          UInt regD = iregNo(i->ARMin.Add32.rD);
+//ZZ          UInt regN = iregNo(i->ARMin.Add32.rN);
+//ZZ          UInt imm32 = i->ARMin.Add32.imm32;
+//ZZ          vassert(regD != regN);
+//ZZ          /* MOV regD, imm32 */
+//ZZ          p = imm32_to_iregNo((UInt *)p, regD, imm32);
+//ZZ          /* ADD regD, regN, regD */
+//ZZ          UInt insn = XXXXXXXX(0xE, 0, X1000, regN, regD, 0, 0, regD);
+//ZZ          *p++ = insn;
+//ZZ          goto done;
+//ZZ       }
+
+      case ARM64in_EvCheck: {
+         /* The sequence is fixed (canned) except for the two amodes
+            supplied by the insn.  These don't change the length, though.
+            We generate:
+               ldr  w9, [x21 + #8]   8 == offsetof(host_EvC_COUNTER)
+               subs w9, w9, #1
+               str  w9, [x21 + #8]   8 == offsetof(host_EvC_COUNTER)
+               bpl  nofail
+               ldr  x9, [x21 + #0]   0 == offsetof(host_EvC_FAILADDR)
+               br   x9
+              nofail:
+         */
+         UInt* p0 = p;
+         p = do_load_or_store32(p, True/*isLoad*/, /*w*/9,
+                                i->ARM64in.EvCheck.amCounter);
+         *p++ = 0x71000529; /* subs w9, w9, #1 */
+         p = do_load_or_store32(p, False/*!isLoad*/, /*w*/9,
+                                i->ARM64in.EvCheck.amCounter);
+         *p++ = 0x54000065; /* bpl nofail */
+         p = do_load_or_store64(p, True/*isLoad*/, /*x*/9,
+                                i->ARM64in.EvCheck.amFailAddr);
+         *p++ = 0xD61F0120; /* br x9 */
+         /* nofail: */
+
+         /* Crosscheck */
+         vassert(evCheckSzB_ARM64() == (UChar*)p - (UChar*)p0);
+         goto done;
+      }
+
+//ZZ       case ARMin_ProfInc: {
+//ZZ          /* We generate:
+//ZZ               (ctrP is unknown now, so use 0x65556555 in the
+//ZZ               expectation that a later call to LibVEX_patchProfCtr
+//ZZ               will be used to fill in the immediate fields once the
+//ZZ               right value is known.)
+//ZZ             movw r12, lo16(0x65556555)
+//ZZ             movt r12, lo16(0x65556555)
+//ZZ             ldr  r11, [r12]
+//ZZ             adds r11, r11, #1
+//ZZ             str  r11, [r12]
+//ZZ             ldr  r11, [r12+4]
+//ZZ             adc  r11, r11, #0
+//ZZ             str  r11, [r12+4]
+//ZZ          */
+//ZZ          p = imm32_to_iregNo_EXACTLY2(p, /*r*/12, 0x65556555);
+//ZZ          *p++ = 0xE59CB000;
+//ZZ          *p++ = 0xE29BB001;
+//ZZ          *p++ = 0xE58CB000;
+//ZZ          *p++ = 0xE59CB004;
+//ZZ          *p++ = 0xE2ABB000;
+//ZZ          *p++ = 0xE58CB004;
+//ZZ          /* Tell the caller .. */
+//ZZ          vassert(!(*is_profInc));
+//ZZ          *is_profInc = True;
+//ZZ          goto done;
+//ZZ       }
+
+      /* ... */
+      default: 
+         goto bad;
+    }
+
+  bad:
+   ppARM64Instr(i);
+   vpanic("emit_ARM64Instr");
+   /*NOTREACHED*/
+
+  done:
+   vassert(((UChar*)p) - &buf[0] <= 36);
+   return ((UChar*)p) - &buf[0];
+}
+
+
+/* How big is an event check?  See case for ARM64in_EvCheck in
+   emit_ARM64Instr just above.  That crosschecks what this returns, so
+   we can tell if we're inconsistent. */
+Int evCheckSzB_ARM64 ( void )
+{
+   return 24;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange chainXDirect_ARM64 ( void* place_to_chain,
+                                   void* disp_cp_chain_me_EXPECTED,
+                                   void* place_to_jump_to )
+{
+   /* What we're expecting to see is:
+        movw x9, disp_cp_chain_me_to_EXPECTED[15:0]
+        movk x9, disp_cp_chain_me_to_EXPECTED[31:15], lsl 16
+        movk x9, disp_cp_chain_me_to_EXPECTED[47:32], lsl 32
+        movk x9, disp_cp_chain_me_to_EXPECTED[63:48], lsl 48
+        blr  x9
+      viz
+        <16 bytes generated by imm64_to_iregNo_EXACTLY4>
+        D6 3F 01 20
+   */
+   UInt* p = (UInt*)place_to_chain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(is_imm64_to_iregNo_EXACTLY4(
+              p, /*x*/9, Ptr_to_ULong(disp_cp_chain_me_EXPECTED)));
+   vassert(p[4] == 0xD63F0120);
+
+   /* And what we want to change it to is:
+        movw x9, place_to_jump_to[15:0]
+        movk x9, place_to_jump_to[31:15], lsl 16
+        movk x9, place_to_jump_to[47:32], lsl 32
+        movk x9, place_to_jump_to[63:48], lsl 48
+        br   x9
+      viz
+        <16 bytes generated by imm64_to_iregNo_EXACTLY4>
+        D6 1F 01 20
+
+      The replacement has the same length as the original.
+   */
+   (void)imm64_to_iregNo_EXACTLY4(
+            p, /*x*/9, Ptr_to_ULong(place_to_jump_to));
+   p[4] = 0xD61F0120;
+
+   VexInvalRange vir = {(HWord)p, 20};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange unchainXDirect_ARM64 ( void* place_to_unchain,
+                                     void* place_to_jump_to_EXPECTED,
+                                     void* disp_cp_chain_me )
+{
+   /* What we're expecting to see is:
+        movw x9, place_to_jump_to_EXPECTED[15:0]
+        movk x9, place_to_jump_to_EXPECTED[31:15], lsl 16
+        movk x9, place_to_jump_to_EXPECTED[47:32], lsl 32
+        movk x9, place_to_jump_to_EXPECTED[63:48], lsl 48
+        br   x9
+      viz
+        <16 bytes generated by imm64_to_iregNo_EXACTLY4>
+        D6 1F 01 20
+   */
+   UInt* p = (UInt*)place_to_unchain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(is_imm64_to_iregNo_EXACTLY4(
+              p, /*x*/9, Ptr_to_ULong(place_to_jump_to_EXPECTED)));
+   vassert(p[4] == 0xD61F0120);
+
+   /* And what we want to change it to is:
+        movw x9, disp_cp_chain_me_to[15:0]
+        movk x9, disp_cp_chain_me_to[31:15], lsl 16
+        movk x9, disp_cp_chain_me_to[47:32], lsl 32
+        movk x9, disp_cp_chain_me_to[63:48], lsl 48
+        blr  x9
+      viz
+        <16 bytes generated by imm64_to_iregNo_EXACTLY4>
+        D6 3F 01 20
+   */
+   (void)imm64_to_iregNo_EXACTLY4(
+            p, /*x*/9, Ptr_to_ULong(disp_cp_chain_me));
+   p[4] = 0xD63F0120;
+
+   VexInvalRange vir = {(HWord)p, 20};
+   return vir;
+}
+
+
+//ZZ /* Patch the counter address into a profile inc point, as previously
+//ZZ    created by the ARMin_ProfInc case for emit_ARMInstr. */
+//ZZ VexInvalRange patchProfInc_ARM ( void*  place_to_patch,
+//ZZ                                  ULong* location_of_counter )
+//ZZ {
+//ZZ    vassert(sizeof(ULong*) == 4);
+//ZZ    UInt* p = (UInt*)place_to_patch;
+//ZZ    vassert(0 == (3 & (HWord)p));
+//ZZ    vassert(is_imm32_to_iregNo_EXACTLY2(p, /*r*/12, 0x65556555));
+//ZZ    vassert(p[2] == 0xE59CB000);
+//ZZ    vassert(p[3] == 0xE29BB001);
+//ZZ    vassert(p[4] == 0xE58CB000);
+//ZZ    vassert(p[5] == 0xE59CB004);
+//ZZ    vassert(p[6] == 0xE2ABB000);
+//ZZ    vassert(p[7] == 0xE58CB004);
+//ZZ    imm32_to_iregNo_EXACTLY2(p, /*r*/12, 
+//ZZ                             (UInt)Ptr_to_ULong(location_of_counter));
+//ZZ    VexInvalRange vir = {(HWord)p, 8};
+//ZZ    return vir;
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ #undef BITS4
+//ZZ #undef X0000
+//ZZ #undef X0001
+//ZZ #undef X0010
+//ZZ #undef X0011
+//ZZ #undef X0100
+//ZZ #undef X0101
+//ZZ #undef X0110
+//ZZ #undef X0111
+//ZZ #undef X1000
+//ZZ #undef X1001
+//ZZ #undef X1010
+//ZZ #undef X1011
+//ZZ #undef X1100
+//ZZ #undef X1101
+//ZZ #undef X1110
+//ZZ #undef X1111
+//ZZ #undef XXXXX___
+//ZZ #undef XXXXXX__
+//ZZ #undef XXX___XX
+//ZZ #undef XXXXX__X
+//ZZ #undef XXXXXXXX
+//ZZ #undef XX______
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_arm64_defs.c ---*/
+/*---------------------------------------------------------------*/
Index: priv/host_arm64_defs.h
===================================================================
--- priv/host_arm64_defs.h	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/host_arm64_defs.h	(.../trunk)	(revision 2863)
@@ -0,0 +1,1148 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_arm64_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __VEX_HOST_ARM64_DEFS_H
+#define __VEX_HOST_ARM64_DEFS_H
+
+#include "libvex_basictypes.h"
+#include "libvex.h"                      // VexArch
+#include "host_generic_regs.h"           // HReg
+
+//ZZ extern UInt arm_hwcaps;
+
+
+/* --------- Registers. --------- */
+
+//ZZ /* The usual HReg abstraction.
+//ZZ    There are 16 general purpose regs.
+//ZZ */
+
+extern void ppHRegARM64 ( HReg );
+
+extern HReg hregARM64_X0  ( void );
+extern HReg hregARM64_X1  ( void );
+extern HReg hregARM64_X2  ( void );
+extern HReg hregARM64_X3  ( void );
+extern HReg hregARM64_X4  ( void );
+extern HReg hregARM64_X5  ( void );
+extern HReg hregARM64_X6  ( void );
+extern HReg hregARM64_X7  ( void );
+//ZZ extern HReg hregARM_R8  ( void );
+extern HReg hregARM64_X9  ( void );
+extern HReg hregARM64_X10 ( void );
+extern HReg hregARM64_X11 ( void );
+extern HReg hregARM64_X12 ( void );
+extern HReg hregARM64_X13 ( void );
+extern HReg hregARM64_X14 ( void );
+extern HReg hregARM64_X15 ( void );
+extern HReg hregARM64_X21 ( void );
+extern HReg hregARM64_X22 ( void );
+extern HReg hregARM64_X23 ( void );
+extern HReg hregARM64_X24 ( void );
+extern HReg hregARM64_X25 ( void );
+extern HReg hregARM64_X26 ( void );
+extern HReg hregARM64_X27 ( void );
+extern HReg hregARM64_X28 ( void );
+extern HReg hregARM64_D8  ( void );
+extern HReg hregARM64_D9  ( void );
+extern HReg hregARM64_D10 ( void );
+extern HReg hregARM64_D11 ( void );
+extern HReg hregARM64_D12 ( void );
+extern HReg hregARM64_D13 ( void );
+extern HReg hregARM64_Q16 ( void );
+extern HReg hregARM64_Q17 ( void );
+extern HReg hregARM64_Q18 ( void );
+
+/* Number of registers used arg passing in function calls */
+#define ARM64_N_ARGREGS 8   /* x0 .. x7 */
+
+
+/* --------- Condition codes. --------- */
+
+typedef
+   enum {
+      ARM64cc_EQ  = 0,  /* equal                         : Z=1 */
+      ARM64cc_NE  = 1,  /* not equal                     : Z=0 */
+
+      ARM64cc_CS  = 2,  /* >=u (higher or same)          : C=1 */
+      ARM64cc_CC  = 3,  /* <u  (lower)                   : C=0 */
+
+      ARM64cc_MI  = 4,  /* minus (negative)              : N=1 */
+      ARM64cc_PL  = 5,  /* plus (zero or +ve)            : N=0 */
+
+      ARM64cc_VS  = 6,  /* overflow                      : V=1 */
+      ARM64cc_VC  = 7,  /* no overflow                   : V=0 */
+
+      ARM64cc_HI  = 8,  /* >u   (higher)                 :   C=1 && Z=0 */
+      ARM64cc_LS  = 9,  /* <=u  (lower or same)          : !(C=1 && Z=0) */
+
+      ARM64cc_GE  = 10, /* >=s (signed greater or equal) :   N=V */
+      ARM64cc_LT  = 11, /* <s  (signed less than)        : !(N=V) */
+
+      ARM64cc_GT  = 12, /* >s  (signed greater)          :   Z=0 && N=V */
+      ARM64cc_LE  = 13, /* <=s (signed less or equal)    : !(Z=0 && N=V) */
+
+      ARM64cc_AL  = 14, /* always (unconditional) */
+      ARM64cc_NV  = 15  /* in 64-bit mode also means "always" */
+   }
+   ARM64CondCode;
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+typedef
+   enum {
+      ARM64am_RI9=10, /* reg + simm9 */
+      ARM64am_RI12,   /* reg + uimm12 * szB (iow, scaled by access size) */
+      ARM64am_RR      /* reg1 + reg2 */
+   }
+   ARM64AModeTag;
+
+typedef
+   struct {
+      ARM64AModeTag tag;
+      union {
+         struct {
+            HReg reg;
+            Int  simm9; /* -256 .. +255 */
+         } RI9;
+         struct {
+            HReg  reg;
+            UInt  uimm12; /* 0 .. 4095 */
+            UChar szB;    /* 1, 2, 4, 8 (16 ?) */
+         } RI12;
+         struct {
+            HReg base;
+            HReg index;
+         } RR;
+      } ARM64am;
+   }
+   ARM64AMode;
+
+extern ARM64AMode* ARM64AMode_RI9  ( HReg reg, Int simm9 );
+extern ARM64AMode* ARM64AMode_RI12 ( HReg reg, Int uimm12, UChar szB );
+extern ARM64AMode* ARM64AMode_RR   ( HReg base, HReg index );
+
+
+/* --------- Reg or uimm12 or (uimm12 << 12) operands --------- */
+
+typedef
+   enum {
+      ARM64riA_I12=20, /* uimm12 << 0 or 12 only */
+      ARM64riA_R       /* reg */
+   }
+   ARM64RIATag;
+
+typedef
+   struct {
+      ARM64RIATag tag;
+      union {
+         struct {
+            UShort imm12;  /* 0 .. 4095 */
+            UChar  shift;  /* 0 or 12 only */
+         } I12;
+         struct {
+            HReg reg;
+         } R;
+      } ARM64riA;
+   }
+   ARM64RIA;
+
+extern ARM64RIA* ARM64RIA_I12 ( UShort imm12, UChar shift );
+extern ARM64RIA* ARM64RIA_R   ( HReg );
+
+
+/* --------- Reg or "bitfield" (logic immediate) operands --------- */
+
+typedef
+   enum {
+      ARM64riL_I13=6, /* wierd-o bitfield immediate, 13 bits in total */
+      ARM64riL_R      /* reg */
+   }
+   ARM64RILTag;
+
+typedef
+   struct {
+      ARM64RILTag tag;
+      union {
+         struct {
+            UChar bitN; /* 0 .. 1 */
+            UChar immR; /* 0 .. 63 */
+            UChar immS; /* 0 .. 63 */
+         } I13;
+         struct {
+            HReg reg;
+         } R;
+      } ARM64riL;
+   }
+   ARM64RIL;
+
+extern ARM64RIL* ARM64RIL_I13 ( UChar bitN, UChar immR, UChar immS );
+extern ARM64RIL* ARM64RIL_R   ( HReg );
+
+
+/* --------------- Reg or uimm6 operands --------------- */
+
+typedef
+   enum {
+      ARM64ri6_I6=30, /* uimm6, 1 .. 63 only */
+      ARM64ri6_R      /* reg */
+   }
+   ARM64RI6Tag;
+
+typedef
+   struct {
+      ARM64RI6Tag tag;
+      union {
+         struct {
+            UInt imm6;   /* 1 .. 63 */
+         } I6;
+         struct {
+            HReg reg;
+         } R;
+      } ARM64ri6;
+   }
+   ARM64RI6;
+
+extern ARM64RI6* ARM64RI6_I6 ( UInt imm6 );
+extern ARM64RI6* ARM64RI6_R  ( HReg );
+
+
+/* --------------------- Instructions --------------------- */
+
+typedef
+   enum {
+      ARM64lo_AND=40,
+      ARM64lo_OR,
+      ARM64lo_XOR
+   }
+   ARM64LogicOp;
+
+typedef
+   enum {
+      ARM64sh_SHL=50,
+      ARM64sh_SHR,
+      ARM64sh_SAR
+   }
+   ARM64ShiftOp;
+
+typedef
+   enum {
+      ARM64un_NEG=60,
+      ARM64un_NOT,
+      ARM64un_CLZ,
+   }
+   ARM64UnaryOp;
+
+typedef
+   enum {
+      ARM64mul_PLAIN=70, /* lo64(64 * 64)  */
+      ARM64mul_ZX,       /* hi64(64 *u 64) */
+      ARM64mul_SX        /* hi64(64 *s 64) */
+   }
+   ARM64MulOp;
+
+typedef
+   /* These characterise an integer-FP conversion, but don't imply any
+      particular direction. */
+   enum {
+      ARM64cvt_F32_I32S=80,
+      ARM64cvt_F64_I32S,
+      ARM64cvt_F32_I64S,
+      ARM64cvt_F64_I64S,
+      ARM64cvt_F32_I32U,
+      ARM64cvt_F64_I32U,
+      ARM64cvt_F32_I64U,
+      ARM64cvt_F64_I64U,
+      ARM64cvt_INVALID
+   }
+   ARM64CvtOp;
+
+typedef
+   enum {
+      ARM64fpb_ADD=100,
+      ARM64fpb_SUB,
+      ARM64fpb_MUL,
+      ARM64fpb_DIV,
+      ARM64fpb_INVALID
+   }
+   ARM64FpBinOp;
+
+typedef
+   enum {
+      ARM64fpu_NEG=110,
+      ARM64fpu_ABS,
+      ARM64fpu_SQRT,
+      ARM64fpu_RINT,
+      ARM64fpu_INVALID
+   }
+   ARM64FpUnaryOp;
+
+typedef
+   enum {
+      ARM64vecb_ADD64x2=120,
+      ARM64vecb_ADD32x4,
+      ARM64vecb_ADD16x8,
+      ARM64vecb_ADD8x16,
+      ARM64vecb_SUB64x2,
+      ARM64vecb_SUB32x4,
+      ARM64vecb_SUB16x8,
+      ARM64vecb_SUB8x16,
+      ARM64vecb_MUL32x4,
+      ARM64vecb_MUL16x8,
+      ARM64vecb_MUL8x16,
+      ARM64vecb_FADD64x2,
+      ARM64vecb_FSUB64x2,
+      ARM64vecb_FMUL64x2,
+      ARM64vecb_FDIV64x2,
+      ARM64vecb_FADD32x4,
+      ARM64vecb_FSUB32x4,
+      ARM64vecb_FMUL32x4,
+      ARM64vecb_FDIV32x4,
+      ARM64vecb_UMAX32x4,
+      ARM64vecb_UMAX16x8,
+      ARM64vecb_UMAX8x16,
+      ARM64vecb_UMIN32x4,
+      ARM64vecb_UMIN16x8,
+      ARM64vecb_UMIN8x16,
+      ARM64vecb_SMAX32x4,
+      ARM64vecb_SMAX16x8,
+      ARM64vecb_SMAX8x16,
+      ARM64vecb_SMIN32x4,
+      ARM64vecb_SMIN16x8,
+      ARM64vecb_SMIN8x16,
+      ARM64vecb_AND,
+      ARM64vecb_ORR,
+      ARM64vecb_XOR,
+      ARM64vecb_CMEQ64x2,
+      ARM64vecb_CMEQ32x4,
+      ARM64vecb_CMEQ16x8,
+      ARM64vecb_CMEQ8x16,
+      ARM64vecb_CMHI64x2, /* >u */
+      ARM64vecb_CMHI32x4,
+      ARM64vecb_CMHI16x8,
+      ARM64vecb_CMHI8x16,
+      ARM64vecb_CMGT64x2, /* >s */
+      ARM64vecb_CMGT32x4,
+      ARM64vecb_CMGT16x8,
+      ARM64vecb_CMGT8x16,
+      ARM64vecb_FCMEQ64x2,
+      ARM64vecb_FCMEQ32x4,
+      ARM64vecb_FCMGE64x2,
+      ARM64vecb_FCMGE32x4,
+      ARM64vecb_FCMGT64x2,
+      ARM64vecb_FCMGT32x4,
+      ARM64vecb_TBL1,
+      ARM64vecb_INVALID
+   }
+   ARM64VecBinOp;
+
+typedef
+   enum {
+      ARM64vecu_FNEG64x2=300,
+      ARM64vecu_FNEG32x4,
+      ARM64vecu_FABS64x2,
+      ARM64vecu_FABS32x4,
+      ARM64vecu_NOT,
+      ARM64vecu_INVALID
+   }
+   ARM64VecUnaryOp;
+
+typedef
+   enum {
+      ARM64vecsh_USHR64x2=350,
+      ARM64vecsh_USHR32x4,
+      ARM64vecsh_USHR16x8,
+      ARM64vecsh_USHR8x16,
+      ARM64vecsh_SSHR64x2,
+      ARM64vecsh_SSHR32x4,
+      ARM64vecsh_SSHR16x8,
+      ARM64vecsh_SSHR8x16,
+      ARM64vecsh_SHL64x2,
+      ARM64vecsh_SHL32x4,
+      ARM64vecsh_SHL16x8,
+      ARM64vecsh_SHL8x16,
+      ARM64vecsh_INVALID
+   }
+   ARM64VecShiftOp;
+
+//ZZ extern const HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op );
+//ZZ 
+//ZZ typedef
+//ZZ    enum {
+//ZZ       ARMneon_VAND=90,
+//ZZ       ARMneon_VORR,
+//ZZ       ARMneon_VXOR,
+//ZZ       ARMneon_VADD,
+//ZZ       ARMneon_VADDFP,
+//ZZ       ARMneon_VRHADDS,
+//ZZ       ARMneon_VRHADDU,
+//ZZ       ARMneon_VPADDFP,
+//ZZ       ARMneon_VABDFP,
+//ZZ       ARMneon_VSUB,
+//ZZ       ARMneon_VSUBFP,
+//ZZ       ARMneon_VMAXU,
+//ZZ       ARMneon_VMAXS,
+//ZZ       ARMneon_VMAXF,
+//ZZ       ARMneon_VMINU,
+//ZZ       ARMneon_VMINS,
+//ZZ       ARMneon_VMINF,
+//ZZ       ARMneon_VQADDU,
+//ZZ       ARMneon_VQADDS,
+//ZZ       ARMneon_VQSUBU,
+//ZZ       ARMneon_VQSUBS,
+//ZZ       ARMneon_VCGTU,
+//ZZ       ARMneon_VCGTS,
+//ZZ       ARMneon_VCGEU,
+//ZZ       ARMneon_VCGES,
+//ZZ       ARMneon_VCGTF,
+//ZZ       ARMneon_VCGEF,
+//ZZ       ARMneon_VCEQ,
+//ZZ       ARMneon_VCEQF,
+//ZZ       ARMneon_VEXT,
+//ZZ       ARMneon_VMUL,
+//ZZ       ARMneon_VMULFP,
+//ZZ       ARMneon_VMULLU,
+//ZZ       ARMneon_VMULLS,
+//ZZ       ARMneon_VMULP,
+//ZZ       ARMneon_VMULLP,
+//ZZ       ARMneon_VQDMULH,
+//ZZ       ARMneon_VQRDMULH,
+//ZZ       ARMneon_VPADD,
+//ZZ       ARMneon_VPMINU,
+//ZZ       ARMneon_VPMINS,
+//ZZ       ARMneon_VPMINF,
+//ZZ       ARMneon_VPMAXU,
+//ZZ       ARMneon_VPMAXS,
+//ZZ       ARMneon_VPMAXF,
+//ZZ       ARMneon_VTBL,
+//ZZ       ARMneon_VQDMULL,
+//ZZ       ARMneon_VRECPS,
+//ZZ       ARMneon_VRSQRTS,
+//ZZ       /* ... */
+//ZZ    }
+//ZZ    ARMNeonBinOp;
+//ZZ 
+//ZZ typedef
+//ZZ    enum {
+//ZZ       ARMneon_VSHL=150,
+//ZZ       ARMneon_VSAL, /* Yah, not SAR but SAL */
+//ZZ       ARMneon_VQSHL,
+//ZZ       ARMneon_VQSAL
+//ZZ    }
+//ZZ    ARMNeonShiftOp;
+//ZZ 
+//ZZ typedef
+//ZZ    enum {
+//ZZ       ARMneon_COPY=160,
+//ZZ       ARMneon_COPYLU,
+//ZZ       ARMneon_COPYLS,
+//ZZ       ARMneon_COPYN,
+//ZZ       ARMneon_COPYQNSS,
+//ZZ       ARMneon_COPYQNUS,
+//ZZ       ARMneon_COPYQNUU,
+//ZZ       ARMneon_NOT,
+//ZZ       ARMneon_EQZ,
+//ZZ       ARMneon_DUP,
+//ZZ       ARMneon_PADDLS,
+//ZZ       ARMneon_PADDLU,
+//ZZ       ARMneon_CNT,
+//ZZ       ARMneon_CLZ,
+//ZZ       ARMneon_CLS,
+//ZZ       ARMneon_VCVTxFPxINT,
+//ZZ       ARMneon_VQSHLNSS,
+//ZZ       ARMneon_VQSHLNUU,
+//ZZ       ARMneon_VQSHLNUS,
+//ZZ       ARMneon_VCVTFtoU,
+//ZZ       ARMneon_VCVTFtoS,
+//ZZ       ARMneon_VCVTUtoF,
+//ZZ       ARMneon_VCVTStoF,
+//ZZ       ARMneon_VCVTFtoFixedU,
+//ZZ       ARMneon_VCVTFtoFixedS,
+//ZZ       ARMneon_VCVTFixedUtoF,
+//ZZ       ARMneon_VCVTFixedStoF,
+//ZZ       ARMneon_VCVTF16toF32,
+//ZZ       ARMneon_VCVTF32toF16,
+//ZZ       ARMneon_REV16,
+//ZZ       ARMneon_REV32,
+//ZZ       ARMneon_REV64,
+//ZZ       ARMneon_ABS,
+//ZZ       ARMneon_VNEGF,
+//ZZ       ARMneon_VRECIP,
+//ZZ       ARMneon_VRECIPF,
+//ZZ       ARMneon_VABSFP,
+//ZZ       ARMneon_VRSQRTEFP,
+//ZZ       ARMneon_VRSQRTE
+//ZZ       /* ... */
+//ZZ    }
+//ZZ    ARMNeonUnOp;
+//ZZ 
+//ZZ typedef
+//ZZ    enum {
+//ZZ       ARMneon_SETELEM=200,
+//ZZ       ARMneon_GETELEMU,
+//ZZ       ARMneon_GETELEMS,
+//ZZ       ARMneon_VDUP,
+//ZZ    }
+//ZZ    ARMNeonUnOpS;
+//ZZ 
+//ZZ typedef
+//ZZ    enum {
+//ZZ       ARMneon_TRN=210,
+//ZZ       ARMneon_ZIP,
+//ZZ       ARMneon_UZP
+//ZZ       /* ... */
+//ZZ    }
+//ZZ    ARMNeonDualOp;
+//ZZ 
+//ZZ extern const HChar* showARMNeonBinOp ( ARMNeonBinOp op );
+//ZZ extern const HChar* showARMNeonUnOp ( ARMNeonUnOp op );
+//ZZ extern const HChar* showARMNeonUnOpS ( ARMNeonUnOpS op );
+//ZZ extern const HChar* showARMNeonShiftOp ( ARMNeonShiftOp op );
+//ZZ extern const HChar* showARMNeonDualOp ( ARMNeonDualOp op );
+//ZZ extern const HChar* showARMNeonBinOpDataType ( ARMNeonBinOp op );
+//ZZ extern const HChar* showARMNeonUnOpDataType ( ARMNeonUnOp op );
+//ZZ extern const HChar* showARMNeonUnOpSDataType ( ARMNeonUnOpS op );
+//ZZ extern const HChar* showARMNeonShiftOpDataType ( ARMNeonShiftOp op );
+//ZZ extern const HChar* showARMNeonDualOpDataType ( ARMNeonDualOp op );
+
+typedef
+   enum {
+      /* baseline */
+      ARM64in_Arith=1220,
+      ARM64in_Cmp,
+      ARM64in_Logic,
+      ARM64in_Test,
+      ARM64in_Shift,
+      ARM64in_Unary,
+      ARM64in_MovI,        /* int reg-reg move */
+      ARM64in_Imm64,
+      ARM64in_LdSt64,
+      ARM64in_LdSt32,      /* w/ ZX loads */
+      ARM64in_LdSt16,      /* w/ ZX loads */
+      ARM64in_LdSt8,       /* w/ ZX loads */
+      ARM64in_XDirect,     /* direct transfer to GA */
+      ARM64in_XIndir,      /* indirect transfer to GA */
+      ARM64in_XAssisted,   /* assisted transfer to GA */
+      ARM64in_CSel,
+      ARM64in_Call,
+      ARM64in_AddToSP,     /* move SP by small, signed constant */
+      ARM64in_FromSP,      /* move SP to integer register */
+      ARM64in_Mul,
+      ARM64in_LdrEX,
+      ARM64in_StrEX,
+      ARM64in_MFence,
+//ZZ       ARMin_CLREX,
+      /* ARM64in_V*: scalar ops involving vector registers */
+      ARM64in_VLdStS,   /* 32-bit FP load/store, with imm offset  */
+      ARM64in_VLdStD,   /* 64-bit FP load/store, with imm offset  */
+      ARM64in_VLdStQ,
+      ARM64in_VCvtI2F,
+      ARM64in_VCvtF2I,
+      ARM64in_VCvtSD,
+      ARM64in_VUnaryD,
+      ARM64in_VUnaryS,
+      ARM64in_VBinD,
+      ARM64in_VBinS,
+      ARM64in_VCmpD,
+      ARM64in_VCmpS,
+      ARM64in_FPCR,
+      /* ARM64in_V*V: vector ops on vector registers */
+      ARM64in_VBinV,
+      ARM64in_VUnaryV,
+      ARM64in_VNarrowV,
+      ARM64in_VShiftImmV,
+//ZZ       ARMin_VAluS,
+//ZZ       ARMin_VCMovD,
+//ZZ       ARMin_VCMovS,
+//ZZ       ARMin_VXferD,
+//ZZ       ARMin_VXferS,
+//ZZ       ARMin_VCvtID,
+//ZZ       /* Neon */
+//ZZ       ARMin_NLdStD,
+//ZZ       ARMin_NUnary,
+//ZZ       ARMin_NUnaryS,
+//ZZ       ARMin_NDual,
+//ZZ       ARMin_NBinary,
+//ZZ       ARMin_NBinaryS,
+//ZZ       ARMin_NShift,
+//ZZ       ARMin_NShl64, // special case 64-bit shift of Dreg by immediate
+      ARM64in_VImmQ,
+      ARM64in_VDfromX,    /* Move an Xreg to a Dreg */
+      ARM64in_VQfromXX,   /* Move 2 Xregs to a Qreg */
+      ARM64in_VXfromQ,    /* Move half a Qreg to an Xreg */
+      ARM64in_VMov,       /* vector reg-reg move, 16, 8 or 4 bytes */
+      /* infrastructure */
+      ARM64in_EvCheck,     /* Event check */
+//ZZ       ARMin_ProfInc      /* 64-bit profile counter increment */
+   }
+   ARM64InstrTag;
+
+/* Destinations are on the LEFT (first operand) */
+
+typedef
+   struct {
+      ARM64InstrTag tag;
+      union {
+         /* --- INTEGER INSTRUCTIONS --- */
+         /* 64 bit ADD/SUB reg, reg or uimm12<<{0,12} */
+         struct {
+            HReg      dst;
+            HReg      argL;
+            ARM64RIA* argR;
+            Bool      isAdd;
+         } Arith;
+         /* 64 or 32 bit CMP reg, reg or aimm (SUB and set flags) */
+         struct {
+            HReg      argL;
+            ARM64RIA* argR;
+            Bool      is64;
+         } Cmp;
+         /* 64 bit AND/OR/XOR reg, reg or bitfield-immediate */
+         struct {
+            HReg         dst;
+            HReg         argL;
+            ARM64RIL*    argR;
+            ARM64LogicOp op;
+         } Logic;
+         /* 64 bit TST reg, reg or bimm (AND and set flags) */
+         struct {
+            HReg      argL;
+            ARM64RIL* argR;
+         } Test;
+         /* 64 bit SHL/SHR/SAR, 2nd arg is reg or imm */
+         struct {
+            HReg         dst;
+            HReg         argL;
+            ARM64RI6*    argR;
+            ARM64ShiftOp op;
+         } Shift;
+         /* NOT/NEG/CLZ, 64 bit only */
+         struct {
+            HReg         dst;
+            HReg         src;
+            ARM64UnaryOp op;
+         } Unary;
+         /* MOV dst, src -- reg-reg move for integer registers */
+         struct {
+            HReg dst;
+            HReg src;
+         } MovI;
+         /* Pseudo-insn; make a 64-bit immediate */
+         struct {
+            HReg  dst;
+            ULong imm64;
+         } Imm64;
+         /* 64-bit load or store */
+         struct {
+            Bool        isLoad;
+            HReg        rD;
+            ARM64AMode* amode;
+         } LdSt64;
+         /* zx-32-to-64-bit load, or 32-bit store */
+         struct {
+            Bool        isLoad;
+            HReg        rD;
+            ARM64AMode* amode;
+         } LdSt32;
+         /* zx-16-to-64-bit load, or 16-bit store */
+         struct {
+            Bool        isLoad;
+            HReg        rD;
+            ARM64AMode* amode;
+         } LdSt16;
+         /* zx-8-to-64-bit load, or 8-bit store */
+         struct {
+            Bool        isLoad;
+            HReg        rD;
+            ARM64AMode* amode;
+         } LdSt8;
+         /* Update the guest PC value, then exit requesting to chain
+            to it.  May be conditional.  Urr, use of Addr64 implicitly
+            assumes that wordsize(guest) == wordsize(host). */
+         struct {
+            Addr64        dstGA;    /* next guest address */
+            ARM64AMode*   amPC;     /* amode in guest state for PC */
+            ARM64CondCode cond;     /* can be ARM64cc_AL */
+            Bool          toFastEP; /* chain to the slow or fast point? */
+         } XDirect;
+         /* Boring transfer to a guest address not known at JIT time.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg          dstGA;
+            ARM64AMode*   amPC;
+            ARM64CondCode cond; /* can be ARM64cc_AL */
+         } XIndir;
+         /* Assisted transfer to a guest address, most general case.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg          dstGA;
+            ARM64AMode*   amPC;
+            ARM64CondCode cond; /* can be ARM64cc_AL */
+            IRJumpKind    jk;
+         } XAssisted;
+         /* CSEL: dst = if cond then argL else argR.  cond may be anything. */
+          struct {
+            HReg          dst;
+            HReg          argL;
+            HReg          argR;
+            ARM64CondCode cond;
+         } CSel;
+         /* Pseudo-insn.  Call target (an absolute address), on given
+            condition (which could be ARM64cc_AL). */
+         struct {
+            RetLoc        rloc;     /* where the return value will be */
+            HWord         target;
+            ARM64CondCode cond;
+            Int           nArgRegs; /* # regs carrying args: 0 .. 8 */
+         } Call;
+         /* move SP by small, signed constant */
+         struct {
+            Int simm; /* needs to be 0 % 16 and in the range -4095
+                         .. 4095 inclusive */
+         } AddToSP;
+         /* move SP to integer register */
+         struct {
+            HReg dst;
+         } FromSP;
+         /* Integer multiply, with 3 variants:
+              (PLAIN) lo64(64 *  64)
+              (ZX)    hi64(64 *u 64)
+              (SX)    hi64(64 *s 64)
+         */
+         struct {
+            HReg       dst;
+            HReg       argL;
+            HReg       argR;
+            ARM64MulOp op;
+         } Mul;
+         /* LDXR{,H,B} x2, [x4] */
+         struct {
+            Int  szB; /* 1, 2, 4 or 8 */
+         } LdrEX;
+         /* STXR{,H,B} w0, x2, [x4] */
+         struct {
+            Int  szB; /* 1, 2, 4 or 8 */
+         } StrEX;
+         /* Mem fence.  An insn which fences all loads and stores as
+            much as possible before continuing.  On ARM64 we emit the
+            sequence "dsb sy ; dmb sy ; isb sy", which is probably
+            total nuclear overkill, but better safe than sorry. */
+         struct {
+         } MFence;
+//ZZ          /* A CLREX instruction. */
+//ZZ          struct {
+//ZZ          } CLREX;
+         /* --- INSTRUCTIONS INVOLVING VECTOR REGISTERS --- */
+         /* 32-bit Fp load/store */
+         struct {
+            Bool isLoad;
+            HReg sD;
+            HReg rN;
+            UInt uimm12;  /* 0 .. 16380 inclusive, 0 % 4 */
+         } VLdStS;
+         /* 64-bit Fp load/store */
+         struct {
+            Bool isLoad;
+            HReg dD;
+            HReg rN;
+            UInt uimm12;  /* 0 .. 32760 inclusive, 0 % 8 */
+         } VLdStD;
+         /* 128-bit Vector load/store. */
+         struct {
+            Bool isLoad;
+            HReg rQ; // data
+            HReg rN; // address
+         } VLdStQ;
+         /* Scalar conversion of int to float. */
+         struct {
+            ARM64CvtOp how;
+            HReg       rD; // dst, a D or S register
+            HReg       rS; // src, a W or X register
+         } VCvtI2F;
+         /* Scalar conversion of float to int, w/ specified RM. */
+         struct {
+            ARM64CvtOp how;
+            HReg       rD; // dst, a W or X register
+            HReg       rS; // src, a D or S register
+            UChar      armRM; // ARM encoded RM:
+                              // 00=nearest, 01=+inf, 10=-inf, 11=zero
+         } VCvtF2I;
+         /* Convert between 32-bit and 64-bit FP values (both
+            ways). (FCVT) */
+         struct {
+            Bool sToD; /* True: F32->F64.  False: F64->F32 */
+            HReg dst;
+            HReg src;
+         } VCvtSD;
+         /* 64-bit FP unary */
+         struct {
+            ARM64FpUnaryOp op;
+            HReg           dst;
+            HReg           src;
+         } VUnaryD;
+         /* 32-bit FP unary */
+         struct {
+            ARM64FpUnaryOp op;
+            HReg           dst;
+            HReg           src;
+         } VUnaryS;
+         /* 64-bit FP binary arithmetic */
+         struct {
+            ARM64FpBinOp op;
+            HReg         dst;
+            HReg         argL;
+            HReg         argR;
+         } VBinD;
+         /* 32-bit FP binary arithmetic */
+         struct {
+            ARM64FpBinOp op;
+            HReg         dst;
+            HReg         argL;
+            HReg         argR;
+         } VBinS;
+         /* 64-bit FP compare */
+         struct {
+            HReg argL;
+            HReg argR;
+         } VCmpD;
+         /* 32-bit FP compare */
+         struct {
+            HReg argL;
+            HReg argR;
+         } VCmpS;
+         /* Move a 32-bit value to/from the FPCR */
+         struct {
+            Bool toFPCR;
+            HReg iReg;
+         } FPCR;
+         /* binary vector operation on vector registers */
+         struct {
+            ARM64VecBinOp op;
+            HReg          dst;
+            HReg          argL;
+            HReg          argR;
+         } VBinV;
+         /* unary vector operation on vector registers */
+         struct {
+            ARM64VecUnaryOp op;
+            HReg            dst;
+            HReg            arg;
+         } VUnaryV;
+         /* vector narrowing, Q -> Q.  Result goes in the bottom half
+            of dst and the top half is zeroed out.  Iow is XTN. */
+        struct {
+           UInt dszBlg2; // 0: 16to8_x8  1: 32to16_x4  2: 64to32_x2
+           HReg dst;     // Q reg
+           HReg src;     // Q reg
+        } VNarrowV;
+        /* Vector shift by immediate.  |amt| needs to be > 0 and <
+           implied lane size of |op|.  Zero shifts and out of range
+           shifts are not allowed. */
+        struct {
+           ARM64VecShiftOp op;
+           HReg            dst;
+           HReg            src;
+           UInt            amt;
+        } VShiftImmV;
+//ZZ          /* 32-bit FP binary arithmetic */
+//ZZ          struct {
+//ZZ             ARMVfpOp op;
+//ZZ             HReg     dst;
+//ZZ             HReg     argL;
+//ZZ             HReg     argR;
+//ZZ          } VAluS;
+//ZZ          /* 64-bit FP mov src to dst on the given condition, which may
+//ZZ             not be ARMcc_AL. */
+//ZZ          struct {
+//ZZ             ARMCondCode cond;
+//ZZ             HReg        dst;
+//ZZ             HReg        src;
+//ZZ          } VCMovD;
+//ZZ          /* 32-bit FP mov src to dst on the given condition, which may
+//ZZ             not be ARMcc_AL. */
+//ZZ          struct {
+//ZZ             ARMCondCode cond;
+//ZZ             HReg        dst;
+//ZZ             HReg        src;
+//ZZ          } VCMovS;
+//ZZ          /* Transfer a VFP D reg to/from two integer registers (VMOV) */
+//ZZ          struct {
+//ZZ             Bool toD;
+//ZZ             HReg dD;
+//ZZ             HReg rHi;
+//ZZ             HReg rLo;
+//ZZ          } VXferD;
+//ZZ          /* Transfer a VFP S reg to/from an integer register (VMOV) */
+//ZZ          struct {
+//ZZ             Bool toS;
+//ZZ             HReg fD;
+//ZZ             HReg rLo;
+//ZZ          } VXferS;
+//ZZ          /* Convert between 32-bit ints and 64-bit FP values (both ways
+//ZZ             and both signednesses). (FSITOD, FUITOD, FTOSID, FTOUID) */
+//ZZ          struct {
+//ZZ             Bool iToD; /* True: I32->F64.  False: F64->I32 */
+//ZZ             Bool syned; /* True: I32 is signed.  False: I32 is unsigned */
+//ZZ             HReg dst;
+//ZZ             HReg src;
+//ZZ          } VCvtID;
+//ZZ          /* Neon data processing instruction: 3 registers of the same
+//ZZ             length */
+//ZZ          struct {
+//ZZ             ARMNeonBinOp op;
+//ZZ             HReg dst;
+//ZZ             HReg argL;
+//ZZ             HReg argR;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NBinary;
+//ZZ          struct {
+//ZZ             ARMNeonBinOp op;
+//ZZ             ARMNRS* dst;
+//ZZ             ARMNRS* argL;
+//ZZ             ARMNRS* argR;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NBinaryS;
+//ZZ          struct {
+//ZZ             ARMNeonShiftOp op;
+//ZZ             HReg dst;
+//ZZ             HReg argL;
+//ZZ             HReg argR;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NShift;
+//ZZ          struct {
+//ZZ             HReg dst;
+//ZZ             HReg src;
+//ZZ             UInt amt; /* 1..63 only */
+//ZZ          } NShl64;
+//ZZ          struct {
+//ZZ             Bool isLoad;
+//ZZ             HReg dD;
+//ZZ             ARMAModeN *amode;
+//ZZ          } NLdStD
+//ZZ          struct {
+//ZZ             ARMNeonUnOpS op;
+//ZZ             ARMNRS*  dst;
+//ZZ             ARMNRS*  src;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NUnaryS;
+//ZZ          struct {
+//ZZ             ARMNeonUnOp op;
+//ZZ             HReg  dst;
+//ZZ             HReg  src;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NUnary;
+//ZZ          /* Takes two arguments and modifies them both. */
+//ZZ          struct {
+//ZZ             ARMNeonDualOp op;
+//ZZ             HReg  arg1;
+//ZZ             HReg  arg2;
+//ZZ             UInt size;
+//ZZ             Bool Q;
+//ZZ          } NDual;
+         struct {
+            HReg   rQ;
+            UShort imm; /* Same 1-bit-per-byte encoding as IR */
+         } VImmQ;
+         struct {
+            HReg rD;
+            HReg rX;
+         } VDfromX;
+         struct {
+            HReg rQ;
+            HReg rXhi;
+            HReg rXlo;
+         } VQfromXX;
+         struct {
+            HReg rX;
+            HReg rQ;
+            UInt laneNo; /* either 0 or 1 */
+         } VXfromQ;
+         /* MOV dst, src -- reg-reg move for vector registers */
+         struct {
+            UInt szB; // 16=mov qD,qS;  8=mov dD,dS;  4=mov sD,sS
+            HReg dst;
+            HReg src;
+         } VMov;
+         struct {
+            ARM64AMode* amCounter;
+            ARM64AMode* amFailAddr;
+         } EvCheck;
+//ZZ          struct {
+//ZZ             /* No fields.  The address of the counter to inc is
+//ZZ                installed later, post-translation, by patching it in,
+//ZZ                as it is not known at translation time. */
+//ZZ          } ProfInc;
+      } ARM64in;
+   }
+   ARM64Instr;
+
+//ZZ 
+extern ARM64Instr* ARM64Instr_Arith   ( HReg, HReg, ARM64RIA*, Bool isAdd );
+extern ARM64Instr* ARM64Instr_Cmp     ( HReg, ARM64RIA*, Bool is64 );
+extern ARM64Instr* ARM64Instr_Logic   ( HReg, HReg, ARM64RIL*, ARM64LogicOp );
+extern ARM64Instr* ARM64Instr_Test    ( HReg, ARM64RIL* );
+extern ARM64Instr* ARM64Instr_Shift   ( HReg, HReg, ARM64RI6*, ARM64ShiftOp );
+extern ARM64Instr* ARM64Instr_Unary   ( HReg, HReg, ARM64UnaryOp );
+//ZZ extern ARMInstr* ARMInstr_CmpOrTst ( Bool isCmp, HReg, ARMRI84* );
+extern ARM64Instr* ARM64Instr_MovI    ( HReg, HReg );
+extern ARM64Instr* ARM64Instr_Imm64   ( HReg, ULong );
+extern ARM64Instr* ARM64Instr_LdSt64  ( Bool isLoad, HReg, ARM64AMode* );
+extern ARM64Instr* ARM64Instr_LdSt32  ( Bool isLoad, HReg, ARM64AMode* );
+extern ARM64Instr* ARM64Instr_LdSt16  ( Bool isLoad, HReg, ARM64AMode* );
+extern ARM64Instr* ARM64Instr_LdSt8   ( Bool isLoad, HReg, ARM64AMode* );
+//ZZ extern ARMInstr* ARMInstr_Ld8S     ( ARMCondCode, HReg, ARMAMode2* );
+extern ARM64Instr* ARM64Instr_XDirect ( Addr64 dstGA, ARM64AMode* amPC,
+                                        ARM64CondCode cond, Bool toFastEP );
+extern ARM64Instr* ARM64Instr_XIndir  ( HReg dstGA, ARM64AMode* amPC,
+                                        ARM64CondCode cond );
+extern ARM64Instr* ARM64Instr_XAssisted ( HReg dstGA, ARM64AMode* amPC,
+                                          ARM64CondCode cond, IRJumpKind jk );
+extern ARM64Instr* ARM64Instr_CSel    ( HReg dst, HReg argL, HReg argR,
+                                        ARM64CondCode cond );
+extern ARM64Instr* ARM64Instr_Call    ( ARM64CondCode, HWord, Int nArgRegs,
+                                        RetLoc rloc );
+extern ARM64Instr* ARM64Instr_AddToSP ( Int simm );
+extern ARM64Instr* ARM64Instr_FromSP  ( HReg dst );
+extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
+                                        ARM64MulOp op );
+extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
+extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
+extern ARM64Instr* ARM64Instr_MFence  ( void );
+//ZZ extern ARMInstr* ARMInstr_CLREX    ( void );
+extern ARM64Instr* ARM64Instr_VLdStS  ( Bool isLoad, HReg sD, HReg rN,
+                                        UInt uimm12 /* 0 .. 16380, 0 % 4 */ );
+extern ARM64Instr* ARM64Instr_VLdStD  ( Bool isLoad, HReg dD, HReg rN,
+                                        UInt uimm12 /* 0 .. 32760, 0 % 8 */ );
+extern ARM64Instr* ARM64Instr_VLdStQ  ( Bool isLoad, HReg rQ, HReg rN );
+extern ARM64Instr* ARM64Instr_VCvtI2F ( ARM64CvtOp how, HReg rD, HReg rS );
+extern ARM64Instr* ARM64Instr_VCvtF2I ( ARM64CvtOp how, HReg rD, HReg rS,
+                                        UChar armRM );
+extern ARM64Instr* ARM64Instr_VCvtSD  ( Bool sToD, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VUnaryD ( ARM64FpUnaryOp op, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VUnaryS ( ARM64FpUnaryOp op, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VBinD   ( ARM64FpBinOp op, HReg, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VBinS   ( ARM64FpBinOp op, HReg, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VCmpD   ( HReg argL, HReg argR );
+extern ARM64Instr* ARM64Instr_VCmpS   ( HReg argL, HReg argR );
+extern ARM64Instr* ARM64Instr_FPCR    ( Bool toFPCR, HReg iReg );
+extern ARM64Instr* ARM64Instr_VBinV   ( ARM64VecBinOp op, HReg, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+                                           HReg dst, HReg src, UInt amt );
+//ZZ extern ARMInstr* ARMInstr_VAluS    ( ARMVfpOp op, HReg, HReg, HReg );
+//ZZ extern ARMInstr* ARMInstr_VCMovD   ( ARMCondCode, HReg dst, HReg src );
+//ZZ extern ARMInstr* ARMInstr_VCMovS   ( ARMCondCode, HReg dst, HReg src );
+//ZZ extern ARMInstr* ARMInstr_VXferD   ( Bool toD, HReg dD, HReg rHi, HReg rLo );
+//ZZ extern ARMInstr* ARMInstr_VXferS   ( Bool toS, HReg fD, HReg rLo );
+//ZZ extern ARMInstr* ARMInstr_VCvtID   ( Bool iToD, Bool syned,
+//ZZ                                      HReg dst, HReg src );
+//ZZ extern ARMInstr* ARMInstr_NLdStD   ( Bool isLoad, HReg, ARMAModeN* );
+//ZZ extern ARMInstr* ARMInstr_NUnary   ( ARMNeonUnOp, HReg, HReg, UInt, Bool );
+//ZZ extern ARMInstr* ARMInstr_NUnaryS  ( ARMNeonUnOpS, ARMNRS*, ARMNRS*,
+//ZZ                                      UInt, Bool );
+//ZZ extern ARMInstr* ARMInstr_NDual    ( ARMNeonDualOp, HReg, HReg, UInt, Bool );
+//ZZ extern ARMInstr* ARMInstr_NBinary  ( ARMNeonBinOp, HReg, HReg, HReg,
+//ZZ                                      UInt, Bool );
+//ZZ extern ARMInstr* ARMInstr_NShift   ( ARMNeonShiftOp, HReg, HReg, HReg,
+//ZZ                                      UInt, Bool );
+//ZZ extern ARMInstr* ARMInstr_NShl64   ( HReg, HReg, UInt );
+extern ARM64Instr* ARM64Instr_VImmQ   ( HReg, UShort );
+extern ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX );
+extern ARM64Instr* ARM64Instr_VQfromXX( HReg rQ, HReg rXhi, HReg rXlo );
+extern ARM64Instr* ARM64Instr_VXfromQ ( HReg rX, HReg rQ, UInt laneNo );
+extern ARM64Instr* ARM64Instr_VMov    ( UInt szB, HReg dst, HReg src );
+
+extern ARM64Instr* ARM64Instr_EvCheck ( ARM64AMode* amCounter,
+                                        ARM64AMode* amFailAddr );
+//ZZ extern ARMInstr* ARMInstr_ProfInc  ( void );
+
+extern void ppARM64Instr ( ARM64Instr* );
+
+
+/* Some functions that insulate the register allocator from details
+   of the underlying instruction set. */
+extern void getRegUsage_ARM64Instr ( HRegUsage*, ARM64Instr*, Bool );
+extern void mapRegs_ARM64Instr     ( HRegRemap*, ARM64Instr*, Bool );
+extern Bool isMove_ARM64Instr      ( ARM64Instr*, HReg*, HReg* );
+extern Int  emit_ARM64Instr        ( /*MB_MOD*/Bool* is_profInc,
+                                     UChar* buf, Int nbuf, ARM64Instr* i,
+                                     Bool mode64,
+                                     void* disp_cp_chain_me_to_slowEP,
+                                     void* disp_cp_chain_me_to_fastEP,
+                                     void* disp_cp_xindir,
+                                     void* disp_cp_xassisted );
+
+extern void genSpill_ARM64  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                              HReg rreg, Int offset, Bool );
+extern void genReload_ARM64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                              HReg rreg, Int offset, Bool );
+
+extern void getAllocableRegs_ARM64 ( Int*, HReg** );
+extern HInstrArray* iselSB_ARM64 ( IRSB*, 
+                                   VexArch,
+                                   VexArchInfo*,
+                                   VexAbiInfo*,
+                                   Int offs_Host_EvC_Counter,
+                                   Int offs_Host_EvC_FailAddr,
+                                   Bool chainingAllowed,
+                                   Bool addProfInc,
+                                   Addr64 max_ga );
+
+/* How big is an event check?  This is kind of a kludge because it
+   depends on the offsets of host_EvC_FAILADDR and
+   host_EvC_COUNTER. */
+extern Int evCheckSzB_ARM64 ( void );
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+extern VexInvalRange chainXDirect_ARM64 ( void* place_to_chain,
+                                          void* disp_cp_chain_me_EXPECTED,
+                                          void* place_to_jump_to );
+
+extern VexInvalRange unchainXDirect_ARM64 ( void* place_to_unchain,
+                                            void* place_to_jump_to_EXPECTED,
+                                            void* disp_cp_chain_me );
+
+//ZZ /* Patch the counter location into an existing ProfInc point. */
+//ZZ extern VexInvalRange patchProfInc_ARM ( void*  place_to_patch,
+//ZZ                                         ULong* location_of_counter );
+
+
+#endif /* ndef __VEX_HOST_ARM64_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_arm64_defs.h ---*/
+/*---------------------------------------------------------------*/
Index: priv/host_arm64_isel.c
===================================================================
--- priv/host_arm64_isel.c	(.../tags/VEX_3_9_0)	(revision 0)
+++ priv/host_arm64_isel.c	(.../trunk)	(revision 2863)
@@ -0,0 +1,7058 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_arm64_isel.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "ir_match.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "host_generic_regs.h"
+#include "host_generic_simd64.h"  // for 32-bit SIMD helpers
+#include "host_arm64_defs.h"
+
+
+//ZZ /*---------------------------------------------------------*/
+//ZZ /*--- ARMvfp control word stuff                         ---*/
+//ZZ /*---------------------------------------------------------*/
+//ZZ 
+//ZZ /* Vex-generated code expects to run with the FPU set as follows: all
+//ZZ    exceptions masked, round-to-nearest, non-vector mode, with the NZCV
+//ZZ    flags cleared, and FZ (flush to zero) disabled.  Curiously enough,
+//ZZ    this corresponds to a FPSCR value of zero.
+//ZZ 
+//ZZ    fpscr should therefore be zero on entry to Vex-generated code, and
+//ZZ    should be unchanged at exit.  (Or at least the bottom 28 bits
+//ZZ    should be zero).
+//ZZ */
+//ZZ 
+//ZZ #define DEFAULT_FPSCR 0
+
+
+/*---------------------------------------------------------*/
+/*--- ISelEnv                                           ---*/
+/*---------------------------------------------------------*/
+
+/* This carries around:
+
+   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
+     might encounter.  This is computed before insn selection starts,
+     and does not change.
+
+   - A mapping from IRTemp to HReg.  This tells the insn selector
+     which virtual register is associated with each IRTemp temporary.
+     This is computed before insn selection starts, and does not
+     change.  We expect this mapping to map precisely the same set of
+     IRTemps as the type mapping does.
+
+     |vregmap|   holds the primary register for the IRTemp.
+     |vregmapHI| is only used for 128-bit integer-typed
+                 IRTemps.  It holds the identity of a second
+                 64-bit virtual HReg, which holds the high half
+                 of the value.
+
+   - The code array, that is, the insns selected so far.
+
+   - A counter, for generating new virtual registers.
+
+   - The host hardware capabilities word.  This is set at the start
+     and does not change.
+
+   - A Bool for indicating whether we may generate chain-me
+     instructions for control flow transfers, or whether we must use
+     XAssisted.
+
+   - The maximum guest address of any guest insn in this block.
+     Actually, the address of the highest-addressed byte from any insn
+     in this block.  Is set at the start and does not change.  This is
+     used for detecting jumps which are definitely forward-edges from
+     this block, and therefore can be made (chained) to the fast entry
+     point of the destination, thereby avoiding the destination's
+     event check.
+
+    - An IRExpr*, which may be NULL, holding the IR expression (an
+      IRRoundingMode-encoded value) to which the FPU's rounding mode
+      was most recently set.  Setting to NULL is always safe.  Used to
+      avoid redundant settings of the FPU's rounding mode, as
+      described in set_FPCR_rounding_mode below.
+
+   Note, this is all (well, mostly) host-independent.
+*/
+
+typedef
+   struct {
+      /* Constant -- are set at the start and do not change. */
+      IRTypeEnv*   type_env;
+
+      HReg*        vregmap;
+      HReg*        vregmapHI;
+      Int          n_vregmap;
+
+      UInt         hwcaps;
+
+      Bool         chainingAllowed;
+      Addr64       max_ga;
+
+      /* These are modified as we go along. */
+      HInstrArray* code;
+      Int          vreg_ctr;
+
+      IRExpr*      previous_rm;
+   }
+   ISelEnv;
+
+static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   return env->vregmap[tmp];
+}
+
+static void addInstr ( ISelEnv* env, ARM64Instr* instr )
+{
+   addHInstr(env->code, instr);
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppARM64Instr(instr);
+      vex_printf("\n");
+   }
+}
+
+static HReg newVRegI ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegD ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+//ZZ static HReg newVRegF ( ISelEnv* env )
+//ZZ {
+//ZZ    HReg reg = mkHReg(env->vreg_ctr, HRcFlt32, True/*virtual reg*/);
+//ZZ    env->vreg_ctr++;
+//ZZ    return reg;
+//ZZ }
+
+static HReg newVRegV ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+//ZZ /* These are duplicated in guest_arm_toIR.c */
+//ZZ static IRExpr* unop ( IROp op, IRExpr* a )
+//ZZ {
+//ZZ    return IRExpr_Unop(op, a);
+//ZZ }
+//ZZ 
+//ZZ static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+//ZZ {
+//ZZ    return IRExpr_Binop(op, a1, a2);
+//ZZ }
+//ZZ 
+//ZZ static IRExpr* bind ( Int binder )
+//ZZ {
+//ZZ    return IRExpr_Binder(binder);
+//ZZ }
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Forward declarations                        ---*/
+/*---------------------------------------------------------*/
+
+/* These are organised as iselXXX and iselXXX_wrk pairs.  The
+   iselXXX_wrk do the real work, but are not to be called directly.
+   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
+   checks that all returned registers are virtual.  You should not
+   call the _wrk version directly.
+
+   Because some forms of ARM64 memory amodes are implicitly scaled by
+   the access size, iselIntExpr_AMode takes an IRType which tells it
+   the type of the access for which the amode is to be used.  This
+   type needs to be correct, else you'll get incorrect code.
+*/
+static ARM64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env,
+                                           IRExpr* e, IRType dty );
+static ARM64AMode* iselIntExpr_AMode     ( ISelEnv* env,
+                                           IRExpr* e, IRType dty );
+
+static ARM64RIA*   iselIntExpr_RIA_wrk   ( ISelEnv* env, IRExpr* e );
+static ARM64RIA*   iselIntExpr_RIA       ( ISelEnv* env, IRExpr* e );
+
+static ARM64RIL*   iselIntExpr_RIL_wrk   ( ISelEnv* env, IRExpr* e );
+static ARM64RIL*   iselIntExpr_RIL       ( ISelEnv* env, IRExpr* e );
+
+static ARM64RI6*   iselIntExpr_RI6_wrk   ( ISelEnv* env, IRExpr* e );
+static ARM64RI6*   iselIntExpr_RI6       ( ISelEnv* env, IRExpr* e );
+
+static ARM64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
+static ARM64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
+static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
+
+static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo, 
+                                           ISelEnv* env, IRExpr* e );
+static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo, 
+                                           ISelEnv* env, IRExpr* e );
+
+
+//ZZ static void        iselInt64Expr_wrk      ( HReg* rHi, HReg* rLo, 
+//ZZ                                             ISelEnv* env, IRExpr* e );
+//ZZ static void        iselInt64Expr          ( HReg* rHi, HReg* rLo, 
+//ZZ                                             ISelEnv* env, IRExpr* e );
+
+static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
+static HReg        iselDblExpr            ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselFltExpr_wrk        ( ISelEnv* env, IRExpr* e );
+static HReg        iselFltExpr            ( ISelEnv* env, IRExpr* e );
+
+//ZZ static HReg        iselNeon64Expr_wrk     ( ISelEnv* env, IRExpr* e );
+//ZZ static HReg        iselNeon64Expr         ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselV128Expr_wrk       ( ISelEnv* env, IRExpr* e );
+static HReg        iselV128Expr           ( ISelEnv* env, IRExpr* e );
+
+static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 );
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Misc helpers                                ---*/
+/*---------------------------------------------------------*/
+
+/* Generate an amode suitable for a 64-bit sized access relative to
+   the baseblock register (X21).  This generates an RI12 amode, which
+   means its scaled by the access size, which is why the access size
+   -- 64 bit -- is stated explicitly here.  Consequently |off| needs
+   to be divisible by 8. */
+static ARM64AMode* mk_baseblock_64bit_access_amode ( UInt off )
+{
+   vassert(off < (8 << 12)); /* otherwise it's unrepresentable */
+   vassert((off & 7) == 0);  /* ditto */
+   return ARM64AMode_RI12(hregARM64_X21(), off >> 3, 8/*scale*/);
+}
+
+/* Ditto, for 32 bit accesses. */
+static ARM64AMode* mk_baseblock_32bit_access_amode ( UInt off )
+{
+   vassert(off < (4 << 12)); /* otherwise it's unrepresentable */
+   vassert((off & 3) == 0);  /* ditto */
+   return ARM64AMode_RI12(hregARM64_X21(), off >> 2, 4/*scale*/);
+}
+
+/* Ditto, for 16 bit accesses. */
+static ARM64AMode* mk_baseblock_16bit_access_amode ( UInt off )
+{
+   vassert(off < (2 << 12)); /* otherwise it's unrepresentable */
+   vassert((off & 1) == 0);  /* ditto */
+   return ARM64AMode_RI12(hregARM64_X21(), off >> 1, 2/*scale*/);
+}
+
+/* Ditto, for 8 bit accesses. */
+static ARM64AMode* mk_baseblock_8bit_access_amode ( UInt off )
+{
+   vassert(off < (1 << 12)); /* otherwise it's unrepresentable */
+   return ARM64AMode_RI12(hregARM64_X21(), off >> 0, 1/*scale*/);
+}
+
+static HReg mk_baseblock_128bit_access_addr ( ISelEnv* env, UInt off )
+{
+   vassert(off < (1<<12));
+   HReg r = newVRegI(env);
+   addInstr(env, ARM64Instr_Arith(r, hregARM64_X21(),
+                                     ARM64RIA_I12(off,0), True/*isAdd*/));
+   return r;
+}
+
+static HReg get_baseblock_register ( void )
+{
+   return hregARM64_X21();
+}
+
+/* Generate code to zero extend a 32 bit value in 'src' to 64 bits, in
+   a new register, and return the new register. */
+static HReg widen_z_32_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst  = newVRegI(env);
+   ARM64RIL* mask = ARM64RIL_I13(1, 0, 31); /* encodes 0xFFFFFFFF */
+   addInstr(env, ARM64Instr_Logic(dst, src, mask, ARM64lo_AND));
+   return dst;
+}
+
+/* Generate code to sign extend a 16 bit value in 'src' to 64 bits, in
+   a new register, and return the new register. */
+static HReg widen_s_16_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst = newVRegI(env);
+   ARM64RI6* n48 = ARM64RI6_I6(48);
+   addInstr(env, ARM64Instr_Shift(dst, src, n48, ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(dst, dst, n48, ARM64sh_SAR));
+   return dst;
+}
+
+/* Generate code to zero extend a 16 bit value in 'src' to 64 bits, in
+   a new register, and return the new register. */
+static HReg widen_z_16_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst = newVRegI(env);
+   ARM64RI6* n48 = ARM64RI6_I6(48);
+   addInstr(env, ARM64Instr_Shift(dst, src, n48, ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(dst, dst, n48, ARM64sh_SHR));
+   return dst;
+}
+
+/* Generate code to sign extend a 32 bit value in 'src' to 64 bits, in
+   a new register, and return the new register. */
+static HReg widen_s_32_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst = newVRegI(env);
+   ARM64RI6* n32 = ARM64RI6_I6(32);
+   addInstr(env, ARM64Instr_Shift(dst, src, n32, ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(dst, dst, n32, ARM64sh_SAR));
+   return dst;
+}
+
+/* Generate code to sign extend a 8 bit value in 'src' to 64 bits, in
+   a new register, and return the new register. */
+static HReg widen_s_8_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst = newVRegI(env);
+   ARM64RI6* n56 = ARM64RI6_I6(56);
+   addInstr(env, ARM64Instr_Shift(dst, src, n56, ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(dst, dst, n56, ARM64sh_SAR));
+   return dst;
+}
+
+static HReg widen_z_8_to_64 ( ISelEnv* env, HReg src )
+{
+   HReg      dst = newVRegI(env);
+   ARM64RI6* n56 = ARM64RI6_I6(56);
+   addInstr(env, ARM64Instr_Shift(dst, src, n56, ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(dst, dst, n56, ARM64sh_SHR));
+   return dst;
+}
+
+/* Is this IRExpr_Const(IRConst_U64(0)) ? */
+static Bool isZeroU64 ( IRExpr* e ) {
+   if (e->tag != Iex_Const) return False;
+   IRConst* con = e->Iex.Const.con;
+   vassert(con->tag == Ico_U64);
+   return con->Ico.U64 == 0;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: FP rounding mode helpers                    ---*/
+/*---------------------------------------------------------*/
+
+/* Set the FP rounding mode: 'mode' is an I32-typed expression
+   denoting a value in the range 0 .. 3, indicating a round mode
+   encoded as per type IRRoundingMode -- the first four values only
+   (Irrm_NEAREST, Irrm_NegINF, Irrm_PosINF, Irrm_ZERO).  Set the PPC
+   FSCR to have the same rounding.
+
+   For speed & simplicity, we're setting the *entire* FPCR here.
+
+   Setting the rounding mode is expensive.  So this function tries to
+   avoid repeatedly setting the rounding mode to the same thing by
+   first comparing 'mode' to the 'mode' tree supplied in the previous
+   call to this function, if any.  (The previous value is stored in
+   env->previous_rm.)  If 'mode' is a single IR temporary 't' and
+   env->previous_rm is also just 't', then the setting is skipped.
+
+   This is safe because of the SSA property of IR: an IR temporary can
+   only be defined once and so will have the same value regardless of
+   where it appears in the block.  Cool stuff, SSA.
+
+   A safety condition: all attempts to set the RM must be aware of
+   this mechanism - by being routed through the functions here.
+
+   Of course this only helps if blocks where the RM is set more than
+   once and it is set to the same value each time, *and* that value is
+   held in the same IR temporary each time.  In order to assure the
+   latter as much as possible, the IR optimiser takes care to do CSE
+   on any block with any sign of floating point activity.
+*/
+static
+void set_FPCR_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   vassert(typeOfIRExpr(env->type_env,mode) == Ity_I32);
+   
+   /* Do we need to do anything? */
+   if (env->previous_rm
+       && env->previous_rm->tag == Iex_RdTmp
+       && mode->tag == Iex_RdTmp
+       && env->previous_rm->Iex.RdTmp.tmp == mode->Iex.RdTmp.tmp) {
+      /* no - setting it to what it was before.  */
+      vassert(typeOfIRExpr(env->type_env, env->previous_rm) == Ity_I32);
+      return;
+   }
+
+   /* No luck - we better set it, and remember what we set it to. */
+   env->previous_rm = mode;
+
+   /* Only supporting the rounding-mode bits - the rest of FPCR is set
+      to zero - so we can set the whole register at once (faster). */
+
+   /* This isn't simple, because 'mode' carries an IR rounding
+      encoding, and we need to translate that to an ARM64 FP one:
+      The IR encoding:
+         00  to nearest (the default)
+         10  to +infinity
+         01  to -infinity
+         11  to zero
+      The ARM64 FP encoding:
+         00  to nearest
+         01  to +infinity
+         10  to -infinity
+         11  to zero
+      Easy enough to do; just swap the two bits.
+   */
+   HReg irrm = iselIntExpr_R(env, mode);
+   HReg tL   = newVRegI(env);
+   HReg tR   = newVRegI(env);
+   HReg t3   = newVRegI(env);
+   /* tL = irrm << 1;
+      tR = irrm >> 1;  if we're lucky, these will issue together
+      tL &= 2;
+      tR &= 1;         ditto
+      t3 = tL | tR;
+      t3 <<= 22;
+      fmxr fpscr, t3
+   */
+   ARM64RIL* ril_one = mb_mkARM64RIL_I(1);
+   ARM64RIL* ril_two = mb_mkARM64RIL_I(2);
+   vassert(ril_one && ril_two);
+   addInstr(env, ARM64Instr_Shift(tL, irrm, ARM64RI6_I6(1), ARM64sh_SHL));
+   addInstr(env, ARM64Instr_Shift(tR, irrm, ARM64RI6_I6(1), ARM64sh_SHR));
+   addInstr(env, ARM64Instr_Logic(tL, tL, ril_two, ARM64lo_AND));
+   addInstr(env, ARM64Instr_Logic(tR, tR, ril_one, ARM64lo_AND));
+   addInstr(env, ARM64Instr_Logic(t3, tL, ARM64RIL_R(tR), ARM64lo_OR));
+   addInstr(env, ARM64Instr_Shift(t3, t3, ARM64RI6_I6(22), ARM64sh_SHL));
+   addInstr(env, ARM64Instr_FPCR(True/*toFPCR*/, t3));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Function call helpers                       ---*/
+/*---------------------------------------------------------*/
+
+/* Used only in doHelperCall.  See big comment in doHelperCall re
+   handling of register-parameter args.  This function figures out
+   whether evaluation of an expression might require use of a fixed
+   register.  If in doubt return True (safe but suboptimal).
+*/
+static
+Bool mightRequireFixedRegs ( IRExpr* e )
+{
+   if (UNLIKELY(is_IRExpr_VECRET_or_BBPTR(e))) {
+      // These are always "safe" -- either a copy of SP in some
+      // arbitrary vreg, or a copy of x21, respectively.
+      return False;
+   }
+   /* Else it's a "normal" expression. */
+   switch (e->tag) {
+      case Iex_RdTmp: case Iex_Const: case Iex_Get:
+         return False;
+      default:
+         return True;
+   }
+}
+ 
+
+/* Do a complete function call.  |guard| is a Ity_Bit expression
+   indicating whether or not the call happens.  If guard==NULL, the
+   call is unconditional.  |retloc| is set to indicate where the
+   return value is after the call.  The caller (of this fn) must
+   generate code to add |stackAdjustAfterCall| to the stack pointer
+   after the call is done.  Returns True iff it managed to handle this
+   combination of arg/return types, else returns False. */
+
+static
+Bool doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
+                    /*OUT*/RetLoc* retloc,
+                    ISelEnv* env,
+                    IRExpr* guard,
+                    IRCallee* cee, IRType retTy, IRExpr** args )
+{
+   ARM64CondCode cc;
+   HReg          argregs[ARM64_N_ARGREGS];
+   HReg          tmpregs[ARM64_N_ARGREGS];
+   Bool          go_fast;
+   Int           n_args, i, nextArgReg;
+   ULong         target;
+
+   vassert(ARM64_N_ARGREGS == 8);
+
+   /* Set default returns.  We'll update them later if needed. */
+   *stackAdjustAfterCall = 0;
+   *retloc               = mk_RetLoc_INVALID();
+
+   /* These are used for cross-checking that IR-level constraints on
+      the use of IRExpr_VECRET() and IRExpr_BBPTR() are observed. */
+   UInt nVECRETs = 0;
+   UInt nBBPTRs  = 0;
+
+   /* Marshal args for a call and do the call.
+
+      This function only deals with a tiny set of possibilities, which
+      cover all helpers in practice.  The restrictions are that only
+      arguments in registers are supported, hence only
+      ARM64_N_REGPARMS x 64 integer bits in total can be passed.  In
+      fact the only supported arg type is I64.
+
+      The return type can be I{64,32} or V128.  In the V128 case, it
+      is expected that |args| will contain the special node
+      IRExpr_VECRET(), in which case this routine generates code to
+      allocate space on the stack for the vector return value.  Since
+      we are not passing any scalars on the stack, it is enough to
+      preallocate the return space before marshalling any arguments,
+      in this case.
+
+      |args| may also contain IRExpr_BBPTR(), in which case the
+      value in x21 is passed as the corresponding argument.
+
+      Generating code which is both efficient and correct when
+      parameters are to be passed in registers is difficult, for the
+      reasons elaborated in detail in comments attached to
+      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
+      of the method described in those comments.
+
+      The problem is split into two cases: the fast scheme and the
+      slow scheme.  In the fast scheme, arguments are computed
+      directly into the target (real) registers.  This is only safe
+      when we can be sure that computation of each argument will not
+      trash any real registers set by computation of any other
+      argument.
+
+      In the slow scheme, all args are first computed into vregs, and
+      once they are all done, they are moved to the relevant real
+      regs.  This always gives correct code, but it also gives a bunch
+      of vreg-to-rreg moves which are usually redundant but are hard
+      for the register allocator to get rid of.
+
+      To decide which scheme to use, all argument expressions are
+      first examined.  If they are all so simple that it is clear they
+      will be evaluated without use of any fixed registers, use the
+      fast scheme, else use the slow scheme.  Note also that only
+      unconditional calls may use the fast scheme, since having to
+      compute a condition expression could itself trash real
+      registers.
+
+      Note this requires being able to examine an expression and
+      determine whether or not evaluation of it might use a fixed
+      register.  That requires knowledge of how the rest of this insn
+      selector works.  Currently just the following 3 are regarded as
+      safe -- hopefully they cover the majority of arguments in
+      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
+   */
+
+   /* Note that the cee->regparms field is meaningless on ARM64 hosts
+      (since there is only one calling convention) and so we always
+      ignore it. */
+
+   n_args = 0;
+   for (i = 0; args[i]; i++) {
+      IRExpr* arg = args[i];
+      if (UNLIKELY(arg->tag == Iex_VECRET)) {
+         nVECRETs++;
+      } else if (UNLIKELY(arg->tag == Iex_BBPTR)) {
+         nBBPTRs++;
+      }
+      n_args++;
+   }
+
+   /* If this fails, the IR is ill-formed */
+   vassert(nBBPTRs == 0 || nBBPTRs == 1);
+
+   /* If we have a VECRET, allocate space on the stack for the return
+      value, and record the stack pointer after that. */
+   HReg r_vecRetAddr = INVALID_HREG;
+   if (nVECRETs == 1) {
+      vassert(retTy == Ity_V128 || retTy == Ity_V256);
+      vassert(retTy != Ity_V256); // we don't handle that yet (if ever)
+      r_vecRetAddr = newVRegI(env);
+      addInstr(env, ARM64Instr_AddToSP(-16));
+      addInstr(env, ARM64Instr_FromSP(r_vecRetAddr));
+   } else {
+      // If either of these fail, the IR is ill-formed
+      vassert(retTy != Ity_V128 && retTy != Ity_V256);
+      vassert(nVECRETs == 0);
+   }
+
+   argregs[0] = hregARM64_X0();
+   argregs[1] = hregARM64_X1();
+   argregs[2] = hregARM64_X2();
+   argregs[3] = hregARM64_X3();
+   argregs[4] = hregARM64_X4();
+   argregs[5] = hregARM64_X5();
+   argregs[6] = hregARM64_X6();
+   argregs[7] = hregARM64_X7();
+
+   tmpregs[0] = tmpregs[1] = tmpregs[2] = tmpregs[3] = INVALID_HREG;
+   tmpregs[4] = tmpregs[5] = tmpregs[6] = tmpregs[7] = INVALID_HREG;
+
+   /* First decide which scheme (slow or fast) is to be used.  First
+      assume the fast scheme, and select slow if any contraindications
+      (wow) appear. */
+
+   go_fast = True;
+
+   if (guard) {
+      if (guard->tag == Iex_Const
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional */
+      } else {
+         /* Not manifestly unconditional -- be conservative. */
+         go_fast = False;
+      }
+   }
+
+   if (go_fast) {
+      for (i = 0; i < n_args; i++) {
+         if (mightRequireFixedRegs(args[i])) {
+            go_fast = False;
+            break;
+         }
+      }
+   }
+
+   if (go_fast) {
+      if (retTy == Ity_V128 || retTy == Ity_V256)
+         go_fast = False;
+   }
+
+   /* At this point the scheme to use has been established.  Generate
+      code to get the arg values into the argument rregs.  If we run
+      out of arg regs, give up. */
+
+   if (go_fast) {
+
+      /* FAST SCHEME */
+      nextArgReg = 0;
+
+      for (i = 0; i < n_args; i++) {
+         IRExpr* arg = args[i];
+
+         IRType  aTy = Ity_INVALID;
+         if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
+            aTy = typeOfIRExpr(env->type_env, args[i]);
+
+         if (nextArgReg >= ARM64_N_ARGREGS)
+            return False; /* out of argregs */
+
+         if (aTy == Ity_I64) {
+            addInstr(env, ARM64Instr_MovI( argregs[nextArgReg],
+                                           iselIntExpr_R(env, args[i]) ));
+            nextArgReg++;
+         }
+         else if (arg->tag == Iex_BBPTR) {
+            vassert(0); //ATC
+            addInstr(env, ARM64Instr_MovI( argregs[nextArgReg],
+                                           hregARM64_X21() ));
+            nextArgReg++;
+         }
+         else if (arg->tag == Iex_VECRET) {
+            // because of the go_fast logic above, we can't get here,
+            // since vector return values makes us use the slow path
+            // instead.
+            vassert(0);
+         }
+         else
+            return False; /* unhandled arg type */
+      }
+
+      /* Fast scheme only applies for unconditional calls.  Hence: */
+      cc = ARM64cc_AL;
+
+   } else {
+
+      /* SLOW SCHEME; move via temporaries */
+      nextArgReg = 0;
+
+      for (i = 0; i < n_args; i++) {
+         IRExpr* arg = args[i];
+
+         IRType  aTy = Ity_INVALID;
+         if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
+            aTy = typeOfIRExpr(env->type_env, args[i]);
+
+         if (nextArgReg >= ARM64_N_ARGREGS)
+            return False; /* out of argregs */
+
+         if (aTy == Ity_I64) {
+            tmpregs[nextArgReg] = iselIntExpr_R(env, args[i]);
+            nextArgReg++;
+         }
+         else if (arg->tag == Iex_BBPTR) {
+            vassert(0); //ATC
+            tmpregs[nextArgReg] = hregARM64_X21();
+            nextArgReg++;
+         }
+         else if (arg->tag == Iex_VECRET) {
+            vassert(!hregIsInvalid(r_vecRetAddr));
+            tmpregs[nextArgReg] = r_vecRetAddr;
+            nextArgReg++;
+         }
+         else
+            return False; /* unhandled arg type */
+      }
+
+      /* Now we can compute the condition.  We can't do it earlier
+         because the argument computations could trash the condition
+         codes.  Be a bit clever to handle the common case where the
+         guard is 1:Bit. */
+      cc = ARM64cc_AL;
+      if (guard) {
+         if (guard->tag == Iex_Const
+             && guard->Iex.Const.con->tag == Ico_U1
+             && guard->Iex.Const.con->Ico.U1 == True) {
+            /* unconditional -- do nothing */
+         } else {
+            cc = iselCondCode( env, guard );
+         }
+      }
+
+      /* Move the args to their final destinations. */
+      for (i = 0; i < nextArgReg; i++) {
+         vassert(!(hregIsInvalid(tmpregs[i])));
+         /* None of these insns, including any spill code that might
+            be generated, may alter the condition codes. */
+         addInstr( env, ARM64Instr_MovI( argregs[i], tmpregs[i] ) );
+      }
+
+   }
+
+   /* Should be assured by checks above */
+   vassert(nextArgReg <= ARM64_N_ARGREGS);
+
+   /* Do final checks, set the return values, and generate the call
+      instruction proper. */
+   vassert(nBBPTRs == 0 || nBBPTRs == 1);
+   vassert(nVECRETs == (retTy == Ity_V128 || retTy == Ity_V256) ? 1 : 0);
+   vassert(*stackAdjustAfterCall == 0);
+   vassert(is_RetLoc_INVALID(*retloc));
+   switch (retTy) {
+      case Ity_INVALID:
+         /* Function doesn't return a value. */
+         *retloc = mk_RetLoc_simple(RLPri_None);
+         break;
+      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
+         *retloc = mk_RetLoc_simple(RLPri_Int);
+         break;
+      case Ity_V128:
+         *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
+         *stackAdjustAfterCall = 16;
+         break;
+      case Ity_V256:
+         vassert(0); // ATC
+         *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
+         *stackAdjustAfterCall = 32;
+         break;
+      default:
+         /* IR can denote other possible return types, but we don't
+            handle those here. */
+         vassert(0);
+   }
+
+   /* Finally, generate the call itself.  This needs the *retloc value
+      set in the switch above, which is why it's at the end. */
+
+   /* nextArgReg doles out argument registers.  Since these are
+      assigned in the order x0 .. x7, its numeric value at this point,
+      which must be between 0 and 8 inclusive, is going to be equal to
+      the number of arg regs in use for the call.  Hence bake that
+      number into the call (we'll need to know it when doing register
+      allocation, to know what regs the call reads.) */
+
+   target = (HWord)Ptr_to_ULong(cee->addr);
+   addInstr(env, ARM64Instr_Call( cc, target, nextArgReg, *retloc ));
+
+   return True; /* success */
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (64/32 bit)             ---*/
+/*---------------------------------------------------------*/
+
+/* Select insns for an integer-typed expression, and add them to the
+   code list.  Return a reg holding the result.  This reg will be a
+   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
+   want to modify it, ask for a new vreg, copy it in there, and modify
+   the copy.  The register allocator will do its best to map both
+   vregs to the same real register, so the copies will often disappear
+   later in the game.
+
+   This should handle expressions of 64- and 32-bit type.  All results
+   are returned in a 64-bit register.  For 32-bit expressions, the
+   upper 32 bits are arbitrary, so you should mask or sign extend
+   partial values if necessary.
+*/
+
+/* --------------------- AMode --------------------- */
+
+/* Return an AMode which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 64-bit one.
+*/
+
+static Bool isValidScale ( UChar scale )
+{
+   switch (scale) {
+      case 1: case 2: case 4: case 8: /* case 16: ??*/ return True;
+      default: return False;
+   }
+}
+
+static Bool sane_AMode ( ARM64AMode* am )
+{
+   switch (am->tag) {
+      case ARM64am_RI9:
+         return
+            toBool( hregClass(am->ARM64am.RI9.reg) == HRcInt64
+                    && (hregIsVirtual(am->ARM64am.RI9.reg)
+                        /* || sameHReg(am->ARM64am.RI9.reg, 
+                                       hregARM64_X21()) */ )
+                    && am->ARM64am.RI9.simm9 >= -256
+                    && am->ARM64am.RI9.simm9 <= 255 );
+      case ARM64am_RI12:
+         return
+            toBool( hregClass(am->ARM64am.RI12.reg) == HRcInt64
+                    && (hregIsVirtual(am->ARM64am.RI12.reg)
+                        /* || sameHReg(am->ARM64am.RI12.reg, 
+                                       hregARM64_X21()) */ )
+                    && am->ARM64am.RI12.uimm12 < 4096
+                    && isValidScale(am->ARM64am.RI12.szB) );
+      case ARM64am_RR:
+         return
+            toBool( hregClass(am->ARM64am.RR.base) == HRcInt64
+                    && hregIsVirtual(am->ARM64am.RR.base)
+                    && hregClass(am->ARM64am.RR.index) == HRcInt64
+                    && hregIsVirtual(am->ARM64am.RR.index) );
+      default:
+         vpanic("sane_AMode: unknown ARM64 AMode1 tag");
+   }
+}
+
+static
+ARM64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e, IRType dty )
+{
+   ARM64AMode* am = iselIntExpr_AMode_wrk(env, e, dty);
+   vassert(sane_AMode(am));
+   return am;
+}
+
+static
+ARM64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e, IRType dty )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64);
+
+   ULong szBbits = 0;
+   switch (dty) {
+      case Ity_I64: szBbits = 3; break;
+      case Ity_I32: szBbits = 2; break;
+      case Ity_I16: szBbits = 1; break;
+      case Ity_I8:  szBbits = 0; break;
+      default: vassert(0);
+   }
+
+   /* {Add64,Sub64}(expr,simm9).  We don't care about |dty| here since
+      we're going to create an amode suitable for LDU* or STU*
+      instructions, which use unscaled immediate offsets.  */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_Add64 || e->Iex.Binop.op == Iop_Sub64)
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64) {
+      Long simm = (Long)e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
+      if (simm >= -255 && simm <= 255) {
+         /* Although the gating condition might seem to be 
+               simm >= -256 && simm <= 255
+            we will need to negate simm in the case where the op is Sub64.
+            Hence limit the lower value to -255 in order that its negation
+            is representable. */
+         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         if (e->Iex.Binop.op == Iop_Sub64) simm = -simm;
+         return ARM64AMode_RI9(reg, (Int)simm);
+      }
+   }
+
+   /* Add64(expr, uimm12 * transfer-size) */
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_Add64
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64) {
+      ULong uimm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
+      ULong szB  = 1 << szBbits;
+      if (0 == (uimm & (szB-1)) /* "uimm is szB-aligned" */
+          && (uimm >> szBbits) < 4096) {
+         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         return ARM64AMode_RI12(reg, (UInt)(uimm >> szBbits), (UChar)szB);
+      }
+   }
+
+   /* Add64(expr1, expr2) */
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_Add64) {
+      HReg reg1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      HReg reg2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
+      return ARM64AMode_RR(reg1, reg2);
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   HReg reg = iselIntExpr_R(env, e);
+   return ARM64AMode_RI9(reg, 0);
+}
+
+//ZZ /* --------------------- AModeV --------------------- */
+//ZZ 
+//ZZ /* Return an AModeV which computes the value of the specified
+//ZZ    expression, possibly also adding insns to the code list as a
+//ZZ    result.  The expression may only be a 32-bit one.
+//ZZ */
+//ZZ 
+//ZZ static Bool sane_AModeV ( ARMAModeV* am )
+//ZZ {
+//ZZ   return toBool( hregClass(am->reg) == HRcInt32
+//ZZ                  && hregIsVirtual(am->reg)
+//ZZ                  && am->simm11 >= -1020 && am->simm11 <= 1020
+//ZZ                  && 0 == (am->simm11 & 3) );
+//ZZ }
+//ZZ 
+//ZZ static ARMAModeV* iselIntExpr_AModeV ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    ARMAModeV* am = iselIntExpr_AModeV_wrk(env, e);
+//ZZ    vassert(sane_AModeV(am));
+//ZZ    return am;
+//ZZ }
+//ZZ 
+//ZZ static ARMAModeV* iselIntExpr_AModeV_wrk ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    IRType ty = typeOfIRExpr(env->type_env,e);
+//ZZ    vassert(ty == Ity_I32);
+//ZZ 
+//ZZ    /* {Add32,Sub32}(expr, simm8 << 2) */
+//ZZ    if (e->tag == Iex_Binop
+//ZZ        && (e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Sub32)
+//ZZ        && e->Iex.Binop.arg2->tag == Iex_Const
+//ZZ        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
+//ZZ       Int simm = (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+//ZZ       if (simm >= -1020 && simm <= 1020 && 0 == (simm & 3)) {
+//ZZ          HReg reg;
+//ZZ          if (e->Iex.Binop.op == Iop_Sub32)
+//ZZ             simm = -simm;
+//ZZ          reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+//ZZ          return mkARMAModeV(reg, simm);
+//ZZ       }
+//ZZ    }
+//ZZ 
+//ZZ    /* Doesn't match anything in particular.  Generate it into
+//ZZ       a register and use that. */
+//ZZ    {
+//ZZ       HReg reg = iselIntExpr_R(env, e);
+//ZZ       return mkARMAModeV(reg, 0);
+//ZZ    }
+//ZZ 
+//ZZ }
+//ZZ 
+//ZZ /* -------------------- AModeN -------------------- */
+//ZZ 
+//ZZ static ARMAModeN* iselIntExpr_AModeN ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    return iselIntExpr_AModeN_wrk(env, e);
+//ZZ }
+//ZZ 
+//ZZ static ARMAModeN* iselIntExpr_AModeN_wrk ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    HReg reg = iselIntExpr_R(env, e);
+//ZZ    return mkARMAModeN_R(reg);
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /* --------------------- RI84 --------------------- */
+//ZZ 
+//ZZ /* Select instructions to generate 'e' into a RI84.  If mayInv is
+//ZZ    true, then the caller will also accept an I84 form that denotes
+//ZZ    'not e'.  In this case didInv may not be NULL, and *didInv is set
+//ZZ    to True.  This complication is so as to allow generation of an RI84
+//ZZ    which is suitable for use in either an AND or BIC instruction,
+//ZZ    without knowing (before this call) which one.
+//ZZ */
+//ZZ static ARMRI84* iselIntExpr_RI84 ( /*OUT*/Bool* didInv, Bool mayInv,
+//ZZ                                    ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    ARMRI84* ri;
+//ZZ    if (mayInv)
+//ZZ       vassert(didInv != NULL);
+//ZZ    ri = iselIntExpr_RI84_wrk(didInv, mayInv, env, e);
+//ZZ    /* sanity checks ... */
+//ZZ    switch (ri->tag) {
+//ZZ       case ARMri84_I84:
+//ZZ          return ri;
+//ZZ       case ARMri84_R:
+//ZZ          vassert(hregClass(ri->ARMri84.R.reg) == HRcInt32);
+//ZZ          vassert(hregIsVirtual(ri->ARMri84.R.reg));
+//ZZ          return ri;
+//ZZ       default:
+//ZZ          vpanic("iselIntExpr_RI84: unknown arm RI84 tag");
+//ZZ    }
+//ZZ }
+//ZZ 
+//ZZ /* DO NOT CALL THIS DIRECTLY ! */
+//ZZ static ARMRI84* iselIntExpr_RI84_wrk ( /*OUT*/Bool* didInv, Bool mayInv,
+//ZZ                                        ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    IRType ty = typeOfIRExpr(env->type_env,e);
+//ZZ    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+//ZZ 
+//ZZ    if (didInv) *didInv = False;
+//ZZ 
+//ZZ    /* special case: immediate */
+//ZZ    if (e->tag == Iex_Const) {
+//ZZ       UInt u, u8 = 0x100, u4 = 0x10; /* both invalid */
+//ZZ       switch (e->Iex.Const.con->tag) {
+//ZZ          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+//ZZ          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+//ZZ          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+//ZZ          default: vpanic("iselIntExpr_RI84.Iex_Const(armh)");
+//ZZ       }
+//ZZ       if (fitsIn8x4(&u8, &u4, u)) {
+//ZZ          return ARMRI84_I84( (UShort)u8, (UShort)u4 );
+//ZZ       }
+//ZZ       if (mayInv && fitsIn8x4(&u8, &u4, ~u)) {
+//ZZ          vassert(didInv);
+//ZZ          *didInv = True;
+//ZZ          return ARMRI84_I84( (UShort)u8, (UShort)u4 );
+//ZZ       }
+//ZZ       /* else fail, fall through to default case */
+//ZZ    }
+//ZZ 
+//ZZ    /* default case: calculate into a register and return that */
+//ZZ    {
+//ZZ       HReg r = iselIntExpr_R ( env, e );
+//ZZ       return ARMRI84_R(r);
+//ZZ    }
+//ZZ }
+
+
+/* --------------------- RIA --------------------- */
+
+/* Select instructions to generate 'e' into a RIA. */
+
+static ARM64RIA* iselIntExpr_RIA ( ISelEnv* env, IRExpr* e )
+{
+   ARM64RIA* ri = iselIntExpr_RIA_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case ARM64riA_I12:
+         vassert(ri->ARM64riA.I12.imm12 < 4096);
+         vassert(ri->ARM64riA.I12.shift == 0 || ri->ARM64riA.I12.shift == 12);
+         return ri;
+      case ARM64riA_R:
+         vassert(hregClass(ri->ARM64riA.R.reg) == HRcInt64);
+         vassert(hregIsVirtual(ri->ARM64riA.R.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RIA: unknown arm RIA tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static ARM64RIA* iselIntExpr_RIA_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      ULong u = 0xF000000ULL; /* invalid */
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U64: u = e->Iex.Const.con->Ico.U64; break;
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         default: vpanic("iselIntExpr_RIA.Iex_Const(arm64)");
+      }
+      if (0 == (u & ~(0xFFFULL << 0)))
+         return ARM64RIA_I12((UShort)((u >> 0) & 0xFFFULL), 0);
+      if (0 == (u & ~(0xFFFULL << 12)))
+         return ARM64RIA_I12((UShort)((u >> 12) & 0xFFFULL), 12);
+      /* else fail, fall through to default case */
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return ARM64RIA_R(r);
+   }
+}
+
+
+/* --------------------- RIL --------------------- */
+
+/* Select instructions to generate 'e' into a RIL.  At this point we
+   have to deal with the strange bitfield-immediate encoding for logic
+   instructions. */
+
+
+// The following four functions
+//    CountLeadingZeros CountTrailingZeros CountSetBits isImmLogical
+// are copied, with modifications, from
+// https://github.com/armvixl/vixl/blob/master/src/a64/assembler-a64.cc
+// which has the following copyright notice:
+/*
+   Copyright 2013, ARM Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   
+   * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright notice,
+     this list of conditions and the following disclaimer in the documentation
+     and/or other materials provided with the distribution.
+   * Neither the name of ARM Limited nor the names of its contributors may be
+     used to endorse or promote products derived from this software without
+     specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+static Int CountLeadingZeros(ULong value, Int width)
+{
+   vassert(width == 32 || width == 64);
+   Int count = 0;
+   ULong bit_test = 1ULL << (width - 1);
+   while ((count < width) && ((bit_test & value) == 0)) {
+      count++;
+      bit_test >>= 1;
+   }
+   return count;
+}
+
+static Int CountTrailingZeros(ULong value, Int width)
+{
+   vassert(width == 32 || width == 64);
+   Int count = 0;
+   while ((count < width) && (((value >> count) & 1) == 0)) {
+      count++;
+   }
+   return count;
+}
+
+static Int CountSetBits(ULong value, Int width)
+{
+   // TODO: Other widths could be added here, as the implementation already
+   // supports them.
+   vassert(width == 32 || width == 64);
+
+   // Mask out unused bits to ensure that they are not counted.
+   value &= (0xffffffffffffffffULL >> (64-width));
+
+   // Add up the set bits.
+   // The algorithm works by adding pairs of bit fields together iteratively,
+   // where the size of each bit field doubles each time.
+   // An example for an 8-bit value:
+   // Bits: h g f e d c b a
+   // \ | \ | \ | \ |
+   // value = h+g f+e d+c b+a
+   // \ | \ |
+   // value = h+g+f+e d+c+b+a
+   // \ |
+   // value = h+g+f+e+d+c+b+a
+   value = ((value >>  1) & 0x5555555555555555ULL)
+                 + (value & 0x5555555555555555ULL);
+   value = ((value >>  2) & 0x3333333333333333ULL)
+                 + (value & 0x3333333333333333ULL);
+   value = ((value >>  4) & 0x0f0f0f0f0f0f0f0fULL)
+                 + (value & 0x0f0f0f0f0f0f0f0fULL);
+   value = ((value >>  8) & 0x00ff00ff00ff00ffULL)
+                 + (value & 0x00ff00ff00ff00ffULL);
+   value = ((value >> 16) & 0x0000ffff0000ffffULL)
+                 + (value & 0x0000ffff0000ffffULL);
+   value = ((value >> 32) & 0x00000000ffffffffULL)
+                 + (value & 0x00000000ffffffffULL);
+
+   return value;
+}
+
+static Bool isImmLogical ( /*OUT*/UInt* n,
+                           /*OUT*/UInt* imm_s, /*OUT*/UInt* imm_r,
+                           ULong value, UInt width )
+{
+  // Test if a given value can be encoded in the immediate field of a
+  // logical instruction.
+
+  // If it can be encoded, the function returns true, and values
+  // pointed to by n, imm_s and imm_r are updated with immediates
+  // encoded in the format required by the corresponding fields in the
+  // logical instruction.  If it can not be encoded, the function
+  // returns false, and the values pointed to by n, imm_s and imm_r
+  // are undefined.
+  vassert(n != NULL && imm_s != NULL && imm_r != NULL);
+  vassert(width == 32 || width == 64);
+
+  // Logical immediates are encoded using parameters n, imm_s and imm_r using
+  // the following table:
+  //
+  // N imms immr size S R
+  // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr)
+  // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr)
+  // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr)
+  // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr)
+  // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr)
+  // 0 11110s xxxxxr 2 UInt(s) UInt(r)
+  // (s bits must not be all set)
+  //
+  // A pattern is constructed of size bits, where the least significant S+1
+  // bits are set. The pattern is rotated right by R, and repeated across a
+  // 32 or 64-bit value, depending on destination register width.
+  //
+  // To test if an arbitrary immediate can be encoded using this scheme, an
+  // iterative algorithm is used.
+  //
+  // TODO: This code does not consider using X/W register overlap to support
+  // 64-bit immediates where the top 32-bits are zero, and the bottom 32-bits
+  // are an encodable logical immediate.
+
+  // 1. If the value has all set or all clear bits, it can't be encoded.
+  if ((value == 0) || (value == 0xffffffffffffffffULL) ||
+      ((width == 32) && (value == 0xffffffff))) {
+    return False;
+  }
+
+  UInt lead_zero = CountLeadingZeros(value, width);
+  UInt lead_one = CountLeadingZeros(~value, width);
+  UInt trail_zero = CountTrailingZeros(value, width);
+  UInt trail_one = CountTrailingZeros(~value, width);
+  UInt set_bits = CountSetBits(value, width);
+
+  // The fixed bits in the immediate s field.
+  // If width == 64 (X reg), start at 0xFFFFFF80.
+  // If width == 32 (W reg), start at 0xFFFFFFC0, as the iteration for 64-bit
+  // widths won't be executed.
+  Int imm_s_fixed = (width == 64) ? -128 : -64;
+  Int imm_s_mask = 0x3F;
+
+  for (;;) {
+    // 2. If the value is two bits wide, it can be encoded.
+    if (width == 2) {
+      *n = 0;
+      *imm_s = 0x3C;
+      *imm_r = (value & 3) - 1;
+      return True;
+    }
+
+    *n = (width == 64) ? 1 : 0;
+    *imm_s = ((imm_s_fixed | (set_bits - 1)) & imm_s_mask);
+    if ((lead_zero + set_bits) == width) {
+      *imm_r = 0;
+    } else {
+      *imm_r = (lead_zero > 0) ? (width - trail_zero) : lead_one;
+    }
+
+    // 3. If the sum of leading zeros, trailing zeros and set bits is equal to
+    // the bit width of the value, it can be encoded.
+    if (lead_zero + trail_zero + set_bits == width) {
+      return True;
+    }
+
+    // 4. If the sum of leading ones, trailing ones and unset bits in the
+    // value is equal to the bit width of the value, it can be encoded.
+    if (lead_one + trail_one + (width - set_bits) == width) {
+      return True;
+    }
+
+    // 5. If the most-significant half of the bitwise value is equal to the
+    // least-significant half, return to step 2 using the least-significant
+    // half of the value.
+    ULong mask = (1ULL << (width >> 1)) - 1;
+    if ((value & mask) == ((value >> (width >> 1)) & mask)) {
+      width >>= 1;
+      set_bits >>= 1;
+      imm_s_fixed >>= 1;
+      continue;
+    }
+
+    // 6. Otherwise, the value can't be encoded.
+    return False;
+  }
+}
+
+
+/* Create a RIL for the given immediate, if it is representable, or
+   return NULL if not. */
+
+static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 )
+{
+   UInt n = 0, imm_s = 0, imm_r = 0;
+   Bool ok = isImmLogical(&n, &imm_s, &imm_r, imm64, 64);
+   if (!ok) return NULL;
+   vassert(n < 2 && imm_s < 64 && imm_r < 64);
+   return ARM64RIL_I13(n, imm_r, imm_s);
+}
+
+/* So, finally .. */
+
+static ARM64RIL* iselIntExpr_RIL ( ISelEnv* env, IRExpr* e )
+{
+   ARM64RIL* ri = iselIntExpr_RIL_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case ARM64riL_I13:
+         vassert(ri->ARM64riL.I13.bitN < 2);
+         vassert(ri->ARM64riL.I13.immR < 64);
+         vassert(ri->ARM64riL.I13.immS < 64);
+         return ri;
+      case ARM64riL_R:
+         vassert(hregClass(ri->ARM64riL.R.reg) == HRcInt64);
+         vassert(hregIsVirtual(ri->ARM64riL.R.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RIL: unknown arm RIL tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static ARM64RIL* iselIntExpr_RIL_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32);
+   
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      ARM64RIL* maybe = NULL;
+      if (ty == Ity_I64) {
+         vassert(e->Iex.Const.con->tag == Ico_U64);
+         maybe = mb_mkARM64RIL_I(e->Iex.Const.con->Ico.U64);
+      } else {
+         vassert(ty == Ity_I32);
+         vassert(e->Iex.Const.con->tag == Ico_U32);
+         UInt  u32 = e->Iex.Const.con->Ico.U32;
+         ULong u64 = (ULong)u32;
+         /* First try with 32 leading zeroes. */
+         maybe = mb_mkARM64RIL_I(u64);
+         /* If that doesn't work, try with 2 copies, since it doesn't
+            matter what winds up in the upper 32 bits. */
+         if (!maybe) {
+            maybe = mb_mkARM64RIL_I((u64 << 32) | u64);
+         }
+      }
+      if (maybe) return maybe;
+      /* else fail, fall through to default case */
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return ARM64RIL_R(r);
+   }
+}
+
+
+/* --------------------- RI6 --------------------- */
+
+/* Select instructions to generate 'e' into a RI6. */
+
+static ARM64RI6* iselIntExpr_RI6 ( ISelEnv* env, IRExpr* e )
+{
+   ARM64RI6* ri = iselIntExpr_RI6_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case ARM64ri6_I6:
+         vassert(ri->ARM64ri6.I6.imm6 < 64);
+         vassert(ri->ARM64ri6.I6.imm6 > 0);
+         return ri;
+      case ARM64ri6_R:
+         vassert(hregClass(ri->ARM64ri6.R.reg) == HRcInt64);
+         vassert(hregIsVirtual(ri->ARM64ri6.R.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RI6: unknown arm RI6 tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static ARM64RI6* iselIntExpr_RI6_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U8: {
+            UInt u = e->Iex.Const.con->Ico.U8;
+            if (u > 0 && u < 64)
+              return ARM64RI6_I6(u);
+            break;
+         default:
+            break;
+         }
+      }
+      /* else fail, fall through to default case */
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return ARM64RI6_R(r);
+   }
+}
+
+
+/* ------------------- CondCode ------------------- */
+
+/* Generate code to evaluated a bit-typed expression, returning the
+   condition code which would correspond when the expression would
+   notionally have returned 1. */
+
+static ARM64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
+{
+   ARM64CondCode cc = iselCondCode_wrk(env,e);
+   vassert(cc != ARM64cc_NV);
+   return cc;
+}
+
+static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
+
+   /* var */
+   if (e->tag == Iex_RdTmp) {
+      HReg rTmp = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      /* Cmp doesn't modify rTmp; so this is OK. */
+      ARM64RIL* one = mb_mkARM64RIL_I(1);
+      vassert(one);
+      addInstr(env, ARM64Instr_Test(rTmp, one));
+      return ARM64cc_NE;
+   }
+
+   /* Not1(e) */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
+      /* Generate code for the arg, and negate the test condition */
+      ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
+      if (cc == ARM64cc_AL || cc == ARM64cc_NV) {
+        return ARM64cc_AL;
+      } else {
+        return 1 ^ cc;
+      }
+   }
+
+   /* --- patterns rooted at: 64to1 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_64to1) {
+      HReg      rTmp = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARM64RIL* one  = mb_mkARM64RIL_I(1);
+      vassert(one); /* '1' must be representable */
+      addInstr(env, ARM64Instr_Test(rTmp, one));
+      return ARM64cc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ8 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ8) {
+      HReg      r1  = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARM64RIL* xFF = mb_mkARM64RIL_I(0xFF);
+      addInstr(env, ARM64Instr_Test(r1, xFF));
+      return ARM64cc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ64 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ64) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARM64RIA* zero = ARM64RIA_I12(0,0);
+      addInstr(env, ARM64Instr_Cmp(r1, zero, True/*is64*/));
+      return ARM64cc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ32 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ32) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARM64RIA* zero = ARM64RIA_I12(0,0);
+      addInstr(env, ARM64Instr_Cmp(r1, zero, False/*!is64*/));
+      return ARM64cc_NE;
+   }
+
+   /* --- Cmp*64*(x,y) --- */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CmpEQ64
+           || e->Iex.Binop.op == Iop_CmpNE64
+           || e->Iex.Binop.op == Iop_CmpLT64S
+           || e->Iex.Binop.op == Iop_CmpLT64U
+           || e->Iex.Binop.op == Iop_CmpLE64S
+           || e->Iex.Binop.op == Iop_CmpLE64U)) {
+      HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
+      addInstr(env, ARM64Instr_Cmp(argL, argR, True/*is64*/));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ64:  return ARM64cc_EQ;
+         case Iop_CmpNE64:  return ARM64cc_NE;
+         case Iop_CmpLT64S: return ARM64cc_LT;
+         case Iop_CmpLT64U: return ARM64cc_CC;
+         case Iop_CmpLE64S: return ARM64cc_LE;
+         case Iop_CmpLE64U: return ARM64cc_LS;
+         default: vpanic("iselCondCode(arm64): CmpXX64");
+      }
+   }
+
+   /* --- Cmp*32*(x,y) --- */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CmpLT32S
+           || e->Iex.Binop.op == Iop_CmpLT32U
+           || e->Iex.Binop.op == Iop_CmpLE32S
+           || e->Iex.Binop.op == Iop_CmpLE32U)) {
+      HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
+      addInstr(env, ARM64Instr_Cmp(argL, argR, False/*!is64*/));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ32:  return ARM64cc_EQ;
+         case Iop_CmpNE32:  return ARM64cc_NE;
+         case Iop_CmpLT32S: return ARM64cc_LT;
+         case Iop_CmpLT32U: return ARM64cc_CC;
+         case Iop_CmpLE32S: return ARM64cc_LE;
+         case Iop_CmpLE32U: return ARM64cc_LS;
+         default: vpanic("iselCondCode(arm64): CmpXX32");
+      }
+   }
+
+//ZZ    /* const */
+//ZZ    /* Constant 1:Bit */
+//ZZ    if (e->tag == Iex_Const) {
+//ZZ       HReg r;
+//ZZ       vassert(e->Iex.Const.con->tag == Ico_U1);
+//ZZ       vassert(e->Iex.Const.con->Ico.U1 == True 
+//ZZ               || e->Iex.Const.con->Ico.U1 == False);
+//ZZ       r = newVRegI(env);
+//ZZ       addInstr(env, ARMInstr_Imm32(r, 0));
+//ZZ       addInstr(env, ARMInstr_CmpOrTst(True/*isCmp*/, r, ARMRI84_R(r)));
+//ZZ       return e->Iex.Const.con->Ico.U1 ? ARMcc_EQ : ARMcc_NE;
+//ZZ    }
+//ZZ 
+//ZZ    // JRS 2013-Jan-03: this seems completely nonsensical
+//ZZ    /* --- CasCmpEQ* --- */
+//ZZ    /* Ist_Cas has a dummy argument to compare with, so comparison is
+//ZZ       always true. */
+//ZZ    //if (e->tag == Iex_Binop
+//ZZ    //    && (e->Iex.Binop.op == Iop_CasCmpEQ32
+//ZZ    //        || e->Iex.Binop.op == Iop_CasCmpEQ16
+//ZZ    //        || e->Iex.Binop.op == Iop_CasCmpEQ8)) {
+//ZZ    //   return ARMcc_AL;
+//ZZ    //}
+
+   ppIRExpr(e);
+   vpanic("iselCondCode");
+}
+
+
+/* --------------------- Reg --------------------- */
+
+static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselIntExpr_R_wrk(env, e);
+   /* sanity checks ... */
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcInt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   switch (e->tag) {
+
+   /* --------- TEMP --------- */
+   case Iex_RdTmp: {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg dst  = newVRegI(env);
+
+      if (e->Iex.Load.end != Iend_LE)
+         goto irreducible;
+
+      if (ty == Ity_I64) {
+         ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
+         addInstr(env, ARM64Instr_LdSt64(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+      if (ty == Ity_I32) {
+         ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
+         addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+      if (ty == Ity_I16) {
+         ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
+         addInstr(env, ARM64Instr_LdSt16(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+      if (ty == Ity_I8) {
+         ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
+         addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+
+      ARM64LogicOp lop = 0; /* invalid */
+      ARM64ShiftOp sop = 0; /* invalid */
+
+      /* Special-case 0-x into a Neg instruction.  Not because it's
+         particularly useful but more so as to give value flow using
+         this instruction, so as to check its assembly correctness for
+         implementation of Left32/Left64. */
+      switch (e->Iex.Binop.op) {
+         case Iop_Sub64:
+            if (isZeroU64(e->Iex.Binop.arg1)) {
+               HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+               HReg dst  = newVRegI(env);
+               addInstr(env, ARM64Instr_Unary(dst, argR, ARM64un_NEG));
+               return dst;
+            }
+            break;
+         default:
+            break;
+      }
+
+      /* ADD/SUB */
+      switch (e->Iex.Binop.op) {
+         case Iop_Add64: case Iop_Add32:
+         case Iop_Sub64: case Iop_Sub32: {
+            Bool      isAdd = e->Iex.Binop.op == Iop_Add64
+                              || e->Iex.Binop.op == Iop_Add32;
+            HReg      dst   = newVRegI(env);
+            HReg      argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARM64RIA* argR  = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
+            addInstr(env, ARM64Instr_Arith(dst, argL, argR, isAdd));
+            return dst;
+         }
+         default:
+            break;
+      }
+
+      /* AND/OR/XOR */
+      switch (e->Iex.Binop.op) {
+         case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
+         case Iop_Or64:  case Iop_Or32:  lop = ARM64lo_OR;  goto log_binop;
+         case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
+         log_binop: {
+            HReg      dst  = newVRegI(env);
+            HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARM64RIL* argR = iselIntExpr_RIL(env, e->Iex.Binop.arg2);
+            addInstr(env, ARM64Instr_Logic(dst, argL, argR, lop));
+            return dst;
+         }
+         default:
+            break;
+      }
+
+      /* SHL/SHR/SAR */
+      switch (e->Iex.Binop.op) {
+         case Iop_Shr64:                 sop = ARM64sh_SHR; goto sh_binop;
+         case Iop_Sar64:                 sop = ARM64sh_SAR; goto sh_binop;
+         case Iop_Shl64: case Iop_Shl32: sop = ARM64sh_SHL; goto sh_binop;
+         sh_binop: {
+            HReg      dst  = newVRegI(env);
+            HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARM64RI6* argR = iselIntExpr_RI6(env, e->Iex.Binop.arg2);
+            addInstr(env, ARM64Instr_Shift(dst, argL, argR, sop));
+            return dst;
+         }
+         case Iop_Shr32:
+         case Iop_Sar32: {
+            Bool      zx   = e->Iex.Binop.op == Iop_Shr32;
+            HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARM64RI6* argR = iselIntExpr_RI6(env, e->Iex.Binop.arg2);
+            HReg      dst  = zx ? widen_z_32_to_64(env, argL)
+                                : widen_s_32_to_64(env, argL);
+            addInstr(env, ARM64Instr_Shift(dst, dst, argR, ARM64sh_SHR));
+            return dst;
+         }
+         default: break;
+      }
+
+      /* MUL */
+      if (e->Iex.Binop.op == Iop_Mul64 || e->Iex.Binop.op == Iop_Mul32) {
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegI(env);
+         addInstr(env, ARM64Instr_Mul(dst, argL, argR, ARM64mul_PLAIN));
+         return dst;
+      }
+
+      /* MULL */
+      if (e->Iex.Binop.op == Iop_MullU32 || e->Iex.Binop.op == Iop_MullS32) {
+         Bool isS  = e->Iex.Binop.op == Iop_MullS32;
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg extL = (isS ? widen_s_32_to_64 : widen_z_32_to_64)(env, argL);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg extR = (isS ? widen_s_32_to_64 : widen_z_32_to_64)(env, argR);
+         HReg dst  = newVRegI(env);
+         addInstr(env, ARM64Instr_Mul(dst, extL, extR, ARM64mul_PLAIN));
+         return dst;
+      }
+
+      /* Handle misc other ops. */
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegI(env);
+         addInstr(env, ARM64Instr_Cmp(argL, ARM64RIA_R(argR), False/*!is64*/));
+         addInstr(env, ARM64Instr_CSel(dst, argL, argR, ARM64cc_CS));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_32HLto64) {
+         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg lo32  = widen_z_32_to_64(env, lo32s);
+         HReg hi32  = newVRegI(env);
+         addInstr(env, ARM64Instr_Shift(hi32, hi32s, ARM64RI6_I6(32),
+                                        ARM64sh_SHL));
+         addInstr(env, ARM64Instr_Logic(hi32, hi32, ARM64RIL_R(lo32),
+                                        ARM64lo_OR));
+         return hi32;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpF64 || e->Iex.Binop.op == Iop_CmpF32) {
+         Bool isD = e->Iex.Binop.op == Iop_CmpF64;
+         HReg dL  = (isD ? iselDblExpr : iselFltExpr)(env, e->Iex.Binop.arg1);
+         HReg dR  = (isD ? iselDblExpr : iselFltExpr)(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+         HReg imm = newVRegI(env);
+         /* Do the compare (FCMP), which sets NZCV in PSTATE.  Then
+            create in dst, the IRCmpF64Result encoded result. */
+         addInstr(env, (isD ? ARM64Instr_VCmpD : ARM64Instr_VCmpS)(dL, dR));
+         addInstr(env, ARM64Instr_Imm64(dst, 0));
+         addInstr(env, ARM64Instr_Imm64(imm, 0x40)); // 0x40 = Ircr_EQ
+         addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_EQ));
+         addInstr(env, ARM64Instr_Imm64(imm, 0x01)); // 0x01 = Ircr_LT
+         addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_MI));
+         addInstr(env, ARM64Instr_Imm64(imm, 0x00)); // 0x00 = Ircr_GT
+         addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_GT));
+         addInstr(env, ARM64Instr_Imm64(imm, 0x45)); // 0x45 = Ircr_UN
+         addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_VS));
+         return dst;
+      }
+
+      { /* local scope */
+        ARM64CvtOp cvt_op = ARM64cvt_INVALID;
+        Bool       srcIsD = False;
+        switch (e->Iex.Binop.op) {
+           case Iop_F64toI64S:
+              cvt_op = ARM64cvt_F64_I64S; srcIsD = True; break;
+           case Iop_F64toI64U:
+              cvt_op = ARM64cvt_F64_I64U; srcIsD = True; break;
+           case Iop_F64toI32S:
+              cvt_op = ARM64cvt_F64_I32S; srcIsD = True; break;
+           case Iop_F64toI32U:
+              cvt_op = ARM64cvt_F64_I32U; srcIsD = True; break;
+           case Iop_F32toI32S:
+              cvt_op = ARM64cvt_F32_I32S; srcIsD = False; break;
+           case Iop_F32toI32U:
+              cvt_op = ARM64cvt_F32_I32U; srcIsD = False; break;
+           case Iop_F32toI64S:
+              cvt_op = ARM64cvt_F32_I64S; srcIsD = False; break;
+           case Iop_F32toI64U:
+              cvt_op = ARM64cvt_F32_I64U; srcIsD = False; break;
+           default:
+              break;
+        }
+        if (cvt_op != ARM64cvt_INVALID) {
+           /* This is all a bit dodgy, because we can't handle a
+              non-constant (not-known-at-JIT-time) rounding mode
+              indication.  That's because there's no instruction
+              AFAICS that does this conversion but rounds according to
+              FPCR.RM, so we have to bake the rounding mode into the
+              instruction right now.  But that should be OK because
+              (1) the front end attaches a literal Irrm_ value to the
+              conversion binop, and (2) iropt will never float that
+              off via CSE, into a literal.  Hence we should always
+              have an Irrm_ value as the first arg. */
+           IRExpr* arg1 = e->Iex.Binop.arg1;
+           if (arg1->tag != Iex_Const) goto irreducible;
+           IRConst* arg1con = arg1->Iex.Const.con;
+           vassert(arg1con->tag == Ico_U32); // else ill-typed IR
+           UInt irrm = arg1con->Ico.U32;
+           /* Find the ARM-encoded equivalent for |irrm|. */
+           UInt armrm = 4; /* impossible */
+           switch (irrm) {
+              case Irrm_NEAREST: armrm = 0; break;
+              case Irrm_NegINF:  armrm = 2; break;
+              case Irrm_PosINF:  armrm = 1; break;
+              case Irrm_ZERO:    armrm = 3; break;
+              default: goto irreducible;
+           }
+           HReg src = (srcIsD ? iselDblExpr : iselFltExpr)
+                         (env, e->Iex.Binop.arg2);
+           HReg dst = newVRegI(env);
+           addInstr(env, ARM64Instr_VCvtF2I(cvt_op, dst, src, armrm));
+           return dst;
+        }
+      } /* local scope */
+
+//ZZ       if (e->Iex.Binop.op == Iop_GetElem8x8
+//ZZ           || e->Iex.Binop.op == Iop_GetElem16x4
+//ZZ           || e->Iex.Binop.op == Iop_GetElem32x2) {
+//ZZ          HReg res = newVRegI(env);
+//ZZ          HReg arg = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ          UInt index, size;
+//ZZ          if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ              typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ             vpanic("ARM target supports GetElem with constant "
+//ZZ                    "second argument only\n");
+//ZZ          }
+//ZZ          index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ          switch (e->Iex.Binop.op) {
+//ZZ             case Iop_GetElem8x8: vassert(index < 8); size = 0; break;
+//ZZ             case Iop_GetElem16x4: vassert(index < 4); size = 1; break;
+//ZZ             case Iop_GetElem32x2: vassert(index < 2); size = 2; break;
+//ZZ             default: vassert(0);
+//ZZ          }
+//ZZ          addInstr(env, ARMInstr_NUnaryS(ARMneon_GETELEMS,
+//ZZ                                         mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                         mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                         size, False));
+//ZZ          return res;
+//ZZ       }
+//ZZ 
+//ZZ       if (e->Iex.Binop.op == Iop_GetElem8x16
+//ZZ           || e->Iex.Binop.op == Iop_GetElem16x8
+//ZZ           || e->Iex.Binop.op == Iop_GetElem32x4) {
+//ZZ          HReg res = newVRegI(env);
+//ZZ          HReg arg = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ          UInt index, size;
+//ZZ          if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ              typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ             vpanic("ARM target supports GetElem with constant "
+//ZZ                    "second argument only\n");
+//ZZ          }
+//ZZ          index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ          switch (e->Iex.Binop.op) {
+//ZZ             case Iop_GetElem8x16: vassert(index < 16); size = 0; break;
+//ZZ             case Iop_GetElem16x8: vassert(index < 8); size = 1; break;
+//ZZ             case Iop_GetElem32x4: vassert(index < 4); size = 2; break;
+//ZZ             default: vassert(0);
+//ZZ          }
+//ZZ          addInstr(env, ARMInstr_NUnaryS(ARMneon_GETELEMS,
+//ZZ                                         mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                         mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                         size, True));
+//ZZ          return res;
+//ZZ       }
+
+      /* All cases involving host-side helper calls. */
+      void* fn = NULL;
+      switch (e->Iex.Binop.op) {
+//ZZ          case Iop_Add16x2:
+//ZZ             fn = &h_generic_calc_Add16x2; break;
+//ZZ          case Iop_Sub16x2:
+//ZZ             fn = &h_generic_calc_Sub16x2; break;
+//ZZ          case Iop_HAdd16Ux2:
+//ZZ             fn = &h_generic_calc_HAdd16Ux2; break;
+//ZZ          case Iop_HAdd16Sx2:
+//ZZ             fn = &h_generic_calc_HAdd16Sx2; break;
+//ZZ          case Iop_HSub16Ux2:
+//ZZ             fn = &h_generic_calc_HSub16Ux2; break;
+//ZZ          case Iop_HSub16Sx2:
+//ZZ             fn = &h_generic_calc_HSub16Sx2; break;
+//ZZ          case Iop_QAdd16Sx2:
+//ZZ             fn = &h_generic_calc_QAdd16Sx2; break;
+//ZZ          case Iop_QAdd16Ux2:
+//ZZ             fn = &h_generic_calc_QAdd16Ux2; break;
+//ZZ          case Iop_QSub16Sx2:
+//ZZ             fn = &h_generic_calc_QSub16Sx2; break;
+//ZZ          case Iop_Add8x4:
+//ZZ             fn = &h_generic_calc_Add8x4; break;
+//ZZ          case Iop_Sub8x4:
+//ZZ             fn = &h_generic_calc_Sub8x4; break;
+//ZZ          case Iop_HAdd8Ux4:
+//ZZ             fn = &h_generic_calc_HAdd8Ux4; break;
+//ZZ          case Iop_HAdd8Sx4:
+//ZZ             fn = &h_generic_calc_HAdd8Sx4; break;
+//ZZ          case Iop_HSub8Ux4:
+//ZZ             fn = &h_generic_calc_HSub8Ux4; break;
+//ZZ          case Iop_HSub8Sx4:
+//ZZ             fn = &h_generic_calc_HSub8Sx4; break;
+//ZZ          case Iop_QAdd8Sx4:
+//ZZ             fn = &h_generic_calc_QAdd8Sx4; break;
+//ZZ          case Iop_QAdd8Ux4:
+//ZZ             fn = &h_generic_calc_QAdd8Ux4; break;
+//ZZ          case Iop_QSub8Sx4:
+//ZZ             fn = &h_generic_calc_QSub8Sx4; break;
+//ZZ          case Iop_QSub8Ux4:
+//ZZ             fn = &h_generic_calc_QSub8Ux4; break;
+//ZZ          case Iop_Sad8Ux4:
+//ZZ             fn = &h_generic_calc_Sad8Ux4; break;
+//ZZ          case Iop_QAdd32S:
+//ZZ             fn = &h_generic_calc_QAdd32S; break;
+//ZZ          case Iop_QSub32S:
+//ZZ             fn = &h_generic_calc_QSub32S; break;
+//ZZ          case Iop_QSub16Ux2:
+//ZZ             fn = &h_generic_calc_QSub16Ux2; break;
+         case Iop_DivU32:
+            fn = &h_calc_udiv32_w_arm_semantics; break;
+         case Iop_DivS32:
+            fn = &h_calc_sdiv32_w_arm_semantics; break;
+         case Iop_DivU64:
+            fn = &h_calc_udiv64_w_arm_semantics; break;
+         case Iop_DivS64:
+            fn = &h_calc_sdiv64_w_arm_semantics; break;
+         default:
+            break;
+      }
+
+      if (fn) {
+         HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg res  = newVRegI(env);
+         addInstr(env, ARM64Instr_MovI(hregARM64_X0(), regL));
+         addInstr(env, ARM64Instr_MovI(hregARM64_X1(), regR));
+         addInstr(env, ARM64Instr_Call( ARM64cc_AL, (HWord)Ptr_to_ULong(fn),
+                                        2, mk_RetLoc_simple(RLPri_Int) ));
+         addInstr(env, ARM64Instr_MovI(res, hregARM64_X0()));
+         return res;
+      }
+
+      break;
+   }
+
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+
+      switch (e->Iex.Unop.op) {
+         case Iop_16Uto64: {
+            /* This probably doesn't occur often enough to be worth
+               rolling the extension into the load. */
+            IRExpr* arg = e->Iex.Unop.arg;
+            HReg    src = iselIntExpr_R(env, arg);
+            HReg    dst = widen_z_16_to_64(env, src);
+            return dst;
+         }
+         case Iop_32Uto64: {
+            IRExpr* arg = e->Iex.Unop.arg;
+            if (arg->tag == Iex_Load) {
+               /* This correctly zero extends because _LdSt32 is
+                  defined to do a zero extending load. */
+               HReg dst = newVRegI(env);
+               ARM64AMode* am
+                  = iselIntExpr_AMode(env, arg->Iex.Load.addr, Ity_I32);
+               addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, am));
+               return dst;
+            }
+            /* else be lame and mask it  */
+            HReg src  = iselIntExpr_R(env, arg);
+            HReg dst  = widen_z_32_to_64(env, src);
+            return dst;
+         }
+         case Iop_8Uto32: /* Just freeload on the 8Uto64 case */
+         case Iop_8Uto64: {
+            IRExpr* arg = e->Iex.Unop.arg;
+            if (arg->tag == Iex_Load) {
+               /* This correctly zero extends because _LdSt8 is
+                  defined to do a zero extending load. */
+               HReg dst = newVRegI(env);
+               ARM64AMode* am
+                  = iselIntExpr_AMode(env, arg->Iex.Load.addr, Ity_I8);
+               addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, am));
+               return dst;
+            }
+            /* else be lame and mask it  */
+            HReg src = iselIntExpr_R(env, arg);
+            HReg dst = widen_z_8_to_64(env, src);
+            return dst;
+         }
+         case Iop_128HIto64: {
+            HReg rHi, rLo;
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo */
+         }
+         case Iop_8Sto32: case Iop_8Sto64: {
+            IRExpr* arg = e->Iex.Unop.arg;
+            HReg    src = iselIntExpr_R(env, arg);
+            HReg    dst = widen_s_8_to_64(env, src);
+            return dst;
+         }
+         case Iop_16Sto32: case Iop_16Sto64: {
+            IRExpr* arg = e->Iex.Unop.arg;
+            HReg    src = iselIntExpr_R(env, arg);
+            HReg    dst = widen_s_16_to_64(env, src);
+            return dst;
+         }
+         case Iop_32Sto64: {
+            IRExpr* arg = e->Iex.Unop.arg;
+            HReg    src = iselIntExpr_R(env, arg);
+            HReg    dst = widen_s_32_to_64(env, src);
+            return dst;
+         }
+         case Iop_Not32:
+         case Iop_Not64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NOT));
+            return dst;
+         }
+         case Iop_Clz64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_CLZ));
+            return dst;
+         }
+         case Iop_Left32:
+         case Iop_Left64: {
+            /* Left64(src) = src | -src.  Left32 can use the same
+               implementation since in that case we don't care what
+               the upper 32 bits become. */
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
+            addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
+                                           ARM64lo_OR));
+            return dst;
+         }
+         case Iop_CmpwNEZ64: {
+           /* CmpwNEZ64(src) = (src == 0) ? 0...0 : 1...1
+                             = Left64(src) >>s 63 */
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
+            addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
+                                           ARM64lo_OR));
+            addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
+                                           ARM64sh_SAR));
+            return dst;
+         }
+         case Iop_CmpwNEZ32: {
+            /* CmpwNEZ32(src) = CmpwNEZ64(src & 0xFFFFFFFF)
+                              = Left64(src & 0xFFFFFFFF) >>s 63 */
+            HReg dst = newVRegI(env);
+            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
+            HReg src = widen_z_32_to_64(env, pre);
+            addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
+            addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
+                                           ARM64lo_OR));
+            addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
+                                           ARM64sh_SAR));
+            return dst;
+         }
+         case Iop_V128to64: case Iop_V128HIto64: {
+            HReg dst    = newVRegI(env);
+            HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
+            UInt laneNo = (e->Iex.Unop.op == Iop_V128HIto64) ? 1 : 0;
+            addInstr(env, ARM64Instr_VXfromQ(dst, src, laneNo));
+            return dst;
+         }
+         case Iop_1Sto32:
+         case Iop_1Sto64: {
+            /* As with the iselStmt case for 'tmp:I1 = expr', we could
+               do a lot better here if it ever became necessary. */
+            HReg zero = newVRegI(env);
+            HReg one  = newVRegI(env);
+            HReg dst  = newVRegI(env);
+            addInstr(env, ARM64Instr_Imm64(zero, 0));
+            addInstr(env, ARM64Instr_Imm64(one,  1));
+            ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
+            addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
+                                           ARM64sh_SHL));
+            addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
+                                           ARM64sh_SAR));
+            return dst;
+         }
+         case Iop_NarrowUn16to8x8:
+         case Iop_NarrowUn32to16x4:
+         case Iop_NarrowUn64to32x2: {
+            HReg src = iselV128Expr(env, e->Iex.Unop.arg);
+            HReg tmp = newVRegV(env);
+            HReg dst = newVRegI(env);
+            UInt dszBlg2 = 3; /* illegal */
+            switch (e->Iex.Unop.op) {
+               case Iop_NarrowUn16to8x8:  dszBlg2 = 0; break; // 16to8_x8
+               case Iop_NarrowUn32to16x4: dszBlg2 = 1; break; // 32to16_x4
+               case Iop_NarrowUn64to32x2: dszBlg2 = 2; break; // 64to32_x2
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VNarrowV(dszBlg2, tmp, src));
+            addInstr(env, ARM64Instr_VXfromQ(dst, tmp, 0/*laneNo*/));
+            return dst;
+         }
+//ZZ          case Iop_64HIto32: {
+//ZZ             HReg rHi, rLo;
+//ZZ             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+//ZZ             return rHi; /* and abandon rLo .. poor wee thing :-) */
+//ZZ          }
+//ZZ          case Iop_64to32: {
+//ZZ             HReg rHi, rLo;
+//ZZ             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+//ZZ             return rLo; /* similar stupid comment to the above ... */
+//ZZ          }
+//ZZ          case Iop_64to8: {
+//ZZ             HReg rHi, rLo;
+//ZZ             if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ                HReg tHi = newVRegI(env);
+//ZZ                HReg tLo = newVRegI(env);
+//ZZ                HReg tmp = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ                addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+//ZZ                rHi = tHi;
+//ZZ                rLo = tLo;
+//ZZ             } else {
+//ZZ                iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+//ZZ             }
+//ZZ             return rLo;
+//ZZ          }
+
+         case Iop_1Uto64: {
+            /* 1Uto64(tmp). */
+            HReg dst = newVRegI(env);
+            if (e->Iex.Unop.arg->tag == Iex_RdTmp) {
+               ARM64RIL* one = mb_mkARM64RIL_I(1);
+               HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
+               vassert(one);
+               addInstr(env, ARM64Instr_Logic(dst, src, one, ARM64lo_AND));
+            } else {
+               /* CLONE-01 */
+               HReg zero = newVRegI(env);
+               HReg one  = newVRegI(env);
+               addInstr(env, ARM64Instr_Imm64(zero, 0));
+               addInstr(env, ARM64Instr_Imm64(one,  1));
+               ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
+               addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
+            }
+            return dst;
+         }
+//ZZ          case Iop_1Uto8: {
+//ZZ             HReg        dst  = newVRegI(env);
+//ZZ             ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+//ZZ             addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+//ZZ             return dst;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_1Sto32: {
+//ZZ             HReg        dst  = newVRegI(env);
+//ZZ             ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+//ZZ             ARMRI5*     amt  = ARMRI5_I5(31);
+//ZZ             /* This is really rough.  We could do much better here;
+//ZZ                perhaps mvn{cond} dst, #0 as the second insn?
+//ZZ                (same applies to 1Sto64) */
+//ZZ             addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+//ZZ             addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, dst, amt));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, amt));
+//ZZ             return dst;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Clz32: {
+//ZZ             /* Count leading zeroes; easy on ARM. */
+//ZZ             HReg dst = newVRegI(env);
+//ZZ             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_Unary(ARMun_CLZ, dst, src));
+//ZZ             return dst;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_CmpwNEZ32: {
+//ZZ             HReg dst = newVRegI(env);
+//ZZ             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_Unary(ARMun_NEG, dst, src));
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR, dst, dst, ARMRI84_R(src)));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, ARMRI5_I5(31)));
+//ZZ             return dst;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_ReinterpF32asI32: {
+//ZZ             HReg dst = newVRegI(env);
+//ZZ             HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_VXferS(False/*!toS*/, src, dst));
+//ZZ             return dst;
+//ZZ          }
+
+         case Iop_64to32:
+         case Iop_64to16:
+         case Iop_64to8:
+            /* These are no-ops. */
+            return iselIntExpr_R(env, e->Iex.Unop.arg);
+
+         default:
+            break;
+      }
+
+//ZZ       /* All Unop cases involving host-side helper calls. */
+//ZZ       void* fn = NULL;
+//ZZ       switch (e->Iex.Unop.op) {
+//ZZ          case Iop_CmpNEZ16x2:
+//ZZ             fn = &h_generic_calc_CmpNEZ16x2; break;
+//ZZ          case Iop_CmpNEZ8x4:
+//ZZ             fn = &h_generic_calc_CmpNEZ8x4; break;
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ 
+//ZZ       if (fn) {
+//ZZ          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ          HReg res = newVRegI(env);
+//ZZ          addInstr(env, mk_iMOVds_RR(hregARM_R0(), arg));
+//ZZ          addInstr(env, ARMInstr_Call( ARMcc_AL, (HWord)Ptr_to_ULong(fn),
+//ZZ                                       1, RetLocInt ));
+//ZZ          addInstr(env, mk_iMOVds_RR(res, hregARM_R0()));
+//ZZ          return res;
+//ZZ       }
+
+      break;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      if (ty == Ity_I64
+          && 0 == (e->Iex.Get.offset & 7) && e->Iex.Get.offset < (8<<12)-8) {
+         HReg        dst = newVRegI(env);
+         ARM64AMode* am
+            = mk_baseblock_64bit_access_amode(e->Iex.Get.offset);
+         addInstr(env, ARM64Instr_LdSt64(True/*isLoad*/, dst, am));
+         return dst;
+      }
+      if (ty == Ity_I32
+          && 0 == (e->Iex.Get.offset & 3) && e->Iex.Get.offset < (4<<12)-4) {
+         HReg        dst = newVRegI(env);
+         ARM64AMode* am
+            = mk_baseblock_32bit_access_amode(e->Iex.Get.offset);
+         addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, am));
+         return dst;
+      }
+      if (ty == Ity_I16
+          && 0 == (e->Iex.Get.offset & 1) && e->Iex.Get.offset < (2<<12)-2) {
+         HReg        dst = newVRegI(env);
+         ARM64AMode* am
+            = mk_baseblock_16bit_access_amode(e->Iex.Get.offset);
+         addInstr(env, ARM64Instr_LdSt16(True/*isLoad*/, dst, am));
+         return dst;
+      }
+      if (ty == Ity_I8
+          /* && no alignment check */ && e->Iex.Get.offset < (1<<12)-1) {
+         HReg        dst = newVRegI(env);
+         ARM64AMode* am
+            = mk_baseblock_8bit_access_amode(e->Iex.Get.offset);
+         addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, am));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- CCALL --------- */
+   case Iex_CCall: {
+      HReg    dst = newVRegI(env);
+      vassert(ty == e->Iex.CCall.retty);
+
+      /* be very restrictive for now.  Only 64-bit ints allowed for
+         args, and 64 bits for return type.  Don't forget to change
+         the RetLoc if more types are allowed in future. */
+      if (e->Iex.CCall.retty != Ity_I64)
+         goto irreducible;
+
+      /* Marshal args, do the call, clear stack. */
+      UInt   addToSp = 0;
+      RetLoc rloc    = mk_RetLoc_INVALID();
+      Bool   ok      = doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
+                                     e->Iex.CCall.cee, e->Iex.CCall.retty,
+                                     e->Iex.CCall.args );
+      /* */
+      if (ok) {
+         vassert(is_sane_RetLoc(rloc));
+         vassert(rloc.pri == RLPri_Int);
+         vassert(addToSp == 0);
+         addInstr(env, ARM64Instr_MovI(dst, hregARM64_X0()));
+         return dst;
+      }
+      /* else fall through; will hit the irreducible: label */
+   }
+
+   /* --------- LITERAL --------- */
+   /* 64-bit literals */
+   case Iex_Const: {
+      ULong u   = 0;
+      HReg  dst = newVRegI(env);
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U64: u = e->Iex.Const.con->Ico.U64; break;
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = e->Iex.Const.con->Ico.U16; break;
+         case Ico_U8:  u = e->Iex.Const.con->Ico.U8;  break;
+         default: ppIRExpr(e); vpanic("iselIntExpr_R.Iex_Const(arm64)");
+      }
+      addInstr(env, ARM64Instr_Imm64(dst, u));
+      return dst;
+   }
+
+   /* --------- MULTIPLEX --------- */
+   case Iex_ITE: {
+      /* ITE(ccexpr, iftrue, iffalse) */
+      if (ty == Ity_I64 || ty == Ity_I32) {
+         ARM64CondCode cc;
+         HReg r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
+         HReg r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
+         HReg dst = newVRegI(env);
+         cc = iselCondCode(env, e->Iex.ITE.cond);
+         addInstr(env, ARM64Instr_CSel(dst, r1, r0, cc));
+         return dst;
+      }
+      break;
+   }
+
+   default: 
+   break;
+   } /* switch (e->tag) */
+
+   /* We get here if no pattern matched. */
+  irreducible:
+   ppIRExpr(e);
+   vpanic("iselIntExpr_R: cannot reduce tree");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (128 bit)               ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 128-bit value into a register pair, which is returned as
+   the first two parameters.  As with iselIntExpr_R, these may be
+   either real or virtual regs; in any case they must not be changed
+   by subsequent code emitted by the caller.  */
+
+static void iselInt128Expr ( HReg* rHi, HReg* rLo, 
+                             ISelEnv* env, IRExpr* e )
+{
+   iselInt128Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcInt64);
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcInt64);
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                 ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         /* 64 x 64 -> 128 multiply */
+         case Iop_MullU64:
+         case Iop_MullS64: {
+            Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
+            HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg dstLo = newVRegI(env);
+            HReg dstHi = newVRegI(env);
+            addInstr(env, ARM64Instr_Mul(dstLo, argL, argR,
+                                         ARM64mul_PLAIN));
+            addInstr(env, ARM64Instr_Mul(dstHi, argL, argR,
+                                         syned ? ARM64mul_SX : ARM64mul_ZX));
+            *rHi = dstHi;
+            *rLo = dstLo;
+            return;
+         }
+         /* 64HLto128(e1,e2) */
+         case Iop_64HLto128:
+            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            return;
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Binop) */
+
+   ppIRExpr(e);
+   vpanic("iselInt128Expr(arm64)");
+}
+
+
+//ZZ /* -------------------- 64-bit -------------------- */
+//ZZ 
+//ZZ /* Compute a 64-bit value into a register pair, which is returned as
+//ZZ    the first two parameters.  As with iselIntExpr_R, these may be
+//ZZ    either real or virtual regs; in any case they must not be changed
+//ZZ    by subsequent code emitted by the caller.  */
+//ZZ 
+//ZZ static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    iselInt64Expr_wrk(rHi, rLo, env, e);
+//ZZ #  if 0
+//ZZ    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+//ZZ #  endif
+//ZZ    vassert(hregClass(*rHi) == HRcInt32);
+//ZZ    vassert(hregIsVirtual(*rHi));
+//ZZ    vassert(hregClass(*rLo) == HRcInt32);
+//ZZ    vassert(hregIsVirtual(*rLo));
+//ZZ }
+//ZZ 
+//ZZ /* DO NOT CALL THIS DIRECTLY ! */
+//ZZ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    vassert(e);
+//ZZ    vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
+//ZZ 
+//ZZ    /* 64-bit literal */
+//ZZ    if (e->tag == Iex_Const) {
+//ZZ       ULong   w64 = e->Iex.Const.con->Ico.U64;
+//ZZ       UInt    wHi = toUInt(w64 >> 32);
+//ZZ       UInt    wLo = toUInt(w64);
+//ZZ       HReg    tHi = newVRegI(env);
+//ZZ       HReg    tLo = newVRegI(env);
+//ZZ       vassert(e->Iex.Const.con->tag == Ico_U64);
+//ZZ       addInstr(env, ARMInstr_Imm32(tHi, wHi));
+//ZZ       addInstr(env, ARMInstr_Imm32(tLo, wLo));
+//ZZ       *rHi = tHi;
+//ZZ       *rLo = tLo;
+//ZZ       return;
+//ZZ    }
+//ZZ 
+//ZZ    /* read 64-bit IRTemp */
+//ZZ    if (e->tag == Iex_RdTmp) {
+//ZZ       if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ          HReg tHi = newVRegI(env);
+//ZZ          HReg tLo = newVRegI(env);
+//ZZ          HReg tmp = iselNeon64Expr(env, e);
+//ZZ          addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+//ZZ          *rHi = tHi;
+//ZZ          *rLo = tLo;
+//ZZ       } else {
+//ZZ          lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
+//ZZ       }
+//ZZ       return;
+//ZZ    }
+//ZZ 
+//ZZ    /* 64-bit load */
+//ZZ    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+//ZZ       HReg      tLo, tHi, rA;
+//ZZ       vassert(e->Iex.Load.ty == Ity_I64);
+//ZZ       rA  = iselIntExpr_R(env, e->Iex.Load.addr);
+//ZZ       tHi = newVRegI(env);
+//ZZ       tLo = newVRegI(env);
+//ZZ       addInstr(env, ARMInstr_LdSt32(ARMcc_AL, True/*isLoad*/,
+//ZZ                                     tHi, ARMAMode1_RI(rA, 4)));
+//ZZ       addInstr(env, ARMInstr_LdSt32(ARMcc_AL, True/*isLoad*/,
+//ZZ                                     tLo, ARMAMode1_RI(rA, 0)));
+//ZZ       *rHi = tHi;
+//ZZ       *rLo = tLo;
+//ZZ       return;
+//ZZ    }
+//ZZ 
+//ZZ    /* 64-bit GET */
+//ZZ    if (e->tag == Iex_Get) {
+//ZZ       ARMAMode1* am0 = ARMAMode1_RI(hregARM_R8(), e->Iex.Get.offset + 0);
+//ZZ       ARMAMode1* am4 = ARMAMode1_RI(hregARM_R8(), e->Iex.Get.offset + 4);
+//ZZ       HReg tHi = newVRegI(env);
+//ZZ       HReg tLo = newVRegI(env);
+//ZZ       addInstr(env, ARMInstr_LdSt32(ARMcc_AL, True/*isLoad*/, tHi, am4));
+//ZZ       addInstr(env, ARMInstr_LdSt32(ARMcc_AL, True/*isLoad*/, tLo, am0));
+//ZZ       *rHi = tHi;
+//ZZ       *rLo = tLo;
+//ZZ       return;
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- BINARY ops --------- */
+//ZZ    if (e->tag == Iex_Binop) {
+//ZZ       switch (e->Iex.Binop.op) {
+//ZZ 
+//ZZ          /* 32 x 32 -> 64 multiply */
+//ZZ          case Iop_MullS32:
+//ZZ          case Iop_MullU32: {
+//ZZ             HReg     argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+//ZZ             HReg     argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             HReg     tHi  = newVRegI(env);
+//ZZ             HReg     tLo  = newVRegI(env);
+//ZZ             ARMMulOp mop  = e->Iex.Binop.op == Iop_MullS32
+//ZZ                                ? ARMmul_SX : ARMmul_ZX;
+//ZZ             addInstr(env, mk_iMOVds_RR(hregARM_R2(), argL));
+//ZZ             addInstr(env, mk_iMOVds_RR(hregARM_R3(), argR));
+//ZZ             addInstr(env, ARMInstr_Mul(mop));
+//ZZ             addInstr(env, mk_iMOVds_RR(tHi, hregARM_R1()));
+//ZZ             addInstr(env, mk_iMOVds_RR(tLo, hregARM_R0()));
+//ZZ             *rHi = tHi;
+//ZZ             *rLo = tLo;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Or64: {
+//ZZ             HReg xLo, xHi, yLo, yHi;
+//ZZ             HReg tHi = newVRegI(env);
+//ZZ             HReg tLo = newVRegI(env);
+//ZZ             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+//ZZ             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR, tHi, xHi, ARMRI84_R(yHi)));
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR, tLo, xLo, ARMRI84_R(yLo)));
+//ZZ             *rHi = tHi;
+//ZZ             *rLo = tLo;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Add64: {
+//ZZ             HReg xLo, xHi, yLo, yHi;
+//ZZ             HReg tHi = newVRegI(env);
+//ZZ             HReg tLo = newVRegI(env);
+//ZZ             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+//ZZ             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_ADDS, tLo, xLo, ARMRI84_R(yLo)));
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_ADC,  tHi, xHi, ARMRI84_R(yHi)));
+//ZZ             *rHi = tHi;
+//ZZ             *rLo = tLo;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          /* 32HLto64(e1,e2) */
+//ZZ          case Iop_32HLto64: {
+//ZZ             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+//ZZ             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- UNARY ops --------- */
+//ZZ    if (e->tag == Iex_Unop) {
+//ZZ       switch (e->Iex.Unop.op) {
+//ZZ 
+//ZZ          /* ReinterpF64asI64 */
+//ZZ          case Iop_ReinterpF64asI64: {
+//ZZ             HReg dstHi = newVRegI(env);
+//ZZ             HReg dstLo = newVRegI(env);
+//ZZ             HReg src   = iselDblExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_VXferD(False/*!toD*/, src, dstHi, dstLo));
+//ZZ             *rHi = dstHi;
+//ZZ             *rLo = dstLo;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          /* Left64(e) */
+//ZZ          case Iop_Left64: {
+//ZZ             HReg yLo, yHi;
+//ZZ             HReg tHi  = newVRegI(env);
+//ZZ             HReg tLo  = newVRegI(env);
+//ZZ             HReg zero = newVRegI(env);
+//ZZ             /* yHi:yLo = arg */
+//ZZ             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
+//ZZ             /* zero = 0 */
+//ZZ             addInstr(env, ARMInstr_Imm32(zero, 0));
+//ZZ             /* tLo = 0 - yLo, and set carry */
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_SUBS,
+//ZZ                                        tLo, zero, ARMRI84_R(yLo)));
+//ZZ             /* tHi = 0 - yHi - carry */
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_SBC,
+//ZZ                                        tHi, zero, ARMRI84_R(yHi)));
+//ZZ             /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
+//ZZ                back in, so as to give the final result 
+//ZZ                tHi:tLo = arg | -arg. */
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR, tHi, tHi, ARMRI84_R(yHi)));
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR, tLo, tLo, ARMRI84_R(yLo)));
+//ZZ             *rHi = tHi;
+//ZZ             *rLo = tLo;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          /* CmpwNEZ64(e) */
+//ZZ          case Iop_CmpwNEZ64: {
+//ZZ             HReg srcLo, srcHi;
+//ZZ             HReg tmp1 = newVRegI(env);
+//ZZ             HReg tmp2 = newVRegI(env);
+//ZZ             /* srcHi:srcLo = arg */
+//ZZ             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+//ZZ             /* tmp1 = srcHi | srcLo */
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR,
+//ZZ                                        tmp1, srcHi, ARMRI84_R(srcLo)));
+//ZZ             /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+//ZZ             addInstr(env, ARMInstr_Unary(ARMun_NEG, tmp2, tmp1));
+//ZZ             addInstr(env, ARMInstr_Alu(ARMalu_OR,
+//ZZ                                        tmp2, tmp2, ARMRI84_R(tmp1)));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SAR,
+//ZZ                                          tmp2, tmp2, ARMRI5_I5(31)));
+//ZZ             *rHi = tmp2;
+//ZZ             *rLo = tmp2;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_1Sto64: {
+//ZZ             HReg        dst  = newVRegI(env);
+//ZZ             ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+//ZZ             ARMRI5*     amt  = ARMRI5_I5(31);
+//ZZ             /* This is really rough.  We could do much better here;
+//ZZ                perhaps mvn{cond} dst, #0 as the second insn?
+//ZZ                (same applies to 1Sto32) */
+//ZZ             addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+//ZZ             addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, dst, amt));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, amt));
+//ZZ             *rHi = dst;
+//ZZ             *rLo = dst;
+//ZZ             return;
+//ZZ          }
+//ZZ 
+//ZZ          default: 
+//ZZ             break;
+//ZZ       }
+//ZZ    } /* if (e->tag == Iex_Unop) */
+//ZZ 
+//ZZ    /* --------- MULTIPLEX --------- */
+//ZZ    if (e->tag == Iex_ITE) { // VFD
+//ZZ       IRType tyC;
+//ZZ       HReg   r1hi, r1lo, r0hi, r0lo, dstHi, dstLo;
+//ZZ       ARMCondCode cc;
+//ZZ       tyC = typeOfIRExpr(env->type_env,e->Iex.ITE.cond);
+//ZZ       vassert(tyC == Ity_I1);
+//ZZ       iselInt64Expr(&r1hi, &r1lo, env, e->Iex.ITE.iftrue);
+//ZZ       iselInt64Expr(&r0hi, &r0lo, env, e->Iex.ITE.iffalse);
+//ZZ       dstHi = newVRegI(env);
+//ZZ       dstLo = newVRegI(env);
+//ZZ       addInstr(env, mk_iMOVds_RR(dstHi, r1hi));
+//ZZ       addInstr(env, mk_iMOVds_RR(dstLo, r1lo));
+//ZZ       cc = iselCondCode(env, e->Iex.ITE.cond);
+//ZZ       addInstr(env, ARMInstr_CMov(cc ^ 1, dstHi, ARMRI84_R(r0hi)));
+//ZZ       addInstr(env, ARMInstr_CMov(cc ^ 1, dstLo, ARMRI84_R(r0lo)));
+//ZZ       *rHi = dstHi;
+//ZZ       *rLo = dstLo;
+//ZZ       return;
+//ZZ    }
+//ZZ 
+//ZZ    /* It is convenient sometimes to call iselInt64Expr even when we
+//ZZ       have NEON support (e.g. in do_helper_call we need 64-bit
+//ZZ       arguments as 2 x 32 regs). */
+//ZZ    if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ       HReg tHi = newVRegI(env);
+//ZZ       HReg tLo = newVRegI(env);
+//ZZ       HReg tmp = iselNeon64Expr(env, e);
+//ZZ       addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+//ZZ       *rHi = tHi;
+//ZZ       *rLo = tLo;
+//ZZ       return ;
+//ZZ    }
+//ZZ 
+//ZZ    ppIRExpr(e);
+//ZZ    vpanic("iselInt64Expr");
+//ZZ }
+//ZZ 
+//ZZ 
+//ZZ /*---------------------------------------------------------*/
+//ZZ /*--- ISEL: Vector (NEON) expressions (64 bit)          ---*/
+//ZZ /*---------------------------------------------------------*/
+//ZZ 
+//ZZ static HReg iselNeon64Expr ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    HReg r = iselNeon64Expr_wrk( env, e );
+//ZZ    vassert(hregClass(r) == HRcFlt64);
+//ZZ    vassert(hregIsVirtual(r));
+//ZZ    return r;
+//ZZ }
+//ZZ 
+//ZZ /* DO NOT CALL THIS DIRECTLY */
+//ZZ static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e )
+//ZZ {
+//ZZ    IRType ty = typeOfIRExpr(env->type_env, e);
+//ZZ    MatchInfo mi;
+//ZZ    vassert(e);
+//ZZ    vassert(ty == Ity_I64);
+//ZZ 
+//ZZ    if (e->tag == Iex_RdTmp) {
+//ZZ       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+//ZZ    }
+//ZZ 
+//ZZ    if (e->tag == Iex_Const) {
+//ZZ       HReg rLo, rHi;
+//ZZ       HReg res = newVRegD(env);
+//ZZ       iselInt64Expr(&rHi, &rLo, env, e);
+//ZZ       addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ       return res;
+//ZZ    }
+//ZZ 
+//ZZ    /* 64-bit load */
+//ZZ    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+//ZZ       HReg res = newVRegD(env);
+//ZZ       ARMAModeN* am = iselIntExpr_AModeN(env, e->Iex.Load.addr);
+//ZZ       vassert(ty == Ity_I64);
+//ZZ       addInstr(env, ARMInstr_NLdStD(True, res, am));
+//ZZ       return res;
+//ZZ    }
+//ZZ 
+//ZZ    /* 64-bit GET */
+//ZZ    if (e->tag == Iex_Get) {
+//ZZ       HReg addr = newVRegI(env);
+//ZZ       HReg res = newVRegD(env);
+//ZZ       vassert(ty == Ity_I64);
+//ZZ       addInstr(env, ARMInstr_Add32(addr, hregARM_R8(), e->Iex.Get.offset));
+//ZZ       addInstr(env, ARMInstr_NLdStD(True, res, mkARMAModeN_R(addr)));
+//ZZ       return res;
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- BINARY ops --------- */
+//ZZ    if (e->tag == Iex_Binop) {
+//ZZ       switch (e->Iex.Binop.op) {
+//ZZ 
+//ZZ          /* 32 x 32 -> 64 multiply */
+//ZZ          case Iop_MullS32:
+//ZZ          case Iop_MullU32: {
+//ZZ             HReg rLo, rHi;
+//ZZ             HReg res = newVRegD(env);
+//ZZ             iselInt64Expr(&rHi, &rLo, env, e);
+//ZZ             addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_And64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+//ZZ                                            res, argL, argR, 4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Or64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            res, argL, argR, 4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Xor64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VXOR,
+//ZZ                                            res, argL, argR, 4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          /* 32HLto64(e1,e2) */
+//ZZ          case Iop_32HLto64: {
+//ZZ             HReg rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+//ZZ             HReg rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             HReg res = newVRegD(env);
+//ZZ             addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Add8x8:
+//ZZ          case Iop_Add16x4:
+//ZZ          case Iop_Add32x2:
+//ZZ          case Iop_Add64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Add8x8: size = 0; break;
+//ZZ                case Iop_Add16x4: size = 1; break;
+//ZZ                case Iop_Add32x2: size = 2; break;
+//ZZ                case Iop_Add64: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VADD,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Add32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VADDFP,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recps32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VRECPS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Rsqrts32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VRSQRTS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          // These 6 verified 18 Apr 2013
+//ZZ          case Iop_InterleaveHI32x2:
+//ZZ          case Iop_InterleaveLO32x2:
+//ZZ          case Iop_InterleaveOddLanes8x8:
+//ZZ          case Iop_InterleaveEvenLanes8x8:
+//ZZ          case Iop_InterleaveOddLanes16x4:
+//ZZ          case Iop_InterleaveEvenLanes16x4: {
+//ZZ             HReg rD   = newVRegD(env);
+//ZZ             HReg rM   = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_InterleaveOddLanes8x8:   resRd = False; size = 0; break;
+//ZZ                case Iop_InterleaveEvenLanes8x8:  resRd = True;  size = 0; break;
+//ZZ                case Iop_InterleaveOddLanes16x4:  resRd = False; size = 1; break;
+//ZZ                case Iop_InterleaveEvenLanes16x4: resRd = True;  size = 1; break;
+//ZZ                case Iop_InterleaveHI32x2:        resRd = False; size = 2; break;
+//ZZ                case Iop_InterleaveLO32x2:        resRd = True;  size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_TRN, rD, rM, size, False));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          // These 4 verified 18 Apr 2013
+//ZZ          case Iop_InterleaveHI8x8:
+//ZZ          case Iop_InterleaveLO8x8:
+//ZZ          case Iop_InterleaveHI16x4:
+//ZZ          case Iop_InterleaveLO16x4: {
+//ZZ             HReg rD   = newVRegD(env);
+//ZZ             HReg rM   = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_InterleaveHI8x8:  resRd = False; size = 0; break;
+//ZZ                case Iop_InterleaveLO8x8:  resRd = True;  size = 0; break;
+//ZZ                case Iop_InterleaveHI16x4: resRd = False; size = 1; break;
+//ZZ                case Iop_InterleaveLO16x4: resRd = True;  size = 1; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_ZIP, rD, rM, size, False));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          // These 4 verified 18 Apr 2013
+//ZZ          case Iop_CatOddLanes8x8:
+//ZZ          case Iop_CatEvenLanes8x8:
+//ZZ          case Iop_CatOddLanes16x4:
+//ZZ          case Iop_CatEvenLanes16x4: {
+//ZZ             HReg rD   = newVRegD(env);
+//ZZ             HReg rM   = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CatOddLanes8x8:   resRd = False; size = 0; break;
+//ZZ                case Iop_CatEvenLanes8x8:  resRd = True;  size = 0; break;
+//ZZ                case Iop_CatOddLanes16x4:  resRd = False; size = 1; break;
+//ZZ                case Iop_CatEvenLanes16x4: resRd = True;  size = 1; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_UZP, rD, rM, size, False));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QAdd8Ux8:
+//ZZ          case Iop_QAdd16Ux4:
+//ZZ          case Iop_QAdd32Ux2:
+//ZZ          case Iop_QAdd64Ux1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QAdd8Ux8: size = 0; break;
+//ZZ                case Iop_QAdd16Ux4: size = 1; break;
+//ZZ                case Iop_QAdd32Ux2: size = 2; break;
+//ZZ                case Iop_QAdd64Ux1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQADDU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QAdd8Sx8:
+//ZZ          case Iop_QAdd16Sx4:
+//ZZ          case Iop_QAdd32Sx2:
+//ZZ          case Iop_QAdd64Sx1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QAdd8Sx8: size = 0; break;
+//ZZ                case Iop_QAdd16Sx4: size = 1; break;
+//ZZ                case Iop_QAdd32Sx2: size = 2; break;
+//ZZ                case Iop_QAdd64Sx1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQADDS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sub8x8:
+//ZZ          case Iop_Sub16x4:
+//ZZ          case Iop_Sub32x2:
+//ZZ          case Iop_Sub64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sub8x8: size = 0; break;
+//ZZ                case Iop_Sub16x4: size = 1; break;
+//ZZ                case Iop_Sub32x2: size = 2; break;
+//ZZ                case Iop_Sub64: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sub32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUBFP,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSub8Ux8:
+//ZZ          case Iop_QSub16Ux4:
+//ZZ          case Iop_QSub32Ux2:
+//ZZ          case Iop_QSub64Ux1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSub8Ux8: size = 0; break;
+//ZZ                case Iop_QSub16Ux4: size = 1; break;
+//ZZ                case Iop_QSub32Ux2: size = 2; break;
+//ZZ                case Iop_QSub64Ux1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSub8Sx8:
+//ZZ          case Iop_QSub16Sx4:
+//ZZ          case Iop_QSub32Sx2:
+//ZZ          case Iop_QSub64Sx1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSub8Sx8: size = 0; break;
+//ZZ                case Iop_QSub16Sx4: size = 1; break;
+//ZZ                case Iop_QSub32Sx2: size = 2; break;
+//ZZ                case Iop_QSub64Sx1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max8Ux8:
+//ZZ          case Iop_Max16Ux4:
+//ZZ          case Iop_Max32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Max8Ux8: size = 0; break;
+//ZZ                case Iop_Max16Ux4: size = 1; break;
+//ZZ                case Iop_Max32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max8Sx8:
+//ZZ          case Iop_Max16Sx4:
+//ZZ          case Iop_Max32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Max8Sx8: size = 0; break;
+//ZZ                case Iop_Max16Sx4: size = 1; break;
+//ZZ                case Iop_Max32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min8Ux8:
+//ZZ          case Iop_Min16Ux4:
+//ZZ          case Iop_Min32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Min8Ux8: size = 0; break;
+//ZZ                case Iop_Min16Ux4: size = 1; break;
+//ZZ                case Iop_Min32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min8Sx8:
+//ZZ          case Iop_Min16Sx4:
+//ZZ          case Iop_Min32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Min8Sx8: size = 0; break;
+//ZZ                case Iop_Min16Sx4: size = 1; break;
+//ZZ                case Iop_Min32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sar8x8:
+//ZZ          case Iop_Sar16x4:
+//ZZ          case Iop_Sar32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegD(env);
+//ZZ             HReg zero = newVRegD(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sar8x8: size = 0; break;
+//ZZ                case Iop_Sar16x4: size = 1; break;
+//ZZ                case Iop_Sar32x2: size = 2; break;
+//ZZ                case Iop_Sar64: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            argR2, zero, argR, size, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+//ZZ                                           res, argL, argR2, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sal8x8:
+//ZZ          case Iop_Sal16x4:
+//ZZ          case Iop_Sal32x2:
+//ZZ          case Iop_Sal64x1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sal8x8: size = 0; break;
+//ZZ                case Iop_Sal16x4: size = 1; break;
+//ZZ                case Iop_Sal32x2: size = 2; break;
+//ZZ                case Iop_Sal64x1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+//ZZ                                           res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Shr8x8:
+//ZZ          case Iop_Shr16x4:
+//ZZ          case Iop_Shr32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegD(env);
+//ZZ             HReg zero = newVRegD(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Shr8x8: size = 0; break;
+//ZZ                case Iop_Shr16x4: size = 1; break;
+//ZZ                case Iop_Shr32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            argR2, zero, argR, size, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, argR2, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Shl8x8:
+//ZZ          case Iop_Shl16x4:
+//ZZ          case Iop_Shl32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Shl8x8: size = 0; break;
+//ZZ                case Iop_Shl16x4: size = 1; break;
+//ZZ                case Iop_Shl32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShl8x8:
+//ZZ          case Iop_QShl16x4:
+//ZZ          case Iop_QShl32x2:
+//ZZ          case Iop_QShl64x1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShl8x8: size = 0; break;
+//ZZ                case Iop_QShl16x4: size = 1; break;
+//ZZ                case Iop_QShl32x2: size = 2; break;
+//ZZ                case Iop_QShl64x1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VQSHL,
+//ZZ                                           res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSal8x8:
+//ZZ          case Iop_QSal16x4:
+//ZZ          case Iop_QSal32x2:
+//ZZ          case Iop_QSal64x1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSal8x8: size = 0; break;
+//ZZ                case Iop_QSal16x4: size = 1; break;
+//ZZ                case Iop_QSal32x2: size = 2; break;
+//ZZ                case Iop_QSal64x1: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VQSAL,
+//ZZ                                           res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShlN8x8:
+//ZZ          case Iop_QShlN16x4:
+//ZZ          case Iop_QShlN32x2:
+//ZZ          case Iop_QShlN64x1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNAxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShlN8x8: size = 8 | imm; break;
+//ZZ                case Iop_QShlN16x4: size = 16 | imm; break;
+//ZZ                case Iop_QShlN32x2: size = 32 | imm; break;
+//ZZ                case Iop_QShlN64x1: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUU,
+//ZZ                                           res, argL, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShlN8Sx8:
+//ZZ          case Iop_QShlN16Sx4:
+//ZZ          case Iop_QShlN32Sx2:
+//ZZ          case Iop_QShlN64Sx1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNAxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShlN8Sx8: size = 8 | imm; break;
+//ZZ                case Iop_QShlN16Sx4: size = 16 | imm; break;
+//ZZ                case Iop_QShlN32Sx2: size = 32 | imm; break;
+//ZZ                case Iop_QShlN64Sx1: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUS,
+//ZZ                                           res, argL, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSalN8x8:
+//ZZ          case Iop_QSalN16x4:
+//ZZ          case Iop_QSalN32x2:
+//ZZ          case Iop_QSalN64x1: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNAxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSalN8x8: size = 8 | imm; break;
+//ZZ                case Iop_QSalN16x4: size = 16 | imm; break;
+//ZZ                case Iop_QSalN32x2: size = 32 | imm; break;
+//ZZ                case Iop_QSalN64x1: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNSS,
+//ZZ                                           res, argL, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_ShrN8x8:
+//ZZ          case Iop_ShrN16x4:
+//ZZ          case Iop_ShrN32x2:
+//ZZ          case Iop_Shr64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg tmp = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegI(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_ShrN8x8: size = 0; break;
+//ZZ                case Iop_ShrN16x4: size = 1; break;
+//ZZ                case Iop_ShrN32x2: size = 2; break;
+//ZZ                case Iop_Shr64: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR2, 0, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, tmp, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_ShlN8x8:
+//ZZ          case Iop_ShlN16x4:
+//ZZ          case Iop_ShlN32x2:
+//ZZ          case Iop_Shl64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg tmp = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             /* special-case Shl64(x, imm8) since the Neon front
+//ZZ                end produces a lot of those for V{LD,ST}{1,2,3,4}. */
+//ZZ             if (e->Iex.Binop.op == Iop_Shl64 
+//ZZ                 && e->Iex.Binop.arg2->tag == Iex_Const) {
+//ZZ                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+//ZZ                Int nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ                if (nshift >= 1 && nshift <= 63) {
+//ZZ                   addInstr(env, ARMInstr_NShl64(res, argL, nshift));
+//ZZ                   return res;
+//ZZ                }
+//ZZ                /* else fall through to general case */
+//ZZ             }
+//ZZ             HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_ShlN8x8:  size = 0; break;
+//ZZ                case Iop_ShlN16x4: size = 1; break;
+//ZZ                case Iop_ShlN32x2: size = 2; break;
+//ZZ                case Iop_Shl64:    size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP,
+//ZZ                                           tmp, argR, 0, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, tmp, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_SarN8x8:
+//ZZ          case Iop_SarN16x4:
+//ZZ          case Iop_SarN32x2:
+//ZZ          case Iop_Sar64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg tmp = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegI(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_SarN8x8: size = 0; break;
+//ZZ                case Iop_SarN16x4: size = 1; break;
+//ZZ                case Iop_SarN32x2: size = 2; break;
+//ZZ                case Iop_Sar64: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR2, 0, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+//ZZ                                           res, argL, tmp, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGT8Ux8:
+//ZZ          case Iop_CmpGT16Ux4:
+//ZZ          case Iop_CmpGT32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpGT8Ux8: size = 0; break;
+//ZZ                case Iop_CmpGT16Ux4: size = 1; break;
+//ZZ                case Iop_CmpGT32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGT8Sx8:
+//ZZ          case Iop_CmpGT16Sx4:
+//ZZ          case Iop_CmpGT32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpGT8Sx8: size = 0; break;
+//ZZ                case Iop_CmpGT16Sx4: size = 1; break;
+//ZZ                case Iop_CmpGT32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpEQ8x8:
+//ZZ          case Iop_CmpEQ16x4:
+//ZZ          case Iop_CmpEQ32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpEQ8x8: size = 0; break;
+//ZZ                case Iop_CmpEQ16x4: size = 1; break;
+//ZZ                case Iop_CmpEQ32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCEQ,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Mul8x8:
+//ZZ          case Iop_Mul16x4:
+//ZZ          case Iop_Mul32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Mul8x8: size = 0; break;
+//ZZ                case Iop_Mul16x4: size = 1; break;
+//ZZ                case Iop_Mul32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMUL,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Mul32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULFP,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QDMulHi16Sx4:
+//ZZ          case Iop_QDMulHi32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QDMulHi16Sx4: size = 1; break;
+//ZZ                case Iop_QDMulHi32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULH,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QRDMulHi16Sx4:
+//ZZ          case Iop_QRDMulHi32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QRDMulHi16Sx4: size = 1; break;
+//ZZ                case Iop_QRDMulHi32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQRDMULH,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_PwAdd8x8:
+//ZZ          case Iop_PwAdd16x4:
+//ZZ          case Iop_PwAdd32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAdd8x8: size = 0; break;
+//ZZ                case Iop_PwAdd16x4: size = 1; break;
+//ZZ                case Iop_PwAdd32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPADD,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwAdd32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPADDFP,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMin8Ux8:
+//ZZ          case Iop_PwMin16Ux4:
+//ZZ          case Iop_PwMin32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwMin8Ux8: size = 0; break;
+//ZZ                case Iop_PwMin16Ux4: size = 1; break;
+//ZZ                case Iop_PwMin32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMINU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMin8Sx8:
+//ZZ          case Iop_PwMin16Sx4:
+//ZZ          case Iop_PwMin32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwMin8Sx8: size = 0; break;
+//ZZ                case Iop_PwMin16Sx4: size = 1; break;
+//ZZ                case Iop_PwMin32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMINS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMax8Ux8:
+//ZZ          case Iop_PwMax16Ux4:
+//ZZ          case Iop_PwMax32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwMax8Ux8: size = 0; break;
+//ZZ                case Iop_PwMax16Ux4: size = 1; break;
+//ZZ                case Iop_PwMax32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXU,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMax8Sx8:
+//ZZ          case Iop_PwMax16Sx4:
+//ZZ          case Iop_PwMax32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwMax8Sx8: size = 0; break;
+//ZZ                case Iop_PwMax16Sx4: size = 1; break;
+//ZZ                case Iop_PwMax32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXS,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Perm8x8: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VTBL,
+//ZZ                                            res, argL, argR, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PolynomialMul8x8: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULP,
+//ZZ                                            res, argL, argR, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMax32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMin32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMINF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGT32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGE32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGEF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpEQ32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCEQF,
+//ZZ                                            res, argL, argR, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_F32ToFixed32Ux2_RZ:
+//ZZ          case Iop_F32ToFixed32Sx2_RZ:
+//ZZ          case Iop_Fixed32UToF32x2_RN:
+//ZZ          case Iop_Fixed32SToF32x2_RN: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             ARMNeonUnOp op;
+//ZZ             UInt imm6;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                   vpanic("ARM supports FP <-> Fixed conversion with constant "
+//ZZ                          "second argument less than 33 only\n");
+//ZZ             }
+//ZZ             imm6 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             vassert(imm6 <= 32 && imm6 > 0);
+//ZZ             imm6 = 64 - imm6;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_F32ToFixed32Ux2_RZ: op = ARMneon_VCVTFtoFixedU; break;
+//ZZ                case Iop_F32ToFixed32Sx2_RZ: op = ARMneon_VCVTFtoFixedS; break;
+//ZZ                case Iop_Fixed32UToF32x2_RN: op = ARMneon_VCVTFixedUtoF; break;
+//ZZ                case Iop_Fixed32SToF32x2_RN: op = ARMneon_VCVTFixedStoF; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(op, res, arg, imm6, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          /*
+//ZZ          FIXME: is this here or not?
+//ZZ          case Iop_VDup8x8:
+//ZZ          case Iop_VDup16x4:
+//ZZ          case Iop_VDup32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             UInt index;
+//ZZ             UInt imm4;
+//ZZ             UInt size = 0;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                   vpanic("ARM supports Iop_VDup with constant "
+//ZZ                          "second argument less than 16 only\n");
+//ZZ             }
+//ZZ             index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_VDup8x8: imm4 = (index << 1) + 1; break;
+//ZZ                case Iop_VDup16x4: imm4 = (index << 2) + 2; break;
+//ZZ                case Iop_VDup32x2: imm4 = (index << 3) + 4; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             if (imm4 >= 16) {
+//ZZ                vpanic("ARM supports Iop_VDup with constant "
+//ZZ                       "second argument less than 16 only\n");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VDUP,
+//ZZ                                           res, argL, imm4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          */
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- UNARY ops --------- */
+//ZZ    if (e->tag == Iex_Unop) {
+//ZZ       switch (e->Iex.Unop.op) {
+//ZZ 
+//ZZ          /* 32Uto64 */
+//ZZ          case Iop_32Uto64: {
+//ZZ             HReg rLo = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             HReg rHi = newVRegI(env);
+//ZZ             HReg res = newVRegD(env);
+//ZZ             addInstr(env, ARMInstr_Imm32(rHi, 0));
+//ZZ             addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          /* 32Sto64 */
+//ZZ          case Iop_32Sto64: {
+//ZZ             HReg rLo = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             HReg rHi = newVRegI(env);
+//ZZ             addInstr(env, mk_iMOVds_RR(rHi, rLo));
+//ZZ             addInstr(env, ARMInstr_Shift(ARMsh_SAR, rHi, rHi, ARMRI5_I5(31)));
+//ZZ             HReg res = newVRegD(env);
+//ZZ             addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          /* The next 3 are pass-throughs */
+//ZZ          /* ReinterpF64asI64 */
+//ZZ          case Iop_ReinterpF64asI64:
+//ZZ          /* Left64(e) */
+//ZZ          case Iop_Left64:
+//ZZ          /* CmpwNEZ64(e) */
+//ZZ          case Iop_1Sto64: {
+//ZZ             HReg rLo, rHi;
+//ZZ             HReg res = newVRegD(env);
+//ZZ             iselInt64Expr(&rHi, &rLo, env, e);
+//ZZ             addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Not64: {
+//ZZ             DECLARE_PATTERN(p_veqz_8x8);
+//ZZ             DECLARE_PATTERN(p_veqz_16x4);
+//ZZ             DECLARE_PATTERN(p_veqz_32x2);
+//ZZ             DECLARE_PATTERN(p_vcge_8sx8);
+//ZZ             DECLARE_PATTERN(p_vcge_16sx4);
+//ZZ             DECLARE_PATTERN(p_vcge_32sx2);
+//ZZ             DECLARE_PATTERN(p_vcge_8ux8);
+//ZZ             DECLARE_PATTERN(p_vcge_16ux4);
+//ZZ             DECLARE_PATTERN(p_vcge_32ux2);
+//ZZ             DEFINE_PATTERN(p_veqz_8x8,
+//ZZ                   unop(Iop_Not64, unop(Iop_CmpNEZ8x8, bind(0))));
+//ZZ             DEFINE_PATTERN(p_veqz_16x4,
+//ZZ                   unop(Iop_Not64, unop(Iop_CmpNEZ16x4, bind(0))));
+//ZZ             DEFINE_PATTERN(p_veqz_32x2,
+//ZZ                   unop(Iop_Not64, unop(Iop_CmpNEZ32x2, bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_8sx8,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT8Sx8, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_16sx4,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT16Sx4, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_32sx2,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT32Sx2, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_8ux8,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT8Ux8, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_16ux4,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT16Ux4, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_32ux2,
+//ZZ                   unop(Iop_Not64, binop(Iop_CmpGT32Ux2, bind(1), bind(0))));
+//ZZ             if (matchIRExpr(&mi, p_veqz_8x8, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 0, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_veqz_16x4, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 1, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_veqz_32x2, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 2, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_8sx8, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 0, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_16sx4, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 1, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_32sx2, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 2, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_8ux8, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 0, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_16ux4, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 1, False));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_32ux2, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 2, False));
+//ZZ                return res;
+//ZZ             } else {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, arg, 4, False));
+//ZZ                return res;
+//ZZ             }
+//ZZ          }
+//ZZ          case Iop_Dup8x8:
+//ZZ          case Iop_Dup16x4:
+//ZZ          case Iop_Dup32x2: {
+//ZZ             HReg res, arg;
+//ZZ             UInt size;
+//ZZ             DECLARE_PATTERN(p_vdup_8x8);
+//ZZ             DECLARE_PATTERN(p_vdup_16x4);
+//ZZ             DECLARE_PATTERN(p_vdup_32x2);
+//ZZ             DEFINE_PATTERN(p_vdup_8x8,
+//ZZ                   unop(Iop_Dup8x8, binop(Iop_GetElem8x8, bind(0), bind(1))));
+//ZZ             DEFINE_PATTERN(p_vdup_16x4,
+//ZZ                   unop(Iop_Dup16x4, binop(Iop_GetElem16x4, bind(0), bind(1))));
+//ZZ             DEFINE_PATTERN(p_vdup_32x2,
+//ZZ                   unop(Iop_Dup32x2, binop(Iop_GetElem32x2, bind(0), bind(1))));
+//ZZ             if (matchIRExpr(&mi, p_vdup_8x8, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 1) + 1;
+//ZZ                   if (index < 8) {
+//ZZ                      res = newVRegD(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, False
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             } else if (matchIRExpr(&mi, p_vdup_16x4, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 2) + 2;
+//ZZ                   if (index < 4) {
+//ZZ                      res = newVRegD(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, False
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             } else if (matchIRExpr(&mi, p_vdup_32x2, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 3) + 4;
+//ZZ                   if (index < 2) {
+//ZZ                      res = newVRegD(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, False
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             }
+//ZZ             arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             res = newVRegD(env);
+//ZZ             switch (e->Iex.Unop.op) {
+//ZZ                case Iop_Dup8x8: size = 0; break;
+//ZZ                case Iop_Dup16x4: size = 1; break;
+//ZZ                case Iop_Dup32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP, res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Abs8x8:
+//ZZ          case Iop_Abs16x4:
+//ZZ          case Iop_Abs32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Abs8x8: size = 0; break;
+//ZZ                case Iop_Abs16x4: size = 1; break;
+//ZZ                case Iop_Abs32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_ABS, res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse64_8x8:
+//ZZ          case Iop_Reverse64_16x4:
+//ZZ          case Iop_Reverse64_32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Reverse64_8x8: size = 0; break;
+//ZZ                case Iop_Reverse64_16x4: size = 1; break;
+//ZZ                case Iop_Reverse64_32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV64,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse32_8x8:
+//ZZ          case Iop_Reverse32_16x4: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Reverse32_8x8: size = 0; break;
+//ZZ                case Iop_Reverse32_16x4: size = 1; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV32,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse16_8x8: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV16,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpwNEZ64: {
+//ZZ             HReg x_lsh = newVRegD(env);
+//ZZ             HReg x_rsh = newVRegD(env);
+//ZZ             HReg lsh_amt = newVRegD(env);
+//ZZ             HReg rsh_amt = newVRegD(env);
+//ZZ             HReg zero = newVRegD(env);
+//ZZ             HReg tmp = newVRegD(env);
+//ZZ             HReg tmp2 = newVRegD(env);
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg x = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp2, arg, 2, False));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_NOT, x, tmp2, 4, False));
+//ZZ             addInstr(env, ARMInstr_NeonImm(lsh_amt, ARMNImm_TI(0, 32)));
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0, 0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            rsh_amt, zero, lsh_amt, 2, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           x_lsh, x, lsh_amt, 3, False));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           x_rsh, x, rsh_amt, 3, False));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            tmp, x_lsh, x_rsh, 0, False));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            res, tmp, x, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpNEZ8x8:
+//ZZ          case Iop_CmpNEZ16x4:
+//ZZ          case Iop_CmpNEZ32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg tmp = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Unop.op) {
+//ZZ                case Iop_CmpNEZ8x8: size = 0; break;
+//ZZ                case Iop_CmpNEZ16x4: size = 1; break;
+//ZZ                case Iop_CmpNEZ32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp, arg, size, False));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, tmp, 4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_NarrowUn16to8x8:
+//ZZ          case Iop_NarrowUn32to16x4:
+//ZZ          case Iop_NarrowUn64to32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_NarrowUn16to8x8:  size = 0; break;
+//ZZ                case Iop_NarrowUn32to16x4: size = 1; break;
+//ZZ                case Iop_NarrowUn64to32x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYN,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QNarrowUn16Sto8Sx8:
+//ZZ          case Iop_QNarrowUn32Sto16Sx4:
+//ZZ          case Iop_QNarrowUn64Sto32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QNarrowUn16Sto8Sx8:  size = 0; break;
+//ZZ                case Iop_QNarrowUn32Sto16Sx4: size = 1; break;
+//ZZ                case Iop_QNarrowUn64Sto32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNSS,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QNarrowUn16Sto8Ux8:
+//ZZ          case Iop_QNarrowUn32Sto16Ux4:
+//ZZ          case Iop_QNarrowUn64Sto32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QNarrowUn16Sto8Ux8:  size = 0; break;
+//ZZ                case Iop_QNarrowUn32Sto16Ux4: size = 1; break;
+//ZZ                case Iop_QNarrowUn64Sto32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNUS,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QNarrowUn16Uto8Ux8:
+//ZZ          case Iop_QNarrowUn32Uto16Ux4:
+//ZZ          case Iop_QNarrowUn64Uto32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QNarrowUn16Uto8Ux8:  size = 0; break;
+//ZZ                case Iop_QNarrowUn32Uto16Ux4: size = 1; break;
+//ZZ                case Iop_QNarrowUn64Uto32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNUU,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwAddL8Sx8:
+//ZZ          case Iop_PwAddL16Sx4:
+//ZZ          case Iop_PwAddL32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAddL8Sx8: size = 0; break;
+//ZZ                case Iop_PwAddL16Sx4: size = 1; break;
+//ZZ                case Iop_PwAddL32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_PADDLS,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwAddL8Ux8:
+//ZZ          case Iop_PwAddL16Ux4:
+//ZZ          case Iop_PwAddL32Ux2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAddL8Ux8: size = 0; break;
+//ZZ                case Iop_PwAddL16Ux4: size = 1; break;
+//ZZ                case Iop_PwAddL32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_PADDLU,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Cnt8x8: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CNT,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Clz8Sx8:
+//ZZ          case Iop_Clz16Sx4:
+//ZZ          case Iop_Clz32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Clz8Sx8: size = 0; break;
+//ZZ                case Iop_Clz16Sx4: size = 1; break;
+//ZZ                case Iop_Clz32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CLZ,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Cls8Sx8:
+//ZZ          case Iop_Cls16Sx4:
+//ZZ          case Iop_Cls32Sx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Cls8Sx8: size = 0; break;
+//ZZ                case Iop_Cls16Sx4: size = 1; break;
+//ZZ                case Iop_Cls32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CLS,
+//ZZ                                           res, arg, size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_FtoI32Sx2_RZ: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoS,
+//ZZ                                           res, arg, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_FtoI32Ux2_RZ: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoU,
+//ZZ                                           res, arg, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_I32StoFx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTStoF,
+//ZZ                                           res, arg, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_I32UtoFx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTUtoF,
+//ZZ                                           res, arg, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_F32toF16x4: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTF32toF16,
+//ZZ                                           res, arg, 2, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recip32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRECIPF,
+//ZZ                                           res, argL, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recip32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRECIP,
+//ZZ                                           res, argL, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Abs32Fx2: {
+//ZZ             DECLARE_PATTERN(p_vabd_32fx2);
+//ZZ             DEFINE_PATTERN(p_vabd_32fx2,
+//ZZ                            unop(Iop_Abs32Fx2,
+//ZZ                                 binop(Iop_Sub32Fx2,
+//ZZ                                       bind(0),
+//ZZ                                       bind(1))));
+//ZZ             if (matchIRExpr(&mi, p_vabd_32fx2, e)) {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VABDFP,
+//ZZ                                               res, argL, argR, 0, False));
+//ZZ                return res;
+//ZZ             } else {
+//ZZ                HReg res = newVRegD(env);
+//ZZ                HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_VABSFP,
+//ZZ                                              res, arg, 0, False));
+//ZZ                return res;
+//ZZ             }
+//ZZ          }
+//ZZ          case Iop_Rsqrte32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTEFP,
+//ZZ                                           res, arg, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Rsqrte32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTE,
+//ZZ                                           res, arg, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Neg32Fx2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VNEGF,
+//ZZ                                           res, arg, 0, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ    } /* if (e->tag == Iex_Unop) */
+//ZZ 
+//ZZ    if (e->tag == Iex_Triop) {
+//ZZ       IRTriop *triop = e->Iex.Triop.details;
+//ZZ 
+//ZZ       switch (triop->op) {
+//ZZ          case Iop_Extract64: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg argL = iselNeon64Expr(env, triop->arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, triop->arg2);
+//ZZ             UInt imm4;
+//ZZ             if (triop->arg3->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, triop->arg3) != Ity_I8) {
+//ZZ                vpanic("ARM target supports Iop_Extract64 with constant "
+//ZZ                       "third argument less than 16 only\n");
+//ZZ             }
+//ZZ             imm4 = triop->arg3->Iex.Const.con->Ico.U8;
+//ZZ             if (imm4 >= 8) {
+//ZZ                vpanic("ARM target supports Iop_Extract64 with constant "
+//ZZ                       "third argument less than 16 only\n");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VEXT,
+//ZZ                                            res, argL, argR, imm4, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_SetElem8x8:
+//ZZ          case Iop_SetElem16x4:
+//ZZ          case Iop_SetElem32x2: {
+//ZZ             HReg res = newVRegD(env);
+//ZZ             HReg dreg = iselNeon64Expr(env, triop->arg1);
+//ZZ             HReg arg = iselIntExpr_R(env, triop->arg3);
+//ZZ             UInt index, size;
+//ZZ             if (triop->arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, triop->arg2) != Ity_I8) {
+//ZZ                vpanic("ARM target supports SetElem with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             index = triop->arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (triop->op) {
+//ZZ                case Iop_SetElem8x8: vassert(index < 8); size = 0; break;
+//ZZ                case Iop_SetElem16x4: vassert(index < 4); size = 1; break;
+//ZZ                case Iop_SetElem32x2: vassert(index < 2); size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, res, dreg, 4, False));
+//ZZ             addInstr(env, ARMInstr_NUnaryS(ARMneon_SETELEM,
+//ZZ                                            mkARMNRS(ARMNRS_Scalar, res, index),
+//ZZ                                            mkARMNRS(ARMNRS_Reg, arg, 0),
+//ZZ                                            size, False));
+//ZZ             return res;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- MULTIPLEX --------- */
+//ZZ    if (e->tag == Iex_ITE) { // VFD
+//ZZ       HReg rLo, rHi;
+//ZZ       HReg res = newVRegD(env);
+//ZZ       iselInt64Expr(&rHi, &rLo, env, e);
+//ZZ       addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+//ZZ       return res;
+//ZZ    }
+//ZZ 
+//ZZ    ppIRExpr(e);
+//ZZ    vpanic("iselNeon64Expr");
+//ZZ }
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Vector (NEON) expressions (128 bit)         ---*/
+/*---------------------------------------------------------*/
+
+static HReg iselV128Expr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselV128Expr_wrk( env, e );
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env, e);
+   vassert(e);
+   vassert(ty == Ity_V128);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      /* Only a very limited range of constants is handled. */
+      vassert(e->Iex.Const.con->tag == Ico_V128);
+      UShort con = e->Iex.Const.con->Ico.V128;
+      if (con == 0x0000) {
+         HReg res = newVRegV(env);
+         addInstr(env, ARM64Instr_VImmQ(res, con));
+         return res;
+      }
+      /* Unhandled */
+      goto v128_expr_bad;
+   }
+
+   if (e->tag == Iex_Load) {
+      HReg res = newVRegV(env);
+      HReg rN  = iselIntExpr_R(env, e->Iex.Load.addr);
+      vassert(ty == Ity_V128);
+      addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, res, rN));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      UInt offs = (UInt)e->Iex.Get.offset;
+      if (offs < (1<<12)) {
+         HReg addr = mk_baseblock_128bit_access_addr(env, offs);
+         HReg res  = newVRegV(env);
+         vassert(ty == Ity_V128);
+         addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, res, addr));
+         return res;
+      }
+      goto v128_expr_bad;
+   }
+
+   if (e->tag == Iex_Unop) {
+
+     /* Iop_ZeroHIXXofV128 cases */
+      UShort imm16 = 0;
+      switch (e->Iex.Unop.op) {
+         case Iop_ZeroHI64ofV128:  imm16 = 0x00FF; break;
+         case Iop_ZeroHI96ofV128:  imm16 = 0x000F; break;
+         case Iop_ZeroHI112ofV128: imm16 = 0x0003; break;
+         case Iop_ZeroHI120ofV128: imm16 = 0x0001; break;
+         default: break;
+      }
+      if (imm16 != 0) {
+         HReg src = iselV128Expr(env, e->Iex.Unop.arg);
+         HReg imm = newVRegV(env);
+         HReg res = newVRegV(env);
+         addInstr(env, ARM64Instr_VImmQ(imm, imm16));
+         addInstr(env, ARM64Instr_VBinV(ARM64vecb_AND, res, src, imm));
+         return res;
+      }
+
+      /* Other cases */
+      switch (e->Iex.Unop.op) {
+         case Iop_NotV128:
+         case Iop_Abs64Fx2:
+         case Iop_Abs32Fx4:
+         case Iop_Neg64Fx2:
+         case Iop_Neg32Fx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselV128Expr(env, e->Iex.Unop.arg);
+            ARM64VecUnaryOp op = ARM64vecu_INVALID;
+            switch (e->Iex.Unop.op) {
+               case Iop_NotV128:  op = ARM64vecu_NOT;      break;
+               case Iop_Abs64Fx2: op = ARM64vecu_FABS64x2; break;
+               case Iop_Abs32Fx4: op = ARM64vecu_FABS32x4; break;
+               case Iop_Neg64Fx2: op = ARM64vecu_FNEG64x2; break;
+               case Iop_Neg32Fx4: op = ARM64vecu_FNEG32x4; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
+            return res;
+         }
+         case Iop_CmpNEZ8x16:
+         case Iop_CmpNEZ16x8:
+         case Iop_CmpNEZ32x4:
+         case Iop_CmpNEZ64x2: {
+            HReg arg  = iselV128Expr(env, e->Iex.Unop.arg);
+            HReg zero = newVRegV(env);
+            HReg res  = newVRegV(env);
+            ARM64VecBinOp cmp = ARM64vecb_INVALID;
+            switch (e->Iex.Unop.op) {
+               case Iop_CmpNEZ64x2: cmp = ARM64vecb_CMEQ64x2; break;
+               case Iop_CmpNEZ32x4: cmp = ARM64vecb_CMEQ32x4; break;
+               case Iop_CmpNEZ16x8: cmp = ARM64vecb_CMEQ16x8; break;
+               case Iop_CmpNEZ8x16: cmp = ARM64vecb_CMEQ8x16; break;
+               default: vassert(0);
+            }
+            // This is pretty feeble.  Better: use CMP against zero
+            // and avoid the extra instruction and extra register.
+            addInstr(env, ARM64Instr_VImmQ(zero, 0x0000));
+            addInstr(env, ARM64Instr_VBinV(cmp, res, arg, zero));
+            addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res));
+            return res;
+         }
+
+//ZZ          case Iop_NotV128: {
+//ZZ             DECLARE_PATTERN(p_veqz_8x16);
+//ZZ             DECLARE_PATTERN(p_veqz_16x8);
+//ZZ             DECLARE_PATTERN(p_veqz_32x4);
+//ZZ             DECLARE_PATTERN(p_vcge_8sx16);
+//ZZ             DECLARE_PATTERN(p_vcge_16sx8);
+//ZZ             DECLARE_PATTERN(p_vcge_32sx4);
+//ZZ             DECLARE_PATTERN(p_vcge_8ux16);
+//ZZ             DECLARE_PATTERN(p_vcge_16ux8);
+//ZZ             DECLARE_PATTERN(p_vcge_32ux4);
+//ZZ             DEFINE_PATTERN(p_veqz_8x16,
+//ZZ                   unop(Iop_NotV128, unop(Iop_CmpNEZ8x16, bind(0))));
+//ZZ             DEFINE_PATTERN(p_veqz_16x8,
+//ZZ                   unop(Iop_NotV128, unop(Iop_CmpNEZ16x8, bind(0))));
+//ZZ             DEFINE_PATTERN(p_veqz_32x4,
+//ZZ                   unop(Iop_NotV128, unop(Iop_CmpNEZ32x4, bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_8sx16,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT8Sx16, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_16sx8,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT16Sx8, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_32sx4,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT32Sx4, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_8ux16,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT8Ux16, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_16ux8,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT16Ux8, bind(1), bind(0))));
+//ZZ             DEFINE_PATTERN(p_vcge_32ux4,
+//ZZ                   unop(Iop_NotV128, binop(Iop_CmpGT32Ux4, bind(1), bind(0))));
+//ZZ             if (matchIRExpr(&mi, p_veqz_8x16, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg arg = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 0, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_veqz_16x8, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg arg = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 1, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_veqz_32x4, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg arg = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 2, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_8sx16, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 0, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_16sx8, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 1, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_32sx4, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+//ZZ                                               res, argL, argR, 2, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_8ux16, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 0, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_16ux8, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 1, True));
+//ZZ                return res;
+//ZZ             } else if (matchIRExpr(&mi, p_vcge_32ux4, e)) {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg argL = iselNeonExpr(env, mi.bindee[0]);
+//ZZ                HReg argR = iselNeonExpr(env, mi.bindee[1]);
+//ZZ                addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+//ZZ                                               res, argL, argR, 2, True));
+//ZZ                return res;
+//ZZ             } else {
+//ZZ                HReg res = newVRegV(env);
+//ZZ                HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ                addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, arg, 4, True));
+//ZZ                return res;
+//ZZ             }
+//ZZ          }
+//ZZ          case Iop_Dup8x16:
+//ZZ          case Iop_Dup16x8:
+//ZZ          case Iop_Dup32x4: {
+//ZZ             HReg res, arg;
+//ZZ             UInt size;
+//ZZ             DECLARE_PATTERN(p_vdup_8x16);
+//ZZ             DECLARE_PATTERN(p_vdup_16x8);
+//ZZ             DECLARE_PATTERN(p_vdup_32x4);
+//ZZ             DEFINE_PATTERN(p_vdup_8x16,
+//ZZ                   unop(Iop_Dup8x16, binop(Iop_GetElem8x8, bind(0), bind(1))));
+//ZZ             DEFINE_PATTERN(p_vdup_16x8,
+//ZZ                   unop(Iop_Dup16x8, binop(Iop_GetElem16x4, bind(0), bind(1))));
+//ZZ             DEFINE_PATTERN(p_vdup_32x4,
+//ZZ                   unop(Iop_Dup32x4, binop(Iop_GetElem32x2, bind(0), bind(1))));
+//ZZ             if (matchIRExpr(&mi, p_vdup_8x16, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 1) + 1;
+//ZZ                   if (index < 8) {
+//ZZ                      res = newVRegV(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, True
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             } else if (matchIRExpr(&mi, p_vdup_16x8, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 2) + 2;
+//ZZ                   if (index < 4) {
+//ZZ                      res = newVRegV(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, True
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             } else if (matchIRExpr(&mi, p_vdup_32x4, e)) {
+//ZZ                UInt index;
+//ZZ                UInt imm4;
+//ZZ                if (mi.bindee[1]->tag == Iex_Const &&
+//ZZ                   typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+//ZZ                   index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+//ZZ                   imm4 = (index << 3) + 4;
+//ZZ                   if (index < 2) {
+//ZZ                      res = newVRegV(env);
+//ZZ                      arg = iselNeon64Expr(env, mi.bindee[0]);
+//ZZ                      addInstr(env, ARMInstr_NUnaryS(
+//ZZ                                       ARMneon_VDUP,
+//ZZ                                       mkARMNRS(ARMNRS_Reg, res, 0),
+//ZZ                                       mkARMNRS(ARMNRS_Scalar, arg, index),
+//ZZ                                       imm4, True
+//ZZ                              ));
+//ZZ                      return res;
+//ZZ                   }
+//ZZ                }
+//ZZ             }
+//ZZ             arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             res = newVRegV(env);
+//ZZ             switch (e->Iex.Unop.op) {
+//ZZ                case Iop_Dup8x16: size = 0; break;
+//ZZ                case Iop_Dup16x8: size = 1; break;
+//ZZ                case Iop_Dup32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP, res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Abs8x16:
+//ZZ          case Iop_Abs16x8:
+//ZZ          case Iop_Abs32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Abs8x16: size = 0; break;
+//ZZ                case Iop_Abs16x8: size = 1; break;
+//ZZ                case Iop_Abs32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_ABS, res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse64_8x16:
+//ZZ          case Iop_Reverse64_16x8:
+//ZZ          case Iop_Reverse64_32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Reverse64_8x16: size = 0; break;
+//ZZ                case Iop_Reverse64_16x8: size = 1; break;
+//ZZ                case Iop_Reverse64_32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV64,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse32_8x16:
+//ZZ          case Iop_Reverse32_16x8: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Reverse32_8x16: size = 0; break;
+//ZZ                case Iop_Reverse32_16x8: size = 1; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV32,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Reverse16_8x16: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_REV16,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpNEZ64x2: {
+//ZZ             HReg x_lsh = newVRegV(env);
+//ZZ             HReg x_rsh = newVRegV(env);
+//ZZ             HReg lsh_amt = newVRegV(env);
+//ZZ             HReg rsh_amt = newVRegV(env);
+//ZZ             HReg zero = newVRegV(env);
+//ZZ             HReg tmp = newVRegV(env);
+//ZZ             HReg tmp2 = newVRegV(env);
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg x = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp2, arg, 2, True));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_NOT, x, tmp2, 4, True));
+//ZZ             addInstr(env, ARMInstr_NeonImm(lsh_amt, ARMNImm_TI(0, 32)));
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0, 0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            rsh_amt, zero, lsh_amt, 2, True));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           x_lsh, x, lsh_amt, 3, True));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           x_rsh, x, rsh_amt, 3, True));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            tmp, x_lsh, x_rsh, 0, True));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            res, tmp, x, 0, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Widen8Uto16x8:
+//ZZ          case Iop_Widen16Uto32x4:
+//ZZ          case Iop_Widen32Uto64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Unop.op) {
+//ZZ                case Iop_Widen8Uto16x8:  size = 0; break;
+//ZZ                case Iop_Widen16Uto32x4: size = 1; break;
+//ZZ                case Iop_Widen32Uto64x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYLU,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Widen8Sto16x8:
+//ZZ          case Iop_Widen16Sto32x4:
+//ZZ          case Iop_Widen32Sto64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Unop.op) {
+//ZZ                case Iop_Widen8Sto16x8:  size = 0; break;
+//ZZ                case Iop_Widen16Sto32x4: size = 1; break;
+//ZZ                case Iop_Widen32Sto64x2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPYLS,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwAddL8Sx16:
+//ZZ          case Iop_PwAddL16Sx8:
+//ZZ          case Iop_PwAddL32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAddL8Sx16: size = 0; break;
+//ZZ                case Iop_PwAddL16Sx8: size = 1; break;
+//ZZ                case Iop_PwAddL32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_PADDLS,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwAddL8Ux16:
+//ZZ          case Iop_PwAddL16Ux8:
+//ZZ          case Iop_PwAddL32Ux4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAddL8Ux16: size = 0; break;
+//ZZ                case Iop_PwAddL16Ux8: size = 1; break;
+//ZZ                case Iop_PwAddL32Ux4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_PADDLU,
+//ZZ                                           res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Cnt8x16: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CNT, res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Clz8Sx16:
+//ZZ          case Iop_Clz16Sx8:
+//ZZ          case Iop_Clz32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Clz8Sx16: size = 0; break;
+//ZZ                case Iop_Clz16Sx8: size = 1; break;
+//ZZ                case Iop_Clz32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CLZ, res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Cls8Sx16:
+//ZZ          case Iop_Cls16Sx8:
+//ZZ          case Iop_Cls32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Cls8Sx16: size = 0; break;
+//ZZ                case Iop_Cls16Sx8: size = 1; break;
+//ZZ                case Iop_Cls32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_CLS, res, arg, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_FtoI32Sx4_RZ: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoS,
+//ZZ                                           res, arg, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_FtoI32Ux4_RZ: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoU,
+//ZZ                                           res, arg, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_I32StoFx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTStoF,
+//ZZ                                           res, arg, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_I32UtoFx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTUtoF,
+//ZZ                                           res, arg, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_F16toF32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VCVTF16toF32,
+//ZZ                                           res, arg, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recip32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRECIPF,
+//ZZ                                           res, argL, 0, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recip32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRECIP,
+//ZZ                                           res, argL, 0, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Rsqrte32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTEFP,
+//ZZ                                           res, argL, 0, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Rsqrte32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTE,
+//ZZ                                           res, argL, 0, True));
+//ZZ             return res;
+//ZZ          }
+         /* ... */
+         default:
+            break;
+      } /* switch on the unop */
+   } /* if (e->tag == Iex_Unop) */
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_64HLtoV128: {
+            HReg res  = newVRegV(env);
+            HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, ARM64Instr_VQfromXX(res, argL, argR));
+            return res;
+         }
+//ZZ          case Iop_AndV128: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+//ZZ                                            res, argL, argR, 4, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_OrV128: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+//ZZ                                            res, argL, argR, 4, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_XorV128: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VXOR,
+//ZZ                                            res, argL, argR, 4, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Add8x16:
+//ZZ          case Iop_Add16x8:
+//ZZ          case Iop_Add32x4:
+         case Iop_AndV128:
+         case Iop_OrV128:
+         case Iop_XorV128:
+         case Iop_Max32Ux4:
+         case Iop_Max16Ux8:
+         case Iop_Max8Ux16:
+         case Iop_Min32Ux4:
+         case Iop_Min16Ux8:
+         case Iop_Min8Ux16:
+         case Iop_Max32Sx4:
+         case Iop_Max16Sx8:
+         case Iop_Max8Sx16:
+         case Iop_Min32Sx4:
+         case Iop_Min16Sx8:
+         case Iop_Min8Sx16:
+         case Iop_Add64x2:
+         case Iop_Add32x4:
+         case Iop_Add16x8:
+         case Iop_Add8x16:
+         case Iop_Sub64x2:
+         case Iop_Sub32x4:
+         case Iop_Sub16x8:
+         case Iop_Sub8x16:
+         case Iop_Mul32x4:
+         case Iop_Mul16x8:
+         case Iop_Mul8x16:
+         case Iop_CmpEQ64x2:
+         case Iop_CmpEQ32x4:
+         case Iop_CmpEQ16x8:
+         case Iop_CmpEQ8x16:
+         case Iop_CmpGT64Ux2:
+         case Iop_CmpGT32Ux4:
+         case Iop_CmpGT16Ux8:
+         case Iop_CmpGT8Ux16:
+         case Iop_CmpGT64Sx2:
+         case Iop_CmpGT32Sx4:
+         case Iop_CmpGT16Sx8:
+         case Iop_CmpGT8Sx16:
+         case Iop_CmpEQ64Fx2:
+         case Iop_CmpEQ32Fx4:
+         case Iop_CmpLE64Fx2:
+         case Iop_CmpLE32Fx4:
+         case Iop_CmpLT64Fx2:
+         case Iop_CmpLT32Fx4:
+         case Iop_Perm8x16:
+         {
+            HReg res  = newVRegV(env);
+            HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
+            Bool sw   = False;
+            ARM64VecBinOp op = ARM64vecb_INVALID;
+            switch (e->Iex.Binop.op) {
+               case Iop_AndV128:    op = ARM64vecb_AND; break;
+               case Iop_OrV128:     op = ARM64vecb_ORR; break;
+               case Iop_XorV128:    op = ARM64vecb_XOR; break;
+               case Iop_Max32Ux4:   op = ARM64vecb_UMAX32x4; break;
+               case Iop_Max16Ux8:   op = ARM64vecb_UMAX16x8; break;
+               case Iop_Max8Ux16:   op = ARM64vecb_UMAX8x16; break;
+               case Iop_Min32Ux4:   op = ARM64vecb_UMIN32x4; break;
+               case Iop_Min16Ux8:   op = ARM64vecb_UMIN16x8; break;
+               case Iop_Min8Ux16:   op = ARM64vecb_UMIN8x16; break;
+               case Iop_Max32Sx4:   op = ARM64vecb_SMAX32x4; break;
+               case Iop_Max16Sx8:   op = ARM64vecb_SMAX16x8; break;
+               case Iop_Max8Sx16:   op = ARM64vecb_SMAX8x16; break;
+               case Iop_Min32Sx4:   op = ARM64vecb_SMIN32x4; break;
+               case Iop_Min16Sx8:   op = ARM64vecb_SMIN16x8; break;
+               case Iop_Min8Sx16:   op = ARM64vecb_SMIN8x16; break;
+               case Iop_Add64x2:    op = ARM64vecb_ADD64x2; break;
+               case Iop_Add32x4:    op = ARM64vecb_ADD32x4; break;
+               case Iop_Add16x8:    op = ARM64vecb_ADD16x8; break;
+               case Iop_Add8x16:    op = ARM64vecb_ADD8x16; break;
+               case Iop_Sub64x2:    op = ARM64vecb_SUB64x2; break;
+               case Iop_Sub32x4:    op = ARM64vecb_SUB32x4; break;
+               case Iop_Sub16x8:    op = ARM64vecb_SUB16x8; break;
+               case Iop_Sub8x16:    op = ARM64vecb_SUB8x16; break;
+               case Iop_Mul32x4:    op = ARM64vecb_MUL32x4; break;
+               case Iop_Mul16x8:    op = ARM64vecb_MUL16x8; break;
+               case Iop_Mul8x16:    op = ARM64vecb_MUL8x16; break;
+               case Iop_CmpEQ64x2:  op = ARM64vecb_CMEQ64x2; break;
+               case Iop_CmpEQ32x4:  op = ARM64vecb_CMEQ32x4; break;
+               case Iop_CmpEQ16x8:  op = ARM64vecb_CMEQ16x8; break;
+               case Iop_CmpEQ8x16:  op = ARM64vecb_CMEQ8x16; break;
+               case Iop_CmpGT64Ux2: op = ARM64vecb_CMHI64x2; break;
+               case Iop_CmpGT32Ux4: op = ARM64vecb_CMHI32x4; break;
+               case Iop_CmpGT16Ux8: op = ARM64vecb_CMHI16x8; break;
+               case Iop_CmpGT8Ux16: op = ARM64vecb_CMHI8x16; break;
+               case Iop_CmpGT64Sx2: op = ARM64vecb_CMGT64x2; break;
+               case Iop_CmpGT32Sx4: op = ARM64vecb_CMGT32x4; break;
+               case Iop_CmpGT16Sx8: op = ARM64vecb_CMGT16x8; break;
+               case Iop_CmpGT8Sx16: op = ARM64vecb_CMGT8x16; break;
+               case Iop_CmpEQ64Fx2: op = ARM64vecb_FCMEQ64x2; break;
+               case Iop_CmpEQ32Fx4: op = ARM64vecb_FCMEQ32x4; break;
+               case Iop_CmpLE64Fx2: op = ARM64vecb_FCMGE64x2; sw = True; break;
+               case Iop_CmpLE32Fx4: op = ARM64vecb_FCMGE32x4; sw = True; break;
+               case Iop_CmpLT64Fx2: op = ARM64vecb_FCMGT64x2; sw = True; break;
+               case Iop_CmpLT32Fx4: op = ARM64vecb_FCMGT32x4; sw = True; break;
+               case Iop_Perm8x16:   op = ARM64vecb_TBL1; break;
+               default: vassert(0);
+            }
+            if (sw) {
+               addInstr(env, ARM64Instr_VBinV(op, res, argR, argL));
+            } else {
+               addInstr(env, ARM64Instr_VBinV(op, res, argL, argR));
+            }
+            return res;
+         }
+//ZZ          case Iop_Add32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VADDFP,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Recps32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VRECPS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Rsqrts32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VRSQRTS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          // These 6 verified 18 Apr 2013
+//ZZ          case Iop_InterleaveEvenLanes8x16:
+//ZZ          case Iop_InterleaveOddLanes8x16:
+//ZZ          case Iop_InterleaveEvenLanes16x8:
+//ZZ          case Iop_InterleaveOddLanes16x8:
+//ZZ          case Iop_InterleaveEvenLanes32x4:
+//ZZ          case Iop_InterleaveOddLanes32x4: {
+//ZZ             HReg rD   = newVRegV(env);
+//ZZ             HReg rM   = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_InterleaveOddLanes8x16:  resRd = False; size = 0; break;
+//ZZ                case Iop_InterleaveEvenLanes8x16: resRd = True;  size = 0; break;
+//ZZ                case Iop_InterleaveOddLanes16x8:  resRd = False; size = 1; break;
+//ZZ                case Iop_InterleaveEvenLanes16x8: resRd = True;  size = 1; break;
+//ZZ                case Iop_InterleaveOddLanes32x4:  resRd = False; size = 2; break;
+//ZZ                case Iop_InterleaveEvenLanes32x4: resRd = True;  size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_TRN, rD, rM, size, True));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          // These 6 verified 18 Apr 2013
+//ZZ          case Iop_InterleaveHI8x16:
+//ZZ          case Iop_InterleaveLO8x16:
+//ZZ          case Iop_InterleaveHI16x8:
+//ZZ          case Iop_InterleaveLO16x8:
+//ZZ          case Iop_InterleaveHI32x4:
+//ZZ          case Iop_InterleaveLO32x4: {
+//ZZ             HReg rD   = newVRegV(env);
+//ZZ             HReg rM   = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_InterleaveHI8x16: resRd = False; size = 0; break;
+//ZZ                case Iop_InterleaveLO8x16: resRd = True;  size = 0; break;
+//ZZ                case Iop_InterleaveHI16x8: resRd = False; size = 1; break;
+//ZZ                case Iop_InterleaveLO16x8: resRd = True;  size = 1; break;
+//ZZ                case Iop_InterleaveHI32x4: resRd = False; size = 2; break;
+//ZZ                case Iop_InterleaveLO32x4: resRd = True;  size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_ZIP, rD, rM, size, True));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          // These 6 verified 18 Apr 2013
+//ZZ          case Iop_CatOddLanes8x16:
+//ZZ          case Iop_CatEvenLanes8x16:
+//ZZ          case Iop_CatOddLanes16x8:
+//ZZ          case Iop_CatEvenLanes16x8:
+//ZZ          case Iop_CatOddLanes32x4:
+//ZZ          case Iop_CatEvenLanes32x4: {
+//ZZ             HReg rD   = newVRegV(env);
+//ZZ             HReg rM   = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             Bool resRd;  // is the result in rD or rM ?
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CatOddLanes8x16:  resRd = False; size = 0; break;
+//ZZ                case Iop_CatEvenLanes8x16: resRd = True;  size = 0; break;
+//ZZ                case Iop_CatOddLanes16x8:  resRd = False; size = 1; break;
+//ZZ                case Iop_CatEvenLanes16x8: resRd = True;  size = 1; break;
+//ZZ                case Iop_CatOddLanes32x4:  resRd = False; size = 2; break;
+//ZZ                case Iop_CatEvenLanes32x4: resRd = True;  size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+//ZZ             addInstr(env, ARMInstr_NDual(ARMneon_UZP, rD, rM, size, True));
+//ZZ             return resRd ? rD : rM;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QAdd8Ux16:
+//ZZ          case Iop_QAdd16Ux8:
+//ZZ          case Iop_QAdd32Ux4:
+//ZZ          case Iop_QAdd64Ux2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QAdd8Ux16: size = 0; break;
+//ZZ                case Iop_QAdd16Ux8: size = 1; break;
+//ZZ                case Iop_QAdd32Ux4: size = 2; break;
+//ZZ                case Iop_QAdd64Ux2: size = 3; break;
+//ZZ                default:
+//ZZ                   ppIROp(e->Iex.Binop.op);
+//ZZ                   vpanic("Illegal element size in VQADDU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQADDU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QAdd8Sx16:
+//ZZ          case Iop_QAdd16Sx8:
+//ZZ          case Iop_QAdd32Sx4:
+//ZZ          case Iop_QAdd64Sx2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QAdd8Sx16: size = 0; break;
+//ZZ                case Iop_QAdd16Sx8: size = 1; break;
+//ZZ                case Iop_QAdd32Sx4: size = 2; break;
+//ZZ                case Iop_QAdd64Sx2: size = 3; break;
+//ZZ                default:
+//ZZ                   ppIROp(e->Iex.Binop.op);
+//ZZ                   vpanic("Illegal element size in VQADDS");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQADDS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sub8x16:
+//ZZ          case Iop_Sub16x8:
+//ZZ          case Iop_Sub32x4:
+//ZZ          case Iop_Sub64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sub8x16: size = 0; break;
+//ZZ                case Iop_Sub16x8: size = 1; break;
+//ZZ                case Iop_Sub32x4: size = 2; break;
+//ZZ                case Iop_Sub64x2: size = 3; break;
+//ZZ                default:
+//ZZ                   ppIROp(e->Iex.Binop.op);
+//ZZ                   vpanic("Illegal element size in VSUB");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sub32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUBFP,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSub8Ux16:
+//ZZ          case Iop_QSub16Ux8:
+//ZZ          case Iop_QSub32Ux4:
+//ZZ          case Iop_QSub64Ux2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSub8Ux16: size = 0; break;
+//ZZ                case Iop_QSub16Ux8: size = 1; break;
+//ZZ                case Iop_QSub32Ux4: size = 2; break;
+//ZZ                case Iop_QSub64Ux2: size = 3; break;
+//ZZ                default:
+//ZZ                   ppIROp(e->Iex.Binop.op);
+//ZZ                   vpanic("Illegal element size in VQSUBU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSub8Sx16:
+//ZZ          case Iop_QSub16Sx8:
+//ZZ          case Iop_QSub32Sx4:
+//ZZ          case Iop_QSub64Sx2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSub8Sx16: size = 0; break;
+//ZZ                case Iop_QSub16Sx8: size = 1; break;
+//ZZ                case Iop_QSub32Sx4: size = 2; break;
+//ZZ                case Iop_QSub64Sx2: size = 3; break;
+//ZZ                default:
+//ZZ                   ppIROp(e->Iex.Binop.op);
+//ZZ                   vpanic("Illegal element size in VQSUBS");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max8Ux16:
+//ZZ          case Iop_Max16Ux8:
+//ZZ          case Iop_Max32Ux4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Max8Ux16: size = 0; break;
+//ZZ                case Iop_Max16Ux8: size = 1; break;
+//ZZ                case Iop_Max32Ux4: size = 2; break;
+//ZZ                default: vpanic("Illegal element size in VMAXU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max8Sx16:
+//ZZ          case Iop_Max16Sx8:
+//ZZ          case Iop_Max32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Max8Sx16: size = 0; break;
+//ZZ                case Iop_Max16Sx8: size = 1; break;
+//ZZ                case Iop_Max32Sx4: size = 2; break;
+//ZZ                default: vpanic("Illegal element size in VMAXU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min8Ux16:
+//ZZ          case Iop_Min16Ux8:
+//ZZ          case Iop_Min32Ux4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Min8Ux16: size = 0; break;
+//ZZ                case Iop_Min16Ux8: size = 1; break;
+//ZZ                case Iop_Min32Ux4: size = 2; break;
+//ZZ                default: vpanic("Illegal element size in VMAXU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min8Sx16:
+//ZZ          case Iop_Min16Sx8:
+//ZZ          case Iop_Min32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Min8Sx16: size = 0; break;
+//ZZ                case Iop_Min16Sx8: size = 1; break;
+//ZZ                case Iop_Min32Sx4: size = 2; break;
+//ZZ                default: vpanic("Illegal element size in VMAXU");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sar8x16:
+//ZZ          case Iop_Sar16x8:
+//ZZ          case Iop_Sar32x4:
+//ZZ          case Iop_Sar64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegV(env);
+//ZZ             HReg zero = newVRegV(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sar8x16: size = 0; break;
+//ZZ                case Iop_Sar16x8: size = 1; break;
+//ZZ                case Iop_Sar32x4: size = 2; break;
+//ZZ                case Iop_Sar64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            argR2, zero, argR, size, True));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+//ZZ                                           res, argL, argR2, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Sal8x16:
+//ZZ          case Iop_Sal16x8:
+//ZZ          case Iop_Sal32x4:
+//ZZ          case Iop_Sal64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Sal8x16: size = 0; break;
+//ZZ                case Iop_Sal16x8: size = 1; break;
+//ZZ                case Iop_Sal32x4: size = 2; break;
+//ZZ                case Iop_Sal64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+//ZZ                                           res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Shr8x16:
+//ZZ          case Iop_Shr16x8:
+//ZZ          case Iop_Shr32x4:
+//ZZ          case Iop_Shr64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             HReg argR2 = newVRegV(env);
+//ZZ             HReg zero = newVRegV(env);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Shr8x16: size = 0; break;
+//ZZ                case Iop_Shr16x8: size = 1; break;
+//ZZ                case Iop_Shr32x4: size = 2; break;
+//ZZ                case Iop_Shr64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+//ZZ                                            argR2, zero, argR, size, True));
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, argR2, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Shl8x16:
+//ZZ          case Iop_Shl16x8:
+//ZZ          case Iop_Shl32x4:
+//ZZ          case Iop_Shl64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_Shl8x16: size = 0; break;
+//ZZ                case Iop_Shl16x8: size = 1; break;
+//ZZ                case Iop_Shl32x4: size = 2; break;
+//ZZ                case Iop_Shl64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+//ZZ                                           res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShl8x16:
+//ZZ          case Iop_QShl16x8:
+//ZZ          case Iop_QShl32x4:
+//ZZ          case Iop_QShl64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShl8x16: size = 0; break;
+//ZZ                case Iop_QShl16x8: size = 1; break;
+//ZZ                case Iop_QShl32x4: size = 2; break;
+//ZZ                case Iop_QShl64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VQSHL,
+//ZZ                                           res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSal8x16:
+//ZZ          case Iop_QSal16x8:
+//ZZ          case Iop_QSal32x4:
+//ZZ          case Iop_QSal64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSal8x16: size = 0; break;
+//ZZ                case Iop_QSal16x8: size = 1; break;
+//ZZ                case Iop_QSal32x4: size = 2; break;
+//ZZ                case Iop_QSal64x2: size = 3; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VQSAL,
+//ZZ                                           res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShlN8x16:
+//ZZ          case Iop_QShlN16x8:
+//ZZ          case Iop_QShlN32x4:
+//ZZ          case Iop_QShlN64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNAxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShlN8x16: size = 8 | imm; break;
+//ZZ                case Iop_QShlN16x8: size = 16 | imm; break;
+//ZZ                case Iop_QShlN32x4: size = 32 | imm; break;
+//ZZ                case Iop_QShlN64x2: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUU,
+//ZZ                                           res, argL, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QShlN8Sx16:
+//ZZ          case Iop_QShlN16Sx8:
+//ZZ          case Iop_QShlN32Sx4:
+//ZZ          case Iop_QShlN64Sx2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNASxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QShlN8Sx16: size = 8 | imm; break;
+//ZZ                case Iop_QShlN16Sx8: size = 16 | imm; break;
+//ZZ                case Iop_QShlN32Sx4: size = 32 | imm; break;
+//ZZ                case Iop_QShlN64Sx2: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUS,
+//ZZ                                           res, argL, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_QSalN8x16:
+//ZZ          case Iop_QSalN16x8:
+//ZZ          case Iop_QSalN32x4:
+//ZZ          case Iop_QSalN64x2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             UInt size, imm;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                vpanic("ARM taget supports Iop_QShlNAxB with constant "
+//ZZ                       "second argument only\n");
+//ZZ             }
+//ZZ             imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_QSalN8x16: size = 8 | imm; break;
+//ZZ                case Iop_QSalN16x8: size = 16 | imm; break;
+//ZZ                case Iop_QSalN32x4: size = 32 | imm; break;
+//ZZ                case Iop_QSalN64x2: size = 64 | imm; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNSS,
+//ZZ                                           res, argL, size, True));
+//ZZ             return res;
+//ZZ          }
+         case Iop_ShrN64x2:
+         case Iop_ShrN32x4:
+         case Iop_ShrN16x8:
+         case Iop_ShrN8x16:
+         case Iop_SarN64x2:
+         case Iop_SarN32x4:
+         case Iop_SarN16x8:
+         case Iop_SarN8x16:
+         case Iop_ShlN64x2:
+         case Iop_ShlN32x4:
+         case Iop_ShlN16x8:
+         case Iop_ShlN8x16:
+         {
+            IRExpr* argL = e->Iex.Binop.arg1;
+            IRExpr* argR = e->Iex.Binop.arg2;
+            if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
+               UInt amt   = argR->Iex.Const.con->Ico.U8;
+               UInt limit = 0;
+               ARM64VecShiftOp op = ARM64vecsh_INVALID;
+               switch (e->Iex.Binop.op) {
+                  case Iop_ShrN64x2:
+                     op = ARM64vecsh_USHR64x2; limit = 63; break;
+                  case Iop_ShrN32x4:
+                     op = ARM64vecsh_USHR32x4; limit = 31; break;
+                  case Iop_ShrN16x8:
+                     op = ARM64vecsh_USHR16x8; limit = 15; break;
+                  case Iop_ShrN8x16:
+                     op = ARM64vecsh_USHR8x16; limit = 7;  break;
+                  case Iop_SarN64x2:
+                     op = ARM64vecsh_SSHR64x2; limit = 63; break;
+                  case Iop_SarN32x4:
+                     op = ARM64vecsh_SSHR32x4; limit = 31; break;
+                  case Iop_SarN16x8:
+                     op = ARM64vecsh_SSHR16x8; limit = 15; break;
+                  case Iop_SarN8x16:
+                     op = ARM64vecsh_SSHR8x16; limit = 7;  break;
+                  case Iop_ShlN64x2:
+                     op = ARM64vecsh_SHL64x2;  limit = 63; break;
+                  case Iop_ShlN32x4:
+                     op = ARM64vecsh_SHL32x4;  limit = 31; break;
+                  case Iop_ShlN16x8:
+                     op = ARM64vecsh_SHL16x8;  limit = 15; break;
+                  case Iop_ShlN8x16:
+                     op = ARM64vecsh_SHL8x16;  limit = 7;  break;
+                  default:
+                     vassert(0);
+               }
+               if (op != ARM64vecsh_INVALID && amt >= 0 && amt <= limit) {
+                  HReg src = iselV128Expr(env, argL);
+                  HReg dst = newVRegV(env);
+                  if (amt > 0) {
+                     addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt));
+                  } else {
+                     dst = src;
+                  }
+                  return dst;
+               }
+            }
+            /* else fall out; this is unhandled */
+            break;
+         }
+//ZZ          case Iop_CmpGT8Ux16:
+//ZZ          case Iop_CmpGT16Ux8:
+//ZZ          case Iop_CmpGT32Ux4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpGT8Ux16: size = 0; break;
+//ZZ                case Iop_CmpGT16Ux8: size = 1; break;
+//ZZ                case Iop_CmpGT32Ux4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGT8Sx16:
+//ZZ          case Iop_CmpGT16Sx8:
+//ZZ          case Iop_CmpGT32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpGT8Sx16: size = 0; break;
+//ZZ                case Iop_CmpGT16Sx8: size = 1; break;
+//ZZ                case Iop_CmpGT32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpEQ8x16:
+//ZZ          case Iop_CmpEQ16x8:
+//ZZ          case Iop_CmpEQ32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size;
+//ZZ             switch (e->Iex.Binop.op) {
+//ZZ                case Iop_CmpEQ8x16: size = 0; break;
+//ZZ                case Iop_CmpEQ16x8: size = 1; break;
+//ZZ                case Iop_CmpEQ32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCEQ,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Mul8x16:
+//ZZ          case Iop_Mul16x8:
+//ZZ          case Iop_Mul32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Mul8x16: size = 0; break;
+//ZZ                case Iop_Mul16x8: size = 1; break;
+//ZZ                case Iop_Mul32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMUL,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Mul32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULFP,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Mull8Ux8:
+//ZZ          case Iop_Mull16Ux4:
+//ZZ          case Iop_Mull32Ux2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Mull8Ux8: size = 0; break;
+//ZZ                case Iop_Mull16Ux4: size = 1; break;
+//ZZ                case Iop_Mull32Ux2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULLU,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_Mull8Sx8:
+//ZZ          case Iop_Mull16Sx4:
+//ZZ          case Iop_Mull32Sx2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_Mull8Sx8: size = 0; break;
+//ZZ                case Iop_Mull16Sx4: size = 1; break;
+//ZZ                case Iop_Mull32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULLS,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QDMulHi16Sx8:
+//ZZ          case Iop_QDMulHi32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QDMulHi16Sx8: size = 1; break;
+//ZZ                case Iop_QDMulHi32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULH,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QRDMulHi16Sx8:
+//ZZ          case Iop_QRDMulHi32Sx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QRDMulHi16Sx8: size = 1; break;
+//ZZ                case Iop_QRDMulHi32Sx4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQRDMULH,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_QDMulLong16Sx4:
+//ZZ          case Iop_QDMulLong32Sx2: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_QDMulLong16Sx4: size = 1; break;
+//ZZ                case Iop_QDMulLong32Sx2: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULL,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PolynomialMul8x16: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULP,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Max32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMAXF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_Min32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMINF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMax32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_PwMin32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPMINF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGT32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGTF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpGE32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCGEF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_CmpEQ32Fx4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VCEQF,
+//ZZ                                            res, argL, argR, 2, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ 
+//ZZ          case Iop_PolynomialMull8x8: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VMULLP,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          case Iop_F32ToFixed32Ux4_RZ:
+//ZZ          case Iop_F32ToFixed32Sx4_RZ:
+//ZZ          case Iop_Fixed32UToF32x4_RN:
+//ZZ          case Iop_Fixed32SToF32x4_RN: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg arg = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             ARMNeonUnOp op;
+//ZZ             UInt imm6;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                   vpanic("ARM supports FP <-> Fixed conversion with constant "
+//ZZ                          "second argument less than 33 only\n");
+//ZZ             }
+//ZZ             imm6 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             vassert(imm6 <= 32 && imm6 > 0);
+//ZZ             imm6 = 64 - imm6;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_F32ToFixed32Ux4_RZ: op = ARMneon_VCVTFtoFixedU; break;
+//ZZ                case Iop_F32ToFixed32Sx4_RZ: op = ARMneon_VCVTFtoFixedS; break;
+//ZZ                case Iop_Fixed32UToF32x4_RN: op = ARMneon_VCVTFixedUtoF; break;
+//ZZ                case Iop_Fixed32SToF32x4_RN: op = ARMneon_VCVTFixedStoF; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(op, res, arg, imm6, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          /*
+//ZZ          FIXME remove if not used
+//ZZ          case Iop_VDup8x16:
+//ZZ          case Iop_VDup16x8:
+//ZZ          case Iop_VDup32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+//ZZ             UInt imm4;
+//ZZ             UInt index;
+//ZZ             if (e->Iex.Binop.arg2->tag != Iex_Const ||
+//ZZ                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+//ZZ                   vpanic("ARM supports Iop_VDup with constant "
+//ZZ                          "second argument less than 16 only\n");
+//ZZ             }
+//ZZ             index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_VDup8x16: imm4 = (index << 1) + 1; break;
+//ZZ                case Iop_VDup16x8: imm4 = (index << 2) + 2; break;
+//ZZ                case Iop_VDup32x4: imm4 = (index << 3) + 4; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             if (imm4 >= 16) {
+//ZZ                vpanic("ARM supports Iop_VDup with constant "
+//ZZ                       "second argument less than 16 only\n");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_VDUP,
+//ZZ                                           res, argL, imm4, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          */
+//ZZ          case Iop_PwAdd8x16:
+//ZZ          case Iop_PwAdd16x8:
+//ZZ          case Iop_PwAdd32x4: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+//ZZ             UInt size = 0;
+//ZZ             switch(e->Iex.Binop.op) {
+//ZZ                case Iop_PwAdd8x16: size = 0; break;
+//ZZ                case Iop_PwAdd16x8: size = 1; break;
+//ZZ                case Iop_PwAdd32x4: size = 2; break;
+//ZZ                default: vassert(0);
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VPADD,
+//ZZ                                            res, argL, argR, size, True));
+//ZZ             return res;
+//ZZ          }
+         /* ... */
+         default:
+            break;
+      } /* switch on the binop */
+   } /* if (e->tag == Iex_Binop) */
+
+   if (e->tag == Iex_Triop) {
+      IRTriop*      triop  = e->Iex.Triop.details;
+      ARM64VecBinOp vecbop = ARM64vecb_INVALID;
+      switch (triop->op) {
+         case Iop_Add64Fx2: vecbop = ARM64vecb_FADD64x2; break;
+         case Iop_Sub64Fx2: vecbop = ARM64vecb_FSUB64x2; break;
+         case Iop_Mul64Fx2: vecbop = ARM64vecb_FMUL64x2; break;
+         case Iop_Div64Fx2: vecbop = ARM64vecb_FDIV64x2; break;
+         case Iop_Add32Fx4: vecbop = ARM64vecb_FADD32x4; break;
+         case Iop_Sub32Fx4: vecbop = ARM64vecb_FSUB32x4; break;
+         case Iop_Mul32Fx4: vecbop = ARM64vecb_FMUL32x4; break;
+         case Iop_Div32Fx4: vecbop = ARM64vecb_FDIV32x4; break;
+         default: break;
+      }
+      if (vecbop != ARM64vecb_INVALID) {
+         HReg argL = iselV128Expr(env, triop->arg2);
+         HReg argR = iselV128Expr(env, triop->arg3);
+         HReg dst  = newVRegV(env);
+         set_FPCR_rounding_mode(env, triop->arg1);
+         addInstr(env, ARM64Instr_VBinV(vecbop, dst, argL, argR));
+         return dst;
+      }
+
+//ZZ       switch (triop->op) {
+//ZZ          case Iop_ExtractV128: {
+//ZZ             HReg res = newVRegV(env);
+//ZZ             HReg argL = iselNeonExpr(env, triop->arg1);
+//ZZ             HReg argR = iselNeonExpr(env, triop->arg2);
+//ZZ             UInt imm4;
+//ZZ             if (triop->arg3->tag != Iex_Const ||
+//ZZ                 typeOfIRExpr(env->type_env, triop->arg3) != Ity_I8) {
+//ZZ                vpanic("ARM target supports Iop_ExtractV128 with constant "
+//ZZ                       "third argument less than 16 only\n");
+//ZZ             }
+//ZZ             imm4 = triop->arg3->Iex.Const.con->Ico.U8;
+//ZZ             if (imm4 >= 16) {
+//ZZ                vpanic("ARM target supports Iop_ExtractV128 with constant "
+//ZZ                       "third argument less than 16 only\n");
+//ZZ             }
+//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VEXT,
+//ZZ                                            res, argL, argR, imm4, True));
+//ZZ             return res;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+   }
+
+//ZZ    if (e->tag == Iex_ITE) { // VFD
+//ZZ       ARMCondCode cc;
+//ZZ       HReg r1  = iselNeonExpr(env, e->Iex.ITE.iftrue);
+//ZZ       HReg r0  = iselNeonExpr(env, e->Iex.ITE.iffalse);
+//ZZ       HReg dst = newVRegV(env);
+//ZZ       addInstr(env, ARMInstr_NUnary(ARMneon_COPY, dst, r1, 4, True));
+//ZZ       cc = iselCondCode(env, e->Iex.ITE.cond);
+//ZZ       addInstr(env, ARMInstr_NCMovQ(cc ^ 1, dst, r0));
+//ZZ       return dst;
+//ZZ    }
+
+  v128_expr_bad:
+   ppIRExpr(e);
+   vpanic("iselV128Expr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (64 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  */
+
+static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselDblExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      IRConst* con = e->Iex.Const.con;
+      if (con->tag == Ico_F64i) {
+         HReg src = newVRegI(env);
+         HReg dst = newVRegD(env);
+         addInstr(env, ARM64Instr_Imm64(src, con->Ico.F64i));
+         addInstr(env, ARM64Instr_VDfromX(dst, src));
+         return dst;
+      }
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      vassert(e->Iex.Load.ty == Ity_F64);
+      HReg addr = iselIntExpr_R(env, e->Iex.Load.addr);
+      HReg res  = newVRegD(env);
+      addInstr(env, ARM64Instr_VLdStD(True/*isLoad*/, res, addr, 0));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      Int offs = e->Iex.Get.offset;
+      if (offs >= 0 && offs < 32768 && 0 == (offs & 7)) {
+         HReg rD = newVRegD(env);
+         HReg rN = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStD(True/*isLoad*/, rD, rN, offs));
+         return rD;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+//ZZ          case Iop_ReinterpI64asF64: {
+//ZZ             if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ                return iselNeon64Expr(env, e->Iex.Unop.arg);
+//ZZ             } else {
+//ZZ                HReg srcHi, srcLo;
+//ZZ                HReg dst = newVRegD(env);
+//ZZ                iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+//ZZ                addInstr(env, ARMInstr_VXferD(True/*toD*/, dst, srcHi, srcLo));
+//ZZ                return dst;
+//ZZ             }
+//ZZ          }
+         case Iop_NegF64: {
+            HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_NEG, dst, src));
+            return dst;
+         }
+         case Iop_AbsF64: {
+            HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_ABS, dst, src));
+            return dst;
+         }
+         case Iop_F32toF64: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtSD(True/*sToD*/, dst, src));
+            return dst;
+         }
+         case Iop_I32UtoF64:
+         case Iop_I32StoF64: {
+            /* Rounding mode is not involved here, since the
+               conversion can always be done without loss of
+               precision. */
+            HReg src   = iselIntExpr_R(env, e->Iex.Unop.arg);
+            HReg dst   = newVRegD(env);
+            Bool syned = e->Iex.Unop.op == Iop_I32StoF64;
+            ARM64CvtOp cvt_op = syned ? ARM64cvt_F64_I32S : ARM64cvt_F64_I32U;
+            addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dst, src));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_RoundF64toInt: {
+            HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_RINT, dst, src));
+            return dst;
+         }
+         case Iop_SqrtF64: {
+            HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_SQRT, dst, src));
+            return dst;
+         }
+         case Iop_I64StoF64:
+         case Iop_I64UtoF64: {
+            ARM64CvtOp cvt_op = e->Iex.Binop.op == Iop_I64StoF64
+                                   ? ARM64cvt_F64_I64S : ARM64cvt_F64_I64U;
+            HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg dstS = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dstS, srcI));
+            return dstS;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      IRTriop*     triop = e->Iex.Triop.details;
+      ARM64FpBinOp dblop = ARM64fpb_INVALID;
+      switch (triop->op) {
+         case Iop_DivF64: dblop = ARM64fpb_DIV; break;
+         case Iop_MulF64: dblop = ARM64fpb_MUL; break;
+         case Iop_SubF64: dblop = ARM64fpb_SUB; break;
+         case Iop_AddF64: dblop = ARM64fpb_ADD; break;
+         default: break;
+      }
+      if (dblop != ARM64fpb_INVALID) {
+         HReg argL = iselDblExpr(env, triop->arg2);
+         HReg argR = iselDblExpr(env, triop->arg3);
+         HReg dst  = newVRegD(env);
+         set_FPCR_rounding_mode(env, triop->arg1);
+         addInstr(env, ARM64Instr_VBinD(dblop, dst, argL, argR));
+         return dst;
+      }
+   }
+
+//ZZ    if (e->tag == Iex_ITE) { // VFD
+//ZZ       if (ty == Ity_F64
+//ZZ           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
+//ZZ          HReg r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
+//ZZ          HReg r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
+//ZZ          HReg dst = newVRegD(env);
+//ZZ          addInstr(env, ARMInstr_VUnaryD(ARMvfpu_COPY, dst, r1));
+//ZZ          ARMCondCode cc = iselCondCode(env, e->Iex.ITE.cond);
+//ZZ          addInstr(env, ARMInstr_VCMovD(cc ^ 1, dst, r0));
+//ZZ          return dst;
+//ZZ       }
+//ZZ    }
+
+   ppIRExpr(e);
+   vpanic("iselDblExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (32 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 32-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  Values are generated into HRcFlt64
+   registers despite the values themselves being Ity_F32s. */
+
+static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselFltExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F32);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      /* This is something of a kludge.  Since a 32 bit floating point
+         zero is just .. all zeroes, just create a 64 bit zero word
+         and transfer it.  This avoids having to create a SfromW
+         instruction for this specific case. */
+      IRConst* con = e->Iex.Const.con;
+      if (con->tag == Ico_F32i && con->Ico.F32i == 0) {
+         HReg src = newVRegI(env);
+         HReg dst = newVRegD(env);
+         addInstr(env, ARM64Instr_Imm64(src, 0));
+         addInstr(env, ARM64Instr_VDfromX(dst, src));
+         return dst;
+      }
+   }
+
+//ZZ    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+//ZZ       ARMAModeV* am;
+//ZZ       HReg res = newVRegF(env);
+//ZZ       vassert(e->Iex.Load.ty == Ity_F32);
+//ZZ       am = iselIntExpr_AModeV(env, e->Iex.Load.addr);
+//ZZ       addInstr(env, ARMInstr_VLdStS(True/*isLoad*/, res, am));
+//ZZ       return res;
+//ZZ    }
+
+   if (e->tag == Iex_Get) {
+      Int offs = e->Iex.Get.offset;
+      if (offs >= 0 && offs < 16384 && 0 == (offs & 3)) {
+         HReg rD = newVRegD(env);
+         HReg rN = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStS(True/*isLoad*/, rD, rN, offs));
+         return rD;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+//ZZ          case Iop_ReinterpI32asF32: {
+//ZZ             HReg dst = newVRegF(env);
+//ZZ             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//ZZ             addInstr(env, ARMInstr_VXferS(True/*toS*/, dst, src));
+//ZZ             return dst;
+//ZZ          }
+         case Iop_NegF32: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_NEG, dst, src));
+            return dst;
+         }
+         case Iop_AbsF32: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_ABS, dst, src));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_RoundF32toInt: {
+            HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_RINT, dst, src));
+            return dst;
+         }
+         case Iop_SqrtF32: {
+            HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_SQRT, dst, src));
+            return dst;
+         }
+         case Iop_F64toF32: {
+            HReg srcD = iselDblExpr(env, e->Iex.Binop.arg2);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg dstS = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtSD(False/*dToS*/, dstS, srcD));
+            return dstS;
+         }
+         case Iop_I32UtoF32:
+         case Iop_I32StoF32:
+         case Iop_I64UtoF32:
+         case Iop_I64StoF32: {
+            ARM64CvtOp cvt_op = ARM64cvt_INVALID;
+            switch (e->Iex.Binop.op) {
+               case Iop_I32UtoF32: cvt_op = ARM64cvt_F32_I32U; break;
+               case Iop_I32StoF32: cvt_op = ARM64cvt_F32_I32S; break;
+               case Iop_I64UtoF32: cvt_op = ARM64cvt_F32_I64U; break;
+               case Iop_I64StoF32: cvt_op = ARM64cvt_F32_I64S; break;
+               default: vassert(0);
+            }
+            HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg dstS = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dstS, srcI));
+            return dstS;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      IRTriop*     triop = e->Iex.Triop.details;
+      ARM64FpBinOp sglop = ARM64fpb_INVALID;
+      switch (triop->op) {
+         case Iop_DivF32: sglop = ARM64fpb_DIV; break;
+         case Iop_MulF32: sglop = ARM64fpb_MUL; break;
+         case Iop_SubF32: sglop = ARM64fpb_SUB; break;
+         case Iop_AddF32: sglop = ARM64fpb_ADD; break;
+         default: break;
+      }
+      if (sglop != ARM64fpb_INVALID) {
+         HReg argL = iselFltExpr(env, triop->arg2);
+         HReg argR = iselFltExpr(env, triop->arg3);
+         HReg dst  = newVRegD(env);
+         set_FPCR_rounding_mode(env, triop->arg1);
+         addInstr(env, ARM64Instr_VBinS(sglop, dst, argL, argR));
+         return dst;
+      }
+   }
+
+//ZZ 
+//ZZ    if (e->tag == Iex_ITE) { // VFD
+//ZZ       if (ty == Ity_F32
+//ZZ           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
+//ZZ          ARMCondCode cc;
+//ZZ          HReg r1  = iselFltExpr(env, e->Iex.ITE.iftrue);
+//ZZ          HReg r0  = iselFltExpr(env, e->Iex.ITE.iffalse);
+//ZZ          HReg dst = newVRegF(env);
+//ZZ          addInstr(env, ARMInstr_VUnaryS(ARMvfpu_COPY, dst, r1));
+//ZZ          cc = iselCondCode(env, e->Iex.ITE.cond);
+//ZZ          addInstr(env, ARMInstr_VCMovS(cc ^ 1, dst, r0));
+//ZZ          return dst;
+//ZZ       }
+//ZZ    }
+
+   ppIRExpr(e);
+   vpanic("iselFltExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Statements                                  ---*/
+/*---------------------------------------------------------*/
+
+static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+{
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- ");
+      ppIRStmt(stmt);
+      vex_printf("\n");
+   }
+   switch (stmt->tag) {
+
+   /* --------- STORE --------- */
+   /* little-endian write to memory */
+   case Ist_Store: {
+      IRType    tya  = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
+      IRType    tyd  = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
+      IREndness end  = stmt->Ist.Store.end;
+
+      if (tya != Ity_I64 || end != Iend_LE) 
+         goto stmt_fail;
+
+      if (tyd == Ity_I64) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
+         addInstr(env, ARM64Instr_LdSt64(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I32) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
+         addInstr(env, ARM64Instr_LdSt32(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I16) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
+         addInstr(env, ARM64Instr_LdSt16(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I8) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
+         addInstr(env, ARM64Instr_LdSt8(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_V128) {
+         HReg qD   = iselV128Expr(env, stmt->Ist.Store.data);
+         HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
+         addInstr(env, ARM64Instr_VLdStQ(False/*!isLoad*/, qD, addr));
+         return;
+      }
+      if (tyd == Ity_F64) {
+         HReg dD   = iselDblExpr(env, stmt->Ist.Store.data);
+         HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
+         addInstr(env, ARM64Instr_VLdStD(False/*!isLoad*/, dD, addr, 0));
+         return;
+      }
+      if (tyd == Ity_F32) {
+         HReg sD   = iselFltExpr(env, stmt->Ist.Store.data);
+         HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
+         addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, sD, addr, 0));
+         return;
+      }
+
+//ZZ       if (tyd == Ity_I16) {
+//ZZ          HReg       rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+//ZZ          ARMAMode2* am = iselIntExpr_AMode2(env, stmt->Ist.Store.addr);
+//ZZ          addInstr(env, ARMInstr_LdSt16(ARMcc_AL,
+//ZZ                                        False/*!isLoad*/,
+//ZZ                                        False/*!isSignedLoad*/, rD, am));
+//ZZ          return;
+//ZZ       }
+//ZZ       if (tyd == Ity_I8) {
+//ZZ          HReg       rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+//ZZ          ARMAMode1* am = iselIntExpr_AMode1(env, stmt->Ist.Store.addr);
+//ZZ          addInstr(env, ARMInstr_LdSt8U(ARMcc_AL, False/*!isLoad*/, rD, am));
+//ZZ          return;
+//ZZ       }
+//ZZ       if (tyd == Ity_I64) {
+//ZZ          if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ             HReg dD = iselNeon64Expr(env, stmt->Ist.Store.data);
+//ZZ             ARMAModeN* am = iselIntExpr_AModeN(env, stmt->Ist.Store.addr);
+//ZZ             addInstr(env, ARMInstr_NLdStD(False, dD, am));
+//ZZ          } else {
+//ZZ             HReg rDhi, rDlo, rA;
+//ZZ             iselInt64Expr(&rDhi, &rDlo, env, stmt->Ist.Store.data);
+//ZZ             rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
+//ZZ             addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*!load*/, rDhi,
+//ZZ                                           ARMAMode1_RI(rA,4)));
+//ZZ             addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*!load*/, rDlo,
+//ZZ                                           ARMAMode1_RI(rA,0)));
+//ZZ          }
+//ZZ          return;
+//ZZ       }
+//ZZ       if (tyd == Ity_F64) {
+//ZZ          HReg       dD = iselDblExpr(env, stmt->Ist.Store.data);
+//ZZ          ARMAModeV* am = iselIntExpr_AModeV(env, stmt->Ist.Store.addr);
+//ZZ          addInstr(env, ARMInstr_VLdStD(False/*!isLoad*/, dD, am));
+//ZZ          return;
+//ZZ       }
+//ZZ       if (tyd == Ity_F32) {
+//ZZ          HReg       fD = iselFltExpr(env, stmt->Ist.Store.data);
+//ZZ          ARMAModeV* am = iselIntExpr_AModeV(env, stmt->Ist.Store.addr);
+//ZZ          addInstr(env, ARMInstr_VLdStS(False/*!isLoad*/, fD, am));
+//ZZ          return;
+//ZZ       }
+//ZZ       if (tyd == Ity_V128) {
+//ZZ          HReg       qD = iselNeonExpr(env, stmt->Ist.Store.data);
+//ZZ          ARMAModeN* am = iselIntExpr_AModeN(env, stmt->Ist.Store.addr);
+//ZZ          addInstr(env, ARMInstr_NLdStQ(False, qD, am));
+//ZZ          return;
+//ZZ       }
+
+      break;
+   }
+
+//ZZ    /* --------- CONDITIONAL STORE --------- */
+//ZZ    /* conditional little-endian write to memory */
+//ZZ    case Ist_StoreG: {
+//ZZ       IRStoreG* sg   = stmt->Ist.StoreG.details;
+//ZZ       IRType    tya  = typeOfIRExpr(env->type_env, sg->addr);
+//ZZ       IRType    tyd  = typeOfIRExpr(env->type_env, sg->data);
+//ZZ       IREndness end  = sg->end;
+//ZZ 
+//ZZ       if (tya != Ity_I32 || end != Iend_LE) 
+//ZZ          goto stmt_fail;
+//ZZ 
+//ZZ       switch (tyd) {
+//ZZ          case Ity_I8:
+//ZZ          case Ity_I32: {
+//ZZ             HReg        rD = iselIntExpr_R(env, sg->data);
+//ZZ             ARMAMode1*  am = iselIntExpr_AMode1(env, sg->addr);
+//ZZ             ARMCondCode cc = iselCondCode(env, sg->guard);
+//ZZ             addInstr(env, (tyd == Ity_I32 ? ARMInstr_LdSt32 : ARMInstr_LdSt8U)
+//ZZ                              (cc, False/*!isLoad*/, rD, am));
+//ZZ             return;
+//ZZ          }
+//ZZ          case Ity_I16: {
+//ZZ             HReg        rD = iselIntExpr_R(env, sg->data);
+//ZZ             ARMAMode2*  am = iselIntExpr_AMode2(env, sg->addr);
+//ZZ             ARMCondCode cc = iselCondCode(env, sg->guard);
+//ZZ             addInstr(env, ARMInstr_LdSt16(cc, 
+//ZZ                                           False/*!isLoad*/,
+//ZZ                                           False/*!isSignedLoad*/, rD, am));
+//ZZ             return;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ       break;
+//ZZ    }
+//ZZ 
+//ZZ    /* --------- CONDITIONAL LOAD --------- */
+//ZZ    /* conditional little-endian load from memory */
+//ZZ    case Ist_LoadG: {
+//ZZ       IRLoadG*  lg   = stmt->Ist.LoadG.details;
+//ZZ       IRType    tya  = typeOfIRExpr(env->type_env, lg->addr);
+//ZZ       IREndness end  = lg->end;
+//ZZ 
+//ZZ       if (tya != Ity_I32 || end != Iend_LE) 
+//ZZ          goto stmt_fail;
+//ZZ 
+//ZZ       switch (lg->cvt) {
+//ZZ          case ILGop_8Uto32:
+//ZZ          case ILGop_Ident32: {
+//ZZ             HReg        rAlt = iselIntExpr_R(env, lg->alt);
+//ZZ             ARMAMode1*  am   = iselIntExpr_AMode1(env, lg->addr);
+//ZZ             HReg        rD   = lookupIRTemp(env, lg->dst);
+//ZZ             addInstr(env, mk_iMOVds_RR(rD, rAlt));
+//ZZ             ARMCondCode cc   = iselCondCode(env, lg->guard);
+//ZZ             addInstr(env, (lg->cvt == ILGop_Ident32 ? ARMInstr_LdSt32
+//ZZ                                                     : ARMInstr_LdSt8U)
+//ZZ                              (cc, True/*isLoad*/, rD, am));
+//ZZ             return;
+//ZZ          }
+//ZZ          case ILGop_16Sto32:
+//ZZ          case ILGop_16Uto32:
+//ZZ          case ILGop_8Sto32: {
+//ZZ             HReg        rAlt = iselIntExpr_R(env, lg->alt);
+//ZZ             ARMAMode2*  am   = iselIntExpr_AMode2(env, lg->addr);
+//ZZ             HReg        rD   = lookupIRTemp(env, lg->dst);
+//ZZ             addInstr(env, mk_iMOVds_RR(rD, rAlt));
+//ZZ             ARMCondCode cc   = iselCondCode(env, lg->guard);
+//ZZ             if (lg->cvt == ILGop_8Sto32) {
+//ZZ                addInstr(env, ARMInstr_Ld8S(cc, rD, am));
+//ZZ             } else {
+//ZZ                vassert(lg->cvt == ILGop_16Sto32 || lg->cvt == ILGop_16Uto32);
+//ZZ                Bool sx = lg->cvt == ILGop_16Sto32;
+//ZZ                addInstr(env, ARMInstr_LdSt16(cc, True/*isLoad*/, sx, rD, am));
+//ZZ             }
+//ZZ             return;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+//ZZ       break;
+//ZZ    }
+
+   /* --------- PUT --------- */
+   /* write guest state, fixed offset */
+   case Ist_Put: {
+      IRType tyd  = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
+      UInt   offs = (UInt)stmt->Ist.Put.offset;
+      if (tyd == Ity_I64 && 0 == (offs & 7) && offs < (8<<12)) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
+         ARM64AMode* am = mk_baseblock_64bit_access_amode(offs);
+         addInstr(env, ARM64Instr_LdSt64(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I32 && 0 == (offs & 3) && offs < (4<<12)) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
+         ARM64AMode* am = mk_baseblock_32bit_access_amode(offs);
+         addInstr(env, ARM64Instr_LdSt32(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I16 && 0 == (offs & 1) && offs < (2<<12)) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
+         ARM64AMode* am = mk_baseblock_16bit_access_amode(offs);
+         addInstr(env, ARM64Instr_LdSt16(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I8 && offs < (1<<12)) {
+         HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
+         ARM64AMode* am = mk_baseblock_8bit_access_amode(offs);
+         addInstr(env, ARM64Instr_LdSt8(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_V128 && offs < (1<<12)) {
+         HReg qD   = iselV128Expr(env, stmt->Ist.Put.data);
+         HReg addr = mk_baseblock_128bit_access_addr(env, offs);
+         addInstr(env, ARM64Instr_VLdStQ(False/*!isLoad*/, qD, addr));
+         return;
+      }
+      if (tyd == Ity_F64 && 0 == (offs & 7) && offs < (8<<12)) {
+         HReg dD   = iselDblExpr(env, stmt->Ist.Put.data);
+         HReg bbp  = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStD(False/*!isLoad*/, dD, bbp, offs));
+         return;
+      }
+      if (tyd == Ity_F32 && 0 == (offs & 3) && offs < (4<<12)) {
+         HReg dD   = iselFltExpr(env, stmt->Ist.Put.data);
+         HReg bbp  = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, dD, bbp, offs));
+         return;
+      }
+
+//ZZ        if (tyd == Ity_I64) {
+//ZZ           if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
+//ZZ              HReg addr = newVRegI(env);
+//ZZ              HReg qD = iselNeon64Expr(env, stmt->Ist.Put.data);
+//ZZ              addInstr(env, ARMInstr_Add32(addr, hregARM_R8(),
+//ZZ                                                 stmt->Ist.Put.offset));
+//ZZ              addInstr(env, ARMInstr_NLdStD(False, qD, mkARMAModeN_R(addr)));
+//ZZ           } else {
+//ZZ              HReg rDhi, rDlo;
+//ZZ              ARMAMode1* am0 = ARMAMode1_RI(hregARM_R8(),
+//ZZ                                            stmt->Ist.Put.offset + 0);
+//ZZ              ARMAMode1* am4 = ARMAMode1_RI(hregARM_R8(),
+//ZZ                                            stmt->Ist.Put.offset + 4);
+//ZZ              iselInt64Expr(&rDhi, &rDlo, env, stmt->Ist.Put.data);
+//ZZ              addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*!isLoad*/,
+//ZZ                                            rDhi, am4));
+//ZZ              addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*!isLoad*/,
+//ZZ                                            rDlo, am0));
+//ZZ           }
+//ZZ           return;
+//ZZ        }
+//ZZ        if (tyd == Ity_F64) {
+//ZZ           // XXX This won't work if offset > 1020 or is not 0 % 4.
+//ZZ           // In which case we'll have to generate more longwinded code.
+//ZZ           ARMAModeV* am = mkARMAModeV(hregARM_R8(), stmt->Ist.Put.offset);
+//ZZ           HReg       rD = iselDblExpr(env, stmt->Ist.Put.data);
+//ZZ           addInstr(env, ARMInstr_VLdStD(False/*!isLoad*/, rD, am));
+//ZZ           return;
+//ZZ        }
+//ZZ        if (tyd == Ity_F32) {
+//ZZ           // XXX This won't work if offset > 1020 or is not 0 % 4.
+//ZZ           // In which case we'll have to generate more longwinded code.
+//ZZ           ARMAModeV* am = mkARMAModeV(hregARM_R8(), stmt->Ist.Put.offset);
+//ZZ           HReg       rD = iselFltExpr(env, stmt->Ist.Put.data);
+//ZZ           addInstr(env, ARMInstr_VLdStS(False/*!isLoad*/, rD, am));
+//ZZ           return;
+//ZZ        }
+      break;
+   }
+
+   /* --------- TMP --------- */
+   /* assign value to temporary */
+   case Ist_WrTmp: {
+      IRTemp tmp = stmt->Ist.WrTmp.tmp;
+      IRType ty  = typeOfIRTemp(env->type_env, tmp);
+
+      if (ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
+         /* We could do a lot better here.  But for the time being: */
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg rD  = iselIntExpr_R(env, stmt->Ist.WrTmp.data);
+         addInstr(env, ARM64Instr_MovI(dst, rD));
+         return;
+      }
+      if (ty == Ity_I1) {
+         /* Here, we are generating a I1 value into a 64 bit register.
+            Make sure the value in the register is only zero or one,
+            but no other.  This allows optimisation of the
+            1Uto64(tmp:I1) case, by making it simply a copy of the
+            register holding 'tmp'.  The point being that the value in
+            the register holding 'tmp' can only have been created
+            here.  LATER: that seems dangerous; safer to do 'tmp & 1'
+            in that case.  Also, could do this just with a single CINC
+            insn. */
+         /* CLONE-01 */
+         HReg zero = newVRegI(env);
+         HReg one  = newVRegI(env);
+         HReg dst  = lookupIRTemp(env, tmp);
+         addInstr(env, ARM64Instr_Imm64(zero, 0));
+         addInstr(env, ARM64Instr_Imm64(one,  1));
+         ARM64CondCode cc = iselCondCode(env, stmt->Ist.WrTmp.data);
+         addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARM64Instr_VMov(8, dst, src));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARM64Instr_VMov(16, dst, src));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Call to DIRTY helper --------- */
+   /* call complex ("dirty") helper function */
+   case Ist_Dirty: {
+      IRDirty* d = stmt->Ist.Dirty.details;
+
+      /* Figure out the return type, if any. */
+      IRType retty = Ity_INVALID;
+      if (d->tmp != IRTemp_INVALID)
+         retty = typeOfIRTemp(env->type_env, d->tmp);
+
+      Bool retty_ok = False;
+      switch (retty) {
+         case Ity_INVALID: /* function doesn't return anything */
+         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
+         case Ity_V128:
+            retty_ok = True; break;
+         default:
+            break;
+      }
+      if (!retty_ok)
+         break; /* will go to stmt_fail: */
+
+      /* Marshal args, do the call, and set the return value to 0x555..555
+         if this is a conditional call that returns a value and the
+         call is skipped. */
+      UInt   addToSp = 0;
+      RetLoc rloc    = mk_RetLoc_INVALID();
+      doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
+      vassert(is_sane_RetLoc(rloc));
+
+      /* Now figure out what to do with the returned value, if any. */
+      switch (retty) {
+         case Ity_INVALID: {
+            /* No return value.  Nothing to do. */
+            vassert(d->tmp == IRTemp_INVALID);
+            vassert(rloc.pri == RLPri_None);
+            vassert(addToSp == 0);
+            return;
+         }
+         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
+            vassert(rloc.pri == RLPri_Int);
+            vassert(addToSp == 0);
+            /* The returned value is in x0.  Park it in the register
+               associated with tmp. */
+            HReg dst = lookupIRTemp(env, d->tmp);
+            addInstr(env, ARM64Instr_MovI(dst, hregARM64_X0()) );
+            return;
+         }
+         case Ity_V128: {
+            /* The returned value is on the stack, and *retloc tells
+               us where.  Fish it off the stack and then move the
+               stack pointer upwards to clear it, as directed by
+               doHelperCall. */
+            vassert(rloc.pri == RLPri_V128SpRel);
+            vassert(rloc.spOff < 256); // stay sane
+            vassert(addToSp >= 16); // ditto
+            vassert(addToSp < 256); // ditto
+            HReg dst = lookupIRTemp(env, d->tmp);
+            HReg tmp = newVRegI(env); // the address of the returned value
+            addInstr(env, ARM64Instr_FromSP(tmp)); // tmp = SP
+            addInstr(env, ARM64Instr_Arith(tmp, tmp,
+                                           ARM64RIA_I12((UShort)rloc.spOff, 0),
+                                           True/*isAdd*/ ));
+            addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, dst, tmp));
+            addInstr(env, ARM64Instr_AddToSP(addToSp));
+            return;
+         }
+         default:
+            /*NOTREACHED*/
+            vassert(0);
+      }
+      break;
+   }
+
+   /* --------- Load Linked and Store Conditional --------- */
+   case Ist_LLSC: {
+      if (stmt->Ist.LLSC.storedata == NULL) {
+         /* LL */
+         IRTemp res = stmt->Ist.LLSC.result;
+         IRType ty  = typeOfIRTemp(env->type_env, res);
+         if (ty == Ity_I64 || ty == Ity_I32 
+             || ty == Ity_I16 || ty == Ity_I8) {
+            Int  szB   = 0;
+            HReg r_dst = lookupIRTemp(env, res);
+            HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            switch (ty) {
+               case Ity_I8:  szB = 1; break;
+               case Ity_I16: szB = 2; break;
+               case Ity_I32: szB = 4; break;
+               case Ity_I64: szB = 8; break;
+               default:      vassert(0);
+            }
+            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
+            addInstr(env, ARM64Instr_LdrEX(szB));
+            addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
+            return;
+         }
+         goto stmt_fail;
+      } else {
+         /* SC */
+         IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
+         if (tyd == Ity_I64 || tyd == Ity_I32
+             || tyd == Ity_I16 || tyd == Ity_I8) {
+            Int  szB = 0;
+            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
+            HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            switch (tyd) {
+               case Ity_I8:  szB = 1; break;
+               case Ity_I16: szB = 2; break;
+               case Ity_I32: szB = 4; break;
+               case Ity_I64: szB = 8; break;
+               default:      vassert(0);
+            }
+            addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
+            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
+            addInstr(env, ARM64Instr_StrEX(szB));
+         } else {
+            goto stmt_fail;
+         }
+         /* now r0 is 1 if failed, 0 if success.  Change to IR
+            conventions (0 is fail, 1 is success).  Also transfer
+            result to r_res. */
+         IRTemp    res   = stmt->Ist.LLSC.result;
+         IRType    ty    = typeOfIRTemp(env->type_env, res);
+         HReg      r_res = lookupIRTemp(env, res);
+         ARM64RIL* one   = mb_mkARM64RIL_I(1);
+         vassert(ty == Ity_I1);
+         vassert(one);
+         addInstr(env, ARM64Instr_Logic(r_res, hregARM64_X0(), one,
+                                        ARM64lo_XOR));
+         /* And be conservative -- mask off all but the lowest bit. */
+         addInstr(env, ARM64Instr_Logic(r_res, r_res, one,
+                                        ARM64lo_AND));
+         return;
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MBE:
+      switch (stmt->Ist.MBE.event) {
+         case Imbe_Fence:
+            addInstr(env, ARM64Instr_MFence());
+            return;
+//ZZ          case Imbe_CancelReservation:
+//ZZ             addInstr(env, ARMInstr_CLREX());
+//ZZ             return;
+         default:
+            break;
+      }
+      break;
+
+   /* --------- INSTR MARK --------- */
+   /* Doesn't generate any executable code ... */
+   case Ist_IMark:
+       return;
+
+   /* --------- NO-OP --------- */
+   case Ist_NoOp:
+       return;
+
+   /* --------- EXIT --------- */
+   case Ist_Exit: {
+      if (stmt->Ist.Exit.dst->tag != Ico_U64)
+         vpanic("isel_arm: Ist_Exit: dst is not a 64-bit value");
+
+      ARM64CondCode cc 
+         = iselCondCode(env, stmt->Ist.Exit.guard);
+      ARM64AMode* amPC
+         = mk_baseblock_64bit_access_amode(stmt->Ist.Exit.offsIP);
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring
+          /*ATC || stmt->Ist.Exit.jk == Ijk_Call */
+          /*ATC || stmt->Ist.Exit.jk == Ijk_Ret */ ) {
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
+            addInstr(env, ARM64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
+                                             amPC, cc, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, ARM64Instr_XAssisted(r, amPC, cc, Ijk_Boring));
+         }
+         return;
+      }
+
+//ZZ       /* Case: assisted transfer to arbitrary address */
+//ZZ       switch (stmt->Ist.Exit.jk) {
+//ZZ          /* Keep this list in sync with that in iselNext below */
+//ZZ          case Ijk_ClientReq:
+//ZZ          case Ijk_NoDecode:
+//ZZ          case Ijk_NoRedir:
+//ZZ          case Ijk_Sys_syscall:
+//ZZ          case Ijk_InvalICache:
+//ZZ          case Ijk_Yield:
+//ZZ          {
+//ZZ             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+//ZZ             addInstr(env, ARMInstr_XAssisted(r, amR15T, cc,
+//ZZ                                              stmt->Ist.Exit.jk));
+//ZZ             return;
+//ZZ          }
+//ZZ          default:
+//ZZ             break;
+//ZZ       }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
+   }
+
+   default: break;
+   }
+  stmt_fail:
+   ppIRStmt(stmt);
+   vpanic("iselStmt");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Basic block terminators (Nexts)             ---*/
+/*---------------------------------------------------------*/
+
+static void iselNext ( ISelEnv* env,
+                       IRExpr* next, IRJumpKind jk, Int offsIP )
+{
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf( "\n-- PUT(%d) = ", offsIP);
+      ppIRExpr( next );
+      vex_printf( "; exit-");
+      ppIRJumpKind(jk);
+      vex_printf( "\n");
+   }
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst* cdst = next->Iex.Const.con;
+      vassert(cdst->tag == Ico_U64);
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)cdst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "X" : ".");
+            addInstr(env, ARM64Instr_XDirect(cdst->Ico.U64,
+                                             amPC, ARM64cc_AL, 
+                                             toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, next);
+            addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL,
+                                               Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
+         HReg        r    = iselIntExpr_R(env, next);
+         ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
+         if (env->chainingAllowed) {
+            addInstr(env, ARM64Instr_XIndir(r, amPC, ARM64cc_AL));
+         } else {
+            addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL,
+                                               Ijk_Boring));
+         }
+         return;
+      }
+      default:
+         break;
+   }
+
+   /* Case: assisted transfer to arbitrary address */
+   switch (jk) {
+      /* Keep this list in sync with that for Ist_Exit above */
+      case Ijk_ClientReq:
+      case Ijk_NoDecode:
+      case Ijk_NoRedir:
+      case Ijk_Sys_syscall:
+      case Ijk_InvalICache:
+      case Ijk_FlushDCache:
+//ZZ       case Ijk_Yield:
+      {
+         HReg        r    = iselIntExpr_R(env, next);
+         ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
+         addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL, jk));
+         return;
+      }
+      default:
+         break;
+   }
+
+   vex_printf( "\n-- PUT(%d) = ", offsIP);
+   ppIRExpr( next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(jk);
+   vex_printf( "\n");
+   vassert(0); // are we expecting any other kind?
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Insn selector top-level                           ---*/
+/*---------------------------------------------------------*/
+
+/* Translate an entire SB to arm64 code. */
+
+HInstrArray* iselSB_ARM64 ( IRSB* bb,
+                            VexArch      arch_host,
+                            VexArchInfo* archinfo_host,
+                            VexAbiInfo*  vbi/*UNUSED*/,
+                            Int offs_Host_EvC_Counter,
+                            Int offs_Host_EvC_FailAddr,
+                            Bool chainingAllowed,
+                            Bool addProfInc,
+                            Addr64 max_ga )
+{
+   Int        i, j;
+   HReg       hreg, hregHI;
+   ISelEnv*   env;
+   UInt       hwcaps_host = archinfo_host->hwcaps;
+   ARM64AMode *amCounter, *amFailAddr;
+
+   /* sanity ... */
+   vassert(arch_host == VexArchARM64);
+
+   /* guard against unexpected space regressions */
+   vassert(sizeof(ARM64Instr) <= 32);
+
+   /* Make up an initial environment to use. */
+   env = LibVEX_Alloc(sizeof(ISelEnv));
+   env->vreg_ctr = 0;
+
+   /* Set up output code array. */
+   env->code = newHInstrArray();
+    
+   /* Copy BB's type env. */
+   env->type_env = bb->tyenv;
+
+   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
+      change as we go along. */
+   env->n_vregmap = bb->tyenv->types_used;
+   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+
+   /* and finally ... */
+   env->chainingAllowed = chainingAllowed;
+   env->hwcaps          = hwcaps_host;
+   env->previous_rm     = NULL;
+   env->max_ga          = max_ga;
+
+   /* For each IR temporary, allocate a suitably-kinded virtual
+      register. */
+   j = 0;
+   for (i = 0; i < env->n_vregmap; i++) {
+      hregHI = hreg = INVALID_HREG;
+      switch (bb->tyenv->types[i]) {
+         case Ity_I1:
+         case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
+            hreg = mkHReg(j++, HRcInt64, True);
+            break;
+         case Ity_I128:
+            hreg   = mkHReg(j++, HRcInt64, True);
+            hregHI = mkHReg(j++, HRcInt64, True);
+            break;
+         case Ity_F32: // we'll use HRcFlt64 regs for F32 too
+         case Ity_F64:
+            hreg = mkHReg(j++, HRcFlt64, True);
+            break;
+         case Ity_V128:
+            hreg = mkHReg(j++, HRcVec128, True);
+            break;
+         default:
+            ppIRType(bb->tyenv->types[i]);
+            vpanic("iselBB(arm64): IRTemp type");
+      }
+      env->vregmap[i]   = hreg;
+      env->vregmapHI[i] = hregHI;
+   }
+   env->vreg_ctr = j;
+
+   /* The very first instruction must be an event check. */
+   amCounter  = ARM64AMode_RI9(hregARM64_X21(), offs_Host_EvC_Counter);
+   amFailAddr = ARM64AMode_RI9(hregARM64_X21(), offs_Host_EvC_FailAddr);
+   addInstr(env, ARM64Instr_EvCheck(amCounter, amFailAddr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfCtr. */
+   if (addProfInc) {
+      vassert(0);
+      //addInstr(env, ARM64Instr_ProfInc());
+   }
+
+   /* Ok, finally we can iterate over the statements. */
+   for (i = 0; i < bb->stmts_used; i++)
+      iselStmt(env, bb->stmts[i]);
+
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
+
+   /* record the number of vregs we used. */
+   env->code->n_vregs = env->vreg_ctr;
+   return env->code;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_arm64_isel.c ---*/
+/*---------------------------------------------------------------*/
Index: priv/host_arm_defs.c
===================================================================
--- priv/host_arm_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_arm_defs.c	(.../trunk)	(revision 2863)
@@ -790,6 +790,7 @@
       case ARMneon_VTBL: return "vtbl";
       case ARMneon_VRECPS: return "vrecps";
       case ARMneon_VRSQRTS: return "vrecps";
+      case ARMneon_INVALID: return "??invalid??";
       /* ... */
       default: vpanic("showARMNeonBinOp");
    }
@@ -3334,7 +3335,7 @@
             //case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
             //case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
             case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
-            case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+            case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
             case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
             //case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
             //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
Index: priv/host_arm_defs.h
===================================================================
--- priv/host_arm_defs.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_arm_defs.h	(.../trunk)	(revision 2863)
@@ -468,6 +468,7 @@
       ARMneon_VQDMULL,
       ARMneon_VRECPS,
       ARMneon_VRSQRTS,
+      ARMneon_INVALID
       /* ... */
    }
    ARMNeonBinOp;
Index: priv/host_arm_isel.c
===================================================================
--- priv/host_arm_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_arm_isel.c	(.../trunk)	(revision 2863)
@@ -4254,26 +4254,11 @@
             return res;
          }
          case Iop_Abs32Fx4: {
-            DECLARE_PATTERN(p_vabd_32fx4);
-            DEFINE_PATTERN(p_vabd_32fx4,
-                           unop(Iop_Abs32Fx4,
-                                binop(Iop_Sub32Fx4,
-                                      bind(0),
-                                      bind(1))));
-            if (matchIRExpr(&mi, p_vabd_32fx4, e)) {
-               HReg res = newVRegV(env);
-               HReg argL = iselNeonExpr(env, mi.bindee[0]);
-               HReg argR = iselNeonExpr(env, mi.bindee[1]);
-               addInstr(env, ARMInstr_NBinary(ARMneon_VABDFP,
-                                              res, argL, argR, 0, True));
-               return res;
-            } else {
-               HReg res = newVRegV(env);
-               HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
-               addInstr(env, ARMInstr_NUnary(ARMneon_VABSFP,
-                                             res, argL, 0, True));
-               return res;
-            }
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VABSFP,
+                                          res, argL, 0, True));
+            return res;
          }
          case Iop_Rsqrte32Fx4: {
             HReg res = newVRegV(env);
@@ -4457,15 +4442,6 @@
                                            res, argL, argR, size, True));
             return res;
          }
-         case Iop_Add32Fx4: {
-            HReg res = newVRegV(env);
-            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-            UInt size = 0;
-            addInstr(env, ARMInstr_NBinary(ARMneon_VADDFP,
-                                           res, argL, argR, size, True));
-            return res;
-         }
          case Iop_Recps32Fx4: {
             HReg res = newVRegV(env);
             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
@@ -4632,15 +4608,6 @@
                                            res, argL, argR, size, True));
             return res;
          }
-         case Iop_Sub32Fx4: {
-            HReg res = newVRegV(env);
-            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-            UInt size = 0;
-            addInstr(env, ARMInstr_NBinary(ARMneon_VSUBFP,
-                                           res, argL, argR, size, True));
-            return res;
-         }
          case Iop_QSub8Ux16:
          case Iop_QSub16Ux8:
          case Iop_QSub32Ux4:
@@ -5083,15 +5050,6 @@
                                            res, argL, argR, size, True));
             return res;
          }
-         case Iop_Mul32Fx4: {
-            HReg res = newVRegV(env);
-            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-            UInt size = 0;
-            addInstr(env, ARMInstr_NBinary(ARMneon_VMULFP,
-                                           res, argL, argR, size, True));
-            return res;
-         }
          case Iop_Mull8Ux8:
          case Iop_Mull16Ux4:
          case Iop_Mull32Ux2: {
@@ -5352,6 +5310,23 @@
                                            res, argL, argR, imm4, True));
             return res;
          }
+         case Iop_Mul32Fx4:
+         case Iop_Sub32Fx4:
+         case Iop_Add32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, triop->arg2);
+            HReg argR = iselNeonExpr(env, triop->arg3);
+            UInt size = 0;
+            ARMNeonBinOp op = ARMneon_INVALID;
+            switch (triop->op) {
+               case Iop_Mul32Fx4: op = ARMneon_VMULFP; break;
+               case Iop_Sub32Fx4: op = ARMneon_VSUBFP; break;
+               case Iop_Add32Fx4: op = ARMneon_VADDFP; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(op, res, argL, argR, size, True));
+            return res;
+         }
          default:
             break;
       }
@@ -6218,7 +6193,7 @@
          case Ijk_NoDecode:
          case Ijk_NoRedir:
          case Ijk_Sys_syscall:
-         case Ijk_TInval:
+         case Ijk_InvalICache:
          case Ijk_Yield:
          {
             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
@@ -6310,7 +6285,7 @@
       case Ijk_NoDecode:
       case Ijk_NoRedir:
       case Ijk_Sys_syscall:
-      case Ijk_TInval:
+      case Ijk_InvalICache:
       case Ijk_Yield:
       {
          HReg       r      = iselIntExpr_R(env, next);
Index: priv/host_generic_reg_alloc2.c
===================================================================
--- priv/host_generic_reg_alloc2.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_generic_reg_alloc2.c	(.../trunk)	(revision 2863)
@@ -399,9 +399,9 @@
       not at each insn processed. */
    Bool do_sanity_check;
 
-   vassert(0 == (guest_sizeB % 32));
-   vassert(0 == (LibVEX_N_SPILL_BYTES % 32));
-   vassert(0 == (N_SPILL64S % 4));
+   vassert(0 == (guest_sizeB % 16));
+   vassert(0 == (LibVEX_N_SPILL_BYTES % 16));
+   vassert(0 == (N_SPILL64S % 2));
 
    /* The live range numbers are signed shorts, and so limiting the
       number of insns to 15000 comfortably guards against them
Index: priv/host_generic_simd64.c
===================================================================
--- priv/host_generic_simd64.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_generic_simd64.c	(.../trunk)	(revision 2863)
@@ -1553,7 +1553,11 @@
 
 /* ----------------------------------------------------- */
 /* Signed and unsigned integer division, that behave like
-   the ARMv7 UDIV ansd SDIV instructions. */
+   the ARMv7 UDIV ansd SDIV instructions.
+
+   sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
+   udiv32 also behaves like 64-bit v8 UDIV on w-regs.
+*/
 /* ----------------------------------------------------- */
 
 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
@@ -1564,11 +1568,19 @@
    return x / y;
 }
 
+ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
+{
+   // Division by zero --> zero
+   if (UNLIKELY(y == 0)) return 0;
+   // C requires rounding towards zero, which is also what we need.
+   return x / y;
+}
+
 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
 {
    // Division by zero --> zero
    if (UNLIKELY(y == 0)) return 0;
-   // The single case that produces an unpresentable result
+   // The single case that produces an unrepresentable result
    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
       return (Int)(UInt)0x80000000;
@@ -1579,7 +1591,22 @@
    return x / y;
 }
 
+Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
+{
+   // Division by zero --> zero
+   if (UNLIKELY(y == 0)) return 0;
+   // The single case that produces an unrepresentable result
+   if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
+                 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
+      return (Long)(ULong)0x8000000000000000ULL;
+   // Else return the result rounded towards zero.  C89 says
+   // this is implementation defined (in the signed case), but gcc
+   // promises to round towards zero.  Nevertheless, at startup,
+   // in main_main.c, do a check for that.
+   return x / y;
+}
 
+
 /*---------------------------------------------------------------*/
 /*--- end                               host_generic_simd64.c ---*/
 /*---------------------------------------------------------------*/
Index: priv/host_generic_simd64.h
===================================================================
--- priv/host_generic_simd64.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_generic_simd64.h	(.../trunk)	(revision 2863)
@@ -166,9 +166,12 @@
 
 // Signed and unsigned integer division, that behave like
 // the ARMv7 UDIV and SDIV instructions.
-extern UInt  h_calc_udiv32_w_arm_semantics ( UInt, UInt );
-extern  Int  h_calc_sdiv32_w_arm_semantics (  Int,  Int );
+extern UInt  h_calc_udiv32_w_arm_semantics ( UInt,  UInt  );
+extern ULong h_calc_udiv64_w_arm_semantics ( ULong, ULong );
+extern Int   h_calc_sdiv32_w_arm_semantics ( Int,   Int   );
+extern Long  h_calc_sdiv64_w_arm_semantics ( Long,  Long  );
 
+
 #endif /* ndef __VEX_HOST_GENERIC_SIMD64_H */
 
 /*---------------------------------------------------------------*/
Index: priv/host_mips_defs.c
===================================================================
--- priv/host_mips_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_mips_defs.c	(.../trunk)	(revision 2863)
@@ -37,7 +37,7 @@
 #include "host_mips_defs.h"
 
 /* guest_COND offset. */
-#define COND_OFFSET(__mode64) (__mode64 ? 612 : 316)
+#define COND_OFFSET(__mode64) (__mode64 ? 612 : 448)
 
 /* Register number for guest state pointer in host code. */
 #define GuestSP 23
@@ -81,7 +81,7 @@
 
    /* But specific for real regs. */
    vassert(hregClass(reg) == HRcInt32 || hregClass(reg) == HRcInt64 ||
-      hregClass(reg) == HRcFlt32 || hregClass(reg) == HRcFlt64);
+           hregClass(reg) == HRcFlt32 || hregClass(reg) == HRcFlt64);
 
    /* But specific for real regs. */
    switch (hregClass(reg)) {
@@ -91,7 +91,6 @@
          vex_printf("%s", ireg32_names[r]);
          return;
       case HRcInt64:
-         vassert(mode64);
          r = hregNumber (reg);
          vassert (r >= 0 && r < 32);
          vex_printf ("%s", ireg32_names[r]);
@@ -773,6 +772,12 @@
       case Mfp_CVTWD:
          ret = "cvt.w.d";
          break;
+      case Mfp_CVTLD:
+         ret = "cvt.l.d";
+         break;
+      case Mfp_CVTLS:
+         ret = "cvt.l.s";
+         break;
       case Mfp_TRUWD:
          ret = "trunc.w.d";
          break;
@@ -797,10 +802,20 @@
       case Mfp_CEILLD:
          ret = "ceil.l.d";
          break;
-      case Mfp_CMP:
-         ret = "C.cond.d";
+      case Mfp_CMP_UN:
+         ret = "c.un.d";
          break;
+      case Mfp_CMP_EQ:
+         ret = "c.eq.d";
+         break;
+      case Mfp_CMP_LT:
+         ret = "c.lt.d";
+         break;
+      case Mfp_CMP_NGT:
+         ret = "c.ngt.d";
+         break;
       default:
+         vex_printf("Unknown op: %d", op);
          vpanic("showMIPSFpOp");
          break;
    }
@@ -1497,8 +1512,7 @@
 
 }
 
-MIPSInstr *MIPSInstr_FpCompare(MIPSFpOp op, HReg dst, HReg srcL, HReg srcR,
-                               UChar cond1)
+MIPSInstr *MIPSInstr_FpCompare(MIPSFpOp op, HReg dst, HReg srcL, HReg srcR)
 {
    MIPSInstr *i = LibVEX_Alloc(sizeof(MIPSInstr));
    i->tag = Min_FpCompare;
@@ -1506,7 +1520,6 @@
    i->Min.FpCompare.dst = dst;
    i->Min.FpCompare.srcL = srcL;
    i->Min.FpCompare.srcR = srcR;
-   i->Min.FpCompare.cond1 = cond1;
    return i;
 }
 
@@ -1811,7 +1824,6 @@
          ppHRegMIPS(i->Min.FpCompare.srcL, mode64);
          vex_printf(",");
          ppHRegMIPS(i->Min.FpCompare.srcR, mode64);
-         vex_printf(" cond: %c", i->Min.FpCompare.cond1);
          return;
       case Min_FpMulAcc:
          vex_printf("%s ", showMIPSFpOp(i->Min.FpMulAcc.op));
@@ -1864,7 +1876,7 @@
          return;
       }
       case Min_FpGpMove: {
-         vex_printf("%s", showMIPSFpGpMoveOp(i->Min.FpGpMove.op));
+         vex_printf("%s ", showMIPSFpGpMoveOp(i->Min.FpGpMove.op));
          ppHRegMIPS(i->Min.FpGpMove.dst, mode64);
          vex_printf(", ");
          ppHRegMIPS(i->Min.FpGpMove.src, mode64);
@@ -2101,7 +2113,7 @@
          addHRegUse(u, HRmRead, i->Min.FpGpMove.src);
          return;
       case Min_MoveCond:
-         addHRegUse(u, HRmWrite, i->Min.MoveCond.dst);
+         addHRegUse(u, HRmModify, i->Min.MoveCond.dst);
          addHRegUse(u, HRmRead, i->Min.MoveCond.src);
          addHRegUse(u, HRmRead, i->Min.MoveCond.cond);
          return;
@@ -2380,7 +2392,6 @@
 static UChar fregNo(HReg r, Bool mode64)
 {
    UInt n;
-   vassert(hregClass(r) == (mode64 ? HRcFlt64 : HRcFlt32));
    vassert(!hregIsVirtual(r));
    n = hregNumber(r);
    vassert(n <= 31);
@@ -2390,7 +2401,6 @@
 static UChar dregNo(HReg r)
 {
    UInt n;
-   vassert(hregClass(r) == HRcFlt64);
    vassert(!hregIsVirtual(r));
    n = hregNumber(r);
    vassert(n <= 31);
@@ -3455,8 +3465,9 @@
             case Ijk_EmFail:        trcval = VEX_TRC_JMP_EMFAIL;        break;
             /* case Ijk_MapFail:   trcval = VEX_TRC_JMP_MAPFAIL;       break; */
             case Ijk_NoDecode:      trcval = VEX_TRC_JMP_NODECODE;      break;
-            case Ijk_TInval:        trcval = VEX_TRC_JMP_TINVAL;        break;
+            case Ijk_InvalICache:   trcval = VEX_TRC_JMP_INVALICACHE;   break;
             case Ijk_NoRedir:       trcval = VEX_TRC_JMP_NOREDIR;       break;
+            case Ijk_SigILL:        trcval = VEX_TRC_JMP_SIGILL;        break;
             case Ijk_SigTRAP:       trcval = VEX_TRC_JMP_SIGTRAP;       break;
             /* case Ijk_SigSEGV:   trcval = VEX_TRC_JMP_SIGSEGV;       break; */
             case Ijk_SigBUS:        trcval = VEX_TRC_JMP_SIGBUS;        break;
@@ -3886,8 +3897,13 @@
                p = mkFormR(p, 0x11, 0x15, 0, fr_src, fr_dst, 0x20);
                break;
             case Mfp_CVTLS:
-               fr_dst = fregNo(i->Min.FpConvert.dst, mode64);
-               fr_src = dregNo(i->Min.FpConvert.src);
+               if (mode64) {
+                  fr_dst = fregNo(i->Min.FpConvert.dst, mode64);
+                  fr_src = dregNo(i->Min.FpConvert.src);
+               } else {
+                  fr_dst = dregNo(i->Min.FpConvert.dst);
+                  fr_src = fregNo(i->Min.FpConvert.src, mode64);
+               }
                p = mkFormR(p, 0x11, 0x10, 0, fr_src, fr_dst, 0x25);
                break;
             case Mfp_CVTLD:
@@ -3973,19 +3989,35 @@
       }
 
       case Min_FpCompare: {
-         UInt r_dst = iregNo(i->Min.FpCompare.dst, mode64);
+         UInt r_dst   = iregNo(i->Min.FpCompare.dst, mode64);
          UInt fr_srcL = dregNo(i->Min.FpCompare.srcL);
          UInt fr_srcR = dregNo(i->Min.FpCompare.srcR);
 
+         UInt op;
          switch (i->Min.FpConvert.op) {
-            case Mfp_CMP:
-               p = mkFormR(p, 0x11, 0x11, fr_srcL, fr_srcR, 0,
-                          (i->Min.FpCompare.cond1 + 48));
-               p = mkFormR(p, 0x11, 0x2, r_dst, 31, 0, 0);
+            case Mfp_CMP_UN:
+               op = 1;
                break;
+            case Mfp_CMP_EQ:
+               op = 2;
+               break;
+            case Mfp_CMP_LT:
+               op = 12;
+               break;
+            case Mfp_CMP_NGT:
+               op = 15;
+               break;               
             default:
                goto bad;
          }
+         /* c.cond.d fr_srcL, fr_srcR
+            cfc1     r_dst,   $31
+            srl      r_dst,   r_dst, 23
+            andi     r_dst,   r_dst, 1 */
+         p = mkFormR(p, 0x11, 0x11, fr_srcL, fr_srcR, 0, op + 48);
+         p = mkFormR(p, 0x11, 0x2, r_dst, 31, 0, 0);
+         p = mkFormS(p, 0, r_dst, 0, r_dst, 23, 2);
+         p = mkFormI(p, 12, r_dst, r_dst, 1);
          goto done;
       }
 
Index: priv/host_mips_defs.h
===================================================================
--- priv/host_mips_defs.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_mips_defs.h	(.../trunk)	(revision 2863)
@@ -366,9 +366,12 @@
    Mfp_CVTSD, Mfp_CVTSW, Mfp_CVTWD,
    Mfp_CVTWS, Mfp_CVTDL, Mfp_CVTSL, Mfp_CVTLS, Mfp_CVTLD, Mfp_TRULS, Mfp_TRULD,
    Mfp_TRUWS, Mfp_TRUWD, Mfp_FLOORWS, Mfp_FLOORWD, Mfp_ROUNDWS, Mfp_ROUNDWD,
-   Mfp_CVTDW, Mfp_CMP, Mfp_CEILWS, Mfp_CEILWD, Mfp_CEILLS, Mfp_CEILLD,
-   Mfp_CVTDS, Mfp_ROUNDLD, Mfp_FLOORLD
+   Mfp_CVTDW, Mfp_CEILWS, Mfp_CEILWD, Mfp_CEILLS, Mfp_CEILLD, Mfp_CVTDS,
+   Mfp_ROUNDLD, Mfp_FLOORLD,
 
+   /* FP compare */
+   Mfp_CMP_UN, Mfp_CMP_EQ, Mfp_CMP_LT, Mfp_CMP_NGT
+
 } MIPSFpOp;
 
 extern const HChar *showMIPSFpOp(MIPSFpOp);
@@ -664,7 +667,7 @@
                                         HReg src2, HReg src3 );
 extern MIPSInstr *MIPSInstr_FpConvert(MIPSFpOp op, HReg dst, HReg src);
 extern MIPSInstr *MIPSInstr_FpCompare(MIPSFpOp op, HReg dst, HReg srcL,
-                  HReg srcR, UChar cond1);
+                                      HReg srcR);
 extern MIPSInstr *MIPSInstr_FpMulAcc(MIPSFpOp op, HReg dst, HReg srcML,
                                      HReg srcMR, HReg srcAcc);
 extern MIPSInstr *MIPSInstr_FpLdSt(Bool isLoad, UChar sz, HReg, MIPSAMode *);
Index: priv/host_mips_isel.c
===================================================================
--- priv/host_mips_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_mips_isel.c	(.../trunk)	(revision 2863)
@@ -47,12 +47,14 @@
    ZERO0       Reserved
    GPR12:22    Allocateable
    23          GuestStatePointer
-   23          Allocateable
    SP          StackFramePointer
    RA          LinkRegister */
 
 static Bool mode64 = False;
 
+/* Host CPU has FPU and 32 dbl. prec. FP registers. */
+static Bool fp_mode64 = False;
+
 /* GPR register class for mips32/64 */
 #define HRcGPR(__mode64) (__mode64 ? HRcInt64 : HRcInt32)
 
@@ -60,7 +62,7 @@
 #define HRcFPR(__mode64) (__mode64 ? HRcFlt64 : HRcFlt32)
 
 /* guest_COND offset */
-#define COND_OFFSET(__mode64) (__mode64 ? 612 : 316)
+#define COND_OFFSET(__mode64) (__mode64 ? 612 : 448)
 
 /*---------------------------------------------------------*/
 /*--- ISelEnv                                           ---*/
@@ -117,6 +119,7 @@
 
       UInt         hwcaps;
       Bool         mode64;
+      Bool         fp_mode64;
 
       Bool         chainingAllowed;
       Addr64       max_ga;
@@ -180,7 +183,7 @@
 
 static HReg newVRegF(ISelEnv * env)
 {
-   HReg reg = mkHReg(env->vreg_ctr, HRcFPR(env->mode64),
+   HReg reg = mkHReg(env->vreg_ctr, HRcFPR(env->fp_mode64),
                      True /*virtual reg */ );
    env->vreg_ctr++;
    return reg;
@@ -230,12 +233,13 @@
 static MIPSRH *iselWordExpr_RH_wrk(ISelEnv * env, Bool syned, IRExpr * e);
 static MIPSRH *iselWordExpr_RH(ISelEnv * env, Bool syned, IRExpr * e);
 
-/* Compute an I8 into a reg-or-5-bit-unsigned-immediate, the latter being an immediate in
-   the range 1 .. 31 inclusive.  Used for doing shift amounts. */
+/* Compute an I8 into a reg-or-5-bit-unsigned-immediate, the latter being an
+   immediate in the range 1 .. 31 inclusive.  Used for doing shift amounts. */
 static MIPSRH *iselWordExpr_RH5u_wrk(ISelEnv * env, IRExpr * e);
 static MIPSRH *iselWordExpr_RH5u(ISelEnv * env, IRExpr * e);
 
-/* In 64-bit mode ONLY */
+/* Compute an I8 into a reg-or-6-bit-unsigned-immediate, the latter being an
+   immediate in the range 1 .. 63 inclusive.  Used for doing shift amounts. */
 static MIPSRH *iselWordExpr_RH6u_wrk(ISelEnv * env, IRExpr * e);
 static MIPSRH *iselWordExpr_RH6u(ISelEnv * env, IRExpr * e);
 
@@ -1119,29 +1123,24 @@
 
             /* Create in dst, the IRCmpF64Result encoded result. */
             /* chech for EQ */
-            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP, tmp, r_srcL, r_srcR,
-                                              toUChar(2)));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, r_ccMIPS, tmp,
-                                         MIPSRH_Imm(False, 22)));
+            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP_EQ, tmp, r_srcL, r_srcR));
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True, r_ccMIPS, tmp,
+                                         MIPSRH_Imm(False, 1)));
             /* chech for UN */
-            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP, tmp, r_srcL, r_srcR,
-                                              toUChar(1)));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, tmp, tmp,
-                                        MIPSRH_Imm(False, 23)));
+            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP_UN, tmp, r_srcL, r_srcR));
             addInstr(env, MIPSInstr_Alu(Malu_OR, r_ccMIPS, r_ccMIPS,
                                         MIPSRH_Reg(tmp)));
             /* chech for LT */
-            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP, tmp, r_srcL, r_srcR,
-                                              toUChar(12)));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, tmp,
-                                         tmp, MIPSRH_Imm(False, 21)));
+            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP_LT, tmp, r_srcL, r_srcR));
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True, tmp,
+                                         tmp, MIPSRH_Imm(False, 2)));
             addInstr(env, MIPSInstr_Alu(Malu_OR, r_ccMIPS, r_ccMIPS,
                                         MIPSRH_Reg(tmp)));
             /* chech for GT */
-            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP, tmp, r_srcL, r_srcR,
-                                              toUChar(15)));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, tmp, tmp,
-                                         MIPSRH_Imm(False, 20)));
+            addInstr(env, MIPSInstr_FpCompare(Mfp_CMP_NGT,
+                                              tmp, r_srcL, r_srcR));
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True, tmp, tmp,
+                                         MIPSRH_Imm(False, 3)));
 
             addInstr(env, MIPSInstr_Alu(Malu_NOR, tmp, tmp, MIPSRH_Reg(tmp)));
             addInstr(env, MIPSInstr_Alu(Malu_AND, tmp, tmp,
@@ -1789,34 +1788,14 @@
       if ((ty == Ity_I8 || ty == Ity_I16 ||
            ty == Ity_I32 || ((ty == Ity_I64))) &&
            typeOfIRExpr(env->type_env, e->Iex.ITE.cond) == Ity_I1) {
+         HReg r_dst  = iselWordExpr_R(env, e->Iex.ITE.iffalse);
+         HReg r1     = iselWordExpr_R(env, e->Iex.ITE.iftrue);
+         HReg r_cond = iselWordExpr_R(env, e->Iex.ITE.cond);
          /*
-          * r_dst = cond && r1
-          * cond = not(cond)
-          * tmp = cond && r0
-          * r_dst = tmp + r_dst
+          * r_dst = r0
+          * movn r_dst, r1, r_cond
           */
-         HReg r0 = iselWordExpr_R(env, e->Iex.ITE.iffalse);
-         HReg r1 = iselWordExpr_R(env, e->Iex.ITE.iftrue);
-         HReg r_cond_1 = iselWordExpr_R(env, e->Iex.ITE.cond);
-         HReg r_cond = newVRegI(env);
-         HReg mask = newVRegI(env);
-         HReg r_dst = newVRegI(env);
-         HReg r_tmp = newVRegI(env);
-         HReg r_tmp1 = newVRegI(env);
-         HReg r_cond_neg = newVRegI(env);
-         /* r_cond = 0 - r_cond_1 */
-         addInstr(env, MIPSInstr_LI(mask, 0x0));
-         addInstr(env, MIPSInstr_Alu(Malu_SUB, r_cond,
-                                     mask, MIPSRH_Reg(r_cond_1)));
-
-         addInstr(env, MIPSInstr_Alu(Malu_AND, r_tmp, r_cond, MIPSRH_Reg(r1)));
-         addInstr(env, MIPSInstr_Alu(Malu_NOR, r_cond_neg, r_cond,
-                       MIPSRH_Reg(r_cond)));
-         addInstr(env, MIPSInstr_Alu(Malu_AND, r_tmp1, r_cond_neg,
-                       MIPSRH_Reg(r0)));
-         addInstr(env, MIPSInstr_Alu(Malu_ADD, r_dst, r_tmp,
-                       MIPSRH_Reg(r_tmp1)));
-
+         addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, r_dst, r1, r_cond));
          return r_dst;
       }
       break;
@@ -2009,7 +1988,6 @@
 static MIPSRH *iselWordExpr_RH6u ( ISelEnv * env, IRExpr * e )
 {
    MIPSRH *ri;
-   vassert(env->mode64);
    ri = iselWordExpr_RH6u_wrk(env, e);
    /* sanity checks ... */
    switch (ri->tag) {
@@ -2436,7 +2414,8 @@
             /* Check if borrow is nedded. */
             addInstr(env, MIPSInstr_Cmp(False, size32, borrow, xLo, yLo, cc));
 
-            addInstr(env, MIPSInstr_Alu(Malu_ADD, yHi, yHi, MIPSRH_Reg(borrow)));
+            addInstr(env, MIPSInstr_Alu(Malu_ADD, yHi, yHi,
+                                        MIPSRH_Reg(borrow)));
             addInstr(env, MIPSInstr_Alu(Malu_SUB, tHi, xHi, MIPSRH_Reg(yHi)));
 
             *rHi = tHi;
@@ -2505,177 +2484,309 @@
          }
 
          case Iop_Shr64: {
-            HReg xLo, xHi;
-            HReg tLo = newVRegI(env);
-            HReg tLo1 = newVRegI(env);
-            HReg tHi = newVRegI(env);
-            HReg tmp = newVRegI(env);
-            HReg tmp2 = newVRegI(env);
-            HReg tmp3 = newVRegI(env);
-            HReg mask = newVRegI(env);
-            HReg tMask = newVRegI(env);
-            HReg discard = newVRegI(env);
-            HReg discard1 = newVRegI(env);
+#if defined (_MIPSEL)
+            /* 64-bit logical shift right based on what gcc generates:
+               <shift>:
+               nor  v0, zero, a2
+               sll  a3, a1, 0x1
+               sllv a3, a3, v0
+               srlv v0, a0, a2
+               srlv v1, a1, a2
+               andi a0, a2, 0x20
+               or   v0, a3, v0
+               movn v0, v1, a0
+               jr   ra
+               movn v1, zero, a0
+            */
+            HReg a0, a1;
+            HReg a0tmp = newVRegI(env);
+            HReg a2 = newVRegI(env);
+            HReg a3 = newVRegI(env);
+            HReg v0 = newVRegI(env);
+            HReg v1 = newVRegI(env);
+            HReg zero = newVRegI(env);
+            MIPSRH *sa = NULL;
 
-            /* We assume any literal values are on the second operand. */
-            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
-            MIPSRH *ri_srcR = NULL;
-            MIPSRH *ri_srcR_sub = NULL;
+            iselInt64Expr(&a1, &a0, env, e->Iex.Binop.arg1);
+            sa = iselWordExpr_RH6u(env, e->Iex.Binop.arg2);
 
-            ri_srcR = iselWordExpr_RH5u(env, e->Iex.Binop.arg2);
-            ri_srcR_sub = iselWordExpr_RH(env, True /*signed */ ,
-                                          e->Iex.Binop.arg2);
+            if (sa->tag == Mrh_Imm) {
+               addInstr(env, MIPSInstr_LI(a2, sa->Mrh.Imm.imm16));
+            }
+            else {
+               addInstr(env, MIPSInstr_Alu(Malu_AND, a2, sa->Mrh.Reg.reg,
+                                           MIPSRH_Imm(False, 0x3f)));
+            }
 
-            /* Steps:
-               1. Take shift-amount (arg2) least significant bits from upper
-                  half of 64bit input value (arg1)
-               2. Shift upper half
-               3. Shift lower half
-               4. Put discarded bits (those from step 1) to most significant
-                  bit positions of lower half */
+            addInstr(env, MIPSInstr_LI(zero, 0x00000000));
+            /* nor  v0, zero, a2 */
+            addInstr(env, MIPSInstr_Alu(Malu_NOR, v0, zero, MIPSRH_Reg(a2)));
+            /* sll  a3, a1, 0x1 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a1, MIPSRH_Imm(False, 0x1)));
+            /* sllv a3, a3, v0 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a3, MIPSRH_Reg(v0)));
+            /* srlv v0, a0, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         v0, a0, MIPSRH_Reg(a2)));
+            /* srlv v1, a1, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         v1, a1, MIPSRH_Reg(a2)));
+            /* andi a0, a2, 0x20 */
+            addInstr(env, MIPSInstr_Alu(Malu_AND, a0tmp, a2,
+                                        MIPSRH_Imm(False, 0x20)));
+            /* or   v0, a3, v0 */
+            addInstr(env, MIPSInstr_Alu(Malu_OR, v0, a3, MIPSRH_Reg(v0)));
 
-            /* Mask for extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_LI(tmp, 0xffffffff));
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         tMask, tmp, ri_srcR));
-            addInstr(env, MIPSInstr_Alu(Malu_NOR, mask,
-                                        tMask, MIPSRH_Reg(tMask)));
+            /* movn    v0, v1, a0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v0, v1, a0tmp));
+            /* movn    v1, zero, a0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v1, zero, a0tmp));
 
-            /* Extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_Alu(Malu_AND, discard, xHi,
-                                        MIPSRH_Reg(mask)));
-            /* Position discarded bits to most significant bit positions. */
-            addInstr(env, MIPSInstr_LI(tmp3, 32));
-            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp2,
-                                        tmp3, ri_srcR_sub));
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         discard1, discard, MIPSRH_Reg(tmp2)));
+            *rHi = v1;
+            *rLo = v0;
+            return;
+#elif defined (_MIPSEB)
+            /* 64-bit logical shift right based on what gcc generates:
+               <shift>:
+               nor  v0, zero, a2
+               sll  a3, a0, 0x1
+               sllv a3, a3, v0
+               srlv v1, a1, a2
+               andi v0, a2, 0x20
+               or   v1, a3, v1
+               srlv a2, a0, a2
+               movn v1, a2, v0
+               movn a2, zero, v0
+               jr   ra
+               move v0, a2
+            */
+            HReg a0, a1;
+            HReg a2 = newVRegI(env);
+            HReg a2tmp = newVRegI(env);
+            HReg a3 = newVRegI(env);
+            HReg v0 = newVRegI(env);
+            HReg v1 = newVRegI(env);
+            HReg zero = newVRegI(env);
+            MIPSRH *sa = NULL;
 
-            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /*32bit shift */,
-                                         tHi, xHi, ri_srcR));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /*32bit shift */,
-                                         tLo1, xLo, ri_srcR));
+            iselInt64Expr(&a0, &a1, env, e->Iex.Binop.arg1);
+            sa = iselWordExpr_RH6u(env, e->Iex.Binop.arg2);
 
-            addInstr(env, MIPSInstr_Alu(Malu_OR, tLo,
-                                        tLo1, MIPSRH_Reg(discard1)));
-            *rHi = tHi;
-            *rLo = tLo;
+            if (sa->tag == Mrh_Imm) {
+               addInstr(env, MIPSInstr_LI(a2, sa->Mrh.Imm.imm16));
+            }
+            else {
+               addInstr(env, MIPSInstr_Alu(Malu_AND, a2, sa->Mrh.Reg.reg,
+                                           MIPSRH_Imm(False, 0x3f)));
+            }
+
+            addInstr(env, MIPSInstr_LI(zero, 0x00000000));
+            /* nor v0, zero, a2 */
+            addInstr(env, MIPSInstr_Alu(Malu_NOR, v0, zero, MIPSRH_Reg(a2)));
+            /* sll a3, a0, 0x1 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a0, MIPSRH_Imm(False, 0x1)));
+            /* sllv a3, a3, v0 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a3, MIPSRH_Reg(v0)));
+            /* srlv v1, a1, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         v1, a1, MIPSRH_Reg(a2)));
+            /* andi v0, a2, 0x20 */
+            addInstr(env, MIPSInstr_Alu(Malu_AND, v0, a2,
+                                        MIPSRH_Imm(False, 0x20)));
+            /* or v1, a3, v1 */
+            addInstr(env, MIPSInstr_Alu(Malu_OR, v1, a3, MIPSRH_Reg(v1)));
+            /* srlv a2, a0, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                             a2tmp, a0, MIPSRH_Reg(a2)));
+
+            /* movn v1, a2, v0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v1, a2tmp, v0));
+            /* movn  a2, zero, v0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, a2tmp, zero, v0));
+            /* move v0, a2 */
+            addInstr(env, mk_iMOVds_RR(v0, a2tmp));
+
+            *rHi = v0;
+            *rLo = v1;
             return;
+#endif
          }
+
          case Iop_Shl64: {
-            HReg xLo, xHi;
-            HReg tLo = newVRegI(env);
-            HReg tHi1 = newVRegI(env);
-            HReg tHi = newVRegI(env);
-            HReg tmp = newVRegI(env);
-            HReg tmp2 = newVRegI(env);
-            HReg tmp3 = newVRegI(env);
-            HReg mask = newVRegI(env);
-            HReg tMask = newVRegI(env);
-            HReg discard = newVRegI(env);
-            HReg discard1 = newVRegI(env);
+            /* 64-bit shift left based on what gcc generates:
+               <shift>:
+               nor  v0,zero,a2
+               srl  a3,a0,0x1
+               srlv a3,a3,v0
+               sllv v1,a1,a2
+               andi v0,a2,0x20
+               or   v1,a3,v1
+               sllv a2,a0,a2
+               movn v1,a2,v0
+               movn a2,zero,v0
+               jr   ra
+               move v0,a2
+            */
+            HReg a0, a1;
+            HReg a2 = newVRegI(env);
+            HReg a3 = newVRegI(env);
+            HReg v0 = newVRegI(env);
+            HReg v1 = newVRegI(env);
+            HReg zero = newVRegI(env);
+            MIPSRH *sa = NULL;
 
-            /* We assume any literal values are on the second operand. */
-            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
-            MIPSRH *ri_srcR = NULL;
-            MIPSRH *ri_srcR_sub = NULL;
+            iselInt64Expr(&a1, &a0, env, e->Iex.Binop.arg1);
+            sa = iselWordExpr_RH6u(env, e->Iex.Binop.arg2);
 
-            ri_srcR = iselWordExpr_RH5u(env, e->Iex.Binop.arg2);
-            ri_srcR_sub = iselWordExpr_RH(env, True /*signed */ ,
-                                          e->Iex.Binop.arg2);
+            if (sa->tag == Mrh_Imm) {
+               addInstr(env, MIPSInstr_LI(a2, sa->Mrh.Imm.imm16));
+            }
+            else {
+               addInstr(env, MIPSInstr_Alu(Malu_AND, a2, sa->Mrh.Reg.reg,
+                                           MIPSRH_Imm(False, 0x3f)));
+            }
 
-            /* Steps:
-               1. Take shift-amount (arg2) most significant bits from lower
-                  half of 64bit input value (arg1)
-               2. Shift lower half
-               3. Shift upper half
-               4. Put discarded bits (those from step 1) to least significant
-                  bit positions of upper half */
+            addInstr(env, MIPSInstr_LI(zero, 0x00000000));
+            /* nor v0, zero, a2 */
+            addInstr(env, MIPSInstr_Alu(Malu_NOR, v0, zero, MIPSRH_Reg(a2)));
+            /* srl a3, a0, 0x1 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         a3, a0, MIPSRH_Imm(False, 0x1)));
+            /* srlv a3, a3, v0 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         a3, a3, MIPSRH_Reg(v0)));
+            /* sllv v1, a1, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         v1, a1, MIPSRH_Reg(a2)));
+            /* andi v0, a2, 0x20 */
+            addInstr(env, MIPSInstr_Alu(Malu_AND, v0, a2,
+                                        MIPSRH_Imm(False, 0x20)));
+            /* or v1, a3, v1 */
+            addInstr(env, MIPSInstr_Alu(Malu_OR, v1, a3, MIPSRH_Reg(v1)));
+            /* sllv a2, a0, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a2, a0, MIPSRH_Reg(a2)));
 
-            /* Mask for extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_LI(tmp, 0xffffffff));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /*32bit shift */,
-                                         tMask, tmp, ri_srcR));
-            addInstr(env, MIPSInstr_Alu(Malu_NOR, mask,
-                                        tMask, MIPSRH_Reg(tMask)));
+            /* movn v1, a2, v0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v1, a2, v0));
+            /* movn a2, zero, v0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, a2, zero, v0));
+            addInstr(env, mk_iMOVds_RR(v0, a2));
 
-            /* Extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_Alu(Malu_AND, discard, xLo,
-                                        MIPSRH_Reg(mask)));
-            /* Position discarded bits to least significant bit positions. */
-            addInstr(env, MIPSInstr_LI(tmp3, 32));
-            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp2,
-                                        tmp3, ri_srcR_sub));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /*32bit shift */,
-                                         discard1, discard, MIPSRH_Reg(tmp2)));
+            *rHi = v1;
+            *rLo = v0;
+            return;
+         }
 
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         tHi1, xHi, ri_srcR));
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         tLo, xLo, ri_srcR));
+         case Iop_Sar64: {
+            /* 64-bit arithmetic shift right based on what gcc generates:
+               <shift>:
+               nor  v0, zero, a2
+               sll  a3, a1, 0x1
+               sllv a3, a3, v0
+               srlv v0, a0, a2
+               srav v1, a1, a2
+               andi a0, a2, 0x20
+               sra  a1, a1, 0x1f
+               or   v0, a3, v0
+               movn v0, v1, a0
+               jr   ra
+               movn v1, a1, a0
+            */
+            HReg a0, a1;
+            HReg a0tmp = newVRegI(env);
+            HReg a1tmp = newVRegI(env);
+            HReg a2 = newVRegI(env);
+            HReg a3 = newVRegI(env);
+            HReg v0 = newVRegI(env);
+            HReg v1 = newVRegI(env);
+            HReg zero = newVRegI(env);
+            MIPSRH *sa = NULL;
 
-            addInstr(env, MIPSInstr_Alu(Malu_OR, tHi,
-                                        tHi1, MIPSRH_Reg(discard1)));
-            *rHi = tHi;
-            *rLo = tLo;
+            iselInt64Expr(&a1, &a0, env, e->Iex.Binop.arg1);
+            sa = iselWordExpr_RH6u(env, e->Iex.Binop.arg2);
+
+            if (sa->tag == Mrh_Imm) {
+               addInstr(env, MIPSInstr_LI(a2, sa->Mrh.Imm.imm16));
+            }
+            else {
+               addInstr(env, MIPSInstr_Alu(Malu_AND, a2, sa->Mrh.Reg.reg,
+                                           MIPSRH_Imm(False, 0x3f)));
+            }
+
+            addInstr(env, MIPSInstr_LI(zero, 0x00000000));
+            /* nor  v0, zero, a2 */
+            addInstr(env, MIPSInstr_Alu(Malu_NOR, v0, zero, MIPSRH_Reg(a2)));
+            /* sll  a3, a1, 0x1 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a1, MIPSRH_Imm(False, 0x1)));
+            /* sllv a3, a3, v0 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /* 32bit shift */,
+                                         a3, a3, MIPSRH_Reg(v0)));
+            /* srlv v0, a0, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /* 32bit shift */,
+                                         v0, a0, MIPSRH_Reg(a2)));
+            /* srav v1, a1, a2 */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True /* 32bit shift */,
+                                         v1, a1, MIPSRH_Reg(a2)));
+            /* andi a0, a2, 0x20 */
+            addInstr(env, MIPSInstr_Alu(Malu_AND, a0tmp, a2,
+                                        MIPSRH_Imm(False, 0x20)));
+            /* sra a1, a1, 0x1f */
+            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True /* 32bit shift */,
+                                         a1tmp, a1, MIPSRH_Imm(False, 0x1f)));
+            /* or   v0, a3, v0 */
+            addInstr(env, MIPSInstr_Alu(Malu_OR, v0, a3, MIPSRH_Reg(v0)));
+
+            /* movn    v0, v1, a0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v0, v1, a0tmp));
+            /* movn    v1, a1, a0 */
+            addInstr(env, MIPSInstr_MoveCond(MMoveCond_movn, v1, a1tmp, a0tmp));
+
+            *rHi = v1;
+            *rLo = v0;
             return;
          }
-         case Iop_Sar64: {
-            HReg xLo, xHi;
-            HReg tLo = newVRegI(env);
-            HReg tLo1 = newVRegI(env);
-            HReg tHi = newVRegI(env);
-            HReg tmp = newVRegI(env);
-            HReg tmp2 = newVRegI(env);
-            HReg tmp3 = newVRegI(env);
-            HReg mask = newVRegI(env);
-            HReg tMask = newVRegI(env);
-            HReg discard = newVRegI(env);
-            HReg discard1 = newVRegI(env);
 
-            /* We assume any literal values are on the second operand. */
-            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
-            MIPSRH *ri_srcR = NULL;
-            MIPSRH *ri_srcR_sub = NULL;
+         case Iop_F32toI64S: {
+            HReg tmpD = newVRegD(env);
+            HReg valF = iselFltExpr(env, e->Iex.Binop.arg2);
+            HReg tLo  = newVRegI(env);
+            HReg tHi  = newVRegI(env);
+            MIPSAMode *am_addr;
 
-            ri_srcR = iselWordExpr_RH5u(env, e->Iex.Binop.arg2);
-            ri_srcR_sub = iselWordExpr_RH(env, True /*signed */ ,
-                                          e->Iex.Binop.arg2);
+            /* CVTLS tmpD, valF */
+            set_MIPS_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, MIPSInstr_FpConvert(Mfp_CVTLS, tmpD, valF));
+            set_MIPS_rounding_default(env);
 
-            /* Steps:
-               1. Take shift-amount (arg2) least significant bits from upper
-                  half of 64bit input value (arg1)
-               2. Shift upper half
-               3. Shift lower half
-               4. Put discarded bits (those from step 1) to most significant
-                  bit positions of lower half */
+            sub_from_sp(env, 16);  /* Move SP down 16 bytes */
+            am_addr = MIPSAMode_IR(0, StackPointer(mode64));
 
-            /* Mask for extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_LI(tmp, 0xffffffff));
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         tMask, tmp, ri_srcR));
-            addInstr(env, MIPSInstr_Alu(Malu_NOR, mask,
-                                        tMask, MIPSRH_Reg(tMask)));
+            /* store as F64 */
+            addInstr(env, MIPSInstr_FpLdSt(False /*store */ , 8, tmpD,
+                                           am_addr));
+            /* load as 2xI32 */
+#if defined (_MIPSEL)
+            addInstr(env, MIPSInstr_Load(4, tLo, am_addr, mode64));
+            addInstr(env, MIPSInstr_Load(4, tHi, nextMIPSAModeFloat(am_addr),
+                                         mode64));
+#elif defined (_MIPSEB)
+            addInstr(env, MIPSInstr_Load(4, tHi, am_addr, mode64));
+            addInstr(env, MIPSInstr_Load(4, tLo, nextMIPSAModeFloat(am_addr),
+                                         mode64));
+#endif
 
-            /* Extraction of bits that will be discarded. */
-            addInstr(env, MIPSInstr_Alu(Malu_AND, discard, xHi,
-                                        MIPSRH_Reg(mask)));
-            /* Position discarded bits to most significant bit positions. */
-            addInstr(env, MIPSInstr_LI(tmp3, 32));
-            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp2,
-                                        tmp3, ri_srcR_sub));
-            addInstr(env, MIPSInstr_Shft(Mshft_SLL, True /*32bit shift */,
-                                         discard1, discard, MIPSRH_Reg(tmp2)));
+            /* Reset SP */
+            add_to_sp(env, 16);
 
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True /*32bit shift */,
-                                         tHi, xHi, ri_srcR));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRL, True /*32bit shift */,
-                                         tLo1, xLo, ri_srcR));
-
-            addInstr(env, MIPSInstr_Alu(Malu_OR, tLo,
-                                        tLo1, MIPSRH_Reg(discard1)));
             *rHi = tHi;
             *rLo = tLo;
+
             return;
          }
 
@@ -2695,7 +2806,7 @@
 
             addInstr(env, MIPSInstr_Shft(Mshft_SLL, True, tmp, src,
                           MIPSRH_Imm(False, 31)));
-            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, tmp, src,
+            addInstr(env, MIPSInstr_Shft(Mshft_SRA, True, tmp, tmp,
                           MIPSRH_Imm(False, 31)));
 
             addInstr(env, mk_iMOVds_RR(tHi, tmp));
@@ -2748,35 +2859,31 @@
          }
 
          case Iop_Left64: {
-            HReg yLo, yHi, borrow;
+            HReg yHi, yLo;
             HReg tHi  = newVRegI(env);
             HReg tLo  = newVRegI(env);
+            HReg tmp  = newVRegI(env);
+            HReg tmp1  = newVRegI(env);
+            HReg tmp2  = newVRegI(env);
             HReg zero = newVRegI(env);
-            Bool size32 = True;
             MIPSCondCode cc = MIPScc_LO;
 
-            borrow = newVRegI(env);
-
             /* yHi:yLo = arg */
             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
             /* zero = 0 */
             addInstr(env, MIPSInstr_LI(zero, 0x00000000));
 
-            /* tLo = 0 - yLo */
-            addInstr(env, MIPSInstr_Alu(Malu_SUB, tLo, zero, MIPSRH_Reg(yLo)));
+            /* tmp2:tmp1 = 0 - (yHi:yLo)*/
+            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp2, zero, MIPSRH_Reg(yLo)));
+            addInstr(env, MIPSInstr_Cmp(False, True, tmp1, zero, tmp2, cc));
+            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp, zero, MIPSRH_Reg(yHi)));
+            addInstr(env, MIPSInstr_Alu(Malu_SUB, tmp1, tmp, MIPSRH_Reg(tmp1)));
 
-            /* Check if borrow is needed. */
-            addInstr(env, MIPSInstr_Cmp(False, size32, borrow, zero, yLo, cc));
-
-            /* tHi = 0 - (yHi + borrow) */
-            addInstr(env, MIPSInstr_Alu(Malu_ADD,
-                                        yHi, yHi, MIPSRH_Reg(borrow)));
-            addInstr(env, MIPSInstr_Alu(Malu_SUB, tHi, zero, MIPSRH_Reg(yHi)));
-            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
+            /* So now we have tmp2:tmp1 = -arg.  To finish off, or 'arg'
                back in, so as to give the final result
                tHi:tLo = arg | -arg. */
-            addInstr(env, MIPSInstr_Alu(Malu_OR, tHi, tHi, MIPSRH_Reg(yHi)));
-            addInstr(env, MIPSInstr_Alu(Malu_OR, tLo, tLo, MIPSRH_Reg(yLo)));
+            addInstr(env, MIPSInstr_Alu(Malu_OR, tHi, yHi, MIPSRH_Reg(tmp1)));
+            addInstr(env, MIPSInstr_Alu(Malu_OR, tLo, yLo, MIPSRH_Reg(tmp2)));
             *rHi = tHi;
             *rLo = tLo;
             return;
@@ -2865,7 +2972,7 @@
 static HReg iselFltExpr_wrk(ISelEnv * env, IRExpr * e)
 {
    IRType ty = typeOfIRExpr(env->type_env, e);
-   vassert(ty == Ity_F32 || (ty == Ity_F64 && mode64));
+   vassert(ty == Ity_F32 || (ty == Ity_F64 && fp_mode64));
 
    if (e->tag == Iex_RdTmp) {
       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
@@ -2872,26 +2979,31 @@
    }
 
    if (e->tag == Iex_Load) {
-      MIPSAMode *am_addr;
-      HReg r_dst = newVRegF(env);
       vassert(e->Iex.Load.ty == Ity_F32
-             || (e->Iex.Load.ty == Ity_F64 && mode64));
-      am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, ty);
-      if (mode64 && e->Iex.Load.ty == Ity_F64)
+              || (e->Iex.Load.ty == Ity_F64 && fp_mode64));
+      HReg r_dst;
+      MIPSAMode *am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, ty);
+      if (e->Iex.Load.ty == Ity_F64) {
+         r_dst = newVRegD(env);
          addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, r_dst, am_addr));
-      else
+      } else {
+         r_dst = newVRegF(env);
          addInstr(env, MIPSInstr_FpLdSt(True /*load */, 4, r_dst, am_addr));
+      }
       return r_dst;
    }
 
    if (e->tag == Iex_Get) {
-      HReg r_dst = newVRegF(env);
       MIPSAMode *am_addr = MIPSAMode_IR(e->Iex.Get.offset,
                                         GuestStatePointer(mode64));
-      if (mode64)
+      HReg r_dst;
+      if (e->Iex.Load.ty == Ity_F64) {
+         r_dst = newVRegD(env);
          addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, r_dst, am_addr));
-      else
+      } else {
+         r_dst = newVRegF(env);
          addInstr(env, MIPSInstr_FpLdSt(True /*load */, 4, r_dst, am_addr));
+      }
       return r_dst;
    }
 
@@ -2908,7 +3020,7 @@
          return r_dst;
       }
       case Iop_F32toF64: {
-         vassert(mode64);
+         vassert(fp_mode64);
          HReg src = iselFltExpr(env, e->Iex.Unop.arg);
          HReg dst = newVRegD(env);
 
@@ -2916,24 +3028,29 @@
          return dst;
       }
       case Iop_ReinterpI64asF64: {
-         vassert(mode64);
-         HReg fr_src = iselWordExpr_R(env, e->Iex.Unop.arg);
-         HReg r_dst = newVRegF(env);
-
-         /* Move Doubleword to Floating Point
-            dmtc1 r_dst, fr_src */
-         addInstr(env, MIPSInstr_FpGpMove(MFpGpMove_dmtc1, r_dst, fr_src));
-
+         HReg r_dst;
+         if (mode64) {
+            HReg fr_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+            r_dst = newVRegF(env);
+            /* Move Doubleword to Floating Point
+               dmtc1 r_dst, fr_src */
+            addInstr(env, MIPSInstr_FpGpMove(MFpGpMove_dmtc1, r_dst, fr_src));
+         } else {
+             HReg Hi, Lo;
+             r_dst = newVRegD(env);
+             iselInt64Expr(&Hi, &Lo, env, e->Iex.Unop.arg);
+             r_dst = mk_LoadRR32toFPR(env, Hi, Lo);  /* 2*I32 -> F64 */
+         }
          return r_dst;
       }
       case Iop_I32StoF64: {
-         vassert(mode64);
+         vassert(fp_mode64);
          HReg dst = newVRegF(env);
          HReg tmp = newVRegF(env);
          HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
 
          /* Move Word to Floating Point
-            mtc1 tmp1, r_src */
+            mtc1 tmp, r_src */
          addInstr(env, MIPSInstr_FpGpMove(MFpGpMove_mtc1, tmp, r_src));
 
          /* and do convert */
@@ -3010,7 +3127,7 @@
                   op = Mfp_DIVS;
                   break;
                case Iop_DivF64:
-                  vassert(mode64);
+                  vassert(fp_mode64);
                   op = Mfp_DIVD;
                   break;
                case Iop_MulF32:
@@ -3017,7 +3134,7 @@
                   op = Mfp_MULS;
                   break;
                case Iop_MulF64:
-                  vassert(mode64);
+                  vassert(fp_mode64);
                   op = Mfp_MULD;
                   break;
                case Iop_AddF32:
@@ -3024,7 +3141,7 @@
                   op = Mfp_ADDS;
                   break;
                case Iop_AddF64:
-                  vassert(mode64);
+                  vassert(fp_mode64);
                   op = Mfp_ADDD;
                   break;
                case Iop_SubF32:
@@ -3031,7 +3148,7 @@
                   op = Mfp_SUBS;
                   break;
                case Iop_SubF64:
-                  vassert(mode64);
+                  vassert(fp_mode64);
                   op = Mfp_SUBD;
                   break;
                default:
@@ -3101,24 +3218,30 @@
 
          case Iop_I64StoF64: {
             HReg r_dst = newVRegF(env);
-
             MIPSAMode *am_addr;
-            HReg fr_src = iselWordExpr_R(env, e->Iex.Binop.arg2);
-            HReg tmp = newVRegF(env);
+            HReg tmp, fr_src;
+            if (mode64) {
+               tmp = newVRegF(env);
+               fr_src = iselWordExpr_R(env, e->Iex.Binop.arg2);
+               /* Move SP down 8 bytes */
+               sub_from_sp(env, 8);
+               am_addr = MIPSAMode_IR(0, StackPointer(mode64));
 
-            /* Move SP down 8 bytes */
-            sub_from_sp(env, 8);
-            am_addr = MIPSAMode_IR(0, StackPointer(mode64));
+               /* store as I64 */
+               addInstr(env, MIPSInstr_Store(8, am_addr, fr_src, mode64));
 
-            /* store as I64 */
-            addInstr(env, MIPSInstr_Store(8, am_addr, fr_src, mode64));
+               /* load as Ity_F64 */
+               addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, tmp, am_addr));
 
-            /* load as Ity_F64 */
-            addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, tmp, am_addr));
+               /* Reset SP */
+               add_to_sp(env, 8);
+            } else {
+               HReg Hi, Lo;
+               tmp = newVRegD(env);
+               iselInt64Expr(&Hi, &Lo, env, e->Iex.Binop.arg2);
+               tmp = mk_LoadRR32toFPR(env, Hi, Lo);  /* 2*I32 -> F64 */
+            }
 
-            /* Reset SP */
-            add_to_sp(env, 8);
-
             set_MIPS_rounding_mode(env, e->Iex.Binop.arg1);
             addInstr(env, MIPSInstr_FpConvert(Mfp_CVTDL, r_dst, tmp));
             set_MIPS_rounding_default(env);
@@ -3128,24 +3251,30 @@
 
          case Iop_I64StoF32: {
             HReg r_dst = newVRegF(env);
-
             MIPSAMode *am_addr;
-            HReg fr_src = iselWordExpr_R(env, e->Iex.Binop.arg2);
-            HReg tmp = newVRegF(env);
+            HReg fr_src, tmp;
+            if (mode64) {
+               tmp = newVRegF(env);
+               fr_src = iselWordExpr_R(env, e->Iex.Binop.arg2);
+               /* Move SP down 8 bytes */
+               sub_from_sp(env, 8);
+               am_addr = MIPSAMode_IR(0, StackPointer(mode64));
 
-            /* Move SP down 8 bytes */
-            sub_from_sp(env, 8);
-            am_addr = MIPSAMode_IR(0, StackPointer(mode64));
+               /* store as I64 */
+               addInstr(env, MIPSInstr_Store(8, am_addr, fr_src, mode64));
 
-            /* store as I64 */
-            addInstr(env, MIPSInstr_Store(8, am_addr, fr_src, mode64));
+               /* load as Ity_F64 */
+               addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, tmp, am_addr));
 
-            /* load as Ity_F64 */
-            addInstr(env, MIPSInstr_FpLdSt(True /*load */, 8, tmp, am_addr));
+               /* Reset SP */
+               add_to_sp(env, 8);
+            } else {
+               HReg Hi, Lo;
+               tmp = newVRegD(env);
+               iselInt64Expr(&Hi, &Lo, env, e->Iex.Binop.arg2);
+               tmp = mk_LoadRR32toFPR(env, Hi, Lo);  /* 2*I32 -> F64 */
+            }
 
-            /* Reset SP */
-            add_to_sp(env, 8);
-
             set_MIPS_rounding_mode(env, e->Iex.Binop.arg1);
             addInstr(env, MIPSInstr_FpConvert(Mfp_CVTSL, r_dst, tmp));
             set_MIPS_rounding_default(env);
@@ -3155,7 +3284,6 @@
 
          case Iop_SqrtF32:
          case Iop_SqrtF64: {
-            /* first arg is rounding mode; we ignore it. */
             Bool sz32 = e->Iex.Binop.op == Iop_SqrtF32;
             HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
             HReg dst = newVRegF(env);
@@ -3368,26 +3496,22 @@
    if (e->tag == Iex_Binop) {
       switch (e->Iex.Binop.op) {
          case Iop_RoundF64toInt: {
-            HReg valD = iselDblExpr(env, e->Iex.Binop.arg2);
-            MIPSRH *fmt = iselWordExpr_RH(env, False, e->Iex.Binop.arg1);
-            HReg valD1 = newVRegD(env);
+            HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
 
-            if (fmt->Mrh.Imm.imm16 == 0x3)
-               addInstr(env, MIPSInstr_FpConvert(Mfp_TRULD, valD1, valD));
-            else if (fmt->Mrh.Imm.imm16 == 0x2)
-               addInstr(env, MIPSInstr_FpConvert(Mfp_CEILLD, valD1, valD));
-            else if (fmt->Mrh.Imm.imm16 == 0x0)
-               addInstr(env, MIPSInstr_FpConvert(Mfp_ROUNDLD, valD1, valD));
-            else
-               vassert(0);
-            return valD1;
+            set_MIPS_rounding_mode(env, e->Iex.Binop.arg1);
+            addInstr(env, MIPSInstr_FpConvert(Mfp_CVTLD, dst, src));
+            set_MIPS_rounding_default(env);
+
+            return dst; 
          }
 
          case Iop_SqrtF64: {
-            /* first arg is rounding mode; we ignore it. */
             HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
             HReg dst = newVRegD(env);
+            set_MIPS_rounding_mode(env, e->Iex.Binop.arg1);
             addInstr(env, MIPSInstr_FpUnary(Mfp_SQRTD, dst, src));
+            set_MIPS_rounding_default(env);
             return dst;
          }
 
@@ -3412,6 +3536,9 @@
                case Iop_DivF64:
                   op = Mfp_DIVD;
                   break;
+               case Iop_DivF32:
+                  op = Mfp_DIVS;
+                  break;
                case Iop_MulF64:
                   op = Mfp_MULD;
                   break;
@@ -3424,7 +3551,9 @@
                default:
                   vassert(0);
             }
+            set_MIPS_rounding_mode(env, e->Iex.Triop.details->arg1);
             addInstr(env, MIPSInstr_FpBinary(op, dst, argL, argR));
+            set_MIPS_rounding_default(env);
             return dst;
          }
          default:
@@ -3872,7 +4001,7 @@
          case Ijk_SigFPE_IntDiv:
          case Ijk_SigFPE_IntOvf:
          case Ijk_Sys_syscall:
-         case Ijk_TInval:
+         case Ijk_InvalICache:
          {
             HReg r = iselWordExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
             addInstr(env, MIPSInstr_XAssisted(r, amPC, cc,
@@ -3971,11 +4100,12 @@
       case Ijk_NoDecode:
       case Ijk_NoRedir:
       case Ijk_SigBUS:
+      case Ijk_SigILL:
       case Ijk_SigTRAP:
       case Ijk_SigFPE_IntDiv:
       case Ijk_SigFPE_IntOvf:
       case Ijk_Sys_syscall:
-      case Ijk_TInval: {
+      case Ijk_InvalICache: {
          HReg      r     = iselWordExpr_R(env, next);
          MIPSAMode* amPC = MIPSAMode_IR(offsIP, GuestStatePointer(env->mode64));
          addInstr(env, MIPSInstr_XAssisted(r, amPC, MIPScc_AL, jk));
@@ -4021,11 +4151,16 @@
            || VEX_PRID_COMP_NETLOGIC);
 
    mode64 = arch_host != VexArchMIPS32;
+#if (__mips_fpr==64)
+   fp_mode64 = ((VEX_MIPS_REV(hwcaps_host) == VEX_PRID_CPU_32FPR)
+                || arch_host == VexArchMIPS64);
+#endif
 
    /* Make up an initial environment to use. */
    env = LibVEX_Alloc(sizeof(ISelEnv));
    env->vreg_ctr = 0;
    env->mode64 = mode64;
+   env->fp_mode64 = fp_mode64;
 
    /* Set up output code array. */
    env->code = newHInstrArray();
@@ -4090,6 +4225,7 @@
          default:
             ppIRType(bb->tyenv->types[i]);
             vpanic("iselBB(mips): IRTemp type");
+            break;
       }
       env->vregmap[i] = hreg;
       env->vregmapHI[i] = hregHI;
Index: priv/host_ppc_defs.c
===================================================================
--- priv/host_ppc_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_ppc_defs.c	(.../trunk)	(revision 2863)
@@ -4270,7 +4270,7 @@
          case Ijk_EmFail:      trcval = VEX_TRC_JMP_EMFAIL;      break;
          //case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
-         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
          //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
Index: priv/host_ppc_isel.c
===================================================================
--- priv/host_ppc_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_ppc_isel.c	(.../trunk)	(revision 2863)
@@ -4929,11 +4929,8 @@
          }
       }
 
-      case Iop_Add32Fx4:   fpop = Pavfp_ADDF;   goto do_32Fx4;
-      case Iop_Sub32Fx4:   fpop = Pavfp_SUBF;   goto do_32Fx4;
       case Iop_Max32Fx4:   fpop = Pavfp_MAXF;   goto do_32Fx4;
       case Iop_Min32Fx4:   fpop = Pavfp_MINF;   goto do_32Fx4;
-      case Iop_Mul32Fx4:   fpop = Pavfp_MULF;   goto do_32Fx4;
       case Iop_CmpEQ32Fx4: fpop = Pavfp_CMPEQF; goto do_32Fx4;
       case Iop_CmpGT32Fx4: fpop = Pavfp_CMPGTF; goto do_32Fx4;
       case Iop_CmpGE32Fx4: fpop = Pavfp_CMPGEF; goto do_32Fx4;
@@ -5213,6 +5210,25 @@
          return dst;
       }
 
+      case Iop_Add32Fx4: fpop = Pavfp_ADDF; goto do_32Fx4_with_rm;
+      case Iop_Sub32Fx4: fpop = Pavfp_SUBF; goto do_32Fx4_with_rm;
+      case Iop_Mul32Fx4: fpop = Pavfp_MULF; goto do_32Fx4_with_rm;
+      do_32Fx4_with_rm:
+      {
+         HReg argL = iselVecExpr(env, triop->arg2);
+         HReg argR = iselVecExpr(env, triop->arg3);
+         HReg dst  = newVRegV(env);
+         /* FIXME: this is bogus, in the sense that Altivec ignores
+            FPSCR.RM, at least for some FP operations.  So setting the
+            RM is pointless.  This is only really correct in the case
+            where the RM is known, at JIT time, to be Irrm_NEAREST,
+            since -- at least for Altivec FP add/sub/mul -- the
+            emitted insn is hardwired to round to nearest. */
+         set_FPU_rounding_mode(env, triop->arg1);
+         addInstr(env, PPCInstr_AvBin32Fx4(fpop, dst, argL, argR));
+         return dst;
+      }
+
       default:
          break;
       } /* switch (e->Iex.Triop.op) */
@@ -5746,7 +5762,7 @@
          case Ijk_SigBUS:
          case Ijk_SigTRAP:
          case Ijk_Sys_syscall:
-         case Ijk_TInval:
+         case Ijk_InvalICache:
          {
             HReg r = iselWordExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
             addInstr(env, PPCInstr_XAssisted(r, amCIA, cc,
@@ -5846,7 +5862,7 @@
       case Ijk_SigBUS:
       case Ijk_SigTRAP:
       case Ijk_Sys_syscall:
-      case Ijk_TInval:
+      case Ijk_InvalICache:
       {
          HReg      r     = iselWordExpr_R(env, next);
          PPCAMode* amCIA = PPCAMode_IR(offsIP, hregPPC_GPR31(env->mode64));
Index: priv/host_s390_defs.c
===================================================================
--- priv/host_s390_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_s390_defs.c	(.../trunk)	(revision 2863)
@@ -273,7 +273,11 @@
 }
 
 
-/* Construct an AMODE for accessing the guest state at OFFSET */
+/* Construct an AMODE for accessing the guest state at OFFSET. 
+   OFFSET can be at most 3 * sizeof(VexGuestS390XState) + LibVEX_N_SPILL_BYTES
+   which may be too large for a B12 addressing mode. 
+   Use a B20 amode as a fallback which will be safe for any offset.
+*/
 s390_amode *
 s390_amode_for_guest_state(Int offset)
 {
@@ -280,6 +284,9 @@
    if (fits_unsigned_12bit(offset))
       return s390_amode_b12(offset, s390_hreg_guest_state_pointer());
 
+   if (fits_signed_20bit(offset))
+      return s390_amode_b20(offset, s390_hreg_guest_state_pointer());
+
    vpanic("invalid guest state offset");
 }
 
@@ -458,7 +465,6 @@
    s390_amode *am;
 
    vassert(offsetB >= 0);
-   vassert(offsetB <= (1 << 12));  /* because we use b12 amode */
    vassert(!hregIsVirtual(rreg));
 
    *i1 = *i2 = NULL;
@@ -485,7 +491,6 @@
    s390_amode *am;
 
    vassert(offsetB >= 0);
-   vassert(offsetB <= (1 << 12));  /* because we use b12 amode */
    vassert(!hregIsVirtual(rreg));
 
    *i1 = *i2 = NULL;
@@ -5861,7 +5866,6 @@
    } else {
       /* From 16 bytes to smaller size */
       vassert(is_valid_fp128_regpair(op_hi, op_lo));
-      vassert(hregIsInvalid(dst_lo));
    }
 
    insn->tag  = S390_INSN_BFP_CONVERT;
@@ -5891,11 +5895,11 @@
 
 
 s390_insn *
-s390_insn_bfp128_convert_from(UChar size, s390_bfp_conv_t tag, HReg dst,
-                              HReg op_hi, HReg op_lo,
+s390_insn_bfp128_convert_from(UChar size, s390_bfp_conv_t tag, HReg dst_hi,
+                              HReg dst_lo, HReg op_hi, HReg op_lo,
                               s390_bfp_round_t rounding_mode)
 {
-   return s390_insn_bfp128_convert(size, tag, dst, INVALID_HREG, op_hi, op_lo,
+   return s390_insn_bfp128_convert(size, tag, dst_hi, dst_lo, op_hi, op_lo,
                                    rounding_mode);
 }
 
@@ -6192,7 +6196,6 @@
    } else {
       /* From 16 bytes to smaller size */
       vassert(is_valid_fp128_regpair(op_hi, op_lo));
-      vassert(hregIsInvalid(dst_lo));
    }
 
    insn->tag  = S390_INSN_DFP_CONVERT;
@@ -6222,11 +6225,11 @@
 
 
 s390_insn *
-s390_insn_dfp128_convert_from(UChar size, s390_dfp_conv_t tag, HReg dst,
-                              HReg op_hi, HReg op_lo,
+s390_insn_dfp128_convert_from(UChar size, s390_dfp_conv_t tag, HReg dst_hi,
+                              HReg dst_lo, HReg op_hi, HReg op_lo,
                               s390_dfp_round_t rounding_mode)
 {
-   return s390_insn_dfp128_convert(size, tag, dst, INVALID_HREG, op_hi, op_lo,
+   return s390_insn_dfp128_convert(size, tag, dst_hi, dst_lo, op_hi, op_lo,
                                    rounding_mode);
 }
 
@@ -6461,7 +6464,7 @@
    case Ijk_EmFail:      return "EmFail";
    case Ijk_NoDecode:    return "NoDecode";
    case Ijk_MapFail:     return "MapFail";
-   case Ijk_TInval:      return "Invalidate";
+   case Ijk_InvalICache: return "Invalidate";
    case Ijk_NoRedir:     return "NoRedir";
    case Ijk_SigTRAP:     return "SigTRAP";
    case Ijk_SigSEGV:     return "SigSEGV";
@@ -9780,7 +9783,7 @@
    case Ijk_EmFail:      trcval = VEX_TRC_JMP_EMFAIL;      break;
    case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
    case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
-   case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+   case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
    case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
    case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
    case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
Index: priv/host_s390_defs.h
===================================================================
--- priv/host_s390_defs.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_s390_defs.h	(.../trunk)	(revision 2863)
@@ -665,8 +665,8 @@
 s390_insn *s390_insn_bfp128_convert_to(UChar size, s390_bfp_conv_t,
                                        HReg dst_hi, HReg dst_lo, HReg op);
 s390_insn *s390_insn_bfp128_convert_from(UChar size, s390_bfp_conv_t,
-                                         HReg dst, HReg op_hi, HReg op_lo,
-                                         s390_bfp_round_t);
+                                         HReg dst_hi, HReg dst_lo, HReg op_hi,
+                                         HReg op_lo, s390_bfp_round_t);
 s390_insn *s390_insn_dfp_binop(UChar size, s390_dfp_binop_t, HReg dst,
                                HReg op2, HReg op3,
                                s390_dfp_round_t rounding_mode);
@@ -699,8 +699,8 @@
 s390_insn *s390_insn_dfp128_convert_to(UChar size, s390_dfp_conv_t,
                                        HReg dst_hi, HReg dst_lo, HReg op);
 s390_insn *s390_insn_dfp128_convert_from(UChar size, s390_dfp_conv_t,
-                                         HReg dst, HReg op_hi, HReg op_lo,
-                                         s390_dfp_round_t);
+                                         HReg dst_hi, HReg dst_lo, HReg op_hi,
+                                         HReg op_lo, s390_dfp_round_t);
 s390_insn *s390_insn_dfp128_reround(UChar size, HReg dst_hi, HReg dst_lo,
                                     HReg op2, HReg op3_hi, HReg op3_lo,
                                     s390_dfp_round_t);
Index: priv/host_s390_isel.c
===================================================================
--- priv/host_s390_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_s390_isel.c	(.../trunk)	(revision 2863)
@@ -1257,7 +1257,8 @@
          addInstr(env, s390_insn_move(8, f15, op_lo));
 
          rounding_mode = get_bfp_rounding_mode(env, arg1);
-         addInstr(env, s390_insn_bfp128_convert_from(size, conv, res, f13, f15,
+         addInstr(env, s390_insn_bfp128_convert_from(size, conv, res,
+                                                     INVALID_HREG, f13, f15,
                                                      rounding_mode));
          return res;
       }
@@ -1290,7 +1291,8 @@
             addInstr(env, s390_insn_move(8, f15, op_lo));
 
             rounding_mode = get_dfp_rounding_mode(env, arg1);
-            addInstr(env, s390_insn_dfp128_convert_from(size, dconv, res, f13,
+            addInstr(env, s390_insn_dfp128_convert_from(size, dconv, res,
+                                                        INVALID_HREG, f13,
                                                         f15, rounding_mode));
             return res;
          }
@@ -2455,7 +2457,7 @@
 
       case Iop_F128toF64:
       case Iop_F128toF32: {
-         HReg op_hi, op_lo, f13, f15;
+         HReg op_hi, op_lo, f12, f13, f14, f15;
          s390_bfp_round_t rounding_mode;
 
          conv = op == Iop_F128toF32 ? S390_BFP_F128_TO_F32
@@ -2463,8 +2465,10 @@
 
          s390_isel_float128_expr(&op_hi, &op_lo, env, left);
 
-         /* We use non-virtual registers as pairs (f13, f15) */
+         /* We use non-virtual registers as pairs (f13, f15) and (f12, f14)) */
+         f12 = make_fpr(12);
          f13 = make_fpr(13);
+         f14 = make_fpr(14);
          f15 = make_fpr(15);
 
          /* operand --> (f13, f15) */
@@ -2471,7 +2475,8 @@
          addInstr(env, s390_insn_move(8, f13, op_hi));
          addInstr(env, s390_insn_move(8, f15, op_lo));
 
-         dst = newVRegF(env);
+         /* result --> (f12, f14) */
+
          /* load-rounded has a rounding mode field when the floating point
             extension facility is installed. */
          if (s390_host_has_fpext) {
@@ -2480,8 +2485,12 @@
             set_bfp_rounding_mode_in_fpc(env, irrm);
             rounding_mode = S390_BFP_ROUND_PER_FPC;
          }
-         addInstr(env, s390_insn_bfp128_convert_from(size, conv, dst, f13, f15,
-                                                     rounding_mode));
+
+         addInstr(env, s390_insn_bfp128_convert_from(size, conv, f12, f14,
+                                                     f13, f15, rounding_mode));
+         dst = newVRegF(env);
+         addInstr(env, s390_insn_move(8, dst, f12));
+
          return dst;
       }
       }
@@ -3044,7 +3053,7 @@
       }
 
       case Iop_D128toD64: {
-         HReg op_hi, op_lo, f13, f15;
+         HReg op_hi, op_lo, f12, f13, f14, f15;
          s390_dfp_round_t rounding_mode;
 
          conv = S390_DFP_D128_TO_D64;
@@ -3051,8 +3060,10 @@
 
          s390_isel_dfp128_expr(&op_hi, &op_lo, env, left);
 
-         /* We use non-virtual registers as pairs (f13, f15) */
+         /* We use non-virtual registers as pairs (f13, f15) and (f12, f14) */
+         f12 = make_fpr(12);
          f13 = make_fpr(13);
+         f14 = make_fpr(14);
          f15 = make_fpr(15);
 
          /* operand --> (f13, f15) */
@@ -3059,7 +3070,8 @@
          addInstr(env, s390_insn_move(8, f13, op_hi));
          addInstr(env, s390_insn_move(8, f15, op_lo));
 
-         dst = newVRegF(env);
+         /* result --> (f12, f14) */
+ 
          /* load-rounded has a rounding mode field when the floating point
             extension facility is installed. */
          if (s390_host_has_fpext) {
@@ -3068,8 +3080,11 @@
             set_dfp_rounding_mode_in_fpc(env, irrm);
             rounding_mode = S390_DFP_ROUND_PER_FPC_0;
          }
-         addInstr(env, s390_insn_dfp128_convert_from(size, conv, dst, f13, f15,
-                                                     rounding_mode));
+         addInstr(env, s390_insn_dfp128_convert_from(size, conv, f12, f14,
+                                                     f13, f15, rounding_mode));
+         dst = newVRegF(env);
+         addInstr(env, s390_insn_move(8, dst, f12));
+
          return dst;
       }
 
@@ -3921,7 +3936,7 @@
       case Ijk_EmFail:
       case Ijk_EmWarn:
       case Ijk_NoDecode:
-      case Ijk_TInval:
+      case Ijk_InvalICache:
       case Ijk_Sys_syscall:
       case Ijk_ClientReq:
       case Ijk_NoRedir:
@@ -4036,7 +4051,7 @@
    case Ijk_EmFail:
    case Ijk_EmWarn:
    case Ijk_NoDecode:
-   case Ijk_TInval:
+   case Ijk_InvalICache:
    case Ijk_Sys_syscall:
    case Ijk_ClientReq:
    case Ijk_NoRedir:
Index: priv/host_x86_defs.c
===================================================================
--- priv/host_x86_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_x86_defs.c	(.../trunk)	(revision 2863)
@@ -2023,11 +2023,25 @@
       case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
       case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
       case Xfp_MOV:    break;
-      case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
-                       *p++ = 0xD9; *p++ = 0xF2; /* fptan */
-                       *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
-                       break;
-      default: vpanic("do_fop1_st: unknown op");
+      case Xfp_TAN:
+         /* fptan pushes 1.0 on the FP stack, except when the argument
+            is out of range.  Hence we have to do the instruction,
+            then inspect C2 to see if there is an out of range
+            condition.  If there is, we skip the fincstp that is used
+            by the in-range case to get rid of this extra 1.0
+            value. */
+         p = do_ffree_st7(p); /* since fptan sometimes pushes 1.0 */
+         *p++ = 0xD9; *p++ = 0xF2; // fptan
+         *p++ = 0x50;              // pushl %eax
+         *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
+         *p++ = 0x66; *p++ = 0xA9; 
+         *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
+         *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
+         *p++ = 0xD9; *p++ = 0xF7; // fincstp
+         *p++ = 0x58;              // after_fincstp: popl %eax
+         break;
+      default:
+         vpanic("do_fop1_st: unknown op");
    }
    return p;
 }
@@ -2539,7 +2553,7 @@
          case Ijk_EmWarn:       trcval = VEX_TRC_JMP_EMWARN;       break;
          case Ijk_MapFail:      trcval = VEX_TRC_JMP_MAPFAIL;      break;
          case Ijk_NoDecode:     trcval = VEX_TRC_JMP_NODECODE;     break;
-         case Ijk_TInval:       trcval = VEX_TRC_JMP_TINVAL;       break;
+         case Ijk_InvalICache:  trcval = VEX_TRC_JMP_INVALICACHE;  break;
          case Ijk_NoRedir:      trcval = VEX_TRC_JMP_NOREDIR;      break;
          case Ijk_SigTRAP:      trcval = VEX_TRC_JMP_SIGTRAP;      break;
          case Ijk_SigSEGV:      trcval = VEX_TRC_JMP_SIGSEGV;      break;
Index: priv/host_x86_isel.c
===================================================================
--- priv/host_x86_isel.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/host_x86_isel.c	(.../trunk)	(revision 2863)
@@ -3147,6 +3147,11 @@
          HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
          /* XXXROUNDINGFIXME */
          /* set roundingmode here */
+         /* Note that X86Instr_FpUnary(Xfp_TAN,..) sets the condition
+            codes.  I don't think that matters, since this insn
+            selector never generates such an instruction intervening
+            between an flag-setting instruction and a flag-using
+            instruction. */
          addInstr(env, X86Instr_FpUnary(fpop,src,res));
 	 if (fpop != Xfp_SQRT
              && fpop != Xfp_NEG && fpop != Xfp_ABS)
@@ -3554,12 +3559,8 @@
       case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
       case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
       case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
-      case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
-      case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
       case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
       case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
-      case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
-      case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
       do_32Fx4:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3574,12 +3575,8 @@
       case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
       case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
       case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
-      case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
-      case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
       case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
       case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
-      case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
-      case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
       do_64Fx2:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3790,6 +3787,50 @@
    } /* switch (e->Iex.Binop.op) */
    } /* if (e->tag == Iex_Binop) */
 
+
+   if (e->tag == Iex_Triop) {
+   IRTriop *triop = e->Iex.Triop.details;
+   switch (triop->op) {
+
+      case Iop_Add32Fx4: op = Xsse_ADDF; goto do_32Fx4_w_rm;
+      case Iop_Sub32Fx4: op = Xsse_SUBF; goto do_32Fx4_w_rm;
+      case Iop_Mul32Fx4: op = Xsse_MULF; goto do_32Fx4_w_rm;
+      case Iop_Div32Fx4: op = Xsse_DIVF; goto do_32Fx4_w_rm;
+      do_32Fx4_w_rm:
+      {
+         HReg argL = iselVecExpr(env, triop->arg2);
+         HReg argR = iselVecExpr(env, triop->arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_Add64Fx2: op = Xsse_ADDF; goto do_64Fx2_w_rm;
+      case Iop_Sub64Fx2: op = Xsse_SUBF; goto do_64Fx2_w_rm;
+      case Iop_Mul64Fx2: op = Xsse_MULF; goto do_64Fx2_w_rm;
+      case Iop_Div64Fx2: op = Xsse_DIVF; goto do_64Fx2_w_rm;
+      do_64Fx2_w_rm:
+      {
+         HReg argL = iselVecExpr(env, triop->arg2);
+         HReg argR = iselVecExpr(env, triop->arg3);
+         HReg dst = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (triop->op) */
+   } /* if (e->tag == Iex_Triop) */
+
+
    if (e->tag == Iex_ITE) { // VFD
       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
@@ -4244,8 +4285,9 @@
          case Ijk_Sys_int128:
          case Ijk_Sys_int129:
          case Ijk_Sys_int130:
+         case Ijk_Sys_syscall:
          case Ijk_Sys_sysenter:
-         case Ijk_TInval:
+         case Ijk_InvalICache:
          case Ijk_Yield:
          {
             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
@@ -4342,8 +4384,9 @@
       case Ijk_Sys_int128:
       case Ijk_Sys_int129:
       case Ijk_Sys_int130:
+      case Ijk_Sys_syscall:
       case Ijk_Sys_sysenter:
-      case Ijk_TInval:
+      case Ijk_InvalICache:
       case Ijk_Yield:
       {
          HReg      r     = iselIntExpr_R(env, next);
Index: priv/ir_defs.c
===================================================================
--- priv/ir_defs.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/ir_defs.c	(.../trunk)	(revision 2863)
@@ -640,6 +640,7 @@
       case Iop_Recps32Fx2:  vex_printf("VRecps32Fx2"); return;
       case Iop_Recps32Fx4:  vex_printf("VRecps32Fx4"); return;
       case Iop_Abs32Fx4:  vex_printf("Abs32Fx4"); return;
+      case Iop_Abs64Fx2:  vex_printf("Abs64Fx2"); return;
       case Iop_Rsqrts32Fx4:  vex_printf("VRsqrts32Fx4"); return;
       case Iop_Rsqrts32Fx2:  vex_printf("VRsqrts32Fx2"); return;
 
@@ -685,6 +686,7 @@
       case Iop_CmpLE64F0x2: vex_printf("CmpLE64F0x2"); return;
       case Iop_CmpUN64F0x2: vex_printf("CmpUN64F0x2"); return;
 
+      case Iop_Neg64Fx2: vex_printf("Neg64Fx2"); return;
       case Iop_Neg32Fx4: vex_printf("Neg32Fx4"); return;
       case Iop_Neg32Fx2: vex_printf("Neg32Fx2"); return;
 
@@ -695,6 +697,11 @@
       case Iop_64UtoV128:   vex_printf("64UtoV128"); return;
       case Iop_SetV128lo64: vex_printf("SetV128lo64"); return;
 
+      case Iop_ZeroHI64ofV128:  vex_printf("ZeroHI64ofV128"); return;
+      case Iop_ZeroHI96ofV128:  vex_printf("ZeroHI96ofV128"); return;
+      case Iop_ZeroHI112ofV128: vex_printf("ZeroHI112ofV128"); return;
+      case Iop_ZeroHI120ofV128: vex_printf("ZeroHI120ofV128"); return;
+
       case Iop_32UtoV128:   vex_printf("32UtoV128"); return;
       case Iop_V128to32:    vex_printf("V128to32"); return;
       case Iop_SetV128lo32: vex_printf("SetV128lo32"); return;
@@ -1405,8 +1412,10 @@
       case Ijk_EmFail:        vex_printf("EmFail"); break;
       case Ijk_NoDecode:      vex_printf("NoDecode"); break;
       case Ijk_MapFail:       vex_printf("MapFail"); break;
-      case Ijk_TInval:        vex_printf("Invalidate"); break;
+      case Ijk_InvalICache:   vex_printf("InvalICache"); break;
+      case Ijk_FlushDCache:   vex_printf("FlushDCache"); break;
       case Ijk_NoRedir:       vex_printf("NoRedir"); break;
+      case Ijk_SigILL:        vex_printf("SigILL"); break;
       case Ijk_SigTRAP:       vex_printf("SigTRAP"); break;
       case Ijk_SigSEGV:       vex_printf("SigSEGV"); break;
       case Ijk_SigBUS:        vex_printf("SigBUS"); break;
@@ -2734,7 +2743,7 @@
       case Iop_RoundF32x4_RP:
       case Iop_RoundF32x4_RN:
       case Iop_RoundF32x4_RZ:
-      case Iop_Abs32Fx4:
+      case Iop_Abs64Fx2: case Iop_Abs32Fx4:
       case Iop_Rsqrte32Fx4:
       case Iop_Rsqrte32x4:
          UNARY(Ity_V128, Ity_V128);
@@ -2789,19 +2798,19 @@
       case Iop_CmpEQ64F0x2: case Iop_CmpLT64F0x2:
       case Iop_CmpLE32F0x4: case Iop_CmpUN32F0x4:
       case Iop_CmpLE64F0x2: case Iop_CmpUN64F0x2:
-      case Iop_Add32Fx4: case Iop_Add32F0x4:
-      case Iop_Add64Fx2: case Iop_Add64F0x2:
-      case Iop_Div32Fx4: case Iop_Div32F0x4:
-      case Iop_Div64Fx2: case Iop_Div64F0x2:
+      case Iop_Add32F0x4:
+      case Iop_Add64F0x2:
+      case Iop_Div32F0x4:
+      case Iop_Div64F0x2:
       case Iop_Max32Fx4: case Iop_Max32F0x4:
       case Iop_PwMax32Fx4: case Iop_PwMin32Fx4:
       case Iop_Max64Fx2: case Iop_Max64F0x2:
       case Iop_Min32Fx4: case Iop_Min32F0x4:
       case Iop_Min64Fx2: case Iop_Min64F0x2:
-      case Iop_Mul32Fx4: case Iop_Mul32F0x4:
-      case Iop_Mul64Fx2: case Iop_Mul64F0x2:
-      case Iop_Sub32Fx4: case Iop_Sub32F0x4:
-      case Iop_Sub64Fx2: case Iop_Sub64F0x2:
+      case Iop_Mul32F0x4:
+      case Iop_Mul64F0x2:
+      case Iop_Sub32F0x4:
+      case Iop_Sub64F0x2:
       case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
       case Iop_Add8x16:   case Iop_Add16x8:   
       case Iop_Add32x4:   case Iop_Add64x2:
@@ -2900,10 +2909,12 @@
       case Iop_Reverse64_8x16: case Iop_Reverse64_16x8: case Iop_Reverse64_32x4:
       case Iop_Reverse32_8x16: case Iop_Reverse32_16x8:
       case Iop_Reverse16_8x16:
-      case Iop_Neg32Fx4:
+      case Iop_Neg64Fx2: case Iop_Neg32Fx4:
       case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
       case Iop_CipherSV128:
       case Iop_PwBitMtxXpose64x2:
+      case Iop_ZeroHI64ofV128:  case Iop_ZeroHI96ofV128:
+      case Iop_ZeroHI112ofV128: case Iop_ZeroHI120ofV128:
          UNARY(Ity_V128, Ity_V128);
 
       case Iop_ShlV128: case Iop_ShrV128:
@@ -2966,7 +2977,7 @@
       case Iop_QDMulLong16Sx4: case Iop_QDMulLong32Sx2:
          BINARY(Ity_I64, Ity_I64, Ity_V128);
 
-         /* s390 specific */
+      /* s390 specific */
       case Iop_MAddF32:
       case Iop_MSubF32:
          QUATERNARY(ity_RMode,Ity_F32,Ity_F32,Ity_F32, Ity_F32);
@@ -2984,6 +2995,18 @@
       case Iop_DivF128:
          TERNARY(ity_RMode,Ity_F128,Ity_F128, Ity_F128);
 
+      case Iop_Add64Fx2: case Iop_Sub64Fx2:
+      case Iop_Mul64Fx2: case Iop_Div64Fx2: 
+      case Iop_Add32Fx4: case Iop_Sub32Fx4:
+      case Iop_Mul32Fx4: case Iop_Div32Fx4: 
+         TERNARY(ity_RMode,Ity_V128,Ity_V128, Ity_V128);
+
+      case Iop_Add64Fx4: case Iop_Sub64Fx4:
+      case Iop_Mul64Fx4: case Iop_Div64Fx4:
+      case Iop_Add32Fx8: case Iop_Sub32Fx8:
+      case Iop_Mul32Fx8: case Iop_Div32Fx8:
+         TERNARY(ity_RMode,Ity_V256,Ity_V256, Ity_V256);
+
       case Iop_NegF128:
       case Iop_AbsF128:
          UNARY(Ity_F128, Ity_F128);
@@ -3203,10 +3226,6 @@
       case Iop_64x4toV256:
          QUATERNARY(Ity_I64, Ity_I64, Ity_I64, Ity_I64, Ity_V256);
 
-      case Iop_Add64Fx4: case Iop_Sub64Fx4:
-      case Iop_Mul64Fx4: case Iop_Div64Fx4:
-      case Iop_Add32Fx8: case Iop_Sub32Fx8:
-      case Iop_Mul32Fx8: case Iop_Div32Fx8:
       case Iop_AndV256:  case Iop_OrV256:
       case Iop_XorV256:
       case Iop_Max32Fx8: case Iop_Min32Fx8:
@@ -4465,6 +4484,17 @@
    }
 }
 
+IRType integerIRTypeOfSize ( Int szB )
+{
+   switch (szB) {
+      case 8: return Ity_I64;
+      case 4: return Ity_I32;
+      case 2: return Ity_I16;
+      case 1: return Ity_I8;
+      default: vpanic("integerIRTypeOfSize");
+   }
+}
+
 IRExpr* mkIRExpr_HWord ( HWord hw )
 {
    vassert(sizeof(void*) == sizeof(HWord));
Index: priv/ir_opt.c
===================================================================
--- priv/ir_opt.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/ir_opt.c	(.../trunk)	(revision 2863)
@@ -1178,6 +1178,30 @@
                   && e->Iex.Const.con->Ico.U32 == 0);
 }
 
+/* Is this literally IRExpr_Const(IRConst_U64(0)) ? */
+static Bool isZeroU64 ( IRExpr* e )
+{
+   return toBool( e->tag == Iex_Const 
+                  && e->Iex.Const.con->tag == Ico_U64
+                  && e->Iex.Const.con->Ico.U64 == 0);
+}
+
+/* Is this literally IRExpr_Const(IRConst_V128(0)) ? */
+static Bool isZeroV128 ( IRExpr* e )
+{
+   return toBool( e->tag == Iex_Const 
+                  && e->Iex.Const.con->tag == Ico_V128
+                  && e->Iex.Const.con->Ico.V128 == 0x0000);
+}
+
+/* Is this literally IRExpr_Const(IRConst_V256(0)) ? */
+static Bool isZeroV256 ( IRExpr* e )
+{
+   return toBool( e->tag == Iex_Const 
+                  && e->Iex.Const.con->tag == Ico_V256
+                  && e->Iex.Const.con->Ico.V256 == 0x00000000);
+}
+
 /* Is this an integer constant with value 0 ? */
 static Bool isZeroU ( IRExpr* e )
 {
@@ -1224,9 +1248,11 @@
       case Iop_Xor16: return IRExpr_Const(IRConst_U16(0));
       case Iop_Sub32:
       case Iop_Xor32: return IRExpr_Const(IRConst_U32(0));
+      case Iop_And64:
       case Iop_Sub64:
       case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
-      case Iop_XorV128: return IRExpr_Const(IRConst_V128(0));
+      case Iop_XorV128:
+      case Iop_AndV128: return IRExpr_Const(IRConst_V128(0));
       default: vpanic("mkZeroOfPrimopResultType: bad primop");
    }
 }
@@ -1990,6 +2016,17 @@
                }
                break;
             }
+            /* Same reasoning for the 256-bit version. */
+            case Iop_V128HLtoV256: {
+               IRExpr* argHi = e->Iex.Binop.arg1;
+               IRExpr* argLo = e->Iex.Binop.arg2;
+               if (isZeroV128(argHi) && isZeroV128(argLo)) {
+                  e2 = IRExpr_Const(IRConst_V256(0));
+               } else {
+                  goto unhandled;
+               }
+               break;
+            }
 
             /* -- V128 stuff -- */
             case Iop_InterleaveLO8x16: {
@@ -2114,6 +2151,13 @@
                   break;
                }
                break;
+            case Iop_Sub8x16:
+               /* Sub8x16(x,0) ==> x */
+               if (isZeroV128(e->Iex.Binop.arg2)) {
+                  e2 = e->Iex.Binop.arg1;
+                  break;
+               }
+               break;
 
             case Iop_And64:
             case Iop_And32:
@@ -2149,6 +2193,19 @@
                   e2 = e->Iex.Binop.arg1;
                   break;
                }
+               /* Deal with either arg zero.  Could handle other And
+                  cases here too. */
+               if (e->Iex.Binop.op == Iop_And64
+                   && (isZeroU64(e->Iex.Binop.arg1)
+                       || isZeroU64(e->Iex.Binop.arg2))) {
+                  e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
+                  break;
+               } else if (e->Iex.Binop.op == Iop_AndV128
+                          && (isZeroV128(e->Iex.Binop.arg1)
+                              || isZeroV128(e->Iex.Binop.arg2))) {
+                  e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
+                  break;
+               }
                break;
 
             case Iop_OrV128:
@@ -2158,6 +2215,29 @@
                   e2 = e->Iex.Binop.arg1;
                   break;
                }
+               /* OrV128(t,0) ==> t */
+               if (e->Iex.Binop.op == Iop_OrV128) {
+                  if (isZeroV128(e->Iex.Binop.arg2)) {
+                     e2 = e->Iex.Binop.arg1;
+                     break;
+                  }
+                  if (isZeroV128(e->Iex.Binop.arg1)) {
+                     e2 = e->Iex.Binop.arg2;
+                     break;
+                  }
+               }
+               /* OrV256(t,0) ==> t */
+               if (e->Iex.Binop.op == Iop_OrV256) {
+                  if (isZeroV256(e->Iex.Binop.arg2)) {
+                     e2 = e->Iex.Binop.arg1;
+                     break;
+                  }
+                  //Disabled because there's no known test case right now.
+                  //if (isZeroV256(e->Iex.Binop.arg1)) {
+                  //   e2 = e->Iex.Binop.arg2;
+                  //   break;
+                  //}
+               }
                break;
 
             case Iop_Xor8:
Index: priv/main_main.c
===================================================================
--- priv/main_main.c	(.../tags/VEX_3_9_0)	(revision 2863)
+++ priv/main_main.c	(.../trunk)	(revision 2863)
@@ -38,6 +38,7 @@
 #include "libvex_guest_x86.h"
 #include "libvex_guest_amd64.h"
 #include "libvex_guest_arm.h"
+#include "libvex_guest_arm64.h"
 #include "libvex_guest_ppc32.h"
 #include "libvex_guest_ppc64.h"
 #include "libvex_guest_s390x.h"
@@ -53,6 +54,7 @@
 #include "host_amd64_defs.h"
 #include "host_ppc_defs.h"
 #include "host_arm_defs.h"
+#include "host_arm64_defs.h"
 #include "host_s390_defs.h"
 #include "host_mips_defs.h"
 
@@ -60,6 +62,7 @@
 #include "guest_x86_defs.h"
 #include "guest_amd64_defs.h"
 #include "guest_arm_defs.h"
+#include "guest_arm64_defs.h"
 #include "guest_ppc_defs.h"
 #include "guest_s390_defs.h"
 #include "guest_mips_defs.h"
@@ -89,6 +92,7 @@
 
 void LibVEX_default_VexControl ( /*OUT*/ VexControl* vcon )
 {
+   vex_bzero(vcon, sizeof(*vcon));
    vcon->iropt_verbosity            = 0;
    vcon->iropt_level                = 2;
    vcon->iropt_register_updates     = VexRegUpdUnwindregsAtMemAccess;
@@ -233,7 +237,7 @@
    HInstrArray*    vcode;
    HInstrArray*    rcode;
    Int             i, j, k, out_used, guest_sizeB;
-   Int             offB_TISTART, offB_TILEN, offB_GUEST_IP, szB_GUEST_IP;
+   Int             offB_CMSTART, offB_CMLEN, offB_GUEST_IP, szB_GUEST_IP;
    Int             offB_HOST_EvC_COUNTER, offB_HOST_EvC_FAILADDR;
    UChar           insn_bytes[128];
    IRType          guest_word_type;
@@ -259,8 +263,8 @@
    disInstrFn             = NULL;
    guest_word_type        = Ity_INVALID;
    host_word_type         = Ity_INVALID;
-   offB_TISTART           = 0;
-   offB_TILEN             = 0;
+   offB_CMSTART           = 0;
+   offB_CMLEN             = 0;
    offB_GUEST_IP          = 0;
    szB_GUEST_IP           = 0;
    offB_HOST_EvC_COUNTER  = 0;
@@ -417,6 +421,30 @@
          vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_host.hwcaps));
          break;
 
+      case VexArchARM64:
+         mode64      = True;
+         getAllocableRegs_ARM64 ( &n_available_real_regs,
+                                  &available_real_regs );
+         isMove      = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_ARM64Instr;
+         getRegUsage = (void(*)(HRegUsage*,HInstr*, Bool))
+                       getRegUsage_ARM64Instr;
+         mapRegs     = (void(*)(HRegRemap*,HInstr*, Bool))
+                       mapRegs_ARM64Instr;
+         genSpill    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                       genSpill_ARM64;
+         genReload   = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                       genReload_ARM64;
+         ppInstr     = (void(*)(HInstr*, Bool)) ppARM64Instr;
+         ppReg       = (void(*)(HReg)) ppHRegARM64;
+         iselSB      = iselSB_ARM64;
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
+                       emit_ARM64Instr;
+         host_is_bigendian = False;
+         host_word_type    = Ity_I64;
+         vassert(are_valid_hwcaps(VexArchARM64, vta->archinfo_host.hwcaps));
+         break;
+
       case VexArchMIPS32:
          mode64      = False;
          getAllocableRegs_MIPS ( &n_available_real_regs,
@@ -479,8 +507,8 @@
          guest_sizeB            = sizeof(VexGuestX86State);
          guest_word_type        = Ity_I32;
          guest_layout           = &x86guest_layout;
-         offB_TISTART           = offsetof(VexGuestX86State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestX86State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestX86State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestX86State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestX86State,guest_EIP);
          szB_GUEST_IP           = sizeof( ((VexGuestX86State*)0)->guest_EIP );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestX86State,host_EvC_COUNTER);
@@ -487,8 +515,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestX86State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchX86, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestX86State) % 16);
-         vassert(sizeof( ((VexGuestX86State*)0)->guest_TISTART) == 4);
-         vassert(sizeof( ((VexGuestX86State*)0)->guest_TILEN  ) == 4);
+         vassert(sizeof( ((VexGuestX86State*)0)->guest_CMSTART) == 4);
+         vassert(sizeof( ((VexGuestX86State*)0)->guest_CMLEN  ) == 4);
          vassert(sizeof( ((VexGuestX86State*)0)->guest_NRADDR ) == 4);
          break;
 
@@ -499,8 +527,8 @@
          guest_sizeB            = sizeof(VexGuestAMD64State);
          guest_word_type        = Ity_I64;
          guest_layout           = &amd64guest_layout;
-         offB_TISTART           = offsetof(VexGuestAMD64State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestAMD64State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestAMD64State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestAMD64State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestAMD64State,guest_RIP);
          szB_GUEST_IP           = sizeof( ((VexGuestAMD64State*)0)->guest_RIP );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestAMD64State,host_EvC_COUNTER);
@@ -507,8 +535,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestAMD64State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchAMD64, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestAMD64State) % 16);
-         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TISTART ) == 8);
-         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TILEN   ) == 8);
+         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_CMSTART ) == 8);
+         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_CMLEN   ) == 8);
          vassert(sizeof( ((VexGuestAMD64State*)0)->guest_NRADDR  ) == 8);
          break;
 
@@ -519,8 +547,8 @@
          guest_sizeB            = sizeof(VexGuestPPC32State);
          guest_word_type        = Ity_I32;
          guest_layout           = &ppc32Guest_layout;
-         offB_TISTART           = offsetof(VexGuestPPC32State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestPPC32State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestPPC32State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestPPC32State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestPPC32State,guest_CIA);
          szB_GUEST_IP           = sizeof( ((VexGuestPPC32State*)0)->guest_CIA );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC32State,host_EvC_COUNTER);
@@ -527,8 +555,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC32State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchPPC32, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestPPC32State) % 16);
-         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TISTART ) == 4);
-         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TILEN   ) == 4);
+         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_CMSTART ) == 4);
+         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_CMLEN   ) == 4);
          vassert(sizeof( ((VexGuestPPC32State*)0)->guest_NRADDR  ) == 4);
          break;
 
@@ -539,8 +567,8 @@
          guest_sizeB            = sizeof(VexGuestPPC64State);
          guest_word_type        = Ity_I64;
          guest_layout           = &ppc64Guest_layout;
-         offB_TISTART           = offsetof(VexGuestPPC64State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestPPC64State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestPPC64State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestPPC64State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestPPC64State,guest_CIA);
          szB_GUEST_IP           = sizeof( ((VexGuestPPC64State*)0)->guest_CIA );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC64State,host_EvC_COUNTER);
@@ -547,8 +575,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC64State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchPPC64, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestPPC64State) % 16);
-         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TISTART    ) == 8);
-         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TILEN      ) == 8);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_CMSTART    ) == 8);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_CMLEN      ) == 8);
          vassert(sizeof( ((VexGuestPPC64State*)0)->guest_NRADDR     ) == 8);
          vassert(sizeof( ((VexGuestPPC64State*)0)->guest_NRADDR_GPR2) == 8);
          break;
@@ -560,8 +588,8 @@
          guest_sizeB      = sizeof(VexGuestS390XState);
          guest_word_type  = Ity_I64;
          guest_layout     = &s390xGuest_layout;
-         offB_TISTART     = offsetof(VexGuestS390XState,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestS390XState,guest_TILEN);
+         offB_CMSTART     = offsetof(VexGuestS390XState,guest_CMSTART);
+         offB_CMLEN       = offsetof(VexGuestS390XState,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestS390XState,guest_IA);
          szB_GUEST_IP           = sizeof( ((VexGuestS390XState*)0)->guest_IA);
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestS390XState,host_EvC_COUNTER);
@@ -568,8 +596,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestS390XState,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchS390X, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestS390XState) % 16);
-         vassert(sizeof( ((VexGuestS390XState*)0)->guest_TISTART    ) == 8);
-         vassert(sizeof( ((VexGuestS390XState*)0)->guest_TILEN      ) == 8);
+         vassert(sizeof( ((VexGuestS390XState*)0)->guest_CMSTART    ) == 8);
+         vassert(sizeof( ((VexGuestS390XState*)0)->guest_CMLEN      ) == 8);
          vassert(sizeof( ((VexGuestS390XState*)0)->guest_NRADDR     ) == 8);
          break;
 
@@ -580,8 +608,8 @@
          guest_sizeB            = sizeof(VexGuestARMState);
          guest_word_type        = Ity_I32;
          guest_layout           = &armGuest_layout;
-         offB_TISTART           = offsetof(VexGuestARMState,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestARMState,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestARMState,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestARMState,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestARMState,guest_R15T);
          szB_GUEST_IP           = sizeof( ((VexGuestARMState*)0)->guest_R15T );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestARMState,host_EvC_COUNTER);
@@ -588,11 +616,31 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestARMState,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestARMState) % 16);
-         vassert(sizeof( ((VexGuestARMState*)0)->guest_TISTART) == 4);
-         vassert(sizeof( ((VexGuestARMState*)0)->guest_TILEN  ) == 4);
+         vassert(sizeof( ((VexGuestARMState*)0)->guest_CMSTART) == 4);
+         vassert(sizeof( ((VexGuestARMState*)0)->guest_CMLEN  ) == 4);
          vassert(sizeof( ((VexGuestARMState*)0)->guest_NRADDR ) == 4);
          break;
 
+      case VexArchARM64:
+         preciseMemExnsFn     = guest_arm64_state_requires_precise_mem_exns;
+         disInstrFn           = disInstr_ARM64;
+         specHelper           = guest_arm64_spechelper;
+         guest_sizeB          = sizeof(VexGuestARM64State);
+         guest_word_type      = Ity_I64;
+         guest_layout         = &arm64Guest_layout;
+         offB_CMSTART         = offsetof(VexGuestARM64State,guest_CMSTART);
+         offB_CMLEN           = offsetof(VexGuestARM64State,guest_CMLEN);
+         offB_GUEST_IP        = offsetof(VexGuestARM64State,guest_PC);
+         szB_GUEST_IP         = sizeof( ((VexGuestARM64State*)0)->guest_PC );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARM64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARM64State,host_EvC_FAILADDR);
+         vassert(are_valid_hwcaps(VexArchARM64, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestARM64State) % 16);
+         vassert(sizeof( ((VexGuestARM64State*)0)->guest_CMSTART) == 8);
+         vassert(sizeof( ((VexGuestARM64State*)0)->guest_CMLEN  ) == 8);
+         vassert(sizeof( ((VexGuestARM64State*)0)->guest_NRADDR ) == 8);
+         break;
+
       case VexArchMIPS32:
          preciseMemExnsFn       = guest_mips32_state_requires_precise_mem_exns;
          disInstrFn             = disInstr_MIPS;
@@ -600,8 +648,8 @@
          guest_sizeB            = sizeof(VexGuestMIPS32State);
          guest_word_type        = Ity_I32;
          guest_layout           = &mips32Guest_layout;
-         offB_TISTART           = offsetof(VexGuestMIPS32State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestMIPS32State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestMIPS32State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestMIPS32State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestMIPS32State,guest_PC);
          szB_GUEST_IP           = sizeof( ((VexGuestMIPS32State*)0)->guest_PC );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS32State,host_EvC_COUNTER);
@@ -608,8 +656,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS32State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchMIPS32, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestMIPS32State) % 16);
-         vassert(sizeof( ((VexGuestMIPS32State*)0)->guest_TISTART) == 4);
-         vassert(sizeof( ((VexGuestMIPS32State*)0)->guest_TILEN  ) == 4);
+         vassert(sizeof( ((VexGuestMIPS32State*)0)->guest_CMSTART) == 4);
+         vassert(sizeof( ((VexGuestMIPS32State*)0)->guest_CMLEN  ) == 4);
          vassert(sizeof( ((VexGuestMIPS32State*)0)->guest_NRADDR ) == 4);
          break;
 
@@ -620,8 +668,8 @@
          guest_sizeB            = sizeof(VexGuestMIPS64State);
          guest_word_type        = Ity_I64;
          guest_layout           = &mips64Guest_layout;
-         offB_TISTART           = offsetof(VexGuestMIPS64State,guest_TISTART);
-         offB_TILEN             = offsetof(VexGuestMIPS64State,guest_TILEN);
+         offB_CMSTART           = offsetof(VexGuestMIPS64State,guest_CMSTART);
+         offB_CMLEN             = offsetof(VexGuestMIPS64State,guest_CMLEN);
          offB_GUEST_IP          = offsetof(VexGuestMIPS64State,guest_PC);
          szB_GUEST_IP           = sizeof( ((VexGuestMIPS64State*)0)->guest_PC );
          offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS64State,host_EvC_COUNTER);
@@ -628,8 +676,8 @@
          offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS64State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchMIPS64, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestMIPS64State) % 16);
-         vassert(sizeof( ((VexGuestMIPS64State*)0)->guest_TISTART) == 8);
-         vassert(sizeof( ((VexGuestMIPS64State*)0)->guest_TILEN  ) == 8);
+         vassert(sizeof( ((VexGuestMIPS64State*)0)->guest_CMSTART) == 8);
+         vassert(sizeof( ((VexGuestMIPS64State*)0)->guest_CMLEN  ) == 8);
          vassert(sizeof( ((VexGuestMIPS64State*)0)->guest_NRADDR ) == 8);
          break;
 
@@ -675,8 +723,8 @@
                      guest_word_type,
                      vta->needs_self_check,
                      vta->preamble_function,
-                     offB_TISTART,
-                     offB_TILEN,
+                     offB_CMSTART,
+                     offB_CMLEN,
                      offB_GUEST_IP,
                      szB_GUEST_IP );
 
@@ -958,6 +1006,8 @@
          chainXDirect = chainXDirect_AMD64; break;
       case VexArchARM:
          chainXDirect = chainXDirect_ARM; break;
+      case VexArchARM64:
+         chainXDirect = chainXDirect_ARM64; break;
       case VexArchS390X:
          chainXDirect = chainXDirect_S390; break;
       case VexArchPPC32:
@@ -999,6 +1049,8 @@
          unchainXDirect = unchainXDirect_AMD64; break;
       case VexArchARM:
          unchainXDirect = unchainXDirect_ARM; break;
+      case VexArchARM64:
+         unchainXDirect = unchainXDirect_ARM64; break;
       case VexArchS390X:
          unchainXDirect = unchainXDirect_S390; break;
       case VexArchPPC32:
@@ -1038,6 +1090,8 @@
             cached = evCheckSzB_AMD64(); break;
          case VexArchARM:
             cached = evCheckSzB_ARM(); break;
+         case VexArchARM64:
+            cached = evCheckSzB_ARM64(); break;
          case VexArchS390X:
             cached = evCheckSzB_S390(); break;
          case VexArchPPC32:
@@ -1152,6 +1206,7 @@
       case VexArchX86:      return "X86";
       case VexArchAMD64:    return "AMD64";
       case VexArchARM:      return "ARM";
+      case VexArchARM64:    return "ARM64";
       case VexArchPPC32:    return "PPC32";
       case VexArchPPC64:    return "PPC64";
       case VexArchS390X:    return "S390X";
@@ -1171,14 +1226,16 @@
 /* Write default settings info *vai. */
 void LibVEX_default_VexArchInfo ( /*OUT*/VexArchInfo* vai )
 {
+   vex_bzero(vai, sizeof(*vai));
    vai->hwcaps              = 0;
    vai->ppc_icache_line_szB = 0;
    vai->ppc_dcbz_szB        = 0;
    vai->ppc_dcbzl_szB       = 0;
-
+   vai->arm64_dMinLine_lg2_szB  = 0;
+   vai->arm64_iMinLine_lg2_szB  = 0;
    vai->hwcache_info.num_levels = 0;
    vai->hwcache_info.num_caches = 0;
-   vai->hwcache_info.caches = NULL;
+   vai->hwcache_info.caches     = NULL;
    vai->hwcache_info.icaches_maintain_coherence = True;  // whatever
 }
 
@@ -1185,6 +1242,7 @@
 /* Write default settings info *vbi. */
 void LibVEX_default_VexAbiInfo ( /*OUT*/VexAbiInfo* vbi )
 {
+   vex_bzero(vbi, sizeof(*vbi));
    vbi->guest_stack_redzone_size       = 0;
    vbi->guest_amd64_assume_fs_is_zero  = False;
    vbi->guest_amd64_assume_gs_is_0x60  = False;
@@ -1381,6 +1439,15 @@
    return NULL;
 }
 
+static const HChar* show_hwcaps_arm64 ( UInt hwcaps )
+{
+   /* Since there are no variants, just insist that hwcaps is zero,
+      and declare it invalid otherwise. */
+  if (hwcaps == 0)
+     return "baseline";
+  return NULL;
+}
+
 static const HChar* show_hwcaps_s390x ( UInt hwcaps )
 {
    static const HChar prefix[] = "s390x";
@@ -1472,6 +1539,7 @@
       case VexArchPPC32:  return show_hwcaps_ppc32(hwcaps);
       case VexArchPPC64:  return show_hwcaps_ppc64(hwcaps);
       case VexArchARM:    return show_hwcaps_arm(hwcaps);
+      case VexArchARM64:  return show_hwcaps_arm64(hwcaps);
       case VexArchS390X:  return show_hwcaps_s390x(hwcaps);
       case VexArchMIPS32: return show_hwcaps_mips32(hwcaps);
       case VexArchMIPS64: return show_hwcaps_mips64(hwcaps);
Index: pub/libvex.h
===================================================================
--- pub/libvex.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex.h	(.../trunk)	(revision 2863)
@@ -55,6 +55,7 @@
       VexArchX86, 
       VexArchAMD64, 
       VexArchARM,
+      VexArchARM64,
       VexArchPPC32,
       VexArchPPC64,
       VexArchS390X,
@@ -172,6 +173,9 @@
 /* Get an ARM architecure level from HWCAPS */
 #define VEX_ARM_ARCHLEVEL(x) ((x) & 0x3f)
 
+/* ARM64: baseline capability is AArch64 v8. */
+/* (no definitions since no variants so far) */
+
 /* MIPS baseline capability */
 /* Assigned Company values for bits 23:16 of the PRId Register
    (CP0 register 15, select 0).  As of the MIPS32 and MIPS64 specs from
@@ -196,10 +200,15 @@
 #define VEX_PRID_IMP_34K        0x9500
 #define VEX_PRID_IMP_74K        0x9700
 
+/* CPU has FPU and 32 dbl. prec. FP registers */
+#define VEX_PRID_CPU_32FPR      0x00000040
+
 /* Get MIPS Company ID from HWCAPS */
 #define VEX_MIPS_COMP_ID(x) ((x) & 0x00FF0000)
 /* Get MIPS Processor ID from HWCAPS */
-#define VEX_MIPS_PROC_ID(x) ((x) & 0x0000FFFF)
+#define VEX_MIPS_PROC_ID(x) ((x) & 0x0000FF00)
+/* Get MIPS Revision from HWCAPS */
+#define VEX_MIPS_REV(x) ((x) & 0x000000FF)
 /* Check if the processor supports DSP ASE Rev 2. */
 #define VEX_MIPS_PROC_DSP2(x) ((VEX_MIPS_COMP_ID(x) == VEX_PRID_COMP_MIPS) && \
                                (VEX_MIPS_PROC_ID(x) == VEX_PRID_IMP_74K))
@@ -213,6 +222,7 @@
 extern const HChar* LibVEX_ppVexArch    ( VexArch );
 extern const HChar* LibVEX_ppVexHwCaps  ( VexArch, UInt );
 
+
 /* The various kinds of caches */
 typedef enum {
    DATA_CACHE,
@@ -266,9 +276,14 @@
       /* PPC32/PPC64 only: size of instruction cache line */
       Int ppc_icache_line_szB;
       /* PPC32/PPC64 only: sizes zeroed by the dcbz/dcbzl instructions
-       * (bug#135264) */
+         (bug#135264) */
       UInt ppc_dcbz_szB;
       UInt ppc_dcbzl_szB; /* 0 means unsupported (SIGILL) */
+      /* ARM64: I- and D- minimum line sizes in log2(bytes), as
+         obtained from ctr_el0.DminLine and .IminLine.  For example, a
+         line size of 64 bytes would be encoded here as 6. */
+      UInt arm64_dMinLine_lg2_szB;
+      UInt arm64_iMinLine_lg2_szB;
    }
    VexArchInfo;
 
@@ -516,7 +531,7 @@
 typedef
    struct {
       /* Total size of the guest state, in bytes.  Must be
-         8-aligned. */
+         16-aligned. */
       Int total_sizeB;
       /* Whereabouts is the stack pointer? */
       Int offset_SP;
@@ -907,13 +922,25 @@
    ~~~~~
    Same as ppc32.
 
+   arm32
+   ~~~~~
+   r8 is GSP.
+
+   arm64
+   ~~~~~
+   r21 is GSP.
+
    ALL GUEST ARCHITECTURES
    ~~~~~~~~~~~~~~~~~~~~~~~
-   The guest state must contain two pseudo-registers, guest_TISTART
-   and guest_TILEN.  These are used to pass the address of areas of
-   guest code, translations of which are to be invalidated, back to
-   the despatcher.  Both pseudo-regs must have size equal to the guest
-   word size.
+   The guest state must contain two pseudo-registers, guest_CMSTART
+   and guest_CMLEN.  These are used to specify guest address ranges,
+   either of code to be invalidated, when used in conjunction with
+   Ijk_InvalICache, or of d-cache ranges to be flushed, when used in
+   conjunction with Ijk_FlushDCache.  In such cases, the two _CM
+   pseudo-regs should be filled in by the IR, and then an exit with
+   one of the two abovementioned Ijk_ kinds should happen, so that the
+   dispatcher can action them.  Both pseudo-regs must have size equal
+   to the guest word size.
 
    The architecture must a third pseudo-register, guest_NRADDR, also
    guest-word-sized.  This is used to record the unredirected guest
Index: pub/libvex_basictypes.h
===================================================================
--- pub/libvex_basictypes.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_basictypes.h	(.../trunk)	(revision 2863)
@@ -192,28 +192,24 @@
 #   define VEX_HOST_WORDSIZE 4
 #   define VEX_REGPARM(_n) /* */
 
-#elif defined(__arm__)
+#elif defined(__arm__) && !defined(__aarch64__)
 #   define VEX_HOST_WORDSIZE 4
 #   define VEX_REGPARM(_n) /* */
 
-#elif defined(_AIX) && !defined(__64BIT__)
-#   define VEX_HOST_WORDSIZE 4
+#elif defined(__aarch64__) && !defined(__arm__)
+#   define VEX_HOST_WORDSIZE 8
 #   define VEX_REGPARM(_n) /* */
 
-#elif defined(_AIX) && defined(__64BIT__)
+#elif defined(__s390x__)
 #   define VEX_HOST_WORDSIZE 8
 #   define VEX_REGPARM(_n) /* */
 
-#elif defined(__s390x__)
+#elif defined(__mips__) && (__mips == 64)
 #   define VEX_HOST_WORDSIZE 8
 #   define VEX_REGPARM(_n) /* */
 
-#elif defined(__mips__)
-#if (__mips==64)
-#   define VEX_HOST_WORDSIZE 8
-#else
+#elif defined(__mips__) && (__mips != 64)
 #   define VEX_HOST_WORDSIZE 4
-#endif
 #   define VEX_REGPARM(_n) /* */
 
 #else
Index: pub/libvex_guest_amd64.h
===================================================================
--- pub/libvex_guest_amd64.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_amd64.h	(.../trunk)	(revision 2863)
@@ -138,8 +138,8 @@
          compilation breakage.  On amd64, these two fields are set to
          zero by LibVEX_GuestAMD64_initialise and then should be
          ignored forever thereafter. */
-      ULong guest_TISTART;
-      ULong guest_TILEN;
+      ULong guest_CMSTART;
+      ULong guest_CMLEN;
 
       /* Used to record the unredirected guest address at the start of
          a translation whose start has been redirected.  By reading
Index: pub/libvex_guest_arm.h
===================================================================
--- pub/libvex_guest_arm.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_arm.h	(.../trunk)	(revision 2863)
@@ -94,9 +94,9 @@
       /* Emulation notes */
       UInt guest_EMNOTE;
 
-      /* For clflush: record start and length of area to invalidate */
-      UInt guest_TISTART;
-      UInt guest_TILEN;
+      /* For clinval/clflush: record start and length of area */
+      UInt guest_CMSTART;
+      UInt guest_CMLEN;
 
       /* Used to record the unredirected guest address at the start of
          a translation whose start has been redirected.  By reading
@@ -193,12 +193,8 @@
       */
       UInt guest_ITSTATE;
 
-      /* Padding to make it have an 32-aligned size */
+      /* Padding to make it have an 16-aligned size */
       UInt padding1;
-      UInt padding2;
-      UInt padding3;
-      UInt padding4;
-      UInt padding5;
    }
    VexGuestARMState;
 
Index: pub/libvex_guest_arm64.h
===================================================================
--- pub/libvex_guest_arm64.h	(.../tags/VEX_3_9_0)	(revision 0)
+++ pub/libvex_guest_arm64.h	(.../trunk)	(revision 2863)
@@ -0,0 +1,190 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                              libvex_guest_arm64.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2013-2013 OpenWorks
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __LIBVEX_PUB_GUEST_ARM64_H
+#define __LIBVEX_PUB_GUEST_ARM64_H
+
+#include "libvex_basictypes.h"
+
+
+/*---------------------------------------------------------------*/
+/*--- Vex's representation of the ARM64 CPU state.            ---*/
+/*---------------------------------------------------------------*/
+
+typedef
+   struct {
+      /* Event check fail addr and counter. */
+      /* 0 */  ULong host_EvC_FAILADDR;
+      /* 8 */  UInt  host_EvC_COUNTER;
+      /* 12 */ UInt  pad0;
+      /* 16 */
+      ULong guest_X0;
+      ULong guest_X1;
+      ULong guest_X2;
+      ULong guest_X3;
+      ULong guest_X4;
+      ULong guest_X5;
+      ULong guest_X6;
+      ULong guest_X7;
+      ULong guest_X8;
+      ULong guest_X9;
+      ULong guest_X10;
+      ULong guest_X11;
+      ULong guest_X12;
+      ULong guest_X13;
+      ULong guest_X14;
+      ULong guest_X15;
+      ULong guest_X16;
+      ULong guest_X17;
+      ULong guest_X18;
+      ULong guest_X19;
+      ULong guest_X20;
+      ULong guest_X21;
+      ULong guest_X22;
+      ULong guest_X23;
+      ULong guest_X24;
+      ULong guest_X25;
+      ULong guest_X26;
+      ULong guest_X27;
+      ULong guest_X28;
+      ULong guest_X29;
+      ULong guest_X30;     /* link register */
+      ULong guest_XSP;
+      ULong guest_PC;
+
+      /* 4-word thunk used to calculate N(sign) Z(zero) C(carry,
+         unsigned overflow) and V(signed overflow) flags. */
+      ULong guest_CC_OP;
+      ULong guest_CC_DEP1;
+      ULong guest_CC_DEP2;
+      ULong guest_CC_NDEP;
+
+      /* User-space thread register? */
+      ULong guest_TPIDR_EL0;
+
+      /* FP/SIMD state */
+      U128 guest_Q0;
+      U128 guest_Q1;
+      U128 guest_Q2;
+      U128 guest_Q3;
+      U128 guest_Q4;
+      U128 guest_Q5;
+      U128 guest_Q6;
+      U128 guest_Q7;
+      U128 guest_Q8;
+      U128 guest_Q9;
+      U128 guest_Q10;
+      U128 guest_Q11;
+      U128 guest_Q12;
+      U128 guest_Q13;
+      U128 guest_Q14;
+      U128 guest_Q15;
+      U128 guest_Q16;
+      U128 guest_Q17;
+      U128 guest_Q18;
+      U128 guest_Q19;
+      U128 guest_Q20;
+      U128 guest_Q21;
+      U128 guest_Q22;
+      U128 guest_Q23;
+      U128 guest_Q24;
+      U128 guest_Q25;
+      U128 guest_Q26;
+      U128 guest_Q27;
+      U128 guest_Q28;
+      U128 guest_Q29;
+      U128 guest_Q30;
+      U128 guest_Q31;
+
+      /* Various pseudo-regs mandated by Vex or Valgrind. */
+      /* Emulation notes */
+      UInt guest_EMNOTE;
+
+      /* For clflush/clinval: record start and length of area */
+      ULong guest_CMSTART;
+      ULong guest_CMLEN;
+
+      /* Used to record the unredirected guest address at the start of
+         a translation whose start has been redirected.  By reading
+         this pseudo-register shortly afterwards, the translation can
+         find out what the corresponding no-redirection address was.
+         Note, this is only set for wrap-style redirects, not for
+         replace-style ones. */
+      ULong guest_NRADDR;
+
+      /* Needed for Darwin (but mandated for all guest architectures):
+         program counter at the last syscall insn (int 0x80/81/82,
+         sysenter, syscall, svc).  Used when backing up to restart a
+         syscall that has been interrupted by a signal. */
+      ULong guest_IP_AT_SYSCALL;
+
+      /* The complete FPCR.  Default value seems to be zero.  We
+         ignore all bits except 23 and 22, which are the rounding
+         mode.  The guest is unconstrained in what values it can write
+         to and read from this register, but the emulation only takes
+         note of bits 23 and 22. */
+      UInt  guest_FPCR;
+
+      /* The complete FPSR.  As with FPCR, the guest may write and
+         read any values here, and the emulation ignores it, with the
+         exception of bit 27 (QC, the sticky saturation bit) which
+         does get set when required. */
+      UInt  guest_FPSR;
+
+      /* Padding to make it have an 16-aligned size */
+      UInt  pad_end_0;
+      ULong pad_end_1;
+   }
+   VexGuestARM64State;
+
+
+/*---------------------------------------------------------------*/
+/*--- Utility functions for ARM64 guest stuff.                ---*/
+/*---------------------------------------------------------------*/
+
+/* ALL THE FOLLOWING ARE VISIBLE TO LIBRARY CLIENT */
+
+/* Initialise all guest ARM64 state. */
+
+extern
+void LibVEX_GuestARM64_initialise ( /*OUT*/VexGuestARM64State* vex_state );
+
+/* Calculate the ARM64 flag state from the saved data, in the format
+   32x0:n:z:c:v:28x0. */
+extern
+ULong LibVEX_GuestARM64_get_nzcv ( /*IN*/
+                                   const VexGuestARM64State* vex_state );
+
+#endif /* ndef __LIBVEX_PUB_GUEST_ARM64_H */
+
+
+/*---------------------------------------------------------------*/
+/*---                                    libvex_guest_arm64.h ---*/
+/*---------------------------------------------------------------*/
Index: pub/libvex_guest_mips32.h
===================================================================
--- pub/libvex_guest_mips32.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_mips32.h	(.../trunk)	(revision 2863)
@@ -41,82 +41,82 @@
 typedef
    struct {
       /* CPU Registers */
-      /*   0 */ UInt guest_r0; /* Hardwired to 0 */
-      /*   4 */ UInt guest_r1;   /* Assembler temporary */
-      /*   8 */ UInt guest_r2;   /* Values for function returns ...*/
-      /*   12 */ UInt guest_r3;   /* ...and expression evaluation */
-      /*   16 */ UInt guest_r4;   /* Function arguments */
-      /*   20 */ UInt guest_r5;
-      /*   24 */ UInt guest_r6;
-      /*   28 */ UInt guest_r7;
-      /*   32 */ UInt guest_r8;   /* Temporaries */
-      /*   36 */ UInt guest_r9;
-      /*   40 */ UInt guest_r10;
-      /*   44 */ UInt guest_r11;
-      /*   48 */ UInt guest_r12;
-      /*   52 */ UInt guest_r13;
-      /*   56 */ UInt guest_r14;
-      /*   60 */ UInt guest_r15;
-      /*   64 */ UInt guest_r16;   /* Saved temporaries */
-      /*   68 */ UInt guest_r17;
-      /*   72 */ UInt guest_r18;
-      /*   76 */ UInt guest_r19;
-      /*   80 */ UInt guest_r20;
-      /*   84 */ UInt guest_r21;
-      /*   88 */ UInt guest_r22;
-      /*   92 */ UInt guest_r23;
-      /*   96 */ UInt guest_r24;   /* Temporaries */
-      /*   100 */ UInt guest_r25;
-      /*   104 */ UInt guest_r26;   /* Reserved for OS kernel */
-      /*   108 */ UInt guest_r27;
-      /*   112 */ UInt guest_r28;   /* Global pointer */
-      /*   116 */ UInt guest_r29;   /* Stack pointer */
-      /*   120 */ UInt guest_r30;   /* Frame pointer */
-      /*   124 */ UInt guest_r31;   /* Return address */
-      /*   128 */ UInt guest_PC;   /* Program counter */
-      /*   132 */ UInt guest_HI;/* Multiply and divide register higher result */
-      /*   136 */ UInt guest_LO;/* Multiply and divide register lower result */
+      /* 0 */ UInt guest_r0;   /* Hardwired to 0 */
+      /* 4 */ UInt guest_r1;   /* Assembler temporary */
+      /* 8 */ UInt guest_r2;   /* Values for function returns ...*/
+      /* 12 */ UInt guest_r3;  /* ...and expression evaluation */
+      /* 16 */ UInt guest_r4;  /* Function arguments */
+      /* 20 */ UInt guest_r5;
+      /* 24 */ UInt guest_r6;
+      /* 28 */ UInt guest_r7;
+      /* 32 */ UInt guest_r8;  /* Temporaries */
+      /* 36 */ UInt guest_r9;
+      /* 40 */ UInt guest_r10;
+      /* 44 */ UInt guest_r11;
+      /* 48 */ UInt guest_r12;
+      /* 52 */ UInt guest_r13;
+      /* 56 */ UInt guest_r14;
+      /* 60 */ UInt guest_r15;
+      /* 64 */ UInt guest_r16;  /* Saved temporaries */
+      /* 68 */ UInt guest_r17;
+      /* 72 */ UInt guest_r18;
+      /* 76 */ UInt guest_r19;
+      /* 80 */ UInt guest_r20;
+      /* 84 */ UInt guest_r21;
+      /* 88 */ UInt guest_r22;
+      /* 92 */ UInt guest_r23;
+      /* 96 */ UInt guest_r24;  /* Temporaries */
+      /* 100 */ UInt guest_r25;
+      /* 104 */ UInt guest_r26;  /* Reserved for OS kernel */
+      /* 108 */ UInt guest_r27;
+      /* 112 */ UInt guest_r28;  /* Global pointer */
+      /* 116 */ UInt guest_r29;  /* Stack pointer */
+      /* 120 */ UInt guest_r30;  /* Frame pointer */
+      /* 124 */ UInt guest_r31;  /* Return address */
+      /* 128 */ UInt guest_PC;  /* Program counter */
+      /* 132 */ UInt guest_HI;  /* Multiply and divide register higher result */
+      /* 136 */ UInt guest_LO;  /* Multiply and divide register lower result */
 
       /* FPU Registers */
-      /*   140 */ UInt guest_f0; /* Floting point general purpose registers */
-      /*   144 */ UInt guest_f1;
-      /*   148 */ UInt guest_f2;
-      /*   152 */ UInt guest_f3;
-      /*   156 */ UInt guest_f4;
-      /*   160 */ UInt guest_f5;
-      /*   164 */ UInt guest_f6;
-      /*   168 */ UInt guest_f7;
-      /*   172 */ UInt guest_f8;
-      /*   176 */ UInt guest_f9;
-      /*   180 */ UInt guest_f10;
-      /*   184 */ UInt guest_f11;
-      /*   188 */ UInt guest_f12;
-      /*   192 */ UInt guest_f13;
-      /*   196 */ UInt guest_f14;
-      /*   200 */ UInt guest_f15;
-      /*   204 */ UInt guest_f16;
-      /*   208 */ UInt guest_f17;
-      /*   212 */ UInt guest_f18;
-      /*   216 */ UInt guest_f19;
-      /*   220 */ UInt guest_f20;
-      /*   224 */ UInt guest_f21;
-      /*   228 */ UInt guest_f22;
-      /*   232 */ UInt guest_f23;
-      /*   236 */ UInt guest_f24;
-      /*   240 */ UInt guest_f25;
-      /*   244 */ UInt guest_f26;
-      /*   248 */ UInt guest_f27;
-      /*   252 */ UInt guest_f28;
-      /*   256 */ UInt guest_f29;
-      /*   260 */ UInt guest_f30;
-      /*   264 */ UInt guest_f31;
-  
-      /*   268 */ UInt guest_FIR;
-      /*   272 */ UInt guest_FCCR;
-      /*   276 */ UInt guest_FEXR;
-      /*   280 */ UInt guest_FENR;
-      /*   284 */ UInt guest_FCSR;
+      /* 144 */ ULong guest_f0;  /* Floating point general purpose registers */
+      /* 152 */ ULong guest_f1;
+      /* 160 */ ULong guest_f2;
+      /* 168 */ ULong guest_f3;
+      /* 176 */ ULong guest_f4;
+      /* 184 */ ULong guest_f5;
+      /* 192 */ ULong guest_f6;
+      /* 200 */ ULong guest_f7;
+      /* 208 */ ULong guest_f8;
+      /* 216 */ ULong guest_f9;
+      /* 224 */ ULong guest_f10;
+      /* 232 */ ULong guest_f11;
+      /* 240 */ ULong guest_f12;
+      /* 248 */ ULong guest_f13;
+      /* 256 */ ULong guest_f14;
+      /* 264 */ ULong guest_f15;
+      /* 272 */ ULong guest_f16;
+      /* 280 */ ULong guest_f17;
+      /* 288 */ ULong guest_f18;
+      /* 296 */ ULong guest_f19;
+      /* 304 */ ULong guest_f20;
+      /* 312 */ ULong guest_f21;
+      /* 320 */ ULong guest_f22;
+      /* 328 */ ULong guest_f23;
+      /* 336 */ ULong guest_f24;
+      /* 344 */ ULong guest_f25;
+      /* 352 */ ULong guest_f26;
+      /* 360 */ ULong guest_f27;
+      /* 368 */ ULong guest_f28;
+      /* 376 */ ULong guest_f29;
+      /* 384 */ ULong guest_f30;
+      /* 392 */ ULong guest_f31;
 
+      /* 400 */ UInt guest_FIR;
+      /* 404 */ UInt guest_FCCR;
+      /* 408 */ UInt guest_FEXR;
+      /* 412 */ UInt guest_FENR;
+      /* 416 */ UInt guest_FCSR;
+
       /* TLS pointer for the thread. It's read-only in user space.
          On Linux it is set in user space by various thread-related
          syscalls.
@@ -126,29 +126,28 @@
          environments, the UserLocal register is a pointer to a
          thread-specific storage block.
       */
-      /*   288 */ UInt guest_ULR;
+      /* 420 */ UInt guest_ULR;
 
       /* Emulation notes */
-          UInt   guest_EMNOTE;  /* 292 */
+      /* 424 */ UInt guest_EMNOTE;
 
       /* For clflush: record start and length of area to invalidate */
-        UInt guest_TISTART;     /* 296 */
-        UInt guest_TILEN;       /* 300 */ 
-        UInt guest_NRADDR;      /* 304 */
+      /* 428 */ UInt guest_CMSTART;
+      /* 432 */ UInt guest_CMLEN;
+      /* 436 */ UInt guest_NRADDR;
 
-        UInt host_EvC_FAILADDR; /* 308 */
-        UInt host_EvC_COUNTER;  /* 312 */
-        UInt guest_COND;        /* 316 */
+      /* 440 */ UInt host_EvC_FAILADDR;
+      /* 444 */ UInt host_EvC_COUNTER;
+      /* 448 */ UInt guest_COND;
 
-        UInt padding1;
       /* MIPS32 DSP ASE(r2) specific registers. */
-        UInt guest_DSPControl;  /* 324 */
-        ULong guest_ac0;        /* 328 */
-        ULong guest_ac1;        /* 336 */
-        ULong guest_ac2;        /* 344 */
-        ULong guest_ac3;        /* 352 */
-        
-        UInt padding[6];
+      /* 452 */ UInt guest_DSPControl;
+      /* 456 */ ULong guest_ac0;
+      /* 464 */ ULong guest_ac1;
+      /* 472 */ ULong guest_ac2;
+      /* 480 */ ULong guest_ac3;
+
+        UInt padding;
 } VexGuestMIPS32State;
 /*---------------------------------------------------------------*/
 /*--- Utility functions for MIPS32 guest stuff.               ---*/
Index: pub/libvex_guest_mips64.h
===================================================================
--- pub/libvex_guest_mips64.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_mips64.h	(.../trunk)	(revision 2863)
@@ -137,8 +137,8 @@
         UInt guest_EMNOTE;       /* 568 */
 
       /* For clflush: record start and length of area to invalidate */
-        ULong guest_TISTART;     /* 576 */
-        ULong guest_TILEN;       /* 584 */
+        ULong guest_CMSTART;     /* 576 */
+        ULong guest_CMLEN;       /* 584 */
 
         ULong guest_NRADDR;      /* 592 */
 
@@ -145,7 +145,7 @@
         ULong host_EvC_FAILADDR; /* 600 */
         UInt host_EvC_COUNTER;   /* 608 */
         UInt guest_COND;         /* 612 */
-        UInt padding[6];
+        UInt padding[2];
 } VexGuestMIPS64State;
 
 /*---------------------------------------------------------------*/
Index: pub/libvex_guest_ppc32.h
===================================================================
--- pub/libvex_guest_ppc32.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_ppc32.h	(.../trunk)	(revision 2863)
@@ -210,8 +210,8 @@
       /* 1196 */ UInt guest_EMNOTE;
 
       /* For icbi: record start and length of area to invalidate */
-      /* 1200 */ UInt guest_TISTART;
-      /* 1204 */ UInt guest_TILEN;
+      /* 1200 */ UInt guest_CMSTART;
+      /* 1204 */ UInt guest_CMLEN;
 
       /* Used to record the unredirected guest address at the start of
          a translation whose start has been redirected.  By reading
@@ -242,7 +242,7 @@
       /* 1368 */ ULong guest_TEXASR;    // Transaction EXception And Summary Register
       /* 1376 */ ULong guest_TFIAR;     // Transaction Failure Instruction Address Register
 
-      /* Padding to make it have an 8-aligned size */
+      /* Padding to make it have an 16-aligned size */
       /* 1384 */ UInt  padding2;
 
    }
Index: pub/libvex_guest_ppc64.h
===================================================================
--- pub/libvex_guest_ppc64.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_ppc64.h	(.../trunk)	(revision 2863)
@@ -252,8 +252,8 @@
       /* 1340 */ UInt  padding;
 
       /* For icbi: record start and length of area to invalidate */
-      /* 1344 */ ULong guest_TISTART;
-      /* 1352 */ ULong guest_TILEN;
+      /* 1344 */ ULong guest_CMSTART;
+      /* 1352 */ ULong guest_CMLEN;
 
       /* Used to record the unredirected guest address at the start of
          a translation whose start has been redirected.  By reading
Index: pub/libvex_guest_s390x.h
===================================================================
--- pub/libvex_guest_s390x.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_s390x.h	(.../trunk)	(revision 2863)
@@ -132,8 +132,8 @@
 
    /* See comments at bottom of libvex.h */
    /*  384 */  ULong guest_NRADDR;
-   /*  392 */  ULong guest_TISTART;
-   /*  400 */  ULong guest_TILEN;
+   /*  392 */  ULong guest_CMSTART;
+   /*  400 */  ULong guest_CMLEN;
 
    /* Used when backing up to restart a syscall that has
       been interrupted by a signal. See also comment in
@@ -148,11 +148,11 @@
    /*  424 */  ULong host_EvC_FAILADDR;
 
 /*------------------------------------------------------------*/
-/*--- Force alignment to 32 bytes                          ---*/
+/*--- Force alignment to 16 bytes                          ---*/
 /*------------------------------------------------------------*/
-   /*  432 */  UChar padding[16];
+   /*  432 */  UChar padding[0];
 
-   /*  448 */  /* This is the size of the guest state */
+   /*  432 */  /* This is the size of the guest state */
 } VexGuestS390XState;
 
 
Index: pub/libvex_guest_x86.h
===================================================================
--- pub/libvex_guest_x86.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_guest_x86.h	(.../trunk)	(revision 2863)
@@ -199,9 +199,9 @@
       /* Emulation notes */
       UInt   guest_EMNOTE;
 
-      /* For clflush: record start and length of area to invalidate */
-      UInt guest_TISTART;
-      UInt guest_TILEN;
+      /* For clflush/clinval: record start and length of area */
+      UInt guest_CMSTART;
+      UInt guest_CMLEN;
 
       /* Used to record the unredirected guest address at the start of
          a translation whose start has been redirected.  By reading
@@ -220,8 +220,8 @@
          been interrupted by a signal. */
       UInt guest_IP_AT_SYSCALL;
 
-      /* Padding to make it have an 32-aligned size */
-      UInt padding[5];
+      /* Padding to make it have an 16-aligned size */
+      UInt padding1;
    }
    VexGuestX86State;
 
Index: pub/libvex_ir.h
===================================================================
--- pub/libvex_ir.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_ir.h	(.../trunk)	(revision 2863)
@@ -242,7 +242,11 @@
 /* Get the size (in bytes) of an IRType */ 
 extern Int sizeofIRType ( IRType );
 
+/* Translate 1/2/4/8 into Ity_I{8,16,32,64} respectively.  Asserts on
+   any other input. */
+extern IRType integerIRTypeOfSize ( Int szB );
 
+
 /* ------------------ Endianness ------------------ */
 
 /* IREndness is used in load IRExprs and store IRStmts. */
@@ -481,9 +485,11 @@
       Iop_DivS32,   // ditto, signed
       Iop_DivU64,   // :: I64,I64 -> I64 (simple div, no mod)
       Iop_DivS64,   // ditto, signed
-      Iop_DivU64E,  // :: I64,I64 -> I64 (dividend is 64-bit arg (hi) concat with 64 0's (low))
+      Iop_DivU64E,  // :: I64,I64 -> I64 (dividend is 64-bit arg (hi)
+                    //                    concat with 64 0's (low))
       Iop_DivS64E,  // ditto, signed
-      Iop_DivU32E,  // :: I32,I32 -> I32 (dividend is 32-bit arg (hi) concat with 32 0's (low))
+      Iop_DivU32E,  // :: I32,I32 -> I32 (dividend is 32-bit arg (hi)
+                    // concat with 32 0's (low))
       Iop_DivS32E,  // ditto, signed
 
       Iop_DivModU64to32, // :: I64,I32 -> I64
@@ -1240,8 +1246,8 @@
 
       /* BCD arithmetic instructions, (V128, V128) -> V128
        * The BCD format is the same as that used in the BCD<->DPB conversion
-       * routines, except using 124 digits (vs 60) plus the trailing 4-bit signed code.
-       * */
+       * routines, except using 124 digits (vs 60) plus the trailing 4-bit
+       * signed code. */
       Iop_BCDAdd, Iop_BCDSub,
 
       /* Conversion I64 -> D64 */
@@ -1254,8 +1260,10 @@
 
       /* --- 32x4 vector FP --- */
 
+      /* ternary :: IRRoundingMode(I32) x V128 x V128 -> V128 */
+      Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4, 
+
       /* binary */
-      Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4, 
       Iop_Max32Fx4, Iop_Min32Fx4,
       Iop_Add32Fx2, Iop_Sub32Fx2,
       /* Note: For the following compares, the ppc and arm front-ends assume a
@@ -1263,13 +1271,11 @@
       Iop_CmpEQ32Fx4, Iop_CmpLT32Fx4, Iop_CmpLE32Fx4, Iop_CmpUN32Fx4,
       Iop_CmpGT32Fx4, Iop_CmpGE32Fx4,
 
-      /* Vector Absolute */
-      Iop_Abs32Fx4,
-
       /* Pairwise Max and Min. See integer pairwise operations for details. */
       Iop_PwMax32Fx4, Iop_PwMin32Fx4,
 
       /* unary */
+      Iop_Abs32Fx4,
       Iop_Sqrt32Fx4, Iop_RSqrt32Fx4,
       Iop_Neg32Fx4,
 
@@ -1296,9 +1302,9 @@
       /* Unlike the standard fp conversions, these irops take no
          rounding mode argument. Instead the irop trailers _R{M,P,N,Z}
          indicate the mode: {-inf, +inf, nearest, zero} respectively. */
-      Iop_I32UtoFx4,  Iop_I32StoFx4,       /* I32x4 -> F32x4       */
+      Iop_I32UtoFx4,     Iop_I32StoFx4,       /* I32x4 -> F32x4       */
       Iop_FtoI32Ux4_RZ,  Iop_FtoI32Sx4_RZ,    /* F32x4 -> I32x4       */
-      Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ,   /* F32x4 -> I32x4 (with saturation) */
+      Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ,   /* F32x4 -> I32x4 (saturating) */
       Iop_RoundF32x4_RM, Iop_RoundF32x4_RP,   /* round to fp integer  */
       Iop_RoundF32x4_RN, Iop_RoundF32x4_RZ,   /* round to fp integer  */
       /* Fixed32 format is floating-point number with fixed number of fraction
@@ -1326,14 +1332,21 @@
 
       /* --- 64x2 vector FP --- */
 
+      /* ternary :: IRRoundingMode(I32) x V128 x V128 -> V128 */
+      Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2, 
+
       /* binary */
-      Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2, 
       Iop_Max64Fx2, Iop_Min64Fx2,
       Iop_CmpEQ64Fx2, Iop_CmpLT64Fx2, Iop_CmpLE64Fx2, Iop_CmpUN64Fx2, 
 
       /* unary */
-      Iop_Recip64Fx2, Iop_Sqrt64Fx2, Iop_RSqrt64Fx2,
+      Iop_Abs64Fx2,
+      Iop_Sqrt64Fx2, Iop_RSqrt64Fx2,
+      Iop_Neg64Fx2,
 
+      /* Vector Reciprocal Estimate */
+      Iop_Recip64Fx2, 
+
       /* --- 64x2 lowest-lane-only scalar FP --- */
 
       /* In binary cases, upper half is copied from first operand.  In
@@ -1357,6 +1370,12 @@
       Iop_64UtoV128,
       Iop_SetV128lo64,
 
+      /* Copies lower 64/32/16/8 bits, zeroes out the rest. */
+      Iop_ZeroHI64ofV128,    // :: V128 -> V128
+      Iop_ZeroHI96ofV128,    // :: V128 -> V128
+      Iop_ZeroHI112ofV128,   // :: V128 -> V128
+      Iop_ZeroHI120ofV128,   // :: V128 -> V128
+
       /* 32 <-> 128 bit vector */
       Iop_32UtoV128,
       Iop_V128to32,     // :: V128 -> I32, lowest lane
@@ -1405,8 +1424,8 @@
       Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4,
       /* Doubling saturating multiplication (long) (I64, I64) -> V128 */
       Iop_QDMulLong16Sx4, Iop_QDMulLong32Sx2,
-      /* Plynomial multiplication treats it's arguments as coefficients of
-         polynoms over {0, 1}. */
+      /* Polynomial multiplication treats its arguments as
+         coefficients of polynomials over {0, 1}. */
       Iop_PolynomialMul8x16, /* (V128, V128) -> V128 */
       Iop_PolynomialMull8x8, /*   (I64, I64) -> V128 */
 
@@ -1519,7 +1538,8 @@
 
       /* NARROWING (unary) -- narrow V128 into I64 */
       Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4, Iop_NarrowUn64to32x2,
-      /* Saturating narrowing from signed source to signed/unsigned destination */
+      /* Saturating narrowing from signed source to signed/unsigned
+         destination */
       Iop_QNarrowUn16Sto8Sx8, Iop_QNarrowUn32Sto16Sx4, Iop_QNarrowUn64Sto32Sx2,
       Iop_QNarrowUn16Sto8Ux8, Iop_QNarrowUn32Sto16Ux4, Iop_QNarrowUn64Sto32Ux2,
       /* Saturating narrowing from unsigned source to unsigned destination */
@@ -1657,15 +1677,11 @@
       Iop_SHA512, Iop_SHA256,
 
       /* ------------------ 256-bit SIMD FP. ------------------ */
-      Iop_Add64Fx4,
-      Iop_Sub64Fx4,
-      Iop_Mul64Fx4,
-      Iop_Div64Fx4,
-      Iop_Add32Fx8,
-      Iop_Sub32Fx8,
-      Iop_Mul32Fx8,
-      Iop_Div32Fx8,
 
+      /* ternary :: IRRoundingMode(I32) x V256 x V256 -> V256 */
+      Iop_Add64Fx4, Iop_Sub64Fx4, Iop_Mul64Fx4, Iop_Div64Fx4,
+      Iop_Add32Fx8, Iop_Sub32Fx8, Iop_Mul32Fx8, Iop_Div32Fx8,
+
       Iop_Sqrt32Fx8,
       Iop_Sqrt64Fx4,
       Iop_RSqrt32Fx8,
@@ -1691,7 +1707,7 @@
       Irrm_PosINF               = 2,  // Round to positive infinity
       Irrm_ZERO                 = 3,  // Round toward zero
       Irrm_NEAREST_TIE_AWAY_0   = 4,  // Round to nearest, ties away from 0
-      Irrm_PREPARE_SHORTER      = 5,  // Round to prepare for storter 
+      Irrm_PREPARE_SHORTER      = 5,  // Round to prepare for shorter 
                                       // precision
       Irrm_AWAY_FROM_ZERO       = 6,  // Round to away from 0
       Irrm_NEAREST_TIE_TOWARD_0 = 7   // Round to nearest, ties towards 0
@@ -2059,13 +2075,18 @@
 /* This describes hints which can be passed to the dispatcher at guest
    control-flow transfer points.
 
-   Re Ijk_TInval: the guest state _must_ have two pseudo-registers,
-   guest_TISTART and guest_TILEN, which specify the start and length
-   of the region to be invalidated.  These are both the size of a
-   guest word.  It is the responsibility of the relevant toIR.c to
-   ensure that these are filled in with suitable values before issuing
-   a jump of kind Ijk_TInval.
+   Re Ijk_InvalICache and Ijk_FlushDCache: the guest state _must_ have
+   two pseudo-registers, guest_CMSTART and guest_CMLEN, which specify
+   the start and length of the region to be invalidated.  CM stands
+   for "Cache Management".  These are both the size of a guest word.
+   It is the responsibility of the relevant toIR.c to ensure that
+   these are filled in with suitable values before issuing a jump of
+   kind Ijk_InvalICache or Ijk_FlushDCache.
 
+   Ijk_InvalICache requests invalidation of translations taken from
+   the requested range.  Ijk_FlushDCache requests flushing of the D
+   cache for the specified range.
+
    Re Ijk_EmWarn and Ijk_EmFail: the guest state must have a
    pseudo-register guest_EMNOTE, which is 32-bits regardless of the
    host or guest word size.  That register should be made to hold a
@@ -2093,8 +2114,10 @@
       Ijk_EmFail,         /* emulation critical (FATAL) error; give up */
       Ijk_NoDecode,       /* current instruction cannot be decoded */
       Ijk_MapFail,        /* Vex-provided address translation failed */
-      Ijk_TInval,         /* Invalidate translations before continuing. */
+      Ijk_InvalICache,    /* Inval icache for range [CMSTART, +CMLEN) */
+      Ijk_FlushDCache,    /* Flush dcache for range [CMSTART, +CMLEN) */
       Ijk_NoRedir,        /* Jump to un-redirected guest addr */
+      Ijk_SigILL,         /* current instruction synths SIGILL */
       Ijk_SigTRAP,        /* current instruction synths SIGTRAP */
       Ijk_SigSEGV,        /* current instruction synths SIGSEGV */
       Ijk_SigBUS,         /* current instruction synths SIGBUS */
@@ -2102,7 +2125,7 @@
       Ijk_SigFPE_IntOvf,  /* current instruction synths SIGFPE - IntOvf */
       /* Unfortunately, various guest-dependent syscall kinds.  They
 	 all mean: do a syscall before continuing. */
-      Ijk_Sys_syscall,    /* amd64 'syscall', ppc 'sc', arm 'svc #0' */
+      Ijk_Sys_syscall,    /* amd64/x86 'syscall', ppc 'sc', arm 'svc #0' */
       Ijk_Sys_int32,      /* amd64/x86 'int $0x20' */
       Ijk_Sys_int128,     /* amd64/x86 'int $0x80' */
       Ijk_Sys_int129,     /* amd64/x86 'int $0x81' */
@@ -2849,12 +2872,12 @@
 /*---------------------------------------------------------------*/
 /*--- IR injection                                            ---*/
 /*---------------------------------------------------------------*/
+
 void vex_inject_ir(IRSB *, IREndness);
 
 
 #endif /* ndef __LIBVEX_IR_H */
 
-
 /*---------------------------------------------------------------*/
 /*---                                             libvex_ir.h ---*/
 /*---------------------------------------------------------------*/
Index: pub/libvex_trc_values.h
===================================================================
--- pub/libvex_trc_values.h	(.../tags/VEX_3_9_0)	(revision 2863)
+++ pub/libvex_trc_values.h	(.../trunk)	(revision 2863)
@@ -46,15 +46,12 @@
 
    These values should be 61 or above so as not to conflict
    with Valgrind's VG_TRC_ values, which are 60 or below.
-
-   These values *must* be odd (have bit 0 set) because the dispatchers
-   (coregrind/m_dispatch/dispatch-*-*.S) use this fact to distinguish
-   a TRC value from the unchanged baseblock pointer -- which has 0 as
-   its lowest bit.
 */
 
-#define VEX_TRC_JMP_TINVAL     61  /* invalidate translations before
-                                      continuing */
+#define VEX_TRC_JMP_INVALICACHE 61  /* invalidate icache (translations)
+                                       before continuing */
+#define VEX_TRC_JMP_FLUSHDCACHE 103 /* flush dcache before continuing */
+
 #define VEX_TRC_JMP_NOREDIR    81  /* jump to undirected guest addr */
 #define VEX_TRC_JMP_SIGTRAP    85  /* deliver trap (SIGTRAP) before
                                       continuing */
@@ -68,6 +65,9 @@
 #define VEX_TRC_JMP_SIGFPE_INTOVF     99  /* deliver SIGFPE (integer overflow)
                                              before continuing */
 
+#define VEX_TRC_JMP_SIGILL     101  /* deliver SIGILL (Illegal instruction)
+                                       before continuing */
+
 #define VEX_TRC_JMP_EMWARN     63  /* deliver emulation warning before
                                       continuing */
 #define VEX_TRC_JMP_EMFAIL     83  /* emulation fatal error; abort system */