curses: Use cursesw instead of curses

Use ncursesw package instead of curses on non-mingw, and check a few functions. Also take cflags from pkg-config, since cursesw headers may be in a separate, non-default directory. Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org> Message-id: 20161015195308.20473-3-samuel.thibault@ens-lyon.org Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
curses: fix left/right arrow translation
2016-10-28 11:19:38 +02:00 · 2016-10-28 11:19:38 +02:00 · 2016-10-28 11:19:38 +02:00 · 2016-10-28 11:19:38 +02:00 · 2016-10-28 11:19:38 +02:00 · 2016-10-28 11:19:38 +02:00
65 changed files with 3590 additions and 2031 deletions
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -89,7 +89,7 @@ endif

 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o cpus-common.o
+common-obj-y += cpus-common.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
--- a/Makefile.target
+++ b/Makefile.target
@@ -94,6 +94,7 @@ obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target-$(TARGET_BASE_ARCH)/
 obj-y += disas.o
+obj-y += tcg-runtime.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o

--- a/atomic_template.h
+++ b/atomic_template.h
@@ -0,0 +1,215 @@
+/*
+ * Atomic helper templates
+ * Included from tcg-runtime.c and cputlb.c.
+ *
+ * Copyright (c) 2016 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if DATA_SIZE == 16
+# define SUFFIX     o
+# define DATA_TYPE  Int128
+# define BSWAP      bswap128
+#elif DATA_SIZE == 8
+# define SUFFIX     q
+# define DATA_TYPE  uint64_t
+# define BSWAP      bswap64
+#elif DATA_SIZE == 4
+# define SUFFIX     l
+# define DATA_TYPE  uint32_t
+# define BSWAP      bswap32
+#elif DATA_SIZE == 2
+# define SUFFIX     w
+# define DATA_TYPE  uint16_t
+# define BSWAP      bswap16
+#elif DATA_SIZE == 1
+# define SUFFIX     b
+# define DATA_TYPE  uint8_t
+# define BSWAP
+#else
+# error unsupported data size
+#endif
+
+#if DATA_SIZE >= 4
+# define ABI_TYPE  DATA_TYPE
+#else
+# define ABI_TYPE  uint32_t
+#endif
+
+/* Define host-endian atomic operations.  Note that END is used within
+   the ATOMIC_NAME macro, and redefined below.  */
+#if DATA_SIZE == 1
+# define END
+#elif defined(HOST_WORDS_BIGENDIAN)
+# define END  _be
+#else
+# define END  _le
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+}
+
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return val;
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_xchg__nocheck(haddr, val);
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return atomic_##X(haddr, val);                                  \
+}                                                                   \
+
+GEN_ATOMIC_HELPER(fetch_add)
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(add_fetch)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+#endif /* DATA SIZE >= 16 */
+
+#undef END
+
+#if DATA_SIZE > 1
+
+/* Define reverse-host-endian atomic operations.  Note that END is used
+   within the ATOMIC_NAME macro.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define END  _le
+#else
+# define END  _be
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
+}
+
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return BSWAP(val);
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    val = BSWAP(val);
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_xchg__nocheck(haddr, BSWAP(val)));
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return BSWAP(atomic_##X(haddr, BSWAP(val)));                    \
+}
+
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+
+/* Note that for addition, we need to use a separate cmpxchg loop instead
+   of bswaps for the reverse-host-endian helpers.  */
+ABI_TYPE ATOMIC_NAME(fetch_add)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo);
+        sto = BSWAP(ret + val);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo) + val;
+        sto = BSWAP(ret);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+#endif /* DATA_SIZE >= 16 */
+
+#undef END
+#endif /* DATA_SIZE > 1 */
+
+#undef BSWAP
+#undef ABI_TYPE
+#undef DATA_TYPE
+#undef SUFFIX
+#undef DATA_SIZE
--- a/backends/baum.c
+++ b/backends/baum.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Baum Braille Device
 *
- * Copyright (c) 2008 Samuel Thibault
+ * Copyright (c) 2008, 2010-2011, 2016 Samuel Thibault
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -92,6 +92,7 @@ typedef struct {
    brlapi_handle_t *brlapi;
    int brlapi_fd;
    unsigned int x, y;
+    bool deferred_init;

    uint8_t in_buf[BUF_SIZE];
    uint8_t in_buf_used;
@@ -102,8 +103,11 @@ typedef struct {
 } BaumDriverState;

 /* Let's assume NABCC by default */
-static const uint8_t nabcc_translation[256] = {
-    [0] = ' ',
+enum way {
+    DOTS2ASCII,
+    ASCII2DOTS
+};
+static const uint8_t nabcc_translation[2][256] = {
 #ifndef BRLAPI_DOTS
 #define BRLAPI_DOTS(d1,d2,d3,d4,d5,d6,d7,d8) \
    ((d1?BRLAPI_DOT1:0)|\
@@ -115,107 +119,154 @@ static const uint8_t nabcc_translation[256] = {
     (d7?BRLAPI_DOT7:0)|\
     (d8?BRLAPI_DOT8:0))
 #endif
-    [BRLAPI_DOTS(1,0,0,0,0,0,0,0)] = 'a',
-    [BRLAPI_DOTS(1,1,0,0,0,0,0,0)] = 'b',
-    [BRLAPI_DOTS(1,0,0,1,0,0,0,0)] = 'c',
-    [BRLAPI_DOTS(1,0,0,1,1,0,0,0)] = 'd',
-    [BRLAPI_DOTS(1,0,0,0,1,0,0,0)] = 'e',
-    [BRLAPI_DOTS(1,1,0,1,0,0,0,0)] = 'f',
-    [BRLAPI_DOTS(1,1,0,1,1,0,0,0)] = 'g',
-    [BRLAPI_DOTS(1,1,0,0,1,0,0,0)] = 'h',
-    [BRLAPI_DOTS(0,1,0,1,0,0,0,0)] = 'i',
-    [BRLAPI_DOTS(0,1,0,1,1,0,0,0)] = 'j',
-    [BRLAPI_DOTS(1,0,1,0,0,0,0,0)] = 'k',
-    [BRLAPI_DOTS(1,1,1,0,0,0,0,0)] = 'l',
-    [BRLAPI_DOTS(1,0,1,1,0,0,0,0)] = 'm',
-    [BRLAPI_DOTS(1,0,1,1,1,0,0,0)] = 'n',
-    [BRLAPI_DOTS(1,0,1,0,1,0,0,0)] = 'o',
-    [BRLAPI_DOTS(1,1,1,1,0,0,0,0)] = 'p',
-    [BRLAPI_DOTS(1,1,1,1,1,0,0,0)] = 'q',
-    [BRLAPI_DOTS(1,1,1,0,1,0,0,0)] = 'r',
-    [BRLAPI_DOTS(0,1,1,1,0,0,0,0)] = 's',
-    [BRLAPI_DOTS(0,1,1,1,1,0,0,0)] = 't',
-    [BRLAPI_DOTS(1,0,1,0,0,1,0,0)] = 'u',
-    [BRLAPI_DOTS(1,1,1,0,0,1,0,0)] = 'v',
-    [BRLAPI_DOTS(0,1,0,1,1,1,0,0)] = 'w',
-    [BRLAPI_DOTS(1,0,1,1,0,1,0,0)] = 'x',
-    [BRLAPI_DOTS(1,0,1,1,1,1,0,0)] = 'y',
-    [BRLAPI_DOTS(1,0,1,0,1,1,0,0)] = 'z',
+#define DO(dots, ascii) \
+    [DOTS2ASCII][dots] = ascii, \
+    [ASCII2DOTS][ascii] = dots
+    DO(0, ' '),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 0, 0, 0), 'a'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 0, 0, 0), 'b'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 0, 0, 0), 'c'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 0, 0, 0), 'd'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 0, 0, 0), 'e'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 0, 0, 0), 'f'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 0, 0, 0), 'g'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 0, 0, 0), 'h'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 0, 0, 0), 'i'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 0, 0, 0), 'j'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 0, 0, 0), 'k'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 0, 0, 0), 'l'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 0, 0, 0), 'm'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 0, 0, 0), 'n'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 0, 0, 0), 'o'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 0, 0, 0), 'p'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 0, 0, 0), 'q'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 0, 0, 0), 'r'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 0, 0, 0), 's'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 0, 0, 0), 't'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 1, 0, 0), 'u'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 1, 0, 0), 'v'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 1, 0, 0), 'w'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 1, 0, 0), 'x'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 1, 0, 0), 'y'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 1, 0, 0), 'z'),

-    [BRLAPI_DOTS(1,0,0,0,0,0,1,0)] = 'A',
-    [BRLAPI_DOTS(1,1,0,0,0,0,1,0)] = 'B',
-    [BRLAPI_DOTS(1,0,0,1,0,0,1,0)] = 'C',
-    [BRLAPI_DOTS(1,0,0,1,1,0,1,0)] = 'D',
-    [BRLAPI_DOTS(1,0,0,0,1,0,1,0)] = 'E',
-    [BRLAPI_DOTS(1,1,0,1,0,0,1,0)] = 'F',
-    [BRLAPI_DOTS(1,1,0,1,1,0,1,0)] = 'G',
-    [BRLAPI_DOTS(1,1,0,0,1,0,1,0)] = 'H',
-    [BRLAPI_DOTS(0,1,0,1,0,0,1,0)] = 'I',
-    [BRLAPI_DOTS(0,1,0,1,1,0,1,0)] = 'J',
-    [BRLAPI_DOTS(1,0,1,0,0,0,1,0)] = 'K',
-    [BRLAPI_DOTS(1,1,1,0,0,0,1,0)] = 'L',
-    [BRLAPI_DOTS(1,0,1,1,0,0,1,0)] = 'M',
-    [BRLAPI_DOTS(1,0,1,1,1,0,1,0)] = 'N',
-    [BRLAPI_DOTS(1,0,1,0,1,0,1,0)] = 'O',
-    [BRLAPI_DOTS(1,1,1,1,0,0,1,0)] = 'P',
-    [BRLAPI_DOTS(1,1,1,1,1,0,1,0)] = 'Q',
-    [BRLAPI_DOTS(1,1,1,0,1,0,1,0)] = 'R',
-    [BRLAPI_DOTS(0,1,1,1,0,0,1,0)] = 'S',
-    [BRLAPI_DOTS(0,1,1,1,1,0,1,0)] = 'T',
-    [BRLAPI_DOTS(1,0,1,0,0,1,1,0)] = 'U',
-    [BRLAPI_DOTS(1,1,1,0,0,1,1,0)] = 'V',
-    [BRLAPI_DOTS(0,1,0,1,1,1,1,0)] = 'W',
-    [BRLAPI_DOTS(1,0,1,1,0,1,1,0)] = 'X',
-    [BRLAPI_DOTS(1,0,1,1,1,1,1,0)] = 'Y',
-    [BRLAPI_DOTS(1,0,1,0,1,1,1,0)] = 'Z',
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 0, 1, 0), 'A'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 0, 1, 0), 'B'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 0, 1, 0), 'C'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 0, 1, 0), 'D'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 0, 1, 0), 'E'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 0, 1, 0), 'F'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 0, 1, 0), 'G'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 0, 1, 0), 'H'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 0, 1, 0), 'I'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 0, 1, 0), 'J'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 0, 1, 0), 'K'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 0, 1, 0), 'L'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 0, 1, 0), 'M'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 0, 1, 0), 'N'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 0, 1, 0), 'O'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 0, 1, 0), 'P'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 0, 1, 0), 'Q'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 0, 1, 0), 'R'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 0, 1, 0), 'S'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 0, 1, 0), 'T'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 1, 1, 0), 'U'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 1, 1, 0), 'V'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 1, 1, 0), 'W'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 1, 1, 0), 'X'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 1, 1, 0), 'Y'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 1, 1, 0), 'Z'),

-    [BRLAPI_DOTS(0,0,1,0,1,1,0,0)] = '0',
-    [BRLAPI_DOTS(0,1,0,0,0,0,0,0)] = '1',
-    [BRLAPI_DOTS(0,1,1,0,0,0,0,0)] = '2',
-    [BRLAPI_DOTS(0,1,0,0,1,0,0,0)] = '3',
-    [BRLAPI_DOTS(0,1,0,0,1,1,0,0)] = '4',
-    [BRLAPI_DOTS(0,1,0,0,0,1,0,0)] = '5',
-    [BRLAPI_DOTS(0,1,1,0,1,0,0,0)] = '6',
-    [BRLAPI_DOTS(0,1,1,0,1,1,0,0)] = '7',
-    [BRLAPI_DOTS(0,1,1,0,0,1,0,0)] = '8',
-    [BRLAPI_DOTS(0,0,1,0,1,0,0,0)] = '9',
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 1, 1, 0, 0), '0'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 0, 0, 0, 0), '1'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 0, 0, 0, 0), '2'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 1, 0, 0, 0), '3'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 1, 1, 0, 0), '4'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 0, 1, 0, 0), '5'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 1, 0, 0, 0), '6'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 1, 1, 0, 0), '7'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 0, 1, 0, 0), '8'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 1, 0, 0, 0), '9'),

-    [BRLAPI_DOTS(0,0,0,1,0,1,0,0)] = '.',
-    [BRLAPI_DOTS(0,0,1,1,0,1,0,0)] = '+',
-    [BRLAPI_DOTS(0,0,1,0,0,1,0,0)] = '-',
-    [BRLAPI_DOTS(1,0,0,0,0,1,0,0)] = '*',
-    [BRLAPI_DOTS(0,0,1,1,0,0,0,0)] = '/',
-    [BRLAPI_DOTS(1,1,1,0,1,1,0,0)] = '(',
-    [BRLAPI_DOTS(0,1,1,1,1,1,0,0)] = ')',
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 1, 0, 0), '.'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 0, 1, 0, 0), '+'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 0, 1, 0, 0), '-'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 1, 0, 0), '*'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 0, 0, 0, 0), '/'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 1, 0, 0), '('),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 1, 0, 0), ')'),

-    [BRLAPI_DOTS(1,1,1,1,0,1,0,0)] = '&',
-    [BRLAPI_DOTS(0,0,1,1,1,1,0,0)] = '#',
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 1, 0, 0), '&'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 1, 1, 0, 0), '#'),

-    [BRLAPI_DOTS(0,0,0,0,0,1,0,0)] = ',',
-    [BRLAPI_DOTS(0,0,0,0,1,1,0,0)] = ';',
-    [BRLAPI_DOTS(1,0,0,0,1,1,0,0)] = ':',
-    [BRLAPI_DOTS(0,1,1,1,0,1,0,0)] = '!',
-    [BRLAPI_DOTS(1,0,0,1,1,1,0,0)] = '?',
-    [BRLAPI_DOTS(0,0,0,0,1,0,0,0)] = '"',
-    [BRLAPI_DOTS(0,0,1,0,0,0,0,0)] ='\'',
-    [BRLAPI_DOTS(0,0,0,1,0,0,0,0)] = '`',
-    [BRLAPI_DOTS(0,0,0,1,1,0,1,0)] = '^',
-    [BRLAPI_DOTS(0,0,0,1,1,0,0,0)] = '~',
-    [BRLAPI_DOTS(0,1,0,1,0,1,1,0)] = '[',
-    [BRLAPI_DOTS(1,1,0,1,1,1,1,0)] = ']',
-    [BRLAPI_DOTS(0,1,0,1,0,1,0,0)] = '{',
-    [BRLAPI_DOTS(1,1,0,1,1,1,0,0)] = '}',
-    [BRLAPI_DOTS(1,1,1,1,1,1,0,0)] = '=',
-    [BRLAPI_DOTS(1,1,0,0,0,1,0,0)] = '<',
-    [BRLAPI_DOTS(0,0,1,1,1,0,0,0)] = '>',
-    [BRLAPI_DOTS(1,1,0,1,0,1,0,0)] = '$',
-    [BRLAPI_DOTS(1,0,0,1,0,1,0,0)] = '%',
-    [BRLAPI_DOTS(0,0,0,1,0,0,1,0)] = '@',
-    [BRLAPI_DOTS(1,1,0,0,1,1,0,0)] = '|',
-    [BRLAPI_DOTS(1,1,0,0,1,1,1,0)] ='\\',
-    [BRLAPI_DOTS(0,0,0,1,1,1,0,0)] = '_',
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 0, 1, 0, 0), ','),
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 1, 1, 0, 0), ';'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 1, 0, 0), ':'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 1, 0, 0), '!'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 1, 0, 0), '?'),
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 1, 0, 0, 0), '"'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 0, 0, 0, 0), '\''),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 0, 0, 0), '`'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 0, 1, 0), '^'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 0, 0, 0), '~'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 1, 1, 0), '['),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 1, 1, 0), ']'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 1, 0, 0), '{'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 1, 0, 0), '}'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 1, 0, 0), '='),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 1, 0, 0), '<'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 1, 0, 0, 0), '>'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 1, 0, 0), '$'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 1, 0, 0), '%'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 0, 1, 0), '@'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 1, 0, 0), '|'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 1, 1, 0), '\\'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 1, 0, 0), '_'),
 };

+/* The guest OS has started discussing with us, finish initializing BrlAPI */
+static int baum_deferred_init(BaumDriverState *baum)
+{
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    SDL_SysWMinfo info;
+#endif
+#endif
+    int tty;
+
+    if (baum->deferred_init) {
+        return 1;
+    }
+
+    if (brlapi__getDisplaySize(baum->brlapi, &baum->x, &baum->y) == -1) {
+        brlapi_perror("baum: brlapi__getDisplaySize");
+        return 0;
+    }
+
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    memset(&info, 0, sizeof(info));
+    SDL_VERSION(&info.version);
+    if (SDL_GetWMInfo(&info)) {
+        tty = info.info.x11.wmwindow;
+    } else {
+#endif
+#endif
+        tty = BRLAPI_TTY_DEFAULT;
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    }
+#endif
+#endif
+
+    if (brlapi__enterTtyMode(baum->brlapi, tty, NULL) == -1) {
+        brlapi_perror("baum: brlapi__enterTtyMode");
+        return 0;
+    }
+    baum->deferred_init = 1;
+    return 1;
+}
+
 /* The serial port can receive more of our data */
 static void baum_accept_input(struct CharDriverState *chr)
 {
@@ -346,8 +397,10 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len)
                cursor = i + 1;
                c &= ~(BRLAPI_DOT7|BRLAPI_DOT8);
            }
-            if (!(c = nabcc_translation[c]))
+            c = nabcc_translation[DOTS2ASCII][c];
+            if (!c) {
                c = '?';
+            }
            text[i] = c;
        }
        timer_del(baum->cellCount_timer);
@@ -440,6 +493,8 @@ static int baum_write(CharDriverState *chr, const uint8_t *buf, int len)
        return 0;
    if (!baum->brlapi)
        return len;
+    if (!baum_deferred_init(baum))
+        return len;

    while (len) {
        /* Complete our buffer as much as possible */
@@ -476,6 +531,13 @@ static void baum_send_key(BaumDriverState *baum, uint8_t type, uint8_t value) {
    baum_write_packet(baum, packet, sizeof(packet));
 }

+static void baum_send_key2(BaumDriverState *baum, uint8_t type, uint8_t value,
+                           uint8_t value2) {
+    uint8_t packet[] = { type, value, value2 };
+    DPRINTF("writing key %x %x\n", type, value);
+    baum_write_packet(baum, packet, sizeof(packet));
+}
+
 /* We got some data on the BrlAPI socket */
 static void baum_chr_read(void *opaque)
 {
@@ -484,6 +546,8 @@ static void baum_chr_read(void *opaque)
    int ret;
    if (!baum->brlapi)
        return;
+    if (!baum_deferred_init(baum))
+        return;
    while ((ret = brlapi__readKey(baum->brlapi, 0, &code)) == 1) {
        DPRINTF("got key %"BRLAPI_PRIxKEYCODE"\n", code);
        /* Emulate */
@@ -540,7 +604,17 @@ static void baum_chr_read(void *opaque)
            }
            break;
        case BRLAPI_KEY_TYPE_SYM:
-            break;
+            {
+                brlapi_keyCode_t keysym = code & BRLAPI_KEY_CODE_MASK;
+                if (keysym < 0x100) {
+                    uint8_t dots = nabcc_translation[ASCII2DOTS][keysym];
+                    if (dots) {
+                        baum_send_key2(baum, BAUM_RSP_EntryKeys, 0, dots);
+                        baum_send_key2(baum, BAUM_RSP_EntryKeys, 0, 0);
+                    }
+                }
+                break;
+            }
        }
    }
    if (ret == -1 && (brlapi_errno != BRLAPI_ERROR_LIBCERR || errno != EINTR)) {
@@ -573,12 +647,6 @@ static CharDriverState *chr_baum_init(const char *id,
    BaumDriverState *baum;
    CharDriverState *chr;
    brlapi_handle_t *handle;
-#if defined(CONFIG_SDL)
-#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
-    SDL_SysWMinfo info;
-#endif
-#endif
-    int tty;

    chr = qemu_chr_alloc(common, errp);
    if (!chr) {
@@ -601,39 +669,14 @@ static CharDriverState *chr_baum_init(const char *id,
                   brlapi_strerror(brlapi_error_location()));
        goto fail_handle;
    }
+    baum->deferred_init = 0;

    baum->cellCount_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, baum_cellCount_timer_cb, baum);

-    if (brlapi__getDisplaySize(handle, &baum->x, &baum->y) == -1) {
-        error_setg(errp, "brlapi__getDisplaySize: %s",
-                   brlapi_strerror(brlapi_error_location()));
-        goto fail;
-    }
-
-#if defined(CONFIG_SDL)
-#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
-    memset(&info, 0, sizeof(info));
-    SDL_VERSION(&info.version);
-    if (SDL_GetWMInfo(&info))
-        tty = info.info.x11.wmwindow;
-    else
-#endif
-#endif
-        tty = BRLAPI_TTY_DEFAULT;
-
-    if (brlapi__enterTtyMode(handle, tty, NULL) == -1) {
-        error_setg(errp, "brlapi__enterTtyMode: %s",
-                   brlapi_strerror(brlapi_error_location()));
-        goto fail;
-    }
-
    qemu_set_fd_handler(baum->brlapi_fd, baum_chr_read, NULL, baum);

    return chr;

-fail:
-    timer_free(baum->cellCount_timer);
-    brlapi__closeConnection(handle);
 fail_handle:
    g_free(handle);
    g_free(chr);
--- a/91
+++ b/91
@@ -1216,7 +1216,10 @@ case "$cpu" in
           cc_i386='$(CC) -m32'
           ;;
    x86_64)
-           CPU_CFLAGS="-m64"
+           # ??? Only extremely old AMD cpus do not have cmpxchg16b.
+           # If we truly care, we should simply detect this case at
+           # runtime and generate the fallback to serial emulation.
+           CPU_CFLAGS="-m64 -mcx16"
           LDFLAGS="-m64 $LDFLAGS"
           cc_i386='$(CC) -m32'
           ;;
@@ -2914,27 +2917,38 @@ fi
 # curses probe
 if test "$curses" != "no" ; then
  if test "$mingw32" = "yes" ; then
-    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
+    curses_inc_list="$($pkg_config --cflags ncurses 2>/dev/null):"
+    curses_lib_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
  else
-    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lncurses:-lcurses"
+    curses_inc_list="$($pkg_config --cflags ncursesw 2>/dev/null):"
+    curses_lib_list="$($pkg_config --libs ncursesw 2>/dev/null):-lncursesw:-lcursesw"
  fi
  curses_found=no
  cat > $TMPC << EOF
+#include <locale.h>
 #include <curses.h>
+#include <wchar.h>
 int main(void) {
  const char *s = curses_version();
+  wchar_t wch = L'w';
+  setlocale(LC_ALL, "");
  resize_term(0, 0);
+  addwstr(L"wide chars\n");
+  addnwstr(&wch, 1);
  return s != 0;
 }
 EOF
  IFS=:
-  for curses_lib in $curses_list; do
-    unset IFS
-    if compile_prog "" "$curses_lib" ; then
-      curses_found=yes
-      libs_softmmu="$curses_lib $libs_softmmu"
-      break
-    fi
+  for curses_inc in $curses_inc_list; do
+    for curses_lib in $curses_lib_list; do
+      unset IFS
+      if compile_prog "$curses_inc" "$curses_lib" ; then
+        curses_found=yes
+        QEMU_CFLAGS="$curses_inc $QEMU_CFLAGS"
+        libs_softmmu="$curses_lib $libs_softmmu"
+        break
+      fi
+    done
  done
  unset IFS
  if test "$curses_found" = "yes" ; then
@@ -4521,6 +4535,55 @@ if compile_prog "" "" ; then
    int128=yes
 fi

+#########################################
+# See if 128-bit atomic operations are supported.
+
+atomic128=no
+if test "$int128" = "yes"; then
+  cat > $TMPC << EOF
+int main(void)
+{
+  unsigned __int128 x = 0, y = 0;
+  y = __atomic_load_16(&x, 0);
+  __atomic_store_16(&x, y, 0);
+  __atomic_compare_exchange_16(&x, &y, x, 0, 0, 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "" ; then
+    atomic128=yes
+  fi
+fi
+
+#########################################
+# See if 64-bit atomic operations are supported.
+# Note that without __atomic builtins, we can only
+# assume atomic loads/stores max at pointer size.
+
+cat > $TMPC << EOF
+#include <stdint.h>
+int main(void)
+{
+  uint64_t x = 0, y = 0;
+#ifdef __ATOMIC_RELAXED
+  y = __atomic_load_8(&x, 0);
+  __atomic_store_8(&x, y, 0);
+  __atomic_compare_exchange_8(&x, &y, x, 0, 0, 0);
+  __atomic_exchange_8(&x, y, 0);
+  __atomic_fetch_add_8(&x, y, 0);
+#else
+  typedef char is_host64[sizeof(void *) >= sizeof(uint64_t) ? 1 : -1];
+  __sync_lock_test_and_set(&x, y);
+  __sync_val_compare_and_swap(&x, y, 0);
+  __sync_fetch_and_add(&x, y);
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "" ; then
+  atomic64=yes
+fi
+
 ########################################
 # check if getauxval is available.

@@ -5483,6 +5546,14 @@ if test "$int128" = "yes" ; then
  echo "CONFIG_INT128=y" >> $config_host_mak
 fi

+if test "$atomic128" = "yes" ; then
+  echo "CONFIG_ATOMIC128=y" >> $config_host_mak
+fi
+
+if test "$atomic64" = "yes" ; then
+  echo "CONFIG_ATOMIC64=y" >> $config_host_mak
+fi
+
 if test "$getauxval" = "yes" ; then
  echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -77,3 +77,9 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
    }
    siglongjmp(cpu->jmp_env, 1);
 }
+
+void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
+{
+    cpu->exception_index = EXCP_ATOMIC;
+    cpu_loop_exit_restore(cpu, pc);
+}
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -151,12 +151,6 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
        && qemu_log_in_addr_range(itb->pc)) {
 #if defined(TARGET_I386)
        log_cpu_state(cpu, CPU_DUMP_CCOP);
-#elif defined(TARGET_M68K)
-        /* ??? Should not modify env state for dumping.  */
-        cpu_m68k_flush_flags(env, env->cc_op);
-        env->cc_op = CC_OP_FLAGS;
-        env->sr = (env->sr & 0xffe0) | env->cc_dest | (env->cc_x << 4);
-        log_cpu_state(cpu, 0);
 #else
        log_cpu_state(cpu, 0);
 #endif
@@ -222,6 +216,36 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
 }
 #endif

+static void cpu_exec_step(CPUState *cpu)
+{
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+    TranslationBlock *tb;
+    target_ulong cs_base, pc;
+    uint32_t flags;
+
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb = tb_gen_code(cpu, pc, cs_base, flags,
+                     1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
+    tb->orig_tb = NULL;
+    /* execute the generated code */
+    trace_exec_tb_nocache(tb, pc);
+    cpu_tb_exec(cpu, tb);
+    tb_phys_invalidate(tb, -1);
+    tb_free(tb);
+}
+
+void cpu_exec_step_atomic(CPUState *cpu)
+{
+    start_exclusive();
+
+    /* Since we got here, we know that parallel_cpus must be true.  */
+    parallel_cpus = false;
+    cpu_exec_step(cpu);
+    parallel_cpus = true;
+
+    end_exclusive();
+}
+
 struct tb_desc {
    target_ulong pc;
    target_ulong cs_base;
--- a/cpus.c
+++ b/cpus.c
@@ -1497,6 +1497,8 @@ static void tcg_exec_all(void)
            if (r == EXCP_DEBUG) {
                cpu_handle_guest_debug(cpu);
                break;
+            } else if (r == EXCP_ATOMIC) {
+                cpu_exec_step_atomic(cpu);
            }
        } else if (cpu->stop || cpu->stopped) {
            if (cpu->unplug) {
--- a/cputlb.c
+++ b/cputlb.c
@@ -23,15 +23,15 @@
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "exec/cpu_ldst.h"
-
 #include "exec/cputlb.h"
-
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
+#include "exec/helper-proto.h"
+#include "qemu/atomic.h"

 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -498,6 +498,43 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
    return qemu_ram_addr_from_host_nofail(p);
 }

+static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                         target_ulong addr, uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    uint64_t val;
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    cpu->mem_io_pc = retaddr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    return val;
+}
+
+static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                      uint64_t val, target_ulong addr,
+                      uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    cpu->mem_io_pc = retaddr;
+    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+}
+
 /* Return true if ADDR is present in the victim tlb, and has been copied
   back to the main tlb.  */
 static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
@@ -527,34 +564,178 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
  victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
                 (ADDR) & TARGET_PAGE_MASK)

+/* Probe for whether the specified guest write access is permitted.
+ * If it is not permitted then an exception will be taken in the same
+ * way as if this were a real write access (and we will not return).
+ * Otherwise the function will return, and there will be a valid
+ * entry in the TLB for this access.
+ */
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+    }
+}
+
+/* Probe for a read-modify-write atomic operation.  Do not allow unaligned
+ * operations, or io operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    size_t mmu_idx = get_mmuidx(oi);
+    size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
+    target_ulong tlb_addr = tlbe->addr_write;
+    TCGMemOp mop = get_memop(oi);
+    int a_bits = get_alignment_bits(mop);
+    int s_bits = mop & MO_SIZE;
+
+    /* Adjust the given return address.  */
+    retaddr -= GETPC_ADJ;
+
+    /* Enforce guest required alignment.  */
+    if (unlikely(a_bits > 0 && (addr & ((1 << a_bits) - 1)))) {
+        /* ??? Maybe indicate atomic op to cpu_unaligned_access */
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & ((1 << s_bits) - 1))) {
+        /* We get here if guest alignment was not requested,
+           or was not enforced by cpu_unaligned_access above.
+           We might widen the access and emulate, but for now
+           mark an exception and exit the cpu loop.  */
+        goto stop_the_world;
+    }
+
+    /* Check TLB entry and enforce page permissions.  */
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+        tlb_addr = tlbe->addr_write;
+    }
+
+    /* Notice an IO access, or a notdirty page.  */
+    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+        /* There's really nothing that can be done to
+           support this apart from stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    /* Let the guest notice RMW on a write-only page.  */
+    if (unlikely(tlbe->addr_read != tlb_addr)) {
+        tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD, mmu_idx, retaddr);
+        /* Since we don't support reads and writes to different addresses,
+           and we do have the proper page loaded for write, this shouldn't
+           ever return.  But just in case, handle via stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    return (void *)((uintptr_t)addr + tlbe->addend);
+
+ stop_the_world:
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+}
+
+#ifdef TARGET_WORDS_BIGENDIAN
+# define TGT_BE(X)  (X)
+# define TGT_LE(X)  BSWAP(X)
+#else
+# define TGT_BE(X)  BSWAP(X)
+# define TGT_LE(X)  (X)
+#endif
+
 #define MMUSUFFIX _mmu

-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"

-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"

-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"

-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
+
+/* First set of helpers allows passing in of OI and RETADDR.  This makes
+   them callable from other helpers.  */
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, retaddr)
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+#ifdef CONFIG_ATOMIC128
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif
+
+/* Second set of helpers are directly callable from TCG as helpers.  */
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+#define EXTRA_ARGS         , TCGMemOpIdx oi
+#define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+/* Code access functions.  */
+
 #undef MMUSUFFIX
-
 #define MMUSUFFIX _cmmu
 #undef GETPC
 #define GETPC() ((uintptr_t)0)
 #define SOFTMMU_CODE_ACCESS

-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"

-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"

-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"

-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
--- a/exec.c
+++ b/exec.c
@@ -352,9 +352,9 @@ static inline bool section_covers_addr(const MemoryRegionSection *section,
    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
     * the section must cover the entire address space.
     */
-    return section->size.hi ||
+    return int128_gethi(section->size) ||
           range_covers_byte(section->offset_within_address_space,
-                             section->size.lo, addr);
+                             int128_getlo(section->size), addr);
 }

 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
--- a/hw/audio/intel-hda.c
+++ b/hw/audio/intel-hda.c
@@ -416,7 +416,8 @@ static bool intel_hda_xfer(HDACodecDevice *dev, uint32_t stnr, bool output,
    }

    left = len;
-    while (left > 0) {
+    s = st->bentries;
+    while (left > 0 && s-- > 0) {
        copy = left;
        if (copy > st->bsize - st->lpib)
            copy = st->bsize - st->lpib;
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1278,11 +1278,10 @@ e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc,

    struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc;

-    memset(d, 0, sizeof(*d));
-
    assert(!rss_info->enabled);

    d->length = cpu_to_le16(length);
+    d->csum = 0;

    e1000e_build_rx_metadata(core, pkt, pkt != NULL,
                             rss_info,
@@ -1291,6 +1290,7 @@ e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc,
                             &d->special);
    d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
    d->status = (uint8_t) le32_to_cpu(status_flags);
+    d->special = 0;
 }

 static inline void
@@ -1301,7 +1301,7 @@ e1000e_write_ext_rx_descr(E1000ECore *core, uint8_t *desc,
 {
    union e1000_rx_desc_extended *d = (union e1000_rx_desc_extended *) desc;

-    memset(d, 0, sizeof(*d));
+    memset(&d->wb, 0, sizeof(d->wb));

    d->wb.upper.length = cpu_to_le16(length);

@@ -1325,7 +1325,7 @@ e1000e_write_ps_rx_descr(E1000ECore *core, uint8_t *desc,
    union e1000_rx_desc_packet_split *d =
        (union e1000_rx_desc_packet_split *) desc;

-    memset(d, 0, sizeof(*d));
+    memset(&d->wb, 0, sizeof(d->wb));

    d->wb.middle.length0 = cpu_to_le16((*written)[0]);

--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -1843,6 +1843,7 @@ static void pci_nic_uninit(PCIDevice *pci_dev)
    EEPRO100State *s = DO_UPCAST(EEPRO100State, dev, pci_dev);

    vmstate_unregister(&pci_dev->qdev, s->vmstate, s);
+    g_free(s->vmstate);
    eeprom93xx_free(&pci_dev->qdev, s->eeprom);
    qemu_del_nic(s->nic);
 }
--- a/hw/net/pcnet.c
+++ b/hw/net/pcnet.c
@@ -302,7 +302,7 @@ static inline void pcnet_tmd_load(PCNetState *s, struct pcnet_TMD *tmd,
            uint32_t tbadr;
            int16_t length;
            int16_t status;
-	} xda;
+        } xda;
        s->phys_mem_read(s->dma_opaque, addr, (void *)&xda, sizeof(xda), 0);
        tmd->tbadr = le32_to_cpu(xda.tbadr) & 0xffffff;
        tmd->length = le16_to_cpu(xda.length);
@@ -664,7 +664,9 @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size)

 static inline hwaddr pcnet_rdra_addr(PCNetState *s, int idx)
 {
-    while (idx < 1) idx += CSR_RCVRL(s);
+    while (idx < 1) {
+        idx += CSR_RCVRL(s);
+    }
    return s->rdra + ((CSR_RCVRL(s) - idx) * (BCR_SWSTYLE(s) ? 16 : 8));
 }

@@ -672,8 +674,10 @@ static inline int64_t pcnet_get_next_poll_time(PCNetState *s, int64_t current_ti
 {
    int64_t next_time = current_time +
                        (65536 - (CSR_SPND(s) ? 0 : CSR_POLL(s))) * 30;
-    if (next_time <= current_time)
+
+    if (next_time <= current_time) {
        next_time = current_time + 1;
+    }
    return next_time;
 }

@@ -795,13 +799,13 @@ static void pcnet_init(PCNetState *s)
        mode = le16_to_cpu(initblk.mode);
        rlen = initblk.rlen >> 4;
        tlen = initblk.tlen >> 4;
-	ladrf[0] = le16_to_cpu(initblk.ladrf[0]);
-	ladrf[1] = le16_to_cpu(initblk.ladrf[1]);
-	ladrf[2] = le16_to_cpu(initblk.ladrf[2]);
-	ladrf[3] = le16_to_cpu(initblk.ladrf[3]);
-	padr[0] = le16_to_cpu(initblk.padr[0]);
-	padr[1] = le16_to_cpu(initblk.padr[1]);
-	padr[2] = le16_to_cpu(initblk.padr[2]);
+        ladrf[0] = le16_to_cpu(initblk.ladrf[0]);
+        ladrf[1] = le16_to_cpu(initblk.ladrf[1]);
+        ladrf[2] = le16_to_cpu(initblk.ladrf[2]);
+        ladrf[3] = le16_to_cpu(initblk.ladrf[3]);
+        padr[0] = le16_to_cpu(initblk.padr[0]);
+        padr[1] = le16_to_cpu(initblk.padr[1]);
+        padr[2] = le16_to_cpu(initblk.padr[2]);
        rdra = le32_to_cpu(initblk.rdra);
        tdra = le32_to_cpu(initblk.tdra);
    } else {
@@ -809,13 +813,13 @@ static void pcnet_init(PCNetState *s)
        s->phys_mem_read(s->dma_opaque, PHYSADDR(s,CSR_IADR(s)),
                (uint8_t *)&initblk, sizeof(initblk), 0);
        mode = le16_to_cpu(initblk.mode);
-	ladrf[0] = le16_to_cpu(initblk.ladrf[0]);
-	ladrf[1] = le16_to_cpu(initblk.ladrf[1]);
-	ladrf[2] = le16_to_cpu(initblk.ladrf[2]);
-	ladrf[3] = le16_to_cpu(initblk.ladrf[3]);
-	padr[0] = le16_to_cpu(initblk.padr[0]);
-	padr[1] = le16_to_cpu(initblk.padr[1]);
-	padr[2] = le16_to_cpu(initblk.padr[2]);
+        ladrf[0] = le16_to_cpu(initblk.ladrf[0]);
+        ladrf[1] = le16_to_cpu(initblk.ladrf[1]);
+        ladrf[2] = le16_to_cpu(initblk.ladrf[2]);
+        ladrf[3] = le16_to_cpu(initblk.ladrf[3]);
+        padr[0] = le16_to_cpu(initblk.padr[0]);
+        padr[1] = le16_to_cpu(initblk.padr[1]);
+        padr[2] = le16_to_cpu(initblk.padr[2]);
        rdra = le32_to_cpu(initblk.rdra);
        tdra = le32_to_cpu(initblk.tdra);
        rlen = rdra >> 29;
@@ -858,12 +862,12 @@ static void pcnet_start(PCNetState *s)
    printf("pcnet_start\n");
 #endif

-    if (!CSR_DTX(s))
+    if (!CSR_DTX(s)) {
        s->csr[0] |= 0x0010;    /* set TXON */
-
-    if (!CSR_DRX(s))
+    }
+    if (!CSR_DRX(s)) {
        s->csr[0] |= 0x0020;    /* set RXON */
-
+    }
    s->csr[0] &= ~0x0004;       /* clear STOP bit */
    s->csr[0] |= 0x0002;
    pcnet_poll_timer(s);
@@ -925,8 +929,7 @@ static void pcnet_rdte_poll(PCNetState *s)
                       crda);
            }
        } else {
-            printf("pcnet: BAD RMD RDA=0x" TARGET_FMT_plx "\n",
-                   crda);
+            printf("pcnet: BAD RMD RDA=0x" TARGET_FMT_plx "\n", crda);
 #endif
        }
    }
@@ -1168,10 +1171,11 @@ ssize_t pcnet_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
 #endif

            while (pktcount--) {
-                if (CSR_RCVRC(s) <= 1)
+                if (CSR_RCVRC(s) <= 1) {
                    CSR_RCVRC(s) = CSR_RCVRL(s);
-                else
+                } else {
                    CSR_RCVRC(s)--;
+                }
            }

            pcnet_rdte_poll(s);
@@ -1207,7 +1211,7 @@ static void pcnet_transmit(PCNetState *s)

    s->tx_busy = 1;

-    txagain:
+txagain:
    if (pcnet_tdte_poll(s)) {
        struct pcnet_TMD tmd;

@@ -1251,7 +1255,7 @@ static void pcnet_transmit(PCNetState *s)
        s->phys_mem_read(s->dma_opaque, PHYSADDR(s, tmd.tbadr),
                         s->buffer + s->xmit_pos, bcnt, CSR_BSWP(s));
        s->xmit_pos += bcnt;
-        
+
        if (!GET_FIELD(tmd.status, TMDS, ENP)) {
            goto txdone;
        }
@@ -1276,21 +1280,22 @@ static void pcnet_transmit(PCNetState *s)
        s->csr[4] |= 0x0004;    /* set TXSTRT */
        s->xmit_pos = -1;

-    txdone:
+txdone:
        SET_FIELD(&tmd.status, TMDS, OWN, 0);
        TMDSTORE(&tmd, PHYSADDR(s,CSR_CXDA(s)));
-        if (!CSR_TOKINTD(s) || (CSR_LTINTEN(s) && GET_FIELD(tmd.status, TMDS, LTINT)))
+        if (!CSR_TOKINTD(s)
+            || (CSR_LTINTEN(s) && GET_FIELD(tmd.status, TMDS, LTINT))) {
            s->csr[0] |= 0x0200;    /* set TINT */
-
-        if (CSR_XMTRC(s)<=1)
+        }
+        if (CSR_XMTRC(s) <= 1) {
            CSR_XMTRC(s) = CSR_XMTRL(s);
-        else
+        } else {
            CSR_XMTRC(s)--;
-        if (count--)
+        }
+        if (count--) {
            goto txagain;
-
-    } else
-    if (s->xmit_pos >= 0) {
+        }
+    } else if (s->xmit_pos >= 0) {
        struct pcnet_TMD tmd;
        TMDLOAD(&tmd, xmit_cxda);
        SET_FIELD(&tmd.misc, TMDM, BUFF, 1);
@@ -1301,9 +1306,9 @@ static void pcnet_transmit(PCNetState *s)
        s->csr[0] |= 0x0200;    /* set TINT */
        if (!CSR_DXSUFLO(s)) {
            s->csr[0] &= ~0x0010;
-        } else
-        if (count--)
-          goto txagain;
+        } else if (count--) {
+            goto txagain;
+        }
    }

    s->tx_busy = 0;
@@ -1315,13 +1320,11 @@ static void pcnet_poll(PCNetState *s)
        pcnet_rdte_poll(s);
    }

-    if (CSR_TDMD(s) ||
-        (CSR_TXON(s) && !CSR_DPOLL(s) && pcnet_tdte_poll(s)))
-    {
+    if (CSR_TDMD(s) || (CSR_TXON(s) && !CSR_DPOLL(s) && pcnet_tdte_poll(s))) {
        /* prevent recursion */
-        if (s->tx_busy)
+        if (s->tx_busy) {
            return;
-
+        }
        pcnet_transmit(s);
    }
 }
@@ -1340,15 +1343,16 @@ static void pcnet_poll_timer(void *opaque)

    if (!CSR_STOP(s) && !CSR_SPND(s) && !CSR_DPOLL(s)) {
        uint64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) * 33;
-        if (!s->timer || !now)
+        if (!s->timer || !now) {
            s->timer = now;
-        else {
+        } else {
            uint64_t t = now - s->timer + CSR_POLL(s);
            if (t > 0xffffLL) {
                pcnet_poll(s);
                CSR_POLL(s) = CSR_PINT(s);
-            } else
+            } else {
                CSR_POLL(s) = t;
+            }
        }
        timer_mod(s->poll_timer,
            pcnet_get_next_poll_time(s,qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)));
@@ -1371,21 +1375,21 @@ static void pcnet_csr_writew(PCNetState *s, uint32_t rap, uint32_t new_value)
        val = (val & 0x007f) | (s->csr[0] & 0x7f00);

        /* IFF STOP, STRT and INIT are set, clear STRT and INIT */
-        if ((val&7) == 7)
-          val &= ~3;
-
-        if (!CSR_STOP(s) && (val & 4))
+        if ((val & 7) == 7) {
+            val &= ~3;
+        }
+        if (!CSR_STOP(s) && (val & 4)) {
            pcnet_stop(s);
-
-        if (!CSR_INIT(s) && (val & 1))
+        }
+        if (!CSR_INIT(s) && (val & 1)) {
            pcnet_init(s);
-
-        if (!CSR_STRT(s) && (val & 2))
+        }
+        if (!CSR_STRT(s) && (val & 2)) {
            pcnet_start(s);
-
-        if (CSR_TDMD(s))
+        }
+        if (CSR_TDMD(s)) {
            pcnet_transmit(s);
-
+        }
        return;
    case 1:
    case 2:
@@ -1429,12 +1433,16 @@ static void pcnet_csr_writew(PCNetState *s, uint32_t rap, uint32_t new_value)
    case 47: /* POLLINT */
    case 72:
    case 74:
+        break;
    case 76: /* RCVRL */
    case 78: /* XMTRL */
+        val = (val > 0) ? val : 512;
+        break;
    case 112:
-       if (CSR_STOP(s) || CSR_SPND(s))
-           break;
-       return;
+        if (CSR_STOP(s) || CSR_SPND(s)) {
+            break;
+        }
+        return;
    case 3:
        break;
    case 4:
@@ -1651,8 +1659,7 @@ void pcnet_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
            pcnet_bcr_writew(s, s->rap, val & 0xffff);
            break;
        }
-    } else
-    if ((addr & 0x0f) == 0) {
+    } else if ((addr & 0x0f) == 0) {
        /* switch device to dword i/o mode */
        pcnet_bcr_writew(s, BCR_BSBC, pcnet_bcr_readw(s, BCR_BSBC) | 0x0080);
 #ifdef PCNET_DEBUG_IO
--- a/hw/net/rocker/rocker.c
+++ b/hw/net/rocker/rocker.c
@@ -860,7 +860,7 @@ static void rocker_io_writel(void *opaque, hwaddr addr, uint32_t val)
        rocker_msix_irq(r, val);
        break;
    case ROCKER_TEST_DMA_SIZE:
-        r->test_dma_size = val;
+        r->test_dma_size = val & 0xFFFF;
        break;
    case ROCKER_TEST_DMA_ADDR + 4:
        r->test_dma_addr = ((uint64_t)val) << 32 | r->lower32;
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -2350,7 +2350,7 @@ static void rtl8139_cplus_transmit(RTL8139State *s)
 {
    int txcount = 0;

-    while (rtl8139_cplus_transmit_one(s))
+    while (txcount < 64 && rtl8139_cplus_transmit_one(s))
    {
        ++txcount;
    }
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -531,6 +531,7 @@ static void vmxnet3_complete_packet(VMXNET3State *s, int qidx, uint32_t tx_ridx)

    VMXNET3_RING_DUMP(VMW_RIPRN, "TXC", qidx, &s->txq_descr[qidx].comp_ring);

+    memset(&txcq_descr, 0, sizeof(txcq_descr));
    txcq_descr.txdIdx = tx_ridx;
    txcq_descr.gen = vmxnet3_ring_curr_gen(&s->txq_descr[qidx].comp_ring);

--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -31,6 +31,7 @@
 #define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
 #define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
 #define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
+#define EXCP_ATOMIC     0x10005 /* stop-the-world and emulate atomic */

 /* some important defines:
 *
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -59,6 +59,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

 void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
 void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
+void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);

 #if !defined(CONFIG_USER_ONLY)
 void cpu_reloading_memory_map(void);
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -80,6 +80,7 @@ void tcg_exec_init(unsigned long tb_size);
 bool tcg_enabled(void);

 void cpu_exec_init_all(void);
+void cpu_exec_step_atomic(CPUState *cpu);

 /**
 * set_preferred_target_page_bits:
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -99,15 +99,21 @@
 * no effect on the generated code but not using the atomic primitives
 * will get flagged by sanitizers as a violation.
 */
+#define atomic_read__nocheck(ptr) \
+    __atomic_load_n(ptr, __ATOMIC_RELAXED)
+
 #define atomic_read(ptr)                              \
    ({                                                \
    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    __atomic_load_n(ptr, __ATOMIC_RELAXED);           \
+    atomic_read__nocheck(ptr);                        \
    })

+#define atomic_set__nocheck(ptr, i) \
+    __atomic_store_n(ptr, i, __ATOMIC_RELAXED)
+
 #define atomic_set(ptr, i)  do {                      \
    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    __atomic_store_n(ptr, i, __ATOMIC_RELAXED);       \
+    atomic_set__nocheck(ptr, i);                      \
 } while(0)

 /* See above: most compilers currently treat consume and acquire the
@@ -151,20 +157,27 @@

 /* All the remaining operations are fully sequentially consistent */

+#define atomic_xchg__nocheck(ptr, i)    ({                  \
+    __atomic_exchange_n(ptr, (i), __ATOMIC_SEQ_CST);        \
+})
+
 #define atomic_xchg(ptr, i)    ({                           \
    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));       \
-    __atomic_exchange_n(ptr, i, __ATOMIC_SEQ_CST);          \
+    atomic_xchg__nocheck(ptr, i);                           \
 })

 /* Returns the eventual value, failed or not */
-#define atomic_cmpxchg(ptr, old, new)                                   \
-    ({                                                                  \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));                   \
+#define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \
    typeof_strip_qual(*ptr) _old = (old);                               \
    __atomic_compare_exchange_n(ptr, &_old, new, false,                 \
                              __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \
    _old;                                                               \
-    })
+})
+
+#define atomic_cmpxchg(ptr, old, new)    ({                             \
+    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));                   \
+    atomic_cmpxchg__nocheck(ptr, old, new);                             \
+})

 /* Provide shorter names for GCC atomic builtins, return old value */
 #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
@@ -173,6 +186,15 @@
 #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
+
+#define atomic_inc_fetch(ptr)    __atomic_add_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_dec_fetch(ptr)    __atomic_sub_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_add_fetch(ptr, n) __atomic_add_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_sub_fetch(ptr, n) __atomic_sub_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_and_fetch(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_or_fetch(ptr, n)  __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_xor_fetch(ptr, n) __atomic_xor_fetch(ptr, n, __ATOMIC_SEQ_CST)

 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)    ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST))
@@ -181,6 +203,7 @@
 #define atomic_sub(ptr, n) ((void) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST))
 #define atomic_and(ptr, n) ((void) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST))
 #define atomic_or(ptr, n)  ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST))
+#define atomic_xor(ptr, n) ((void) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST))

 #else /* __ATOMIC_RELAXED */

@@ -269,8 +292,11 @@
 /* These will only be atomic if the processor does the fetch or store
 * in a single issue memory operation
 */
-#define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))
-#define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
+#define atomic_read__nocheck(p)   (*(__typeof__(*(p)) volatile*) (p))
+#define atomic_set__nocheck(p, i) ((*(__typeof__(*(p)) volatile*) (p)) = (i))
+
+#define atomic_read(ptr)       atomic_read__nocheck(ptr)
+#define atomic_set(ptr, i)     atomic_set__nocheck(ptr,i)

 /**
 * atomic_rcu_read - reads a RCU-protected pointer to a local variable
@@ -331,15 +357,27 @@
 #define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))
 #endif
 #endif
+#define atomic_xchg__nocheck  atomic_xchg

 /* Provide shorter names for GCC atomic builtins.  */
 #define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
 #define atomic_fetch_dec(ptr)  __sync_fetch_and_add(ptr, -1)
-#define atomic_fetch_add       __sync_fetch_and_add
-#define atomic_fetch_sub       __sync_fetch_and_sub
-#define atomic_fetch_and       __sync_fetch_and_and
-#define atomic_fetch_or        __sync_fetch_and_or
-#define atomic_cmpxchg         __sync_val_compare_and_swap
+#define atomic_fetch_add(ptr, n) __sync_fetch_and_add(ptr, n)
+#define atomic_fetch_sub(ptr, n) __sync_fetch_and_sub(ptr, n)
+#define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n)
+#define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n)
+#define atomic_fetch_xor(ptr, n) __sync_fetch_and_xor(ptr, n)
+
+#define atomic_inc_fetch(ptr)  __sync_add_and_fetch(ptr, 1)
+#define atomic_dec_fetch(ptr)  __sync_add_and_fetch(ptr, -1)
+#define atomic_add_fetch(ptr, n) __sync_add_and_fetch(ptr, n)
+#define atomic_sub_fetch(ptr, n) __sync_sub_and_fetch(ptr, n)
+#define atomic_and_fetch(ptr, n) __sync_and_and_fetch(ptr, n)
+#define atomic_or_fetch(ptr, n) __sync_or_and_fetch(ptr, n)
+#define atomic_xor_fetch(ptr, n) __sync_xor_and_fetch(ptr, n)
+
+#define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)
+#define atomic_cmpxchg__nocheck(ptr, old, new)  atomic_cmpxchg(ptr, old, new)

 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)        ((void) __sync_fetch_and_add(ptr, 1))
@@ -348,6 +386,7 @@
 #define atomic_sub(ptr, n)     ((void) __sync_fetch_and_sub(ptr, n))
 #define atomic_and(ptr, n)     ((void) __sync_fetch_and_and(ptr, n))
 #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
+#define atomic_xor(ptr, n)     ((void) __sync_fetch_and_xor(ptr, n))

 #endif /* __ATOMIC_RELAXED */

--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -1,6 +1,149 @@
 #ifndef INT128_H
 #define INT128_H

+#ifdef CONFIG_INT128
+#include "qemu/bswap.h"
+
+typedef __int128_t Int128;
+
+static inline Int128 int128_make64(uint64_t a)
+{
+    return a;
+}
+
+static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+{
+    return (__uint128_t)hi << 64 | lo;
+}
+
+static inline uint64_t int128_get64(Int128 a)
+{
+    uint64_t r = a;
+    assert(r == a);
+    return r;
+}
+
+static inline uint64_t int128_getlo(Int128 a)
+{
+    return a;
+}
+
+static inline int64_t int128_gethi(Int128 a)
+{
+    return a >> 64;
+}
+
+static inline Int128 int128_zero(void)
+{
+    return 0;
+}
+
+static inline Int128 int128_one(void)
+{
+    return 1;
+}
+
+static inline Int128 int128_2_64(void)
+{
+    return (Int128)1 << 64;
+}
+
+static inline Int128 int128_exts64(int64_t a)
+{
+    return a;
+}
+
+static inline Int128 int128_and(Int128 a, Int128 b)
+{
+    return a & b;
+}
+
+static inline Int128 int128_rshift(Int128 a, int n)
+{
+    return a >> n;
+}
+
+static inline Int128 int128_add(Int128 a, Int128 b)
+{
+    return a + b;
+}
+
+static inline Int128 int128_neg(Int128 a)
+{
+    return -a;
+}
+
+static inline Int128 int128_sub(Int128 a, Int128 b)
+{
+    return a - b;
+}
+
+static inline bool int128_nonneg(Int128 a)
+{
+    return a >= 0;
+}
+
+static inline bool int128_eq(Int128 a, Int128 b)
+{
+    return a == b;
+}
+
+static inline bool int128_ne(Int128 a, Int128 b)
+{
+    return a != b;
+}
+
+static inline bool int128_ge(Int128 a, Int128 b)
+{
+    return a >= b;
+}
+
+static inline bool int128_lt(Int128 a, Int128 b)
+{
+    return a < b;
+}
+
+static inline bool int128_le(Int128 a, Int128 b)
+{
+    return a <= b;
+}
+
+static inline bool int128_gt(Int128 a, Int128 b)
+{
+    return a > b;
+}
+
+static inline bool int128_nz(Int128 a)
+{
+    return a != 0;
+}
+
+static inline Int128 int128_min(Int128 a, Int128 b)
+{
+    return a < b ? a : b;
+}
+
+static inline Int128 int128_max(Int128 a, Int128 b)
+{
+    return a > b ? a : b;
+}
+
+static inline void int128_addto(Int128 *a, Int128 b)
+{
+    *a += b;
+}
+
+static inline void int128_subfrom(Int128 *a, Int128 b)
+{
+    *a -= b;
+}
+
+static inline Int128 bswap128(Int128 a)
+{
+    return int128_make128(bswap64(int128_gethi(a)), bswap64(int128_getlo(a)));
+}
+
+#else /* !CONFIG_INT128 */

 typedef struct Int128 Int128;

@@ -14,12 +157,27 @@ static inline Int128 int128_make64(uint64_t a)
    return (Int128) { a, 0 };
 }

+static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+{
+    return (Int128) { lo, hi };
+}
+
 static inline uint64_t int128_get64(Int128 a)
 {
    assert(!a.hi);
    return a.lo;
 }

+static inline uint64_t int128_getlo(Int128 a)
+{
+    return a.lo;
+}
+
+static inline int64_t int128_gethi(Int128 a)
+{
+    return a.hi;
+}
+
 static inline Int128 int128_zero(void)
 {
    return int128_make64(0);
@@ -53,9 +211,9 @@ static inline Int128 int128_rshift(Int128 a, int n)
    }
    h = a.hi >> (n & 63);
    if (n >= 64) {
-        return (Int128) { h, h >> 63 };
+        return int128_make128(h, h >> 63);
    } else {
-        return (Int128) { (a.lo >> n) | ((uint64_t)a.hi << (64 - n)), h };
+        return int128_make128((a.lo >> n) | ((uint64_t)a.hi << (64 - n)), h);
    }
 }

@@ -69,18 +227,18 @@ static inline Int128 int128_add(Int128 a, Int128 b)
     *
     * So the carry is lo < a.lo.
     */
-    return (Int128) { lo, (uint64_t)a.hi + b.hi + (lo < a.lo) };
+    return int128_make128(lo, (uint64_t)a.hi + b.hi + (lo < a.lo));
 }

 static inline Int128 int128_neg(Int128 a)
 {
    uint64_t lo = -a.lo;
-    return (Int128) { lo, ~(uint64_t)a.hi + !lo };
+    return int128_make128(lo, ~(uint64_t)a.hi + !lo);
 }

 static inline Int128 int128_sub(Int128 a, Int128 b)
 {
-    return (Int128){ a.lo - b.lo, (uint64_t)a.hi - b.hi - (a.lo < b.lo) };
+    return int128_make128(a.lo - b.lo, (uint64_t)a.hi - b.hi - (a.lo < b.lo));
 }

 static inline bool int128_nonneg(Int128 a)
@@ -143,4 +301,5 @@ static inline void int128_subfrom(Int128 *a, Int128 b)
    *a = int128_sub(*a, b);
 }

-#endif
+#endif /* CONFIG_INT128 */
+#endif /* INT128_H */
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -354,6 +354,9 @@ void cpu_loop(CPUX86State *env)
                  }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            pc = env->segs[R_CS].base + env->eip;
            EXCP_DUMP(env, "qemu: 0x%08lx: unhandled CPU exception 0x%x - aborting\n",
@@ -550,94 +553,6 @@ do_kernel_trap(CPUARMState *env)
    return 0;
 }

-/* Store exclusive handling for AArch32 */
-static int do_strex(CPUARMState *env)
-{
-    uint64_t val;
-    int size;
-    int rc = 1;
-    int segv = 0;
-    uint32_t addr;
-    start_exclusive();
-    if (env->exclusive_addr != env->exclusive_test) {
-        goto fail;
-    }
-    /* We know we're always AArch32 so the address is in uint32_t range
-     * unless it was the -1 exclusive-monitor-lost value (which won't
-     * match exclusive_test above).
-     */
-    assert(extract64(env->exclusive_addr, 32, 32) == 0);
-    addr = env->exclusive_addr;
-    size = env->exclusive_info & 0xf;
-    switch (size) {
-    case 0:
-        segv = get_user_u8(val, addr);
-        break;
-    case 1:
-        segv = get_user_data_u16(val, addr, env);
-        break;
-    case 2:
-    case 3:
-        segv = get_user_data_u32(val, addr, env);
-        break;
-    default:
-        abort();
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto done;
-    }
-    if (size == 3) {
-        uint32_t valhi;
-        segv = get_user_data_u32(valhi, addr + 4, env);
-        if (segv) {
-            env->exception.vaddress = addr + 4;
-            goto done;
-        }
-        if (arm_cpu_bswap_data(env)) {
-            val = deposit64((uint64_t)valhi, 32, 32, val);
-        } else {
-            val = deposit64(val, 32, 32, valhi);
-        }
-    }
-    if (val != env->exclusive_val) {
-        goto fail;
-    }
-
-    val = env->regs[(env->exclusive_info >> 8) & 0xf];
-    switch (size) {
-    case 0:
-        segv = put_user_u8(val, addr);
-        break;
-    case 1:
-        segv = put_user_data_u16(val, addr, env);
-        break;
-    case 2:
-    case 3:
-        segv = put_user_data_u32(val, addr, env);
-        break;
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto done;
-    }
-    if (size == 3) {
-        val = env->regs[(env->exclusive_info >> 12) & 0xf];
-        segv = put_user_data_u32(val, addr + 4, env);
-        if (segv) {
-            env->exception.vaddress = addr + 4;
-            goto done;
-        }
-    }
-    rc = 0;
-fail:
-    env->regs[15] += 4;
-    env->regs[(env->exclusive_info >> 4) & 0xf] = rc;
-done:
-    end_exclusive();
-    return segv;
-}
-
 void cpu_loop(CPUARMState *env)
 {
    CPUState *cs = CPU(arm_env_get_cpu(env));
@@ -812,11 +727,6 @@ void cpu_loop(CPUARMState *env)
        case EXCP_INTERRUPT:
            /* just indicate that signals should be handled asap */
            break;
-        case EXCP_STREX:
-            if (!do_strex(env)) {
-                break;
-            }
-            /* fall through for segv */
        case EXCP_PREFETCH_ABORT:
        case EXCP_DATA_ABORT:
            addr = env->exception.vaddress;
@@ -851,6 +761,9 @@ void cpu_loop(CPUARMState *env)
        case EXCP_YIELD:
            /* nothing to do here for user-mode, just resume guest code */
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
        error:
            EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
@@ -862,124 +775,6 @@ void cpu_loop(CPUARMState *env)

 #else

-/*
- * Handle AArch64 store-release exclusive
- *
- * rs = gets the status result of store exclusive
- * rt = is the register that is stored
- * rt2 = is the second register store (in STP)
- *
- */
-static int do_strex_a64(CPUARMState *env)
-{
-    uint64_t val;
-    int size;
-    bool is_pair;
-    int rc = 1;
-    int segv = 0;
-    uint64_t addr;
-    int rs, rt, rt2;
-
-    start_exclusive();
-    /* size | is_pair << 2 | (rs << 4) | (rt << 9) | (rt2 << 14)); */
-    size = extract32(env->exclusive_info, 0, 2);
-    is_pair = extract32(env->exclusive_info, 2, 1);
-    rs = extract32(env->exclusive_info, 4, 5);
-    rt = extract32(env->exclusive_info, 9, 5);
-    rt2 = extract32(env->exclusive_info, 14, 5);
-
-    addr = env->exclusive_addr;
-
-    if (addr != env->exclusive_test) {
-        goto finish;
-    }
-
-    switch (size) {
-    case 0:
-        segv = get_user_u8(val, addr);
-        break;
-    case 1:
-        segv = get_user_u16(val, addr);
-        break;
-    case 2:
-        segv = get_user_u32(val, addr);
-        break;
-    case 3:
-        segv = get_user_u64(val, addr);
-        break;
-    default:
-        abort();
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto error;
-    }
-    if (val != env->exclusive_val) {
-        goto finish;
-    }
-    if (is_pair) {
-        if (size == 2) {
-            segv = get_user_u32(val, addr + 4);
-        } else {
-            segv = get_user_u64(val, addr + 8);
-        }
-        if (segv) {
-            env->exception.vaddress = addr + (size == 2 ? 4 : 8);
-            goto error;
-        }
-        if (val != env->exclusive_high) {
-            goto finish;
-        }
-    }
-    /* handle the zero register */
-    val = rt == 31 ? 0 : env->xregs[rt];
-    switch (size) {
-    case 0:
-        segv = put_user_u8(val, addr);
-        break;
-    case 1:
-        segv = put_user_u16(val, addr);
-        break;
-    case 2:
-        segv = put_user_u32(val, addr);
-        break;
-    case 3:
-        segv = put_user_u64(val, addr);
-        break;
-    }
-    if (segv) {
-        goto error;
-    }
-    if (is_pair) {
-        /* handle the zero register */
-        val = rt2 == 31 ? 0 : env->xregs[rt2];
-        if (size == 2) {
-            segv = put_user_u32(val, addr + 4);
-        } else {
-            segv = put_user_u64(val, addr + 8);
-        }
-        if (segv) {
-            env->exception.vaddress = addr + (size == 2 ? 4 : 8);
-            goto error;
-        }
-    }
-    rc = 0;
-finish:
-    env->pc += 4;
-    /* rs == 31 encodes a write to the ZR, thus throwing away
-     * the status return. This is rather silly but valid.
-     */
-    if (rs < 31) {
-        env->xregs[rs] = rc;
-    }
-error:
-    /* instruction faulted, PC does not advance */
-    /* either way a strex releases any exclusive lock we have */
-    env->exclusive_addr = -1;
-    end_exclusive();
-    return segv;
-}
-
 /* AArch64 main loop */
 void cpu_loop(CPUARMState *env)
 {
@@ -1021,11 +816,6 @@ void cpu_loop(CPUARMState *env)
            info._sifields._sigfault._addr = env->pc;
            queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
            break;
-        case EXCP_STREX:
-            if (!do_strex_a64(env)) {
-                break;
-            }
-            /* fall through for segv */
        case EXCP_PREFETCH_ABORT:
        case EXCP_DATA_ABORT:
            info.si_signo = TARGET_SIGSEGV;
@@ -1051,6 +841,9 @@ void cpu_loop(CPUARMState *env)
        case EXCP_YIELD:
            /* nothing to do here for user-mode, just resume guest code */
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
            abort();
@@ -1058,8 +851,6 @@ void cpu_loop(CPUARMState *env)
        process_pending_signals(env);
        /* Exception return on AArch64 always clears the exclusive monitor,
         * so any return to running guest code implies this.
-         * A strex (successful or otherwise) also clears the monitor, so
-         * we don't need to specialcase EXCP_STREX.
         */
        env->exclusive_addr = -1;
    }
@@ -1142,6 +933,9 @@ void cpu_loop(CPUUniCore32State *env)
                }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            goto error;
        }
@@ -1415,6 +1209,9 @@ void cpu_loop (CPUSPARCState *env)
                  }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            printf ("Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -1954,6 +1751,9 @@ void cpu_loop(CPUPPCState *env)
        case EXCP_INTERRUPT:
            /* just indicate that signals should be handled asap */
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            cpu_abort(cs, "Unknown exception 0x%x. Aborting\n", trapnr);
            break;
@@ -2649,6 +2449,9 @@ done_syscall:
                }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
 error:
            EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
@@ -2736,6 +2539,9 @@ void cpu_loop(CPUOpenRISCState *env)
        case EXCP_NR:
            qemu_log_mask(CPU_LOG_INT, "\nNR\n");
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            EXCP_DUMP(env, "\nqemu: unhandled CPU exception %#x - aborting\n",
                     trapnr);
@@ -2812,6 +2618,9 @@ void cpu_loop(CPUSH4State *env)
            queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
 	    break;

+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            printf ("Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -2879,6 +2688,9 @@ void cpu_loop(CPUCRISState *env)
                  }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            printf ("Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -2995,6 +2807,9 @@ void cpu_loop(CPUMBState *env)
                  }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            printf ("Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3098,6 +2913,9 @@ void cpu_loop(CPUM68KState *env)
                  }
            }
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
            abort();
@@ -3108,51 +2926,6 @@ void cpu_loop(CPUM68KState *env)
 #endif /* TARGET_M68K */

 #ifdef TARGET_ALPHA
-static void do_store_exclusive(CPUAlphaState *env, int reg, int quad)
-{
-    target_ulong addr, val, tmp;
-    target_siginfo_t info;
-    int ret = 0;
-
-    addr = env->lock_addr;
-    tmp = env->lock_st_addr;
-    env->lock_addr = -1;
-    env->lock_st_addr = 0;
-
-    start_exclusive();
-    mmap_lock();
-
-    if (addr == tmp) {
-        if (quad ? get_user_s64(val, addr) : get_user_s32(val, addr)) {
-            goto do_sigsegv;
-        }
-
-        if (val == env->lock_value) {
-            tmp = env->ir[reg];
-            if (quad ? put_user_u64(tmp, addr) : put_user_u32(tmp, addr)) {
-                goto do_sigsegv;
-            }
-            ret = 1;
-        }
-    }
-    env->ir[reg] = ret;
-    env->pc += 4;
-
-    mmap_unlock();
-    end_exclusive();
-    return;
-
- do_sigsegv:
-    mmap_unlock();
-    end_exclusive();
-
-    info.si_signo = TARGET_SIGSEGV;
-    info.si_errno = 0;
-    info.si_code = TARGET_SEGV_MAPERR;
-    info._sifields._sigfault._addr = addr;
-    queue_signal(env, TARGET_SIGSEGV, QEMU_SI_FAULT, &info);
-}
-
 void cpu_loop(CPUAlphaState *env)
 {
    CPUState *cs = CPU(alpha_env_get_cpu(env));
@@ -3327,13 +3100,12 @@ void cpu_loop(CPUAlphaState *env)
                queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
            }
            break;
-        case EXCP_STL_C:
-        case EXCP_STQ_C:
-            do_store_exclusive(env, env->error_code, trapnr - EXCP_STL_C);
-            break;
        case EXCP_INTERRUPT:
            /* Just indicate that signals should be handled asap.  */
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            printf ("Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3463,6 +3235,9 @@ void cpu_loop(CPUS390XState *env)
            queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
            break;

+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            fprintf(stderr, "Unhandled trap: 0x%x\n", trapnr);
            cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3717,6 +3492,9 @@ void cpu_loop(CPUTLGState *env)
        case TILEGX_EXCP_REG_UDN_ACCESS:
            gen_sigill_reg(env);
            break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
        default:
            fprintf(stderr, "trapnr is %d[0x%x].\n", trapnr, trapnr);
            g_assert_not_reached();
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6164,6 +6164,14 @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
        sigfillset(&sigmask);
        sigprocmask(SIG_BLOCK, &sigmask, &info.sigmask);

+        /* If this is our first additional thread, we need to ensure we
+         * generate code for parallel execution and flush old translations.
+         */
+        if (!parallel_cpus) {
+            parallel_cpus = true;
+            tb_flush(cpu);
+        }
+
        ret = pthread_create(&info.thread, &attr, clone_func, &info);
        /* TODO: Free new CPU state if thread creation failed.  */

--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -188,7 +188,6 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
 {
    struct tcphdr *ptcp, *stcp;
    int res;
-    char *sdebug, *ddebug;

    trace_colo_compare_main("compare tcp");
    if (ppkt->size != spkt->size) {
@@ -219,24 +218,21 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
                (spkt->size - ETH_HLEN));

    if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
-        sdebug = strdup(inet_ntoa(ppkt->ip->ip_src));
-        ddebug = strdup(inet_ntoa(ppkt->ip->ip_dst));
-        fprintf(stderr, "%s: src/dst: %s/%s p: seq/ack=%u/%u"
-                " s: seq/ack=%u/%u res=%d flags=%x/%x\n",
-                __func__, sdebug, ddebug,
-                (unsigned int)ntohl(ptcp->th_seq),
-                (unsigned int)ntohl(ptcp->th_ack),
-                (unsigned int)ntohl(stcp->th_seq),
-                (unsigned int)ntohl(stcp->th_ack),
-                res, ptcp->th_flags, stcp->th_flags);
+        trace_colo_compare_pkt_info(inet_ntoa(ppkt->ip->ip_src),
+                                    inet_ntoa(ppkt->ip->ip_dst),
+                                    ntohl(ptcp->th_seq),
+                                    ntohl(ptcp->th_ack),
+                                    ntohl(stcp->th_seq),
+                                    ntohl(stcp->th_ack),
+                                    res, ptcp->th_flags,
+                                    stcp->th_flags,
+                                    ppkt->size,
+                                    spkt->size);

-        fprintf(stderr, "Primary len = %d\n", ppkt->size);
-        qemu_hexdump((char *)ppkt->data, stderr, "colo-compare", ppkt->size);
-        fprintf(stderr, "Secondary len = %d\n", spkt->size);
-        qemu_hexdump((char *)spkt->data, stderr, "colo-compare", spkt->size);
-
-        g_free(sdebug);
-        g_free(ddebug);
+        qemu_hexdump((char *)ppkt->data, stderr,
+                     "colo-compare ppkt", ppkt->size);
+        qemu_hexdump((char *)spkt->data, stderr,
+                     "colo-compare spkt", spkt->size);
    }

    return res;
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -68,15 +68,11 @@ static int handle_primary_tcp_pkt(NetFilterState *nf,

    tcp_pkt = (struct tcphdr *)pkt->transport_header;
    if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
-        char *sdebug, *ddebug;
-        sdebug = strdup(inet_ntoa(pkt->ip->ip_src));
-        ddebug = strdup(inet_ntoa(pkt->ip->ip_dst));
-        trace_colo_filter_rewriter_pkt_info(__func__, sdebug, ddebug,
+        trace_colo_filter_rewriter_pkt_info(__func__,
+                    inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
                    ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
                    tcp_pkt->th_flags);
        trace_colo_filter_rewriter_conn_offset(conn->offset);
-        g_free(sdebug);
-        g_free(ddebug);
    }

    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
@@ -116,15 +112,11 @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
    tcp_pkt = (struct tcphdr *)pkt->transport_header;

    if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
-        char *sdebug, *ddebug;
-        sdebug = strdup(inet_ntoa(pkt->ip->ip_src));
-        ddebug = strdup(inet_ntoa(pkt->ip->ip_dst));
-        trace_colo_filter_rewriter_pkt_info(__func__, sdebug, ddebug,
+        trace_colo_filter_rewriter_pkt_info(__func__,
+                    inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
                    ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
                    tcp_pkt->th_flags);
        trace_colo_filter_rewriter_conn_offset(conn->offset);
-        g_free(sdebug);
-        g_free(ddebug);
    }

    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
@@ -162,6 +154,7 @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,

    iov_to_buf(iov, iovcnt, 0, buf, size);
    pkt = packet_new(buf, size);
+    g_free(buf);

    /*
     * if we get tcp packet
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -35,6 +35,10 @@
 #include <net/if_tap.h>
 #endif

+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#endif
+
 #ifndef __FreeBSD__
 int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
             int vnet_hdr_required, int mq_required, Error **errp)
@@ -55,7 +59,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
        if (*ifname) {
            snprintf(dname, sizeof dname, "/dev/%s", ifname);
        } else {
-#if defined(__OpenBSD__)
+#if defined(__OpenBSD__) && OpenBSD < 201605
            snprintf(dname, sizeof dname, "/dev/tun%d", i);
 #else
            snprintf(dname, sizeof dname, "/dev/tap%d", i);
--- a/net/trace-events
+++ b/net/trace-events
@@ -13,6 +13,7 @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
 colo_old_packet_check_found(int64_t old_time) "%" PRId64
 colo_compare_miscompare(void) ""
+colo_compare_pkt_info(const char *src, const char *dst, uint32_t pseq, uint32_t pack, uint32_t sseq, uint32_t sack, int res, uint32_t pflag, uint32_t sflag, int psize, int ssize) "src/dst: %s/%s p: seq/ack=%u/%u   s: seq/ack=%u/%u res=%d flags=%x/%x ppkt_size: %d spkt_size: %d\n"

 # net/filter-rewriter.c
 colo_filter_rewriter_debug(void) ""
--- a/scripts/tracetool/backend/simple.py
+++ b/scripts/tracetool/backend/simple.py
@@ -21,7 +21,8 @@ PUBLIC = True

 def is_string(arg):
    strtype = ('const char*', 'char*', 'const char *', 'char *')
-    if arg.lstrip().startswith(strtype):
+    arg_strip = arg.lstrip()
+    if arg_strip.startswith(strtype) and arg_strip.count('*') == 1:
        return True
    else:
        return False
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -21,12 +21,6 @@
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
-#include "qemu/timer.h"
-#include "exec/address-spaces.h"
-#include "exec/memory.h"
-
-#define DATA_SIZE (1 << SHIFT)
-
 #if DATA_SIZE == 8
 #define SUFFIX q
 #define LSUFFIX q
@@ -84,14 +78,6 @@
 # define BSWAP(X)  (X)
 #endif

-#ifdef TARGET_WORDS_BIGENDIAN
-# define TGT_BE(X)  (X)
-# define TGT_LE(X)  BSWAP(X)
-#else
-# define TGT_BE(X)  BSWAP(X)
-# define TGT_LE(X)  (X)
-#endif
-
 #if DATA_SIZE == 1
 # define helper_le_ld_name  glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
 # define helper_be_ld_name  helper_le_ld_name
@@ -108,35 +94,14 @@
 # define helper_be_st_name  glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
 #endif

-#ifdef TARGET_WORDS_BIGENDIAN
-# define helper_te_ld_name  helper_be_ld_name
-# define helper_te_st_name  helper_be_st_name
-#else
-# define helper_te_ld_name  helper_le_ld_name
-# define helper_te_st_name  helper_le_st_name
-#endif
-
 #ifndef SOFTMMU_CODE_ACCESS
 static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
-                                              CPUIOTLBEntry *iotlbentry,
+                                              size_t mmu_idx, size_t index,
                                              target_ulong addr,
                                              uintptr_t retaddr)
 {
-    uint64_t val;
-    CPUState *cpu = ENV_GET_CPU(env);
-    hwaddr physaddr = iotlbentry->addr;
-    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
-
-    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
-    cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-
-    cpu->mem_io_vaddr = addr;
-    memory_region_dispatch_read(mr, physaddr, &val, 1 << SHIFT,
-                                iotlbentry->attrs);
-    return val;
+    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+    return io_readx(env, iotlbentry, addr, retaddr, DATA_SIZE);
 }
 #endif

@@ -167,15 +132,13 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,

    /* Handle an IO access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
        if ((addr & (DATA_SIZE - 1)) != 0) {
            goto do_unaligned_access;
        }
-        iotlbentry = &env->iotlb[mmu_idx][index];

        /* ??? Note that the io helpers always read data in the target
           byte ordering.  We should push the LE/BE request down into io.  */
-        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr);
        res = TGT_LE(res);
        return res;
    }
@@ -236,15 +199,13 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,

    /* Handle an IO access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
        if ((addr & (DATA_SIZE - 1)) != 0) {
            goto do_unaligned_access;
        }
-        iotlbentry = &env->iotlb[mmu_idx][index];

        /* ??? Note that the io helpers always read data in the target
           byte ordering.  We should push the LE/BE request down into io.  */
-        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr);
        res = TGT_BE(res);
        return res;
    }
@@ -295,24 +256,13 @@ WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
 #endif

 static inline void glue(io_write, SUFFIX)(CPUArchState *env,
-                                          CPUIOTLBEntry *iotlbentry,
+                                          size_t mmu_idx, size_t index,
                                          DATA_TYPE val,
                                          target_ulong addr,
                                          uintptr_t retaddr)
 {
-    CPUState *cpu = ENV_GET_CPU(env);
-    hwaddr physaddr = iotlbentry->addr;
-    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
-
-    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-
-    cpu->mem_io_vaddr = addr;
-    cpu->mem_io_pc = retaddr;
-    memory_region_dispatch_write(mr, physaddr, val, 1 << SHIFT,
-                                 iotlbentry->attrs);
+    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+    return io_writex(env, iotlbentry, val, addr, retaddr, DATA_SIZE);
 }

 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
@@ -340,16 +290,14 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,

    /* Handle an IO access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
        if ((addr & (DATA_SIZE - 1)) != 0) {
            goto do_unaligned_access;
        }
-        iotlbentry = &env->iotlb[mmu_idx][index];

        /* ??? Note that the io helpers always read data in the target
           byte ordering.  We should push the LE/BE request down into io.  */
        val = TGT_LE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        glue(io_write, SUFFIX)(env, mmu_idx, index, val, addr, retaddr);
        return;
    }

@@ -418,16 +366,14 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,

    /* Handle an IO access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
        if ((addr & (DATA_SIZE - 1)) != 0) {
            goto do_unaligned_access;
        }
-        iotlbentry = &env->iotlb[mmu_idx][index];

        /* ??? Note that the io helpers always read data in the target
           byte ordering.  We should push the LE/BE request down into io.  */
        val = TGT_BE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        glue(io_write, SUFFIX)(env, mmu_idx, index, val, addr, retaddr);
        return;
    }

@@ -466,33 +412,9 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
    glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
 }
 #endif /* DATA_SIZE > 1 */
-
-#if DATA_SIZE == 1
-/* Probe for whether the specified guest write access is permitted.
- * If it is not permitted then an exception will be taken in the same
- * way as if this were a real write access (and we will not return).
- * Otherwise the function will return, and there will be a valid
- * entry in the TLB for this access.
- */
-void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
-                 uintptr_t retaddr)
-{
-    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-        /* TLB entry is for a different page */
-        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
-        }
-    }
-}
-#endif
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */

 #undef READ_ACCESS_TYPE
-#undef SHIFT
 #undef DATA_TYPE
 #undef SUFFIX
 #undef LSUFFIX
@@ -503,15 +425,9 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 #undef USUFFIX
 #undef SSUFFIX
 #undef BSWAP
-#undef TGT_BE
-#undef TGT_LE
-#undef CPU_BE
-#undef CPU_LE
 #undef helper_le_ld_name
 #undef helper_be_ld_name
 #undef helper_le_lds_name
 #undef helper_be_lds_name
 #undef helper_le_st_name
 #undef helper_be_st_name
-#undef helper_te_ld_name
-#undef helper_te_st_name
--- a/target-alpha/cpu.h
+++ b/target-alpha/cpu.h
@@ -201,7 +201,7 @@ enum {

 /* MMU modes definitions */

-/* Alpha has 5 MMU modes: PALcode, kernel, executive, supervisor, and user.
+/* Alpha has 5 MMU modes: PALcode, Kernel, Executive, Supervisor, and User.
   The Unix PALcode only exposes the kernel and user modes; presumably
   executive and supervisor are used by VMS.

@@ -209,22 +209,18 @@ enum {
   there are PALmode instructions that can access data via physical mode
   or via an os-installed "alternate mode", which is one of the 4 above.

-   QEMU does not currently properly distinguish between code/data when
-   looking up addresses.  To avoid having to address this issue, our
-   emulated PALcode will cheat and use the KSEG mapping for its code+data
-   rather than physical addresses.
+   That said, we're only emulating Unix PALcode, and not attempting VMS,
+   so we don't need to implement Executive and Supervisor.  QEMU's own
+   PALcode cheats and usees the KSEG mapping for its code+data rather than
+   physical addresses.  */

-   Moreover, we're only emulating Unix PALcode, and not attempting VMS.
-
-   All of which allows us to drop all but kernel and user modes.
-   Elide the unused MMU modes to save space.  */
-
-#define NB_MMU_MODES 2
+#define NB_MMU_MODES 3

 #define MMU_MODE0_SUFFIX _kernel
 #define MMU_MODE1_SUFFIX _user
 #define MMU_KERNEL_IDX   0
 #define MMU_USER_IDX     1
+#define MMU_PHYS_IDX     2

 typedef struct CPUAlphaState CPUAlphaState;

@@ -234,7 +230,6 @@ struct CPUAlphaState {
    uint64_t pc;
    uint64_t unique;
    uint64_t lock_addr;
-    uint64_t lock_st_addr;
    uint64_t lock_value;

    /* The FPCR, and disassembled portions thereof.  */
@@ -350,9 +345,6 @@ enum {
    EXCP_ARITH,
    EXCP_FEN,
    EXCP_CALL_PAL,
-    /* For Usermode emulation.  */
-    EXCP_STL_C,
-    EXCP_STQ_C,
 };

 /* Alpha-specific interrupt pending bits.  */
--- a/target-alpha/helper.c
+++ b/target-alpha/helper.c
@@ -126,6 +126,14 @@ static int get_physical_address(CPUAlphaState *env, target_ulong addr,
    int prot = 0;
    int ret = MM_K_ACV;

+    /* Handle physical accesses.  */
+    if (mmu_idx == MMU_PHYS_IDX) {
+        phys = addr;
+        prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        ret = -1;
+        goto exit;
+    }
+
    /* Ensure that the virtual address is properly sign-extended from
       the last implemented virtual address bit.  */
    if (saddr >> TARGET_VIRT_ADDR_SPACE_BITS != saddr >> 63) {
@@ -298,12 +306,6 @@ void alpha_cpu_do_interrupt(CPUState *cs)
        case EXCP_CALL_PAL:
            name = "call_pal";
            break;
-        case EXCP_STL_C:
-            name = "stl_c";
-            break;
-        case EXCP_STQ_C:
-            name = "stq_c";
-            break;
        }
        qemu_log("INT %6d: %s(%#x) pc=%016" PRIx64 " sp=%016" PRIx64 "\n",
                 ++count, name, env->error_code, env->pc, env->ir[IR_SP]);
--- a/target-alpha/helper.h
+++ b/target-alpha/helper.h
@@ -92,15 +92,6 @@ DEF_HELPER_FLAGS_2(ieee_input_cmp, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(ieee_input_s, TCG_CALL_NO_WG, void, env, i64)

 #if !defined (CONFIG_USER_ONLY)
-DEF_HELPER_2(ldl_phys, i64, env, i64)
-DEF_HELPER_2(ldq_phys, i64, env, i64)
-DEF_HELPER_2(ldl_l_phys, i64, env, i64)
-DEF_HELPER_2(ldq_l_phys, i64, env, i64)
-DEF_HELPER_3(stl_phys, void, env, i64, i64)
-DEF_HELPER_3(stq_phys, void, env, i64, i64)
-DEF_HELPER_3(stl_c_phys, i64, env, i64, i64)
-DEF_HELPER_3(stq_c_phys, i64, env, i64, i64)
-
 DEF_HELPER_FLAGS_1(tbia, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_FLAGS_2(tbis, TCG_CALL_NO_RWG, void, env, i64)
 DEF_HELPER_FLAGS_1(tb_flush, TCG_CALL_NO_RWG, void, env)
--- a/target-alpha/machine.c
+++ b/target-alpha/machine.c
@@ -45,8 +45,6 @@ static VMStateField vmstate_env_fields[] = {
    VMSTATE_UINTTL(unique, CPUAlphaState),
    VMSTATE_UINTTL(lock_addr, CPUAlphaState),
    VMSTATE_UINTTL(lock_value, CPUAlphaState),
-    /* Note that lock_st_addr is not saved; it is a temporary
-       used during the execution of the st[lq]_c insns.  */

    VMSTATE_UINT8(ps, CPUAlphaState),
    VMSTATE_UINT8(intr_flag, CPUAlphaState),
--- a/target-alpha/mem_helper.c
+++ b/target-alpha/mem_helper.c
@@ -25,79 +25,6 @@

 /* Softmmu support */
 #ifndef CONFIG_USER_ONLY
-
-uint64_t helper_ldl_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    return (int32_t)ldl_phys(cs->as, p);
-}
-
-uint64_t helper_ldq_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    return ldq_phys(cs->as, p);
-}
-
-uint64_t helper_ldl_l_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    env->lock_addr = p;
-    return env->lock_value = (int32_t)ldl_phys(cs->as, p);
-}
-
-uint64_t helper_ldq_l_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    env->lock_addr = p;
-    return env->lock_value = ldq_phys(cs->as, p);
-}
-
-void helper_stl_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    stl_phys(cs->as, p, v);
-}
-
-void helper_stq_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    stq_phys(cs->as, p, v);
-}
-
-uint64_t helper_stl_c_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    uint64_t ret = 0;
-
-    if (p == env->lock_addr) {
-        int32_t old = ldl_phys(cs->as, p);
-        if (old == (int32_t)env->lock_value) {
-            stl_phys(cs->as, p, v);
-            ret = 1;
-        }
-    }
-    env->lock_addr = -1;
-
-    return ret;
-}
-
-uint64_t helper_stq_c_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    uint64_t ret = 0;
-
-    if (p == env->lock_addr) {
-        uint64_t old = ldq_phys(cs->as, p);
-        if (old == env->lock_value) {
-            stq_phys(cs->as, p, v);
-            ret = 1;
-        }
-    }
-    env->lock_addr = -1;
-
-    return ret;
-}
-
 void alpha_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
                                   MMUAccessType access_type,
                                   int mmu_idx, uintptr_t retaddr)
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -99,7 +99,6 @@ static TCGv cpu_std_ir[31];
 static TCGv cpu_fir[31];
 static TCGv cpu_pc;
 static TCGv cpu_lock_addr;
-static TCGv cpu_lock_st_addr;
 static TCGv cpu_lock_value;

 #ifndef CONFIG_USER_ONLY
@@ -116,7 +115,6 @@ void alpha_translate_init(void)
    static const GlobalVar vars[] = {
        DEF_VAR(pc),
        DEF_VAR(lock_addr),
-        DEF_VAR(lock_st_addr),
        DEF_VAR(lock_value),
    };

@@ -198,6 +196,23 @@ static TCGv dest_sink(DisasContext *ctx)
    return ctx->sink;
 }

+static void free_context_temps(DisasContext *ctx)
+{
+    if (!TCGV_IS_UNUSED_I64(ctx->sink)) {
+        tcg_gen_discard_i64(ctx->sink);
+        tcg_temp_free(ctx->sink);
+        TCGV_UNUSED_I64(ctx->sink);
+    }
+    if (!TCGV_IS_UNUSED_I64(ctx->zero)) {
+        tcg_temp_free(ctx->zero);
+        TCGV_UNUSED_I64(ctx->zero);
+    }
+    if (!TCGV_IS_UNUSED_I64(ctx->lit)) {
+        tcg_temp_free(ctx->lit);
+        TCGV_UNUSED_I64(ctx->lit);
+    }
+}
+
 static TCGv load_gpr(DisasContext *ctx, unsigned reg)
 {
    if (likely(reg < 31)) {
@@ -392,59 +407,40 @@ static inline void gen_store_mem(DisasContext *ctx,
 }

 static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
-                                        int32_t disp16, int quad)
+                                        int32_t disp16, int mem_idx,
+                                        TCGMemOp op)
 {
-    TCGv addr;
-
-    if (ra == 31) {
-        /* ??? Don't bother storing anything.  The user can't tell
-           the difference, since the zero register always reads zero.  */
-        return NO_EXIT;
-    }
-
-#if defined(CONFIG_USER_ONLY)
-    addr = cpu_lock_st_addr;
-#else
-    addr = tcg_temp_local_new();
-#endif
+    TCGLabel *lab_fail, *lab_done;
+    TCGv addr, val;

+    addr = tcg_temp_new_i64();
    tcg_gen_addi_i64(addr, load_gpr(ctx, rb), disp16);
+    free_context_temps(ctx);

-#if defined(CONFIG_USER_ONLY)
-    /* ??? This is handled via a complicated version of compare-and-swap
-       in the cpu_loop.  Hopefully one day we'll have a real CAS opcode
-       in TCG so that this isn't necessary.  */
-    return gen_excp(ctx, quad ? EXCP_STQ_C : EXCP_STL_C, ra);
-#else
-    /* ??? In system mode we are never multi-threaded, so CAS can be
-       implemented via a non-atomic load-compare-store sequence.  */
-    {
-        TCGLabel *lab_fail, *lab_done;
-        TCGv val;
+    lab_fail = gen_new_label();
+    lab_done = gen_new_label();
+    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_lock_addr, lab_fail);
+    tcg_temp_free_i64(addr);

-        lab_fail = gen_new_label();
-        lab_done = gen_new_label();
-        tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_lock_addr, lab_fail);
+    val = tcg_temp_new_i64();
+    tcg_gen_atomic_cmpxchg_i64(val, cpu_lock_addr, cpu_lock_value,
+                               load_gpr(ctx, ra), mem_idx, op);
+    free_context_temps(ctx);

-        val = tcg_temp_new();
-        tcg_gen_qemu_ld_i64(val, addr, ctx->mem_idx, quad ? MO_LEQ : MO_LESL);
-        tcg_gen_brcond_i64(TCG_COND_NE, val, cpu_lock_value, lab_fail);
-
-        tcg_gen_qemu_st_i64(ctx->ir[ra], addr, ctx->mem_idx,
-                            quad ? MO_LEQ : MO_LEUL);
-        tcg_gen_movi_i64(ctx->ir[ra], 1);
-        tcg_gen_br(lab_done);
-
-        gen_set_label(lab_fail);
-        tcg_gen_movi_i64(ctx->ir[ra], 0);
-
-        gen_set_label(lab_done);
-        tcg_gen_movi_i64(cpu_lock_addr, -1);
-
-        tcg_temp_free(addr);
-        return NO_EXIT;
+    if (ra != 31) {
+        tcg_gen_setcond_i64(TCG_COND_EQ, ctx->ir[ra], val, cpu_lock_value);
    }
-#endif
+    tcg_temp_free_i64(val);
+    tcg_gen_br(lab_done);
+
+    gen_set_label(lab_fail);
+    if (ra != 31) {
+        tcg_gen_movi_i64(ctx->ir[ra], 0);
+    }
+
+    gen_set_label(lab_done);
+    tcg_gen_movi_i64(cpu_lock_addr, -1);
+    return NO_EXIT;
 }

 static bool in_superpage(DisasContext *ctx, int64_t addr)
@@ -2423,19 +2419,19 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
            switch ((insn >> 12) & 0xF) {
            case 0x0:
                /* Longword physical access (hw_ldl/p) */
-                gen_helper_ldl_phys(va, cpu_env, addr);
+                tcg_gen_qemu_ld_i64(va, addr, MMU_PHYS_IDX, MO_LESL);
                break;
            case 0x1:
                /* Quadword physical access (hw_ldq/p) */
-                gen_helper_ldq_phys(va, cpu_env, addr);
+                tcg_gen_qemu_ld_i64(va, addr, MMU_PHYS_IDX, MO_LEQ);
                break;
            case 0x2:
                /* Longword physical access with lock (hw_ldl_l/p) */
-                gen_helper_ldl_l_phys(va, cpu_env, addr);
+                gen_qemu_ldl_l(va, addr, MMU_PHYS_IDX);
                break;
            case 0x3:
                /* Quadword physical access with lock (hw_ldq_l/p) */
-                gen_helper_ldq_l_phys(va, cpu_env, addr);
+                gen_qemu_ldq_l(va, addr, MMU_PHYS_IDX);
                break;
            case 0x4:
                /* Longword virtual PTE fetch (hw_ldl/v) */
@@ -2674,27 +2670,34 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
 #ifndef CONFIG_USER_ONLY
        REQUIRE_TB_FLAG(TB_FLAGS_PAL_MODE);
        {
-            TCGv addr = tcg_temp_new();
-            va = load_gpr(ctx, ra);
-            vb = load_gpr(ctx, rb);
-
-            tcg_gen_addi_i64(addr, vb, disp12);
            switch ((insn >> 12) & 0xF) {
            case 0x0:
                /* Longword physical access */
-                gen_helper_stl_phys(cpu_env, addr, va);
+                va = load_gpr(ctx, ra);
+                vb = load_gpr(ctx, rb);
+                tmp = tcg_temp_new();
+                tcg_gen_addi_i64(tmp, vb, disp12);
+                tcg_gen_qemu_st_i64(va, tmp, MMU_PHYS_IDX, MO_LESL);
+                tcg_temp_free(tmp);
                break;
            case 0x1:
                /* Quadword physical access */
-                gen_helper_stq_phys(cpu_env, addr, va);
+                va = load_gpr(ctx, ra);
+                vb = load_gpr(ctx, rb);
+                tmp = tcg_temp_new();
+                tcg_gen_addi_i64(tmp, vb, disp12);
+                tcg_gen_qemu_st_i64(va, tmp, MMU_PHYS_IDX, MO_LEQ);
+                tcg_temp_free(tmp);
                break;
            case 0x2:
                /* Longword physical access with lock */
-                gen_helper_stl_c_phys(dest_gpr(ctx, ra), cpu_env, addr, va);
+                ret = gen_store_conditional(ctx, ra, rb, disp12,
+                                            MMU_PHYS_IDX, MO_LESL);
                break;
            case 0x3:
                /* Quadword physical access with lock */
-                gen_helper_stq_c_phys(dest_gpr(ctx, ra), cpu_env, addr, va);
+                ret = gen_store_conditional(ctx, ra, rb, disp12,
+                                            MMU_PHYS_IDX, MO_LEQ);
                break;
            case 0x4:
                /* Longword virtual access */
@@ -2733,7 +2736,6 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
                /* Invalid */
                goto invalid_opc;
            }
-            tcg_temp_free(addr);
            break;
        }
 #else
@@ -2797,11 +2799,13 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
        break;
    case 0x2E:
        /* STL_C */
-        ret = gen_store_conditional(ctx, ra, rb, disp16, 0);
+        ret = gen_store_conditional(ctx, ra, rb, disp16,
+                                    ctx->mem_idx, MO_LESL);
        break;
    case 0x2F:
        /* STQ_C */
-        ret = gen_store_conditional(ctx, ra, rb, disp16, 1);
+        ret = gen_store_conditional(ctx, ra, rb, disp16,
+                                    ctx->mem_idx, MO_LEQ);
        break;
    case 0x30:
        /* BR */
@@ -2906,6 +2910,10 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
    /* Similarly for flush-to-zero.  */
    ctx.tb_ftz = -1;

+    TCGV_UNUSED_I64(ctx.zero);
+    TCGV_UNUSED_I64(ctx.sink);
+    TCGV_UNUSED_I64(ctx.lit);
+
    num_insns = 0;
    max_insns = tb->cflags & CF_COUNT_MASK;
    if (max_insns == 0) {
@@ -2940,23 +2948,9 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
        }
        insn = cpu_ldl_code(env, ctx.pc);

-        TCGV_UNUSED_I64(ctx.zero);
-        TCGV_UNUSED_I64(ctx.sink);
-        TCGV_UNUSED_I64(ctx.lit);
-
        ctx.pc += 4;
        ret = translate_one(ctxp, insn);
-
-        if (!TCGV_IS_UNUSED_I64(ctx.sink)) {
-            tcg_gen_discard_i64(ctx.sink);
-            tcg_temp_free(ctx.sink);
-        }
-        if (!TCGV_IS_UNUSED_I64(ctx.zero)) {
-            tcg_temp_free(ctx.zero);
-        }
-        if (!TCGV_IS_UNUSED_I64(ctx.lit)) {
-            tcg_temp_free(ctx.lit);
-        }
+        free_context_temps(ctxp);

        /* If we reach a page boundary, are single stepping,
           or exhaust instruction count, stop generation.  */
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -46,7 +46,6 @@
 #define EXCP_BKPT            7
 #define EXCP_EXCEPTION_EXIT  8   /* Return from v7M exception.  */
 #define EXCP_KERNEL_TRAP     9   /* Jumped to kernel code page.  */
-#define EXCP_STREX          10
 #define EXCP_HVC            11   /* HyperVisor Call */
 #define EXCP_HYP_TRAP       12
 #define EXCP_SMC            13   /* Secure Monitor Call */
@@ -475,10 +474,6 @@ typedef struct CPUARMState {
    uint64_t exclusive_addr;
    uint64_t exclusive_val;
    uint64_t exclusive_high;
-#if defined(CONFIG_USER_ONLY)
-    uint64_t exclusive_test;
-    uint32_t exclusive_info;
-#endif

    /* iwMMXt coprocessor state.  */
    struct {
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -27,6 +27,10 @@
 #include "qemu/bitops.h"
 #include "internals.h"
 #include "qemu/crc32c.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "tcg.h"
 #include <zlib.h> /* For crc32 */

 /* C2.4.7 Multiply and divide */
@@ -444,3 +448,112 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
    /* Linux crc32c converts the output to one's complement.  */
    return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
+
+/* Returns 0 on success; 1 otherwise.  */
+uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
+                                     uint64_t new_lo, uint64_t new_hi)
+{
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    bool success;
+
+    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+    newv = int128_make128(new_lo, new_hi);
+
+    if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+        oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
+        success = int128_eq(oldv, cmpv);
+#endif
+    } else {
+        uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+        /* ??? Enforce alignment.  */
+        uint64_t *haddr = g2h(addr);
+        o0 = ldq_le_p(haddr + 0);
+        o1 = ldq_le_p(haddr + 1);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            stq_le_p(haddr + 0, int128_getlo(newv));
+            stq_le_p(haddr + 1, int128_gethi(newv));
+        }
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+        TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
+
+        o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
+        o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
+            helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
+        }
+#endif
+    }
+
+    return !success;
+}
+
+uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
+                                     uint64_t new_lo, uint64_t new_hi)
+{
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    bool success;
+
+    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+    newv = int128_make128(new_lo, new_hi);
+
+    if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+        oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
+        success = int128_eq(oldv, cmpv);
+#endif
+    } else {
+        uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+        /* ??? Enforce alignment.  */
+        uint64_t *haddr = g2h(addr);
+        o1 = ldq_be_p(haddr + 0);
+        o0 = ldq_be_p(haddr + 1);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            stq_be_p(haddr + 0, int128_gethi(newv));
+            stq_be_p(haddr + 1, int128_getlo(newv));
+        }
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+        TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
+
+        o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
+        o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
+            helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
+        }
+#endif
+    }
+
+    return !success;
+}
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -46,3 +46,5 @@ DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
 DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
+DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
+DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
--- a/target-arm/internals.h
+++ b/target-arm/internals.h
@@ -46,8 +46,7 @@ static inline bool excp_is_internal(int excp)
        || excp == EXCP_HALTED
        || excp == EXCP_EXCEPTION_EXIT
        || excp == EXCP_KERNEL_TRAP
-        || excp == EXCP_SEMIHOST
-        || excp == EXCP_STREX;
+        || excp == EXCP_SEMIHOST;
 }

 /* Exception names for debug logging; note that not all of these
@@ -63,7 +62,6 @@ static const char * const excnames[] = {
    [EXCP_BKPT] = "Breakpoint",
    [EXCP_EXCEPTION_EXIT] = "QEMU v7M exception exit",
    [EXCP_KERNEL_TRAP] = "QEMU intercept of kernel commpage",
-    [EXCP_STREX] = "QEMU intercept of STREX",
    [EXCP_HVC] = "Hypervisor Call",
    [EXCP_HYP_TRAP] = "Hypervisor Trap",
    [EXCP_SMC] = "Secure Monitor Call",
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1839,37 +1839,41 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
    }
 }

-/*
- * Load/Store exclusive instructions are implemented by remembering
- * the value/address loaded, and seeing if these are the same
- * when the store is performed. This is not actually the architecturally
- * mandated semantics, but it works for typical guest code sequences
- * and avoids having to monitor regular stores.
- *
- * In system emulation mode only one CPU will be running at once, so
- * this sequence is effectively atomic.  In user emulation mode we
- * throw an exception and handle the atomic operation elsewhere.
- */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                               TCGv_i64 addr, int size, bool is_pair)
 {
    TCGv_i64 tmp = tcg_temp_new_i64();
-    TCGMemOp memop = s->be_data + size;
+    TCGMemOp be = s->be_data;

    g_assert(size <= 3);
-    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);
-
    if (is_pair) {
-        TCGv_i64 addr2 = tcg_temp_new_i64();
        TCGv_i64 hitmp = tcg_temp_new_i64();

-        g_assert(size >= 2);
-        tcg_gen_addi_i64(addr2, addr, 1 << size);
-        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
-        tcg_temp_free_i64(addr2);
+        if (size == 3) {
+            TCGv_i64 addr2 = tcg_temp_new_i64();
+
+            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
+                                MO_64 | MO_ALIGN_16 | be);
+            tcg_gen_addi_i64(addr2, addr, 8);
+            tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),
+                                MO_64 | MO_ALIGN | be);
+            tcg_temp_free_i64(addr2);
+        } else {
+            g_assert(size == 2);
+            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
+                                MO_64 | MO_ALIGN | be);
+            if (be == MO_LE) {
+                tcg_gen_extr32_i64(tmp, hitmp, tmp);
+            } else {
+                tcg_gen_extr32_i64(hitmp, tmp, tmp);
+            }
+        }
+
        tcg_gen_mov_i64(cpu_exclusive_high, hitmp);
        tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
        tcg_temp_free_i64(hitmp);
+    } else {
+        tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);
    }

    tcg_gen_mov_i64(cpu_exclusive_val, tmp);
@@ -1879,16 +1883,6 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
    tcg_gen_mov_i64(cpu_exclusive_addr, addr);
 }

-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i64 addr, int size, int is_pair)
-{
-    tcg_gen_mov_i64(cpu_exclusive_test, addr);
-    tcg_gen_movi_i32(cpu_exclusive_info,
-                     size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 14));
-    gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                TCGv_i64 inaddr, int size, int is_pair)
 {
@@ -1916,46 +1910,42 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);

    tmp = tcg_temp_new_i64();
-    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), s->be_data + size);
-    tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i64(tmp);
-
    if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
-        TCGv_i64 tmphi = tcg_temp_new_i64();
-
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s),
-                            s->be_data + size);
-        tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label);
-
-        tcg_temp_free_i64(tmphi);
-        tcg_temp_free_i64(addrhi);
-    }
-
-    /* We seem to still have the exclusive monitor, so do the store */
-    tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s),
-                        s->be_data + size);
-    if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
-
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi,
-                            get_mem_index(s), s->be_data + size);
-        tcg_temp_free_i64(addrhi);
+        if (size == 2) {
+            TCGv_i64 val = tcg_temp_new_i64();
+            tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
+            tcg_gen_concat32_i64(val, cpu_exclusive_val, cpu_exclusive_high);
+            tcg_gen_atomic_cmpxchg_i64(tmp, addr, val, tmp,
+                                       get_mem_index(s),
+                                       size | MO_ALIGN | s->be_data);
+            tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, val);
+            tcg_temp_free_i64(val);
+        } else if (s->be_data == MO_LE) {
+            gen_helper_paired_cmpxchg64_le(tmp, cpu_env, addr, cpu_reg(s, rt),
+                                           cpu_reg(s, rt2));
+        } else {
+            gen_helper_paired_cmpxchg64_be(tmp, cpu_env, addr, cpu_reg(s, rt),
+                                           cpu_reg(s, rt2));
+        }
+    } else {
+        TCGv_i64 val = cpu_reg(s, rt);
+        tcg_gen_atomic_cmpxchg_i64(tmp, addr, cpu_exclusive_val, val,
+                                   get_mem_index(s),
+                                   size | MO_ALIGN | s->be_data);
+        tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
    }

    tcg_temp_free_i64(addr);

-    tcg_gen_movi_i64(cpu_reg(s, rd), 0);
+    tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+    tcg_temp_free_i64(tmp);
    tcg_gen_br(done_label);
+
    gen_set_label(fail_label);
    tcg_gen_movi_i64(cpu_reg(s, rd), 1);
    gen_set_label(done_label);
    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-
 }
-#endif

 /* Update the Sixty-Four bit (SF) registersize. This logic is derived
 * from the ARMv8 specs for LDR (Shared decode for all encodings).
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -65,10 +65,6 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
-#ifdef CONFIG_USER_ONLY
-TCGv_i64 cpu_exclusive_test;
-TCGv_i32 cpu_exclusive_info;
-#endif

 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -102,12 +98,6 @@ void arm_translate_init(void)
        offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
    cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env,
        offsetof(CPUARMState, exclusive_val), "exclusive_val");
-#ifdef CONFIG_USER_ONLY
-    cpu_exclusive_test = tcg_global_mem_new_i64(cpu_env,
-        offsetof(CPUARMState, exclusive_test), "exclusive_test");
-    cpu_exclusive_info = tcg_global_mem_new_i32(cpu_env,
-        offsetof(CPUARMState, exclusive_info), "exclusive_info");
-#endif

    a64_translate_init();
 }
@@ -932,145 +922,103 @@ static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var)
 * These functions work like tcg_gen_qemu_{ld,st}* except
 * that the address argument is TCGv_i32 rather than TCGv.
 */
-#if TARGET_LONG_BITS == 32

-#define DO_GEN_LD(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        TCGv addr_be = tcg_temp_new();                                   \
-        tcg_gen_xori_i32(addr_be, addr, BE32_XOR);                       \
-        tcg_gen_qemu_ld_i32(val, addr_be, index, opc);                   \
-        tcg_temp_free(addr_be);                                          \
-        return;                                                          \
-    }                                                                    \
-    tcg_gen_qemu_ld_i32(val, addr, index, opc);                          \
-}
-
-#define DO_GEN_ST(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        TCGv addr_be = tcg_temp_new();                                   \
-        tcg_gen_xori_i32(addr_be, addr, BE32_XOR);                       \
-        tcg_gen_qemu_st_i32(val, addr_be, index, opc);                   \
-        tcg_temp_free(addr_be);                                          \
-        return;                                                          \
-    }                                                                    \
-    tcg_gen_qemu_st_i32(val, addr, index, opc);                          \
-}
-
-static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static inline TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, TCGMemOp op)
+{
+    TCGv addr = tcg_temp_new();
+    tcg_gen_extu_i32_tl(addr, a32);
+
+    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
+    if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) {
+        tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE)));
+    }
+    return addr;
+}
+
+static void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_ld_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
+}
+
+static void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_st_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
+}
+
+#define DO_GEN_LD(SUFF, OPC)                                             \
+static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val,      \
+                                     TCGv_i32 a32, int index)            \
+{                                                                        \
+    gen_aa32_ld_i32(s, val, a32, index, OPC | s->be_data);               \
+}
+
+#define DO_GEN_ST(SUFF, OPC)                                             \
+static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val,      \
+                                     TCGv_i32 a32, int index)            \
+{                                                                        \
+    gen_aa32_st_i32(s, val, a32, index, OPC | s->be_data);               \
+}
+
+static inline void gen_aa32_frob64(DisasContext *s, TCGv_i64 val)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    tcg_gen_qemu_ld_i64(val, addr, index, opc);
    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
    if (!IS_USER_ONLY && s->sctlr_b) {
        tcg_gen_rotri_i64(val, val, 32);
    }
 }

-static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_ld_i64(val, addr, index, opc);
+    gen_aa32_frob64(s, val);
+    tcg_temp_free(addr);
+}
+
+static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
+                                 TCGv_i32 a32, int index)
+{
+    gen_aa32_ld_i64(s, val, a32, index, MO_Q | s->be_data);
+}
+
+static void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+
    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
    if (!IS_USER_ONLY && s->sctlr_b) {
        TCGv_i64 tmp = tcg_temp_new_i64();
        tcg_gen_rotri_i64(tmp, val, 32);
        tcg_gen_qemu_st_i64(tmp, addr, index, opc);
        tcg_temp_free_i64(tmp);
-        return;
+    } else {
+        tcg_gen_qemu_st_i64(val, addr, index, opc);
    }
-    tcg_gen_qemu_st_i64(val, addr, index, opc);
-}
-
-#else
-
-#define DO_GEN_LD(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    TCGv addr64 = tcg_temp_new();                                        \
-    tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        tcg_gen_xori_i64(addr64, addr64, BE32_XOR);                      \
-    }                                                                    \
-    tcg_gen_qemu_ld_i32(val, addr64, index, opc);                        \
-    tcg_temp_free(addr64);                                               \
-}
-
-#define DO_GEN_ST(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    TCGv addr64 = tcg_temp_new();                                        \
-    tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        tcg_gen_xori_i64(addr64, addr64, BE32_XOR);                      \
-    }                                                                    \
-    tcg_gen_qemu_st_i32(val, addr64, index, opc);                        \
-    tcg_temp_free(addr64);                                               \
-}
-
-static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
-{
-    TCGMemOp opc = MO_Q | s->be_data;
-    TCGv addr64 = tcg_temp_new();
-    tcg_gen_extu_i32_i64(addr64, addr);
-    tcg_gen_qemu_ld_i64(val, addr64, index, opc);
-
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b) {
-        tcg_gen_rotri_i64(val, val, 32);
-    }
-    tcg_temp_free(addr64);
+    tcg_temp_free(addr);
 }

 static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+                                 TCGv_i32 a32, int index)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    TCGv addr64 = tcg_temp_new();
-    tcg_gen_extu_i32_i64(addr64, addr);
-
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b) {
-        TCGv tmp = tcg_temp_new();
-        tcg_gen_rotri_i64(tmp, val, 32);
-        tcg_gen_qemu_st_i64(tmp, addr64, index, opc);
-        tcg_temp_free(tmp);
-    } else {
-        tcg_gen_qemu_st_i64(val, addr64, index, opc);
-    }
-    tcg_temp_free(addr64);
+    gen_aa32_st_i64(s, val, a32, index, MO_Q | s->be_data);
 }

-#endif
-
-DO_GEN_LD(8s, MO_SB, 3)
-DO_GEN_LD(8u, MO_UB, 3)
-DO_GEN_LD(16s, MO_SW, 2)
-DO_GEN_LD(16u, MO_UW, 2)
-DO_GEN_LD(32u, MO_UL, 0)
-/* 'a' variants include an alignment check */
-DO_GEN_LD(16ua, MO_UW | MO_ALIGN, 2)
-DO_GEN_LD(32ua, MO_UL | MO_ALIGN, 0)
-DO_GEN_ST(8, MO_UB, 3)
-DO_GEN_ST(16, MO_UW, 2)
-DO_GEN_ST(32, MO_UL, 0)
+DO_GEN_LD(8s, MO_SB)
+DO_GEN_LD(8u, MO_UB)
+DO_GEN_LD(16s, MO_SW)
+DO_GEN_LD(16u, MO_UW)
+DO_GEN_LD(32u, MO_UL)
+DO_GEN_ST(8, MO_UB)
+DO_GEN_ST(16, MO_UW)
+DO_GEN_ST(32, MO_UL)

 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
@@ -7759,45 +7707,30 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)

 /* Load/Store exclusive instructions are implemented by remembering
   the value/address loaded, and seeing if these are the same
-   when the store is performed. This should be sufficient to implement
+   when the store is performed.  This should be sufficient to implement
   the architecturally mandated semantics, and avoids having to monitor
-   regular stores.
-
-   In system emulation mode only one CPU will be running at once, so
-   this sequence is effectively atomic.  In user emulation mode we
-   throw an exception and handle the atomic operation elsewhere.  */
+   regular stores.  The compare vs the remembered value is done during
+   the cmpxchg operation, but we must compare the addresses manually.  */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                               TCGv_i32 addr, int size)
 {
    TCGv_i32 tmp = tcg_temp_new_i32();
+    TCGMemOp opc = size | MO_ALIGN | s->be_data;

    s->is_ldex = true;

-    switch (size) {
-    case 0:
-        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-
    if (size == 3) {
        TCGv_i32 tmp2 = tcg_temp_new_i32();
-        TCGv_i32 tmp3 = tcg_temp_new_i32();
+        TCGv_i64 t64 = tcg_temp_new_i64();

-        tcg_gen_addi_i32(tmp2, addr, 4);
-        gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
-        tcg_temp_free_i32(tmp2);
-        tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3);
-        store_reg(s, rt2, tmp3);
+        gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc);
+        tcg_gen_mov_i64(cpu_exclusive_val, t64);
+        tcg_gen_extr_i64_i32(tmp, tmp2, t64);
+        tcg_temp_free_i64(t64);
+
+        store_reg(s, rt2, tmp2);
    } else {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
        tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
    }

@@ -7810,23 +7743,15 @@ static void gen_clrex(DisasContext *s)
    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
 }

-#ifdef CONFIG_USER_ONLY
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                TCGv_i32 addr, int size)
 {
-    tcg_gen_extu_i32_i64(cpu_exclusive_test, addr);
-    tcg_gen_movi_i32(cpu_exclusive_info,
-                     size | (rd << 4) | (rt << 8) | (rt2 << 12));
-    gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i32 addr, int size)
-{
-    TCGv_i32 tmp;
-    TCGv_i64 val64, extaddr;
+    TCGv_i32 t0, t1, t2;
+    TCGv_i64 extaddr;
+    TCGv taddr;
    TCGLabel *done_label;
    TCGLabel *fail_label;
+    TCGMemOp opc = size | MO_ALIGN | s->be_data;

    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
         [addr] = {Rt};
@@ -7841,69 +7766,45 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
    tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
    tcg_temp_free_i64(extaddr);

-    tmp = tcg_temp_new_i32();
-    switch (size) {
-    case 0:
-        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-
-    val64 = tcg_temp_new_i64();
+    taddr = gen_aa32_addr(s, addr, opc);
+    t0 = tcg_temp_new_i32();
+    t1 = load_reg(s, rt);
    if (size == 3) {
-        TCGv_i32 tmp2 = tcg_temp_new_i32();
-        TCGv_i32 tmp3 = tcg_temp_new_i32();
-        tcg_gen_addi_i32(tmp2, addr, 4);
-        gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
-        tcg_temp_free_i32(tmp2);
-        tcg_gen_concat_i32_i64(val64, tmp, tmp3);
-        tcg_temp_free_i32(tmp3);
+        TCGv_i64 o64 = tcg_temp_new_i64();
+        TCGv_i64 n64 = tcg_temp_new_i64();
+
+        t2 = load_reg(s, rt2);
+        tcg_gen_concat_i32_i64(n64, t1, t2);
+        tcg_temp_free_i32(t2);
+        gen_aa32_frob64(s, n64);
+
+        tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
+                                   get_mem_index(s), opc);
+        tcg_temp_free_i64(n64);
+
+        gen_aa32_frob64(s, o64);
+        tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
+        tcg_gen_extrl_i64_i32(t0, o64);
+
+        tcg_temp_free_i64(o64);
    } else {
-        tcg_gen_extu_i32_i64(val64, tmp);
+        t2 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
+        tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
+        tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
+        tcg_temp_free_i32(t2);
    }
-    tcg_temp_free_i32(tmp);
-
-    tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i64(val64);
-
-    tmp = load_reg(s, rt);
-    switch (size) {
-    case 0:
-        gen_aa32_st8(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_st16(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-    tcg_temp_free_i32(tmp);
-    if (size == 3) {
-        tcg_gen_addi_i32(addr, addr, 4);
-        tmp = load_reg(s, rt2);
-        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-        tcg_temp_free_i32(tmp);
-    }
-    tcg_gen_movi_i32(cpu_R[rd], 0);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free(taddr);
+    tcg_gen_mov_i32(cpu_R[rd], t0);
+    tcg_temp_free_i32(t0);
    tcg_gen_br(done_label);
+
    gen_set_label(fail_label);
    tcg_gen_movi_i32(cpu_R[rd], 1);
    gen_set_label(done_label);
    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
 }
-#endif

 /* gen_srs:
 * @env: CPUARMState
@@ -8878,25 +8779,27 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                        }
                        tcg_temp_free_i32(addr);
                    } else {
+                        TCGv taddr;
+                        TCGMemOp opc = s->be_data;
+
                        /* SWP instruction */
                        rm = (insn) & 0xf;

-                        /* ??? This is not really atomic.  However we know
-                           we never have multiple CPUs running in parallel,
-                           so it is good enough.  */
-                        addr = load_reg(s, rn);
-                        tmp = load_reg(s, rm);
-                        tmp2 = tcg_temp_new_i32();
                        if (insn & (1 << 22)) {
-                            gen_aa32_ld8u(s, tmp2, addr, get_mem_index(s));
-                            gen_aa32_st8(s, tmp, addr, get_mem_index(s));
+                            opc |= MO_UB;
                        } else {
-                            gen_aa32_ld32u(s, tmp2, addr, get_mem_index(s));
-                            gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+                            opc |= MO_UL | MO_ALIGN;
                        }
-                        tcg_temp_free_i32(tmp);
+
+                        addr = load_reg(s, rn);
+                        taddr = gen_aa32_addr(s, addr, opc);
                        tcg_temp_free_i32(addr);
-                        store_reg(s, rd, tmp2);
+
+                        tmp = load_reg(s, rm);
+                        tcg_gen_atomic_xchg_i32(tmp, taddr, tmp,
+                                                get_mem_index(s), opc);
+                        tcg_temp_free(taddr);
+                        store_reg(s, rd, tmp);
                    }
                }
            } else {
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -79,10 +79,6 @@ extern TCGv_env cpu_env;
 extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
 extern TCGv_i64 cpu_exclusive_addr;
 extern TCGv_i64 cpu_exclusive_val;
-#ifdef CONFIG_USER_ONLY
-extern TCGv_i64 cpu_exclusive_test;
-extern TCGv_i32 cpu_exclusive_info;
-#endif

 static inline int arm_dc_feature(DisasContext *dc, int feature)
 {
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -1,8 +1,6 @@
 DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
 DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)

-DEF_HELPER_0(lock, void)
-DEF_HELPER_0(unlock, void)
 DEF_HELPER_3(write_eflags, void, env, tl, i32)
 DEF_HELPER_1(read_eflags, tl, env)
 DEF_HELPER_2(divb_AL, void, env, tl)
@@ -74,8 +72,10 @@ DEF_HELPER_3(boundw, void, env, tl, int)
 DEF_HELPER_3(boundl, void, env, tl, int)
 DEF_HELPER_1(rsm, void, env)
 DEF_HELPER_2(into, void, env, int)
+DEF_HELPER_2(cmpxchg8b_unlocked, void, env, tl)
 DEF_HELPER_2(cmpxchg8b, void, env, tl)
 #ifdef TARGET_X86_64
+DEF_HELPER_2(cmpxchg16b_unlocked, void, env, tl)
 DEF_HELPER_2(cmpxchg16b, void, env, tl)
 #endif
 DEF_HELPER_1(single_step, void, env)
--- a/target-i386/mem_helper.c
+++ b/target-i386/mem_helper.c
@@ -22,87 +22,146 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "tcg.h"

-/* broken thread support */
-
-#if defined(CONFIG_USER_ONLY)
-QemuMutex global_cpu_lock;
-
-void helper_lock(void)
+void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
 {
-    qemu_mutex_lock(&global_cpu_lock);
-}
-
-void helper_unlock(void)
-{
-    qemu_mutex_unlock(&global_cpu_lock);
-}
-
-void helper_lock_init(void)
-{
-    qemu_mutex_init(&global_cpu_lock);
-}
-#else
-void helper_lock(void)
-{
-}
-
-void helper_unlock(void)
-{
-}
-
-void helper_lock_init(void)
-{
-}
-#endif
-
-void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
-{
-    uint64_t d;
+    uintptr_t ra = GETPC();
+    uint64_t oldv, cmpv, newv;
    int eflags;

    eflags = cpu_cc_compute_all(env, CC_OP);
-    d = cpu_ldq_data_ra(env, a0, GETPC());
-    if (d == (((uint64_t)env->regs[R_EDX] << 32) | (uint32_t)env->regs[R_EAX])) {
-        cpu_stq_data_ra(env, a0, ((uint64_t)env->regs[R_ECX] << 32)
-                                  | (uint32_t)env->regs[R_EBX], GETPC());
+
+    cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
+    newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
+
+    oldv = cpu_ldq_data_ra(env, a0, ra);
+    newv = (cmpv == oldv ? newv : oldv);
+    /* always do the store */
+    cpu_stq_data_ra(env, a0, newv, ra);
+
+    if (oldv == cmpv) {
        eflags |= CC_Z;
    } else {
-        /* always do the store */
-        cpu_stq_data_ra(env, a0, d, GETPC());
-        env->regs[R_EDX] = (uint32_t)(d >> 32);
-        env->regs[R_EAX] = (uint32_t)d;
+        env->regs[R_EAX] = (uint32_t)oldv;
+        env->regs[R_EDX] = (uint32_t)(oldv >> 32);
        eflags &= ~CC_Z;
    }
    CC_SRC = eflags;
 }

-#ifdef TARGET_X86_64
-void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
 {
-    uint64_t d0, d1;
+#ifdef CONFIG_ATOMIC64
+    uint64_t oldv, cmpv, newv;
    int eflags;

+    eflags = cpu_cc_compute_all(env, CC_OP);
+
+    cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
+    newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
+
+#ifdef CONFIG_USER_ONLY
+    {
+        uint64_t *haddr = g2h(a0);
+        cmpv = cpu_to_le64(cmpv);
+        newv = cpu_to_le64(newv);
+        oldv = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+        oldv = le64_to_cpu(oldv);
+    }
+#else
+    {
+        uintptr_t ra = GETPC();
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx);
+        oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra);
+    }
+#endif
+
+    if (oldv == cmpv) {
+        eflags |= CC_Z;
+    } else {
+        env->regs[R_EAX] = (uint32_t)oldv;
+        env->regs[R_EDX] = (uint32_t)(oldv >> 32);
+        eflags &= ~CC_Z;
+    }
+    CC_SRC = eflags;
+#else
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
+#endif /* CONFIG_ATOMIC64 */
+}
+
+#ifdef TARGET_X86_64
+void helper_cmpxchg16b_unlocked(CPUX86State *env, target_ulong a0)
+{
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    uint64_t o0, o1;
+    int eflags;
+    bool success;
+
    if ((a0 & 0xf) != 0) {
        raise_exception_ra(env, EXCP0D_GPF, GETPC());
    }
    eflags = cpu_cc_compute_all(env, CC_OP);
-    d0 = cpu_ldq_data_ra(env, a0, GETPC());
-    d1 = cpu_ldq_data_ra(env, a0 + 8, GETPC());
-    if (d0 == env->regs[R_EAX] && d1 == env->regs[R_EDX]) {
-        cpu_stq_data_ra(env, a0, env->regs[R_EBX], GETPC());
-        cpu_stq_data_ra(env, a0 + 8, env->regs[R_ECX], GETPC());
+
+    cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
+    newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
+
+    o0 = cpu_ldq_data_ra(env, a0 + 0, ra);
+    o1 = cpu_ldq_data_ra(env, a0 + 8, ra);
+
+    oldv = int128_make128(o0, o1);
+    success = int128_eq(oldv, cmpv);
+    if (!success) {
+        newv = oldv;
+    }
+
+    cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra);
+    cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra);
+
+    if (success) {
        eflags |= CC_Z;
    } else {
-        /* always do the store */
-        cpu_stq_data_ra(env, a0, d0, GETPC());
-        cpu_stq_data_ra(env, a0 + 8, d1, GETPC());
-        env->regs[R_EDX] = d1;
-        env->regs[R_EAX] = d0;
+        env->regs[R_EAX] = int128_getlo(oldv);
+        env->regs[R_EDX] = int128_gethi(oldv);
        eflags &= ~CC_Z;
    }
    CC_SRC = eflags;
 }
+
+void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+{
+    uintptr_t ra = GETPC();
+
+    if ((a0 & 0xf) != 0) {
+        raise_exception_ra(env, EXCP0D_GPF, ra);
+    } else {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int eflags = cpu_cc_compute_all(env, CC_OP);
+
+        Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
+        Int128 newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
+
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+        Int128 oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv,
+                                                    newv, oi, ra);
+
+        if (int128_eq(oldv, cmpv)) {
+            eflags |= CC_Z;
+        } else {
+            env->regs[R_EAX] = int128_getlo(oldv);
+            env->regs[R_EDX] = int128_gethi(oldv);
+            eflags &= ~CC_Z;
+        }
+        CC_SRC = eflags;
+#endif
+    }
+}
 #endif

 void helper_boundw(CPUX86State *env, target_ulong a0, int v)
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -1257,55 +1257,95 @@ static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 {
    if (d != OR_TMP0) {
        gen_op_mov_v_reg(ot, cpu_T0, d);
-    } else {
+    } else if (!(s1->prefix & PREFIX_LOCK)) {
        gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
    }
    switch(op) {
    case OP_ADCL:
        gen_compute_eflags_c(s1, cpu_tmp4);
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_tmp4);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_add_tl(cpu_T0, cpu_tmp4, cpu_T1);
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_tmp4);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update3_cc(cpu_tmp4);
        set_cc_op(s1, CC_OP_ADCB + ot);
        break;
    case OP_SBBL:
        gen_compute_eflags_c(s1, cpu_tmp4);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_tmp4);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_add_tl(cpu_T0, cpu_T1, cpu_tmp4);
+            tcg_gen_neg_tl(cpu_T0, cpu_T0);
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_tmp4);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update3_cc(cpu_tmp4);
        set_cc_op(s1, CC_OP_SBBB + ot);
        break;
    case OP_ADDL:
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update2_cc();
        set_cc_op(s1, CC_OP_ADDB + ot);
        break;
    case OP_SUBL:
-        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T0);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_neg_tl(cpu_T0, cpu_T1);
+            tcg_gen_atomic_fetch_add_tl(cpu_cc_srcT, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+            tcg_gen_sub_tl(cpu_T0, cpu_cc_srcT, cpu_T1);
+        } else {
+            tcg_gen_mov_tl(cpu_cc_srcT, cpu_T0);
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update2_cc();
        set_cc_op(s1, CC_OP_SUBB + ot);
        break;
    default:
    case OP_ANDL:
-        tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_and_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update1_cc();
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
    case OP_ORL:
-        tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_or_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                       s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update1_cc();
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
    case OP_XORL:
-        tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_xor_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
        gen_op_update1_cc();
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
@@ -1321,21 +1361,23 @@ static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 /* if d == OR_TMP0, it means memory operand (address in A0) */
 static void gen_inc(DisasContext *s1, TCGMemOp ot, int d, int c)
 {
-    if (d != OR_TMP0) {
-        gen_op_mov_v_reg(ot, cpu_T0, d);
+    if (s1->prefix & PREFIX_LOCK) {
+        tcg_gen_movi_tl(cpu_T0, c > 0 ? 1 : -1);
+        tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                    s1->mem_index, ot | MO_LE);
    } else {
-        gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
+        if (d != OR_TMP0) {
+            gen_op_mov_v_reg(ot, cpu_T0, d);
+        } else {
+            gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
+        }
+        tcg_gen_addi_tl(cpu_T0, cpu_T0, (c > 0 ? 1 : -1));
+        gen_op_st_rm_T0_A0(s1, ot, d);
    }
+
    gen_compute_eflags_c(s1, cpu_cc_src);
-    if (c > 0) {
-        tcg_gen_addi_tl(cpu_T0, cpu_T0, 1);
-        set_cc_op(s1, CC_OP_INCB + ot);
-    } else {
-        tcg_gen_addi_tl(cpu_T0, cpu_T0, -1);
-        set_cc_op(s1, CC_OP_DECB + ot);
-    }
-    gen_op_st_rm_T0_A0(s1, ot, d);
    tcg_gen_mov_tl(cpu_cc_dst, cpu_T0);
+    set_cc_op(s1, (c > 0 ? CC_OP_INCB : CC_OP_DECB) + ot);
 }

 static void gen_shift_flags(DisasContext *s, TCGMemOp ot, TCGv result,
@@ -4494,10 +4536,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
    s->aflag = aflag;
    s->dflag = dflag;

-    /* lock generation */
-    if (prefixes & PREFIX_LOCK)
-        gen_helper_lock();
-
    /* now check op code */
 reswitch:
    switch(b) {
@@ -4632,10 +4670,15 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
        rm = (modrm & 7) | REX_B(s);
        op = (modrm >> 3) & 7;
        if (mod != 3) {
-            if (op == 0)
+            if (op == 0) {
                s->rip_offset = insn_const_size(ot);
+            }
            gen_lea_modrm(env, s, modrm);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            /* For those below that handle locked memory, don't load here.  */
+            if (!(s->prefix & PREFIX_LOCK)
+                || op != 2) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
        } else {
            gen_op_mov_v_reg(ot, cpu_T0, rm);
        }
@@ -4648,19 +4691,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
            set_cc_op(s, CC_OP_LOGICB + ot);
            break;
        case 2: /* not */
-            tcg_gen_not_tl(cpu_T0, cpu_T0);
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3) {
+                    goto illegal_op;
+                }
+                tcg_gen_movi_tl(cpu_T0, ~0);
+                tcg_gen_atomic_xor_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                            s->mem_index, ot | MO_LE);
            } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+                tcg_gen_not_tl(cpu_T0, cpu_T0);
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
            }
            break;
        case 3: /* neg */
-            tcg_gen_neg_tl(cpu_T0, cpu_T0);
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                TCGLabel *label1;
+                TCGv a0, t0, t1, t2;
+
+                if (mod == 3) {
+                    goto illegal_op;
+                }
+                a0 = tcg_temp_local_new();
+                t0 = tcg_temp_local_new();
+                label1 = gen_new_label();
+
+                tcg_gen_mov_tl(a0, cpu_A0);
+                tcg_gen_mov_tl(t0, cpu_T0);
+
+                gen_set_label(label1);
+                t1 = tcg_temp_new();
+                t2 = tcg_temp_new();
+                tcg_gen_mov_tl(t2, t0);
+                tcg_gen_neg_tl(t1, t0);
+                tcg_gen_atomic_cmpxchg_tl(t0, a0, t0, t1,
+                                          s->mem_index, ot | MO_LE);
+                tcg_temp_free(t1);
+                tcg_gen_brcond_tl(TCG_COND_NE, t0, t2, label1);
+
+                tcg_temp_free(t2);
+                tcg_temp_free(a0);
+                tcg_gen_mov_tl(cpu_T0, t0);
+                tcg_temp_free(t0);
            } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+                tcg_gen_neg_tl(cpu_T0, cpu_T0);
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
            }
            gen_op_update_neg_cc();
            set_cc_op(s, CC_OP_SUBB + ot);
@@ -5048,19 +5130,24 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
        modrm = cpu_ldub_code(env, s->pc++);
        reg = ((modrm >> 3) & 7) | rex_r;
        mod = (modrm >> 6) & 3;
+        gen_op_mov_v_reg(ot, cpu_T0, reg);
        if (mod == 3) {
            rm = (modrm & 7) | REX_B(s);
-            gen_op_mov_v_reg(ot, cpu_T0, reg);
            gen_op_mov_v_reg(ot, cpu_T1, rm);
            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
            gen_op_mov_reg_v(ot, reg, cpu_T1);
            gen_op_mov_reg_v(ot, rm, cpu_T0);
        } else {
            gen_lea_modrm(env, s, modrm);
-            gen_op_mov_v_reg(ot, cpu_T0, reg);
-            gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
-            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-            gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                tcg_gen_atomic_fetch_add_tl(cpu_T1, cpu_A0, cpu_T0,
+                                            s->mem_index, ot | MO_LE);
+                tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            } else {
+                gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
+                tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            }
            gen_op_mov_reg_v(ot, reg, cpu_T1);
        }
        gen_op_update2_cc();
@@ -5069,57 +5156,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
    case 0x1b0:
    case 0x1b1: /* cmpxchg Ev, Gv */
        {
-            TCGLabel *label1, *label2;
-            TCGv t0, t1, t2, a0;
+            TCGv oldv, newv, cmpv;

            ot = mo_b_d(b, dflag);
            modrm = cpu_ldub_code(env, s->pc++);
            reg = ((modrm >> 3) & 7) | rex_r;
            mod = (modrm >> 6) & 3;
-            t0 = tcg_temp_local_new();
-            t1 = tcg_temp_local_new();
-            t2 = tcg_temp_local_new();
-            a0 = tcg_temp_local_new();
-            gen_op_mov_v_reg(ot, t1, reg);
-            if (mod == 3) {
-                rm = (modrm & 7) | REX_B(s);
-                gen_op_mov_v_reg(ot, t0, rm);
-            } else {
+            oldv = tcg_temp_new();
+            newv = tcg_temp_new();
+            cmpv = tcg_temp_new();
+            gen_op_mov_v_reg(ot, newv, reg);
+            tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]);
+
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3) {
+                    goto illegal_op;
+                }
                gen_lea_modrm(env, s, modrm);
-                tcg_gen_mov_tl(a0, cpu_A0);
-                gen_op_ld_v(s, ot, t0, a0);
-                rm = 0; /* avoid warning */
-            }
-            label1 = gen_new_label();
-            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
-            gen_extu(ot, t0);
-            gen_extu(ot, t2);
-            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
-            label2 = gen_new_label();
-            if (mod == 3) {
-                gen_op_mov_reg_v(ot, R_EAX, t0);
-                tcg_gen_br(label2);
-                gen_set_label(label1);
-                gen_op_mov_reg_v(ot, rm, t1);
+                tcg_gen_atomic_cmpxchg_tl(oldv, cpu_A0, cmpv, newv,
+                                          s->mem_index, ot | MO_LE);
+                gen_op_mov_reg_v(ot, R_EAX, oldv);
            } else {
-                /* perform no-op store cycle like physical cpu; must be
-                   before changing accumulator to ensure idempotency if
-                   the store faults and the instruction is restarted */
-                gen_op_st_v(s, ot, t0, a0);
-                gen_op_mov_reg_v(ot, R_EAX, t0);
-                tcg_gen_br(label2);
-                gen_set_label(label1);
-                gen_op_st_v(s, ot, t1, a0);
+                if (mod == 3) {
+                    rm = (modrm & 7) | REX_B(s);
+                    gen_op_mov_v_reg(ot, oldv, rm);
+                } else {
+                    gen_lea_modrm(env, s, modrm);
+                    gen_op_ld_v(s, ot, oldv, cpu_A0);
+                    rm = 0; /* avoid warning */
+                }
+                gen_extu(ot, oldv);
+                gen_extu(ot, cmpv);
+                /* store value = (old == cmp ? new : old);  */
+                tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
+                if (mod == 3) {
+                    gen_op_mov_reg_v(ot, R_EAX, oldv);
+                    gen_op_mov_reg_v(ot, rm, newv);
+                } else {
+                    /* Perform an unconditional store cycle like physical cpu;
+                       must be before changing accumulator to ensure
+                       idempotency if the store faults and the instruction
+                       is restarted */
+                    gen_op_st_v(s, ot, newv, cpu_A0);
+                    gen_op_mov_reg_v(ot, R_EAX, oldv);
+                }
            }
-            gen_set_label(label2);
-            tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_srcT, t2);
-            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
+            tcg_gen_mov_tl(cpu_cc_src, oldv);
+            tcg_gen_mov_tl(cpu_cc_srcT, cmpv);
+            tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv);
            set_cc_op(s, CC_OP_SUBB + ot);
-            tcg_temp_free(t0);
-            tcg_temp_free(t1);
-            tcg_temp_free(t2);
-            tcg_temp_free(a0);
+            tcg_temp_free(oldv);
+            tcg_temp_free(newv);
+            tcg_temp_free(cmpv);
        }
        break;
    case 0x1c7: /* cmpxchg8b */
@@ -5132,14 +5220,22 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
            if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
                goto illegal_op;
            gen_lea_modrm(env, s, modrm);
-            gen_helper_cmpxchg16b(cpu_env, cpu_A0);
+            if ((s->prefix & PREFIX_LOCK) && parallel_cpus) {
+                gen_helper_cmpxchg16b(cpu_env, cpu_A0);
+            } else {
+                gen_helper_cmpxchg16b_unlocked(cpu_env, cpu_A0);
+            }
        } else
 #endif        
        {
            if (!(s->cpuid_features & CPUID_CX8))
                goto illegal_op;
            gen_lea_modrm(env, s, modrm);
-            gen_helper_cmpxchg8b(cpu_env, cpu_A0);
+            if ((s->prefix & PREFIX_LOCK) && parallel_cpus) {
+                gen_helper_cmpxchg8b(cpu_env, cpu_A0);
+            } else {
+                gen_helper_cmpxchg8b_unlocked(cpu_env, cpu_A0);
+            }
        }
        set_cc_op(s, CC_OP_EFLAGS);
        break;
@@ -5464,12 +5560,8 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
            gen_lea_modrm(env, s, modrm);
            gen_op_mov_v_reg(ot, cpu_T0, reg);
            /* for xchg, lock is implicit */
-            if (!(prefixes & PREFIX_LOCK))
-                gen_helper_lock();
-            gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
-            gen_op_st_v(s, ot, cpu_T0, cpu_A0);
-            if (!(prefixes & PREFIX_LOCK))
-                gen_helper_unlock();
+            tcg_gen_atomic_xchg_tl(cpu_T1, cpu_A0, cpu_T0,
+                                   s->mem_index, ot | MO_LE);
            gen_op_mov_reg_v(ot, reg, cpu_T1);
        }
        break;
@@ -6555,7 +6647,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
        if (mod != 3) {
            s->rip_offset = 1;
            gen_lea_modrm(env, s, modrm);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            if (!(s->prefix & PREFIX_LOCK)) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
        } else {
            gen_op_mov_v_reg(ot, cpu_T0, rm);
        }
@@ -6585,44 +6679,69 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
        rm = (modrm & 7) | REX_B(s);
        gen_op_mov_v_reg(MO_32, cpu_T1, reg);
        if (mod != 3) {
-            gen_lea_modrm(env, s, modrm);
+            AddressParts a = gen_lea_modrm_0(env, s, modrm);
            /* specific case: we need to add a displacement */
            gen_exts(ot, cpu_T1);
            tcg_gen_sari_tl(cpu_tmp0, cpu_T1, 3 + ot);
            tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, ot);
-            tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            tcg_gen_add_tl(cpu_A0, gen_lea_modrm_1(a), cpu_tmp0);
+            gen_lea_v_seg(s, s->aflag, cpu_A0, a.def_seg, s->override);
+            if (!(s->prefix & PREFIX_LOCK)) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
        } else {
            gen_op_mov_v_reg(ot, cpu_T0, rm);
        }
    bt_op:
        tcg_gen_andi_tl(cpu_T1, cpu_T1, (1 << (3 + ot)) - 1);
-        tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
-        switch(op) {
-        case 0:
-            break;
-        case 1:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        case 2:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_andc_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        default:
-        case 3:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        }
-        if (op != 0) {
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
-            } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+        tcg_gen_movi_tl(cpu_tmp0, 1);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
+        if (s->prefix & PREFIX_LOCK) {
+            switch (op) {
+            case 0: /* bt */
+                /* Needs no atomic ops; we surpressed the normal
+                   memory load for LOCK above so do it now.  */
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+                break;
+            case 1: /* bts */
+                tcg_gen_atomic_fetch_or_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                           s->mem_index, ot | MO_LE);
+                break;
+            case 2: /* btr */
+                tcg_gen_not_tl(cpu_tmp0, cpu_tmp0);
+                tcg_gen_atomic_fetch_and_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                            s->mem_index, ot | MO_LE);
+                break;
+            default:
+            case 3: /* btc */
+                tcg_gen_atomic_fetch_xor_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                            s->mem_index, ot | MO_LE);
+                break;
+            }
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
+        } else {
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
+            switch (op) {
+            case 0: /* bt */
+                /* Data already loaded; nothing to do.  */
+                break;
+            case 1: /* bts */
+                tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            case 2: /* btr */
+                tcg_gen_andc_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            default:
+            case 3: /* btc */
+                tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            }
+            if (op != 0) {
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
            }
        }

@@ -8088,20 +8207,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
    default:
        goto unknown_op;
    }
-    /* lock generation */
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
    return s->pc;
 illegal_op:
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
-    /* XXX: ensure that no lock was generated */
    gen_illegal_opcode(s);
    return s->pc;
 unknown_op:
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
-    /* XXX: ensure that no lock was generated */
    gen_unknown_opcode(env, s);
    return s->pc;
 }
@@ -8193,8 +8303,6 @@ void tcg_x86_init(void)
                                     offsetof(CPUX86State, bnd_regs[i].ub),
                                     bnd_regu_names[i]);
    }
-
-    helper_lock_init();
 }

 /* generate intermediate code for basic block 'tb'.  */
--- a/target-m68k/cpu.c
+++ b/target-m68k/cpu.c
@@ -58,15 +58,20 @@ static void m68k_cpu_reset(CPUState *s)
 #endif
    m68k_switch_sp(env);
    /* ??? FP regs should be initialized to NaN.  */
-    env->cc_op = CC_OP_FLAGS;
+    cpu_m68k_set_ccr(env, 0);
    /* TODO: We should set PC from the interrupt vector.  */
    env->pc = 0;
    tlb_flush(s, 1);
 }

-static void m68k_cpu_disas_set_info(CPUState *cpu, disassemble_info *info)
+static void m68k_cpu_disas_set_info(CPUState *s, disassemble_info *info)
 {
+    M68kCPU *cpu = M68K_CPU(s);
+    CPUM68KState *env = &cpu->env;
    info->print_insn = print_insn_m68k;
+    if (m68k_feature(env, M68K_FEATURE_M68000)) {
+        info->mach = bfd_mach_m68040;
+    }
 }

 /* CPU models */
@@ -98,6 +103,57 @@ static void m5206_cpu_initfn(Object *obj)
    m68k_set_feature(env, M68K_FEATURE_CF_ISA_A);
 }

+static void m68000_cpu_initfn(Object *obj)
+{
+    M68kCPU *cpu = M68K_CPU(obj);
+    CPUM68KState *env = &cpu->env;
+
+    m68k_set_feature(env, M68K_FEATURE_M68000);
+    m68k_set_feature(env, M68K_FEATURE_USP);
+    m68k_set_feature(env, M68K_FEATURE_WORD_INDEX);
+}
+
+static void m68020_cpu_initfn(Object *obj)
+{
+    M68kCPU *cpu = M68K_CPU(obj);
+    CPUM68KState *env = &cpu->env;
+
+    m68k_set_feature(env, M68K_FEATURE_M68000);
+    m68k_set_feature(env, M68K_FEATURE_USP);
+    m68k_set_feature(env, M68K_FEATURE_WORD_INDEX);
+    m68k_set_feature(env, M68K_FEATURE_QUAD_MULDIV);
+    m68k_set_feature(env, M68K_FEATURE_BRAL);
+    m68k_set_feature(env, M68K_FEATURE_BCCL);
+    m68k_set_feature(env, M68K_FEATURE_BITFIELD);
+    m68k_set_feature(env, M68K_FEATURE_EXT_FULL);
+    m68k_set_feature(env, M68K_FEATURE_SCALED_INDEX);
+    m68k_set_feature(env, M68K_FEATURE_LONG_MULDIV);
+    m68k_set_feature(env, M68K_FEATURE_FPU);
+    m68k_set_feature(env, M68K_FEATURE_CAS);
+    m68k_set_feature(env, M68K_FEATURE_BKPT);
+}
+#define m68030_cpu_initfn m68020_cpu_initfn
+#define m68040_cpu_initfn m68020_cpu_initfn
+
+static void m68060_cpu_initfn(Object *obj)
+{
+    M68kCPU *cpu = M68K_CPU(obj);
+    CPUM68KState *env = &cpu->env;
+
+    m68k_set_feature(env, M68K_FEATURE_M68000);
+    m68k_set_feature(env, M68K_FEATURE_USP);
+    m68k_set_feature(env, M68K_FEATURE_WORD_INDEX);
+    m68k_set_feature(env, M68K_FEATURE_BRAL);
+    m68k_set_feature(env, M68K_FEATURE_BCCL);
+    m68k_set_feature(env, M68K_FEATURE_BITFIELD);
+    m68k_set_feature(env, M68K_FEATURE_EXT_FULL);
+    m68k_set_feature(env, M68K_FEATURE_SCALED_INDEX);
+    m68k_set_feature(env, M68K_FEATURE_LONG_MULDIV);
+    m68k_set_feature(env, M68K_FEATURE_FPU);
+    m68k_set_feature(env, M68K_FEATURE_CAS);
+    m68k_set_feature(env, M68K_FEATURE_BKPT);
+}
+
 static void m5208_cpu_initfn(Object *obj)
 {
    M68kCPU *cpu = M68K_CPU(obj);
@@ -148,6 +204,11 @@ typedef struct M68kCPUInfo {
 } M68kCPUInfo;

 static const M68kCPUInfo m68k_cpus[] = {
+    { .name = "m68000", .instance_init = m68000_cpu_initfn },
+    { .name = "m68020", .instance_init = m68020_cpu_initfn },
+    { .name = "m68030", .instance_init = m68030_cpu_initfn },
+    { .name = "m68040", .instance_init = m68040_cpu_initfn },
+    { .name = "m68060", .instance_init = m68060_cpu_initfn },
    { .name = "m5206", .instance_init = m5206_cpu_initfn },
    { .name = "m5208", .instance_init = m5208_cpu_initfn },
    { .name = "cfv4e", .instance_init = cfv4e_cpu_initfn },
@@ -220,8 +281,6 @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
 #else
    cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
 #endif
-    cc->cpu_exec_enter = m68k_cpu_exec_enter;
-    cc->cpu_exec_exit = m68k_cpu_exec_exit;
    cc->disas_set_info = m68k_cpu_disas_set_info;

    cc->gdb_num_core_regs = 18;
--- a/target-m68k/cpu.h
+++ b/target-m68k/cpu.h
@@ -30,6 +30,14 @@
 #include "cpu-qom.h"
 #include "fpu/softfloat.h"

+#define OS_BYTE     0
+#define OS_WORD     1
+#define OS_LONG     2
+#define OS_SINGLE   3
+#define OS_DOUBLE   4
+#define OS_EXTENDED 5
+#define OS_PACKED   6
+
 #define MAX_QREGS 32

 #define EXCP_ACCESS         2   /* Access (MMU) error.  */
@@ -53,6 +61,7 @@
 #define EXCP_HALT_INSN      0x101

 #define NB_MMU_MODES 2
+#define TARGET_INSN_START_EXTRA_WORDS 1

 typedef struct CPUM68KState {
    uint32_t dregs[8];
@@ -66,9 +75,11 @@ typedef struct CPUM68KState {

    /* Condition flags.  */
    uint32_t cc_op;
-    uint32_t cc_dest;
-    uint32_t cc_src;
-    uint32_t cc_x;
+    uint32_t cc_x; /* always 0/1 */
+    uint32_t cc_n; /* in bit 31 (i.e. negative) */
+    uint32_t cc_v; /* in bit 31, unused, or computed from cc_n and cc_v */
+    uint32_t cc_c; /* either 0/1, unused, or computed from cc_n and cc_v */
+    uint32_t cc_z; /* == 0 or unused */

    float64 fregs[8];
    float64 fp_result;
@@ -141,9 +152,6 @@ hwaddr m68k_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 int m68k_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);

-void m68k_cpu_exec_enter(CPUState *cs);
-void m68k_cpu_exec_exit(CPUState *cs);
-
 void m68k_tcg_init(void);
 void m68k_cpu_init_gdb(M68kCPU *cpu);
 M68kCPU *cpu_m68k_init(const char *cpu_model);
@@ -152,7 +160,8 @@ M68kCPU *cpu_m68k_init(const char *cpu_model);
   is returned if the signal was handled by the virtual CPU.  */
 int cpu_m68k_signal_handler(int host_signum, void *pinfo,
                           void *puc);
-void cpu_m68k_flush_flags(CPUM68KState *, int);
+uint32_t cpu_m68k_get_ccr(CPUM68KState *env);
+void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);


 /* Instead of computing the condition codes after each m68k instruction,
@@ -162,18 +171,25 @@ void cpu_m68k_flush_flags(CPUM68KState *, int);
 * using this information. Condition codes are not generated if they
 * are only needed for conditional branches.
 */
-enum {
-    CC_OP_DYNAMIC, /* Use env->cc_op  */
-    CC_OP_FLAGS, /* CC_DEST = CVZN, CC_SRC = unused */
-    CC_OP_LOGIC, /* CC_DEST = result, CC_SRC = unused */
-    CC_OP_ADD,   /* CC_DEST = result, CC_SRC = source */
-    CC_OP_SUB,   /* CC_DEST = result, CC_SRC = source */
-    CC_OP_CMPB,  /* CC_DEST = result, CC_SRC = source */
-    CC_OP_CMPW,  /* CC_DEST = result, CC_SRC = source */
-    CC_OP_ADDX,  /* CC_DEST = result, CC_SRC = source */
-    CC_OP_SUBX,  /* CC_DEST = result, CC_SRC = source */
-    CC_OP_SHIFT, /* CC_DEST = result, CC_SRC = carry */
-};
+typedef enum {
+    /* Translator only -- use env->cc_op.  */
+    CC_OP_DYNAMIC = -1,
+
+    /* Each flag bit computed into cc_[xcnvz].  */
+    CC_OP_FLAGS,
+
+    /* X in cc_x, C = X, N in cc_n, Z in cc_n, V via cc_n/cc_v.  */
+    CC_OP_ADD,
+    CC_OP_SUB,
+
+    /* X in cc_x, {N,Z,C,V} via cc_n/cc_v.  */
+    CC_OP_CMP,
+
+    /* X in cc_x, C = 0, V = 0, N in cc_n, Z in cc_n.  */
+    CC_OP_LOGIC,
+
+    CC_OP_NB
+} CCOp;

 #define CCF_C 0x01
 #define CCF_V 0x02
@@ -215,6 +231,7 @@ void do_m68k_semihosting(CPUM68KState *env, int nr);
   ISA revisions mentioned.  */

 enum m68k_features {
+    M68K_FEATURE_M68000,
    M68K_FEATURE_CF_ISA_A,
    M68K_FEATURE_CF_ISA_B, /* (ISA B or C).  */
    M68K_FEATURE_CF_ISA_APLUSC, /* BIT/BITREV, FF1, STRLDSR (ISA A+ or C).  */
@@ -225,7 +242,15 @@ enum m68k_features {
    M68K_FEATURE_CF_EMAC_B, /* Revision B EMAC (dual accumulate).  */
    M68K_FEATURE_USP, /* User Stack Pointer.  (ISA A+, B or C).  */
    M68K_FEATURE_EXT_FULL, /* 68020+ full extension word.  */
-    M68K_FEATURE_WORD_INDEX /* word sized address index registers.  */
+    M68K_FEATURE_WORD_INDEX, /* word sized address index registers.  */
+    M68K_FEATURE_SCALED_INDEX, /* scaled address index registers.  */
+    M68K_FEATURE_LONG_MULDIV, /* 32 bit multiply/divide. */
+    M68K_FEATURE_QUAD_MULDIV, /* 64 bit multiply/divide. */
+    M68K_FEATURE_BCCL, /* Long conditional branches.  */
+    M68K_FEATURE_BITFIELD, /* Bit field insns.  */
+    M68K_FEATURE_FPU,
+    M68K_FEATURE_CAS,
+    M68K_FEATURE_BKPT,
 };

 static inline int m68k_feature(CPUM68KState *env, int feature)
@@ -238,8 +263,11 @@ void m68k_cpu_list(FILE *f, fprintf_function cpu_fprintf);
 void register_m68k_insns (CPUM68KState *env);

 #ifdef CONFIG_USER_ONLY
-/* Linux uses 8k pages.  */
-#define TARGET_PAGE_BITS 13
+/* Coldfire Linux uses 8k pages
+ * and m68k linux uses 4k pages
+ * use the smaller one
+ */
+#define TARGET_PAGE_BITS 12
 #else
 /* Smallest TLB entry size is 1k.  */
 #define TARGET_PAGE_BITS 10
--- a/target-m68k/helper.c
+++ b/target-m68k/helper.c
@@ -132,87 +132,6 @@ void m68k_cpu_init_gdb(M68kCPU *cpu)
    /* TODO: Add [E]MAC registers.  */
 }

-void cpu_m68k_flush_flags(CPUM68KState *env, int cc_op)
-{
-    M68kCPU *cpu = m68k_env_get_cpu(env);
-    int flags;
-    uint32_t src;
-    uint32_t dest;
-    uint32_t tmp;
-
-#define HIGHBIT 0x80000000u
-
-#define SET_NZ(x) do { \
-    if ((x) == 0) \
-        flags |= CCF_Z; \
-    else if ((int32_t)(x) < 0) \
-        flags |= CCF_N; \
-    } while (0)
-
-#define SET_FLAGS_SUB(type, utype) do { \
-    SET_NZ((type)dest); \
-    tmp = dest + src; \
-    if ((utype) tmp < (utype) src) \
-        flags |= CCF_C; \
-    if ((1u << (sizeof(type) * 8 - 1)) & (tmp ^ dest) & (tmp ^ src)) \
-        flags |= CCF_V; \
-    } while (0)
-
-    flags = 0;
-    src = env->cc_src;
-    dest = env->cc_dest;
-    switch (cc_op) {
-    case CC_OP_FLAGS:
-        flags = dest;
-        break;
-    case CC_OP_LOGIC:
-        SET_NZ(dest);
-        break;
-    case CC_OP_ADD:
-        SET_NZ(dest);
-        if (dest < src)
-            flags |= CCF_C;
-        tmp = dest - src;
-        if (HIGHBIT & (src ^ dest) & ~(tmp ^ src))
-            flags |= CCF_V;
-        break;
-    case CC_OP_SUB:
-        SET_FLAGS_SUB(int32_t, uint32_t);
-        break;
-    case CC_OP_CMPB:
-        SET_FLAGS_SUB(int8_t, uint8_t);
-        break;
-    case CC_OP_CMPW:
-        SET_FLAGS_SUB(int16_t, uint16_t);
-        break;
-    case CC_OP_ADDX:
-        SET_NZ(dest);
-        if (dest <= src)
-            flags |= CCF_C;
-        tmp = dest - src - 1;
-        if (HIGHBIT & (src ^ dest) & ~(tmp ^ src))
-            flags |= CCF_V;
-        break;
-    case CC_OP_SUBX:
-        SET_NZ(dest);
-        tmp = dest + src + 1;
-        if (tmp <= src)
-            flags |= CCF_C;
-        if (HIGHBIT & (tmp ^ dest) & (tmp ^ src))
-            flags |= CCF_V;
-        break;
-    case CC_OP_SHIFT:
-        SET_NZ(dest);
-        if (src)
-            flags |= CCF_C;
-        break;
-    default:
-        cpu_abort(CPU(cpu), "Bad CC_OP %d", cc_op);
-    }
-    env->cc_op = CC_OP_FLAGS;
-    env->cc_dest = flags;
-}
-
 void HELPER(movec)(CPUM68KState *env, uint32_t reg, uint32_t val)
 {
    M68kCPU *cpu = m68k_env_get_cpu(env);
@@ -349,140 +268,111 @@ uint32_t HELPER(ff1)(uint32_t x)
    return n;
 }

-uint32_t HELPER(sats)(uint32_t val, uint32_t ccr)
+uint32_t HELPER(sats)(uint32_t val, uint32_t v)
 {
    /* The result has the opposite sign to the original value.  */
-    if (ccr & CCF_V)
+    if ((int32_t)v < 0) {
        val = (((int32_t)val) >> 31) ^ SIGNBIT;
+    }
    return val;
 }

 uint32_t HELPER(subx_cc)(CPUM68KState *env, uint32_t op1, uint32_t op2)
 {
-    uint32_t res;
-    uint32_t old_flags;
+    uint32_t res, new_x;

-    old_flags = env->cc_dest;
    if (env->cc_x) {
-        env->cc_x = (op1 <= op2);
-        env->cc_op = CC_OP_SUBX;
+        new_x = (op1 <= op2);
        res = op1 - (op2 + 1);
    } else {
-        env->cc_x = (op1 < op2);
-        env->cc_op = CC_OP_SUB;
+        new_x = (op1 < op2);
        res = op1 - op2;
    }
-    env->cc_dest = res;
-    env->cc_src = op2;
-    cpu_m68k_flush_flags(env, env->cc_op);
-    /* !Z is sticky.  */
-    env->cc_dest &= (old_flags | ~CCF_Z);
+    env->cc_x = new_x;
+    env->cc_c = new_x;
+    env->cc_n = res;
+    env->cc_z |= res; /* !Z is sticky */
+    env->cc_v = (res ^ op1) & (op1 ^ op2);
+
    return res;
 }

 uint32_t HELPER(addx_cc)(CPUM68KState *env, uint32_t op1, uint32_t op2)
 {
-    uint32_t res;
-    uint32_t old_flags;
+    uint32_t res, new_x;

-    old_flags = env->cc_dest;
    if (env->cc_x) {
        res = op1 + op2 + 1;
-        env->cc_x = (res <= op2);
-        env->cc_op = CC_OP_ADDX;
+        new_x = (res <= op2);
    } else {
        res = op1 + op2;
-        env->cc_x = (res < op2);
-        env->cc_op = CC_OP_ADD;
+        new_x = (res < op2);
    }
-    env->cc_dest = res;
-    env->cc_src = op2;
-    cpu_m68k_flush_flags(env, env->cc_op);
-    /* !Z is sticky.  */
-    env->cc_dest &= (old_flags | ~CCF_Z);
-    return res;
-}
+    env->cc_x = new_x;
+    env->cc_c = new_x;
+    env->cc_n = res;
+    env->cc_z |= res; /* !Z is sticky.  */
+    env->cc_v = (res ^ op1) & ~(op1 ^ op2);

-uint32_t HELPER(xflag_lt)(uint32_t a, uint32_t b)
-{
-    return a < b;
+    return res;
 }

 void HELPER(set_sr)(CPUM68KState *env, uint32_t val)
 {
-    env->sr = val & 0xffff;
+    env->sr = val & 0xffe0;
+    cpu_m68k_set_ccr(env, val);
    m68k_switch_sp(env);
 }

 uint32_t HELPER(shl_cc)(CPUM68KState *env, uint32_t val, uint32_t shift)
 {
-    uint32_t result;
-    uint32_t cf;
+    uint64_t result;

    shift &= 63;
-    if (shift == 0) {
-        result = val;
-        cf = env->cc_src & CCF_C;
-    } else if (shift < 32) {
-        result = val << shift;
-        cf = (val >> (32 - shift)) & 1;
-    } else if (shift == 32) {
-        result = 0;
-        cf = val & 1;
-    } else /* shift > 32 */ {
-        result = 0;
-        cf = 0;
-    }
-    env->cc_src = cf;
-    env->cc_x = (cf != 0);
-    env->cc_dest = result;
+    result = (uint64_t)val << shift;
+
+    env->cc_c = (result >> 32) & 1;
+    env->cc_n = result;
+    env->cc_z = result;
+    env->cc_v = 0;
+    env->cc_x = shift ? env->cc_c : env->cc_x;
+
    return result;
 }

 uint32_t HELPER(shr_cc)(CPUM68KState *env, uint32_t val, uint32_t shift)
 {
+    uint64_t temp;
    uint32_t result;
-    uint32_t cf;

    shift &= 63;
-    if (shift == 0) {
-        result = val;
-        cf = env->cc_src & CCF_C;
-    } else if (shift < 32) {
-        result = val >> shift;
-        cf = (val >> (shift - 1)) & 1;
-    } else if (shift == 32) {
-        result = 0;
-        cf = val >> 31;
-    } else /* shift > 32 */ {
-        result = 0;
-        cf = 0;
-    }
-    env->cc_src = cf;
-    env->cc_x = (cf != 0);
-    env->cc_dest = result;
+    temp = (uint64_t)val << 32 >> shift;
+    result = temp >> 32;
+
+    env->cc_c = (temp >> 31) & 1;
+    env->cc_n = result;
+    env->cc_z = result;
+    env->cc_v = 0;
+    env->cc_x = shift ? env->cc_c : env->cc_x;
+
    return result;
 }

 uint32_t HELPER(sar_cc)(CPUM68KState *env, uint32_t val, uint32_t shift)
 {
+    uint64_t temp;
    uint32_t result;
-    uint32_t cf;

    shift &= 63;
-    if (shift == 0) {
-        result = val;
-        cf = (env->cc_src & CCF_C) != 0;
-    } else if (shift < 32) {
-        result = (int32_t)val >> shift;
-        cf = (val >> (shift - 1)) & 1;
-    } else /* shift >= 32 */ {
-        result = (int32_t)val >> 31;
-        cf = val >> 31;
-    }
-    env->cc_src = cf;
-    env->cc_x = cf;
-    env->cc_dest = result;
+    temp = (int64_t)val << 32 >> shift;
+    result = temp >> 32;
+
+    env->cc_c = (temp >> 31) & 1;
+    env->cc_n = result;
+    env->cc_z = result;
+    env->cc_v = result ^ val;
+    env->cc_x = shift ? env->cc_c : env->cc_x;
+
    return result;
 }

@@ -734,9 +624,92 @@ void HELPER(mac_set_flags)(CPUM68KState *env, uint32_t acc)
    }
 }

+
+#define COMPUTE_CCR(op, x, n, z, v, c) {                                   \
+    switch (op) {                                                          \
+    case CC_OP_FLAGS:                                                      \
+        /* Everything in place.  */                                        \
+        break;                                                             \
+    case CC_OP_ADD:                                                        \
+        res = n;                                                           \
+        src2 = v;                                                          \
+        src1 = res - src2;                                                 \
+        c = x;                                                             \
+        z = n;                                                             \
+        v = (res ^ src1) & ~(src1 ^ src2);                                 \
+        break;                                                             \
+    case CC_OP_SUB:                                                        \
+        res = n;                                                           \
+        src2 = v;                                                          \
+        src1 = res + src2;                                                 \
+        c = x;                                                             \
+        z = n;                                                             \
+        v = (res ^ src1) & (src1 ^ src2);                                  \
+        break;                                                             \
+    case CC_OP_CMP:                                                        \
+        src1 = n;                                                          \
+        src2 = v;                                                          \
+        res = src1 - src2;                                                 \
+        n = res;                                                           \
+        z = res;                                                           \
+        c = src1 < src2;                                                   \
+        v = (res ^ src1) & (src1 ^ src2);                                  \
+        break;                                                             \
+    case CC_OP_LOGIC:                                                      \
+        c = v = 0;                                                         \
+        z = n;                                                             \
+        break;                                                             \
+    default:                                                               \
+        cpu_abort(CPU(m68k_env_get_cpu(env)), "Bad CC_OP %d", op);         \
+    }                                                                      \
+} while (0)
+
+uint32_t cpu_m68k_get_ccr(CPUM68KState *env)
+{
+    uint32_t x, c, n, z, v;
+    uint32_t res, src1, src2;
+
+    x = env->cc_x;
+    c = env->cc_c;
+    n = env->cc_n;
+    z = env->cc_z;
+    v = env->cc_v;
+
+    COMPUTE_CCR(env->cc_op, x, n, z, v, c);
+
+    n = n >> 31;
+    v = v >> 31;
+    z = (z == 0);
+
+    return x * CCF_X + n * CCF_N + z * CCF_Z + v * CCF_V + c * CCF_C;
+}
+
+uint32_t HELPER(get_ccr)(CPUM68KState *env)
+{
+    return cpu_m68k_get_ccr(env);
+}
+
+void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t ccr)
+{
+    env->cc_x = (ccr & CCF_X ? 1 : 0);
+    env->cc_n = (ccr & CCF_N ? -1 : 0);
+    env->cc_z = (ccr & CCF_Z ? 0 : 1);
+    env->cc_v = (ccr & CCF_V ? -1 : 0);
+    env->cc_c = (ccr & CCF_C ? 1 : 0);
+    env->cc_op = CC_OP_FLAGS;
+}
+
+void HELPER(set_ccr)(CPUM68KState *env, uint32_t ccr)
+{
+    cpu_m68k_set_ccr(env, ccr);
+}
+
 void HELPER(flush_flags)(CPUM68KState *env, uint32_t cc_op)
 {
-    cpu_m68k_flush_flags(env, cc_op);
+    uint32_t res, src1, src2;
+
+    COMPUTE_CCR(cc_op, env->cc_x, env->cc_n, env->cc_z, env->cc_v, env->cc_c);
+    env->cc_op = CC_OP_FLAGS;
 }

 uint32_t HELPER(get_macf)(CPUM68KState *env, uint64_t val)
@@ -866,23 +839,3 @@ void HELPER(set_mac_extu)(CPUM68KState *env, uint32_t val, uint32_t acc)
    res |= (uint64_t)(val & 0xffff0000) << 16;
    env->macc[acc + 1] = res;
 }
-
-void m68k_cpu_exec_enter(CPUState *cs)
-{
-    M68kCPU *cpu = M68K_CPU(cs);
-    CPUM68KState *env = &cpu->env;
-
-    env->cc_op = CC_OP_FLAGS;
-    env->cc_dest = env->sr & 0xf;
-    env->cc_x = (env->sr >> 4) & 1;
-}
-
-void m68k_cpu_exec_exit(CPUState *cs)
-{
-    M68kCPU *cpu = M68K_CPU(cs);
-    CPUM68KState *env = &cpu->env;
-
-    cpu_m68k_flush_flags(env, env->cc_op);
-    env->cc_op = CC_OP_FLAGS;
-    env->sr = (env->sr & 0xffe0) | env->cc_dest | (env->cc_x << 4);
-}
--- a/target-m68k/helper.h
+++ b/target-m68k/helper.h
@@ -1,6 +1,6 @@
 DEF_HELPER_1(bitrev, i32, i32)
 DEF_HELPER_1(ff1, i32, i32)
-DEF_HELPER_2(sats, i32, i32, i32)
+DEF_HELPER_FLAGS_2(sats, TCG_CALL_NO_RWG_SE, i32, i32, i32)
 DEF_HELPER_2(divu, void, env, i32)
 DEF_HELPER_2(divs, void, env, i32)
 DEF_HELPER_3(addx_cc, i32, env, i32, i32)
@@ -8,7 +8,6 @@ DEF_HELPER_3(subx_cc, i32, env, i32, i32)
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
 DEF_HELPER_3(shr_cc, i32, env, i32, i32)
 DEF_HELPER_3(sar_cc, i32, env, i32, i32)
-DEF_HELPER_2(xflag_lt, i32, i32, i32)
 DEF_HELPER_2(set_sr, void, env, i32)
 DEF_HELPER_3(movec, void, env, i32, i32)

@@ -47,4 +46,6 @@ DEF_HELPER_3(set_mac_exts, void, env, i32, i32)
 DEF_HELPER_3(set_mac_extu, void, env, i32, i32)

 DEF_HELPER_2(flush_flags, void, env, i32)
+DEF_HELPER_2(set_ccr, void, env, i32)
+DEF_HELPER_FLAGS_1(get_ccr, TCG_CALL_NO_WG_SE, i32, env)
 DEF_HELPER_2(raise_exception, void, env, i32)
--- a/target-m68k/op_helper.c
+++ b/target-m68k/op_helper.c
@@ -63,9 +63,9 @@ static void do_rte(CPUM68KState *env)
    fmt = cpu_ldl_kernel(env, sp);
    env->pc = cpu_ldl_kernel(env, sp + 4);
    sp |= (fmt >> 28) & 3;
-    env->sr = fmt & 0xffff;
    env->aregs[7] = sp + 8;
-    m68k_switch_sp(env);
+
+    helper_set_sr(env, fmt);
 }

 static void do_interrupt_all(CPUM68KState *env, int is_hw)
@@ -112,6 +112,7 @@ static void do_interrupt_all(CPUM68KState *env, int is_hw)
    fmt |= 0x40000000;
    fmt |= vector << 16;
    fmt |= env->sr;
+    fmt |= cpu_m68k_get_ccr(env);

    env->sr |= SR_S;
    if (is_hw) {
@@ -184,7 +185,6 @@ void HELPER(divu)(CPUM68KState *env, uint32_t word)
    uint32_t den;
    uint32_t quot;
    uint32_t rem;
-    uint32_t flags;

    num = env->div1;
    den = env->div2;
@@ -194,16 +194,14 @@ void HELPER(divu)(CPUM68KState *env, uint32_t word)
    }
    quot = num / den;
    rem = num % den;
-    flags = 0;
-    if (word && quot > 0xffff)
-        flags |= CCF_V;
-    if (quot == 0)
-        flags |= CCF_Z;
-    else if ((int32_t)quot < 0)
-        flags |= CCF_N;
+
+    env->cc_v = (word && quot > 0xffff ? -1 : 0);
+    env->cc_z = quot;
+    env->cc_n = quot;
+    env->cc_c = 0;
+
    env->div1 = quot;
    env->div2 = rem;
-    env->cc_dest = flags;
 }

 void HELPER(divs)(CPUM68KState *env, uint32_t word)
@@ -212,7 +210,6 @@ void HELPER(divs)(CPUM68KState *env, uint32_t word)
    int32_t den;
    int32_t quot;
    int32_t rem;
-    int32_t flags;

    num = env->div1;
    den = env->div2;
@@ -221,14 +218,12 @@ void HELPER(divs)(CPUM68KState *env, uint32_t word)
    }
    quot = num / den;
    rem = num % den;
-    flags = 0;
-    if (word && quot != (int16_t)quot)
-        flags |= CCF_V;
-    if (quot == 0)
-        flags |= CCF_Z;
-    else if (quot < 0)
-        flags |= CCF_N;
+
+    env->cc_v = (word && quot != (int16_t)quot ? -1 : 0);
+    env->cc_z = quot;
+    env->cc_n = quot;
+    env->cc_c = 0;
+
    env->div1 = quot;
    env->div2 = rem;
-    env->cc_dest = flags;
 }
--- a/target-m68k/qregs.def
+++ b/target-m68k/qregs.def
@@ -2,9 +2,11 @@ DEFF64(FP_RESULT, fp_result)
 DEFO32(PC, pc)
 DEFO32(SR, sr)
 DEFO32(CC_OP, cc_op)
-DEFO32(CC_DEST, cc_dest)
-DEFO32(CC_SRC, cc_src)
 DEFO32(CC_X, cc_x)
+DEFO32(CC_C, cc_c)
+DEFO32(CC_N, cc_n)
+DEFO32(CC_V, cc_v)
+DEFO32(CC_Z, cc_z)
 DEFO32(DIV1, div1)
 DEFO32(DIV2, div2)
 DEFO32(MACSR, macsr)
--- a/target-m68k/translate.c
+++ b/target-m68k/translate.c
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -23,17 +23,10 @@
 */
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
-
-/* This file is compiled once, and thus we can't include the standard
-   "exec/helper-proto.h", which has includes that are target specific.  */
-
-#include "exec/helper-head.h"
-
-#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
-
-#include "tcg-runtime.h"
-
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"

 /* 32-bit helpers */

@@ -107,3 +100,62 @@ int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
    muls64(&l, &h, arg1, arg2);
    return h;
 }
+
+void HELPER(exit_atomic)(CPUArchState *env)
+{
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
+}
+
+#ifndef CONFIG_SOFTMMU
+/* The softmmu versions of these helpers are in cputlb.c.  */
+
+/* Do not allow unaligned operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               int size, uintptr_t retaddr)
+{
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & (size - 1))) {
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+    }
+    return g2h(addr);
+}
+
+/* Macro to call the above, with local variables from the use context.  */
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+
+#define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define EXTRA_ARGS
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+/* The following is only callable from other helpers, and matches up
+   with the softmmu version.  */
+
+#ifdef CONFIG_ATOMIC128
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)
+
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif /* CONFIG_ATOMIC128 */
+
+#endif /* !CONFIG_SOFTMMU */
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -150,17 +150,7 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2,

 void tcg_gen_mb(TCGBar mb_type)
 {
-    bool emit_barriers = true;
-
-#ifndef CONFIG_USER_ONLY
-    /* TODO: When MTTCG is available for system mode, we will check
-     * the following condition and enable emit_barriers
-     * (qemu_tcg_mttcg_enabled() && smp_cpus > 1)
-     */
-    emit_barriers = false;
-#endif
-
-    if (emit_barriers) {
+    if (parallel_cpus) {
        tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type);
    }
 }
@@ -1975,3 +1965,345 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
                               addr, trace_mem_get_info(memop, 1));
    gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
 }
+
+static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i32(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i32(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i32(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i32(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i32(ret, val);
+        break;
+    }
+}
+
+static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i64(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i64(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i64(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i64(ret, val);
+        break;
+    case MO_SL:
+        tcg_gen_ext32s_i64(ret, val);
+        break;
+    case MO_UL:
+        tcg_gen_ext32u_i64(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i64(ret, val);
+        break;
+    }
+}
+
+#ifdef CONFIG_SOFTMMU
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv,
+                                  TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv,
+                                  TCGv_i64, TCGv_i64, TCGv_i32);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv,
+                                  TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv,
+                                  TCGv_i64, TCGv_i32);
+#else
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64, TCGv_i64);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64);
+#endif
+
+#ifdef CONFIG_ATOMIC64
+# define WITH_ATOMIC64(X) X,
+#else
+# define WITH_ATOMIC64(X)
+#endif
+
+static void * const table_cmpxchg[16] = {
+    [MO_8] = gen_helper_atomic_cmpxchgb,
+    [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le,
+    [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be,
+    [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le,
+    [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be,
+    WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le)
+    WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be)
+};
+
+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
+                                TCGv_i32 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+
+        tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+        tcg_temp_free_i32(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i32(retv, t1);
+        }
+        tcg_temp_free_i32(t1);
+    } else {
+        gen_atomic_cx_i32 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv);
+#endif
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, retv, memop);
+        }
+    }
+}
+
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
+                                TCGv_i64 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+
+        tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+        tcg_temp_free_i64(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i64(retv, t1);
+        }
+        tcg_temp_free_i64(t1);
+    } else if ((memop & MO_SIZE) == MO_64) {
+#ifdef CONFIG_ATOMIC64
+        gen_atomic_cx_i64 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
+            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv);
+#endif
+#else
+        gen_helper_exit_atomic(tcg_ctx.tcg_env);
+#endif /* CONFIG_ATOMIC64 */
+    } else {
+        TCGv_i32 c32 = tcg_temp_new_i32();
+        TCGv_i32 n32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(c32, cmpv);
+        tcg_gen_extrl_i64_i32(n32, newv);
+        tcg_gen_atomic_cmpxchg_i32(r32, addr, c32, n32, idx, memop & ~MO_SIGN);
+        tcg_temp_free_i32(c32);
+        tcg_temp_free_i32(n32);
+
+        tcg_gen_extu_i32_i64(retv, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, retv, memop);
+        }
+    }
+}
+
+static void do_nonatomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t1 = tcg_temp_new_i32();
+    TCGv_i32 t2 = tcg_temp_new_i32();
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+
+    tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t2);
+}
+
+static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    gen_atomic_op_i32 gen;
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    gen = table[memop & (MO_SIZE | MO_BSWAP)];
+    tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+    {
+        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+        gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+        tcg_temp_free_i32(oi);
+    }
+#else
+    gen(ret, tcg_ctx.tcg_env, addr, val);
+#endif
+
+    if (memop & MO_SIGN) {
+        tcg_gen_ext_i32(ret, ret, memop);
+    }
+}
+
+static void do_nonatomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+
+    tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if ((memop & MO_SIZE) == MO_64) {
+#ifdef CONFIG_ATOMIC64
+        gen_atomic_op_i64 gen;
+
+        gen = table[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(ret, tcg_ctx.tcg_env, addr, val);
+#endif
+#else
+        gen_helper_exit_atomic(tcg_ctx.tcg_env);
+#endif /* CONFIG_ATOMIC64 */
+    } else {
+        TCGv_i32 v32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(v32, val);
+        do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table);
+        tcg_temp_free_i32(v32);
+
+        tcg_gen_extu_i32_i64(ret, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(ret, ret, memop);
+        }
+    }
+}
+
+#define GEN_ATOMIC_HELPER(NAME, OP, NEW)                                \
+static void * const table_##NAME[16] = {                                \
+    [MO_8] = gen_helper_atomic_##NAME##b,                               \
+    [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le,                   \
+    [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be,                   \
+    [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le,                   \
+    [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be,                   \
+    WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le)     \
+    WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be)     \
+};                                                                      \
+void tcg_gen_atomic_##NAME##_i32                                        \
+    (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i32);                        \
+    }                                                                   \
+}                                                                       \
+void tcg_gen_atomic_##NAME##_i64                                        \
+    (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i64);                        \
+    }                                                                   \
+}
+
+GEN_ATOMIC_HELPER(fetch_add, add, 0)
+GEN_ATOMIC_HELPER(fetch_and, and, 0)
+GEN_ATOMIC_HELPER(fetch_or, or, 0)
+GEN_ATOMIC_HELPER(fetch_xor, xor, 0)
+
+GEN_ATOMIC_HELPER(add_fetch, add, 1)
+GEN_ATOMIC_HELPER(and_fetch, and, 1)
+GEN_ATOMIC_HELPER(or_fetch, or, 1)
+GEN_ATOMIC_HELPER(xor_fetch, xor, 1)
+
+static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_mov_i32(r, b);
+}
+
+static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_mov_i64(r, b);
+}
+
+GEN_ATOMIC_HELPER(xchg, mov2, 0)
+
+#undef GEN_ATOMIC_HELPER
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -854,6 +854,30 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
    tcg_gen_qemu_st_i64(arg, addr, mem_index, MO_TEQ);
 }

+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32,
+                                TCGArg, TCGMemOp);
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64,
+                                TCGArg, TCGMemOp);
+
+void tcg_gen_atomic_xchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -932,6 +956,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i64
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
 #define tcg_gen_muls2_tl tcg_gen_muls2_i64
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i64
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i64
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i64
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i64
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i64
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i64
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i64
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1009,6 +1043,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i32
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
 #define tcg_gen_muls2_tl tcg_gen_muls2_i32
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i32
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i32
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i32
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i32
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i32
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i32
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i32
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
 #endif

 #if UINTPTR_MAX == UINT32_MAX
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -14,3 +14,112 @@ DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)

 DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
+
+#ifdef CONFIG_SOFTMMU
+
+DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+#ifdef CONFIG_ATOMIC64
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
+                   i64, env, tl, i64, i64, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
+                   i64, env, tl, i64, i64, i32)
+#endif
+
+#ifdef CONFIG_ATOMIC64
+#define GEN_ATOMIC_HELPERS(NAME)                                  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),           \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),           \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+#else
+#define GEN_ATOMIC_HELPERS(NAME)                                  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+#endif /* CONFIG_ATOMIC64 */
+
+#else
+
+DEF_HELPER_FLAGS_4(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+#ifdef CONFIG_ATOMIC64
+DEF_HELPER_FLAGS_4(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
+#endif
+
+#ifdef CONFIG_ATOMIC64
+#define GEN_ATOMIC_HELPERS(NAME)                             \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), b),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), q_le),      \
+                       TCG_CALL_NO_WG, i64, env, tl, i64)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), q_be),      \
+                       TCG_CALL_NO_WG, i64, env, tl, i64)
+#else
+#define GEN_ATOMIC_HELPERS(NAME)                             \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), b),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)
+#endif /* CONFIG_ATOMIC64 */
+
+#endif /* CONFIG_SOFTMMU */
+
+GEN_ATOMIC_HELPERS(fetch_add)
+GEN_ATOMIC_HELPERS(fetch_and)
+GEN_ATOMIC_HELPERS(fetch_or)
+GEN_ATOMIC_HELPERS(fetch_xor)
+
+GEN_ATOMIC_HELPERS(add_fetch)
+GEN_ATOMIC_HELPERS(and_fetch)
+GEN_ATOMIC_HELPERS(or_fetch)
+GEN_ATOMIC_HELPERS(xor_fetch)
+
+GEN_ATOMIC_HELPERS(xchg)
+
+#undef GEN_ATOMIC_HELPERS
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -704,6 +704,7 @@ struct TCGContext {
 };

 extern TCGContext tcg_ctx;
+extern bool parallel_cpus;

 static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
 {
@@ -1176,6 +1177,90 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
 #endif

+uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
+                                    uint32_t cmpv, uint32_t newv,
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
+TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
+    (CPUArchState *env, target_ulong addr, TYPE val,  \
+     TCGMemOpIdx oi, uintptr_t retaddr);
+
+#ifdef CONFIG_ATOMIC64
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+#else
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)
+#endif
+
+GEN_ATOMIC_HELPER_ALL(fetch_add)
+GEN_ATOMIC_HELPER_ALL(fetch_sub)
+GEN_ATOMIC_HELPER_ALL(fetch_and)
+GEN_ATOMIC_HELPER_ALL(fetch_or)
+GEN_ATOMIC_HELPER_ALL(fetch_xor)
+
+GEN_ATOMIC_HELPER_ALL(add_fetch)
+GEN_ATOMIC_HELPER_ALL(sub_fetch)
+GEN_ATOMIC_HELPER_ALL(and_fetch)
+GEN_ATOMIC_HELPER_ALL(or_fetch)
+GEN_ATOMIC_HELPER_ALL(xor_fetch)
+
+GEN_ATOMIC_HELPER_ALL(xchg)
+
+#undef GEN_ATOMIC_HELPER_ALL
+#undef GEN_ATOMIC_HELPER
 #endif /* CONFIG_SOFTMMU */

+#ifdef CONFIG_ATOMIC128
+#include "qemu/int128.h"
+
+/* These aren't really a "proper" helpers because TCG cannot manage Int128.
+   However, use the same format as the others, for use by the backends. */
+Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+
+Int128 helper_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+
+#endif /* CONFIG_ATOMIC128 */
+
 #endif /* TCG_H */
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,4 @@
+atomic_add-bench
 check-qdict
 check-qfloat
 check-qint
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -460,7 +460,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \
 	tests/test-opts-visitor.o tests/test-qmp-event.o \
 	tests/rcutorture.o tests/test-rcu-list.o \
 	tests/test-qdist.o \
-	tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o
+	tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
+	tests/atomic_add-bench.o

 $(test-obj-y): QEMU_INCLUDES += -Itests
 QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -507,6 +508,7 @@ tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
 tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(test-util-obj-y)
 tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
 tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
+tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)

 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
--- a/tests/atomic_add-bench.c
+++ b/tests/atomic_add-bench.c
@@ -0,0 +1,163 @@
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/host-utils.h"
+#include "qemu/processor.h"
+
+struct thread_info {
+    uint64_t r;
+} QEMU_ALIGNED(64);
+
+struct count {
+    unsigned long val;
+} QEMU_ALIGNED(64);
+
+static QemuThread *threads;
+static struct thread_info *th_info;
+static unsigned int n_threads = 1;
+static unsigned int n_ready_threads;
+static struct count *counts;
+static unsigned int duration = 1;
+static unsigned int range = 1024;
+static bool test_start;
+static bool test_stop;
+
+static const char commands_string[] =
+    " -n = number of threads\n"
+    " -d = duration in seconds\n"
+    " -r = range (will be rounded up to pow2)";
+
+static void usage_complete(char *argv[])
+{
+    fprintf(stderr, "Usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "options:\n%s\n", commands_string);
+}
+
+/*
+ * From: https://en.wikipedia.org/wiki/Xorshift
+ * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
+ * guaranteed to be >= INT_MAX).
+ */
+static uint64_t xorshift64star(uint64_t x)
+{
+    x ^= x >> 12; /* a */
+    x ^= x << 25; /* b */
+    x ^= x >> 27; /* c */
+    return x * UINT64_C(2685821657736338717);
+}
+
+static void *thread_func(void *arg)
+{
+    struct thread_info *info = arg;
+
+    atomic_inc(&n_ready_threads);
+    while (!atomic_read(&test_start)) {
+        cpu_relax();
+    }
+
+    while (!atomic_read(&test_stop)) {
+        unsigned int index;
+
+        info->r = xorshift64star(info->r);
+        index = info->r & (range - 1);
+        atomic_inc(&counts[index].val);
+    }
+    return NULL;
+}
+
+static void run_test(void)
+{
+    unsigned int remaining;
+    unsigned int i;
+
+    while (atomic_read(&n_ready_threads) != n_threads) {
+        cpu_relax();
+    }
+    atomic_set(&test_start, true);
+    do {
+        remaining = sleep(duration);
+    } while (remaining);
+    atomic_set(&test_stop, true);
+
+    for (i = 0; i < n_threads; i++) {
+        qemu_thread_join(&threads[i]);
+    }
+}
+
+static void create_threads(void)
+{
+    unsigned int i;
+
+    threads = g_new(QemuThread, n_threads);
+    th_info = g_new(struct thread_info, n_threads);
+    counts = qemu_memalign(64, sizeof(*counts) * range);
+    memset(counts, 0, sizeof(*counts) * range);
+
+    for (i = 0; i < n_threads; i++) {
+        struct thread_info *info = &th_info[i];
+
+        info->r = (i + 1) ^ time(NULL);
+        qemu_thread_create(&threads[i], NULL, thread_func, info,
+                           QEMU_THREAD_JOINABLE);
+    }
+}
+
+static void pr_params(void)
+{
+    printf("Parameters:\n");
+    printf(" # of threads:      %u\n", n_threads);
+    printf(" duration:          %u\n", duration);
+    printf(" ops' range:        %u\n", range);
+}
+
+static void pr_stats(void)
+{
+    unsigned long long val = 0;
+    unsigned int i;
+    double tx;
+
+    for (i = 0; i < range; i++) {
+        val += counts[i].val;
+    }
+    tx = val / duration / 1e6;
+
+    printf("Results:\n");
+    printf("Duration:            %u s\n", duration);
+    printf(" Throughput:         %.2f Mops/s\n", tx);
+    printf(" Throughput/thread:  %.2f Mops/s/thread\n", tx / n_threads);
+}
+
+static void parse_args(int argc, char *argv[])
+{
+    int c;
+
+    for (;;) {
+        c = getopt(argc, argv, "hd:n:r:");
+        if (c < 0) {
+            break;
+        }
+        switch (c) {
+        case 'h':
+            usage_complete(argv);
+            exit(0);
+        case 'd':
+            duration = atoi(optarg);
+            break;
+        case 'n':
+            n_threads = atoi(optarg);
+            break;
+        case 'r':
+            range = pow2ceil(atoi(optarg));
+            break;
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    parse_args(argc, argv);
+    pr_params();
+    create_threads();
+    run_test();
+    pr_stats();
+    return 0;
+}
--- a/tests/test-int128.c
+++ b/tests/test-int128.c
@@ -41,7 +41,7 @@ static Int128 expand(uint32_t x)
    uint64_t l, h;
    l = expand16(x & 65535);
    h = expand16(x >> 16);
-    return (Int128) {l, h};
+    return (Int128) int128_make128(l, h);
 };

 static void test_and(void)
@@ -54,8 +54,8 @@ static void test_and(void)
            Int128 b = expand(tests[j]);
            Int128 r = expand(tests[i] & tests[j]);
            Int128 s = int128_and(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
        }
    }
 }
@@ -70,8 +70,8 @@ static void test_add(void)
            Int128 b = expand(tests[j]);
            Int128 r = expand(tests[i] + tests[j]);
            Int128 s = int128_add(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
        }
    }
 }
@@ -86,8 +86,8 @@ static void test_sub(void)
            Int128 b = expand(tests[j]);
            Int128 r = expand(tests[i] - tests[j]);
            Int128 s = int128_sub(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
        }
    }
 }
@@ -100,8 +100,8 @@ static void test_neg(void)
        Int128 a = expand(tests[i]);
        Int128 r = expand(-tests[i]);
        Int128 s = int128_neg(a);
-        g_assert_cmpuint(r.lo, ==, s.lo);
-        g_assert_cmpuint(r.hi, ==, s.hi);
+        g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+        g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
    }
 }

@@ -180,8 +180,8 @@ test_rshift_one(uint32_t x, int n, uint64_t h, uint64_t l)
 {
    Int128 a = expand(x);
    Int128 r = int128_rshift(a, n);
-    g_assert_cmpuint(r.lo, ==, l);
-    g_assert_cmpuint(r.hi, ==, h);
+    g_assert_cmpuint(int128_getlo(r), ==, l);
+    g_assert_cmpuint(int128_gethi(r), ==, h);
 }

 static void test_rshift(void)
--- a/translate-all.c
+++ b/translate-all.c
@@ -118,6 +118,7 @@ static void *l1_map[V_L1_MAX_SIZE];

 /* code generation context */
 TCGContext tcg_ctx;
+bool parallel_cpus;

 /* translation block context */
 #ifdef CONFIG_USER_ONLY
--- a/ui/curses.c
+++ b/ui/curses.c
@@ -369,10 +369,10 @@ static void curses_setup(void)
    /* ACS_* is not constant. So, we can't initialize statically. */
    vga_to_curses['\0'] = ' ';
    vga_to_curses[0x04] = ACS_DIAMOND;
-    vga_to_curses[0x0a] = ACS_RARROW;
-    vga_to_curses[0x0b] = ACS_LARROW;
    vga_to_curses[0x18] = ACS_UARROW;
    vga_to_curses[0x19] = ACS_DARROW;
+    vga_to_curses[0x1a] = ACS_RARROW;
+    vga_to_curses[0x1b] = ACS_LARROW;
    vga_to_curses[0x9c] = ACS_STERLING;
    vga_to_curses[0xb0] = ACS_BOARD;
    vga_to_curses[0xb1] = ACS_CKBOARD;
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -912,9 +912,28 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion,

    if (!qemu_input_is_absolute() && s->ptr_owner == vc) {
        GdkScreen *screen = gtk_widget_get_screen(vc->gfx.drawing_area);
+        int screen_width, screen_height;
+
        int x = (int)motion->x_root;
        int y = (int)motion->y_root;

+#if GTK_CHECK_VERSION(3, 22, 0)
+        {
+            GdkDisplay *dpy = gtk_widget_get_display(widget);
+            GdkWindow *win = gtk_widget_get_window(widget);
+            GdkMonitor *monitor = gdk_display_get_monitor_at_window(dpy, win);
+            GdkRectangle geometry;
+            gdk_monitor_get_geometry(monitor, &geometry);
+            screen_width = geometry.width;
+            screen_height = geometry.height;
+        }
+#else
+        {
+            screen_width = gdk_screen_get_width(screen);
+            screen_height = gdk_screen_get_height(screen);
+        }
+#endif
+
        /* In relative mode check to see if client pointer hit
         * one of the screen edges, and if so move it back by
         * 200 pixels. This is important because the pointer
@@ -928,10 +947,10 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion,
        if (y == 0) {
            y += 200;
        }
-        if (x == (gdk_screen_get_width(screen) - 1)) {
+        if (x == (screen_width - 1)) {
            x -= 200;
        }
-        if (y == (gdk_screen_get_height(screen) - 1)) {
+        if (y == (screen_height - 1)) {
            y -= 200;
        }

@@ -1051,7 +1070,9 @@ static gboolean gd_text_key_down(GtkWidget *widget,
    VirtualConsole *vc = opaque;
    QemuConsole *con = vc->gfx.dcl.con;

-    if (key->length) {
+    if (key->keyval == GDK_KEY_Delete) {
+        kbd_put_qcode_console(con, Q_KEY_CODE_DELETE);
+    } else if (key->length) {
        kbd_put_string_console(con, key->string, key->length);
    } else {
        int num = gd_map_keycode(vc->s, gtk_widget_get_display(widget),